summaryrefslogtreecommitdiffstats
path: root/sys/netinet
diff options
context:
space:
mode:
authorluigi <luigi@FreeBSD.org>2000-01-08 11:24:46 +0000
committerluigi <luigi@FreeBSD.org>2000-01-08 11:24:46 +0000
commit554cd7f4044078eb3ae437f0e33d605b81b87042 (patch)
treeced11a33896913aef6e1122ee97881d0f045baec /sys/netinet
parent8820c08f151489a928659376ea7852c5ab7c823d (diff)
downloadFreeBSD-src-554cd7f4044078eb3ae437f0e33d605b81b87042.zip
FreeBSD-src-554cd7f4044078eb3ae437f0e33d605b81b87042.tar.gz
Implement per-flow queueing. Using a single pipe config rule,
now you can dynamically create rate-limited queues for different flows using masks on dst/src IP, port and protocols. Read the ipfw(8) manpage for details and examples. Restructure the internals of the traffic shaper to use heaps, so that it manages efficiently large number of queues. Fix a bug which was present in the previous versions which could cause, under certain unfrequent conditions, to send out very large bursts of traffic. All in all, this new code is much cleaner than the previous one and should also perform better. Work supported by Akamba Corp.
Diffstat (limited to 'sys/netinet')
-rw-r--r--sys/netinet/ip_dummynet.c1030
-rw-r--r--sys/netinet/ip_dummynet.h141
2 files changed, 765 insertions, 406 deletions
diff --git a/sys/netinet/ip_dummynet.c b/sys/netinet/ip_dummynet.c
index 9791a7f..28e877a 100644
--- a/sys/netinet/ip_dummynet.c
+++ b/sys/netinet/ip_dummynet.c
@@ -1,38 +1,47 @@
/*
- * Copyright (c) 1998 Luigi Rizzo
+ * Copyright (c) 1998-2000 Luigi Rizzo, Universita` di Pisa
+ * Portions Copyright (c) 2000 Akamba Corp.
+ * All rights reserved
*
- * Redistribution and use in source forms, with and without modification,
- * are permitted provided that this entire comment appears intact.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
*
- * Redistribution in binary form may occur without any restrictions.
- * Obviously, it would be nice if you gave credit where credit is due
- * but requiring it would be too onerous.
- *
- * This software is provided ``AS IS'' without any warranties of any kind.
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
*
* $FreeBSD$
*/
+#define DEB(x)
+#define DDB(x) x
+
/*
* This module implements IP dummynet, a bandwidth limiter/delay emulator
* used in conjunction with the ipfw package.
*
- * Changes:
+ * Most important Changes:
*
- * 980821: changed conventions in the queueing logic
- * packets passed from dummynet to ip_in/out are prepended with
- * a vestigial mbuf type MT_DUMMYNET which contains a pointer
- * to the matching rule.
- * ip_input/output will extract the parameters, free the vestigial mbuf,
- * and do the processing.
- *
- * 980519: fixed behaviour when deleting rules.
- * 980518: added splimp()/splx() to protect against races
+ * 000106: large rewrite, use heaps to handle very many pipes.
* 980513: initial release
+ *
+ * include files marked with XXX are probably not needed
*/
-/* include files marked with XXX are probably not needed */
-
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
@@ -60,54 +69,63 @@
#include <net/bridge.h>
#endif
+/*
+ * the addresses/ports of last pkt matched by the firewall are
+ * in this structure. This is so that we can easily find them without
+ * navigating through the mbuf.
+ */
+struct dn_flow_id dn_last_pkt ;
+
+/*
+ * we keep a private variable for the simulation time, but probably
+ * it would be better to use the already existing one "softticks"
+ * (in sys/kern/kern_timer.c)
+ */
+static dn_key curr_time = 0 ; /* current simulation time */
+
+static int dn_hash_size = 64 ; /* default hash size */
+
+/* statistics on number of queue searches and search steps */
+static int searches, search_steps ;
+
+static struct dn_heap ready_heap, extract_heap ;
+static int heap_init(struct dn_heap *h, int size) ;
+static int heap_insert (struct dn_heap *h, dn_key key1, void *p);
+static void heap_extract(struct dn_heap *h);
+static void transmit_event(struct dn_pipe *pipe);
+static void ready_event(struct dn_flow_queue *q);
+
static struct dn_pipe *all_pipes = NULL ; /* list of all pipes */
-static int dn_debug = 0 ; /* verbose */
-static int dn_calls = 0 ; /* number of calls */
-static int dn_idle = 1;
#ifdef SYSCTL_NODE
-SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet");
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW, &dn_debug, 0, "");
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, calls, CTLFLAG_RD, &dn_calls, 0, "");
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, idle, CTLFLAG_RD, &dn_idle, 0, "");
+SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet,
+ CTLFLAG_RW, 0, "Dummynet");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size,
+ CTLFLAG_RD, &dn_hash_size, 0, "Default hash table size");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, curr_time,
+ CTLFLAG_RD, &curr_time, 0, "Current tick");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, ready_heap,
+ CTLFLAG_RD, &ready_heap.size, 0, "Size of ready heap");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, extract_heap,
+ CTLFLAG_RD, &extract_heap.size, 0, "Size of extract heap");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, searches,
+ CTLFLAG_RD, &searches, 0, "Number of queue searches");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, search_steps,
+ CTLFLAG_RD, &search_steps, 0, "Number of queue search steps");
#endif
static int ip_dn_ctl(struct sockopt *sopt);
static void rt_unref(struct rtentry *);
static void dummynet(void *);
-static void dn_restart(void);
-static void dn_move(struct dn_pipe *pipe, int immediate);
static void dummynet_flush(void);
/*
- * the following is needed when deleting a pipe, because rules can
+ * ip_fw_chain is used when deleting a pipe, because ipfw rules can
* hold references to the pipe.
*/
extern LIST_HEAD (ip_fw_head, ip_fw_chain) ip_fw_chain;
-/*
- * invoked to reschedule the periodic task if necessary.
- * Should only be called when dn_idle = 1 ;
- */
-static void
-dn_restart()
-{
- struct dn_pipe *pipe;
-
- if (!dn_idle)
- return;
-
- for (pipe = all_pipes ; pipe ; pipe = pipe->next ) {
- /* if there any pipe that needs work, restart */
- if (pipe->r.head || pipe->p.head || pipe->numbytes < 0 ) {
- dn_idle = 0;
- timeout(dummynet, NULL, 1);
- return ;
- }
- }
-}
-
static void
rt_unref(struct rtentry *rt)
{
@@ -119,124 +137,192 @@ rt_unref(struct rtentry *rt)
}
/*
- * move packets from R-queue to P-queue
+ * Heap management functions.
+ *
+ * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2.
+ * Some macros help finding parent/children so we can optimize them.
+ #
+ * heap_init() is called to expand the heap when needed.
+ * Increment size in blocks of 256 entries (which make one 4KB page)
+ * XXX failure to allocate a new element is a pretty bad failure
+ * as we basically stall a whole queue forever!!
+ * Returns 1 on error, 0 on success
*/
-static void
-dn_move(struct dn_pipe *pipe, int immediate)
-{
- struct dn_pkt *pkt;
-
- /*
- * consistency check, should catch new pipes which are
- * not initialized properly.
- */
- if ( pipe->p.head == NULL &&
- pipe->ticks_from_last_insert != pipe->delay) {
- printf("Warning, empty pipe and delay %d (should be %d)\n",
- pipe->ticks_from_last_insert, pipe->delay);
- pipe->ticks_from_last_insert = pipe->delay;
+#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 )
+#define HEAP_LEFT(x) ( 2*(x) + 1 )
+#define HEAP_IS_LEFT(x) ( (x) & 1 )
+#define HEAP_RIGHT(x) ( 2*(x) + 1 )
+#define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; }
+#define HEAP_INCREMENT 255
+
+static int
+heap_init(struct dn_heap *h, int new_size)
+{
+ struct dn_heap_entry *p;
+
+ if (h->size >= new_size ) {
+ printf("heap_init, Bogus call, have %d want %d\n",
+ h->size, new_size);
+ return 0 ;
+ }
+ new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT ;
+ p = malloc(new_size * sizeof(*p), M_IPFW, M_DONTWAIT );
+ if (p == NULL) {
+ printf(" heap_init, resize %d failed\n", new_size );
+ return 1 ; /* error */
+ }
+ if (h->size > 0) {
+ bcopy(h->p, p, h->size * sizeof(*p) );
+ free(h->p, M_IPFW);
}
- /* this ought to go in dn_dequeue() */
- if (!immediate && pipe->ticks_from_last_insert < pipe->delay)
- pipe->ticks_from_last_insert++;
- if ((pkt = pipe->r.head) != NULL) {
+ h->p = p ;
+ h->size = new_size ;
+ return 0 ;
+}
+
+/*
+ * Insert element in heap. Normally, p != NULL, we insert p in
+ * a new position and bubble up. If p == NULL, then the element is
+ * already in place, and key is the position where to start the
+ * bubble-up.
+ * Returns 1 on failure (cannot allocate new heap entry)
+ */
+static int
+heap_insert(struct dn_heap *h, dn_key key1, void *p)
+{
+ int son = h->elements ;
+
+ if (p == NULL) /* data already there, set starting point */
+ son = key1 ;
+ else { /* insert new element at the end, possibly resize */
+ son = h->elements ;
+ if (son == h->size) /* need resize... */
+ if (heap_init(h, h->elements+1) )
+ return 1 ; /* failure... */
+ h->p[son].object = p ;
+ h->p[son].key = key1 ;
+ h->elements++ ;
+ }
+ while (son > 0) { /* bubble up */
+ int father = HEAP_FATHER(son) ;
+ struct dn_heap_entry tmp ;
+
+ if (DN_KEY_LT( h->p[father].key, h->p[son].key ) )
+ break ; /* found right position */
+ /* son smaller than father, swap and try again */
+ HEAP_SWAP(h->p[son], h->p[father], tmp) ;
+ son = father ;
+ }
+ return 0 ;
+}
+
+/*
+ * remove top element from heap
+ */
+static void
+heap_extract(struct dn_heap *h)
+{
+ int child, father, max = h->elements - 1 ;
+ if (max < 0)
+ return ;
+
+ /* move up smallest child */
+ father = 0 ;
+ child = HEAP_LEFT(father) ; /* left child */
+ while (child <= max) { /* valid entry */
+ if (child != max && DN_KEY_LT(h->p[child+1].key, h->p[child].key) )
+ child = child+1 ; /* take right child, otherwise left */
+ h->p[father] = h->p[child] ;
+ father = child ;
+ child = HEAP_LEFT(child) ; /* left child for next loop */
+ }
+ h->elements-- ;
+ if (father != max) {
/*
- * Move at most numbytes bytes from src and move to dst.
- * delay is set to ticks_from_last_insert, which
- * is reset after the first insertion;
+ * Fill hole with last entry and bubble up, reusing the insert code
*/
- while ( pkt ) {
- int len = pkt->dn_m->m_pkthdr.len ;
+ h->p[father] = h->p[max] ;
+ heap_insert(h, father, NULL); /* this one cannot fail */
+ }
+}
- /*
- * queue limitation: pass packets down if the len is
- * such that the pkt would go out before the next tick.
- */
- if (pipe->bandwidth) {
- int len_scaled = len*8*hz ;
- /* numbytes is in bit/sec, scaled 8*hz ... */
- if (pipe->numbytes < len_scaled)
- break;
- pipe->numbytes -= len_scaled;
- }
- pipe->r_len--; /* elements in queue */
- pipe->r_len_bytes -= len ;
+/*
+ * heapify() will reorganize data inside an array to maintain the
+ * heap property. It is needed when we delete a bunch of entries.
+ */
+static void
+heapify(struct dn_heap *h)
+{
+ int father, i ;
+ struct dn_heap_entry tmp ;
- /*
- * to add delay jitter, must act here. A lower value
- * (bounded to 0) means lower delay.
- */
- pkt->delay = pipe->ticks_from_last_insert;
- pipe->ticks_from_last_insert = 0;
- /* compensate the decrement done next in dn_dequeue */
- if (!immediate && pkt->delay >0 && pipe->p.head==NULL)
- pkt->delay++;
- if (pipe->p.head == NULL)
- pipe->p.head = pkt;
- else
- (struct dn_pkt *)pipe->p.tail->dn_next = pkt;
- pipe->p.tail = pkt;
- pkt = (struct dn_pkt *)pkt->dn_next;
- pipe->p.tail->dn_next = NULL;
- }
- pipe->r.head = pkt;
-
- /*** XXX just a sanity check */
- if ( ( pkt == NULL && pipe->r_len != 0) ||
- ( pkt != NULL && pipe->r_len == 0) )
- printf("-- Warning, pipe head %p len %d\n",
- (void *)pkt, pipe->r_len);
+ for (i = h->elements - 1 ; i > 0 ; i-- ) {
+ father = HEAP_FATHER(i) ;
+ if ( DN_KEY_LT(h->p[i].key, h->p[father].key) )
+ HEAP_SWAP(h->p[father], h->p[i], tmp) ;
}
-
- /*
- * deliver packets downstream after the delay in the P-queue.
- */
+}
+/*
+ * --- end of heap management functions ---
+ */
- if (pipe->p.head == NULL)
- return;
- if (!immediate)
- pipe->p.head->delay--;
- while ( (pkt = pipe->p.head) && pkt->delay < 1) {
+/*
+ * Scheduler functions -- transmit_event(), ready_event()
+ *
+ * transmit_event() is called when the delay-line needs to enter
+ * the scheduler, either because of existing pkts getting ready,
+ * or new packets entering the queue. The event handled is the delivery
+ * time of the packet.
+ *
+ * ready_event() does something similar with flow queues, and the
+ * event handled is the finish time of the head pkt.
+ *
+ * In both cases, we make sure that the data structures are consistent
+ * before passing pkts out, because this might trigger recursive
+ * invocations of the procedures.
+ */
+static void
+transmit_event(struct dn_pipe *pipe)
+{
+ struct dn_pkt *pkt ;
+
+ while ( (pkt = pipe->p.head) && DN_KEY_LEQ(pkt->output_time, curr_time) ) {
/*
- * first unlink, then call procedures since ip_input()
- * can result in a call to ip_output cnd viceversa,
- * thus causing nested calls
+ * first unlink, then call procedures, since ip_input() can invoke
+ * ip_output() and viceversa, thus causing nested calls
*/
- pipe->p.head = (struct dn_pkt *) pkt->dn_next ;
+ pipe->p.head = DN_NEXT(pkt) ;
/*
- * the trick to avoid flow-id settings here is to prepend a
- * vestigial mbuf to the packet, with the following values:
- * m_type = MT_DUMMYNET
- * m_next = the actual mbuf to be processed by ip_input/output
- * m_data = the matching rule
- * The vestigial element is the same memory area used by
- * the dn_pkt, and IS FREED HERE because it can contain
- * parameters passed to the called routine. The buffer IS NOT
- * A REAL MBUF, just a block of memory acquired with malloc().
+ * The actual mbuf is preceded by a struct dn_pkt, resembling an mbuf
+ * (NOT A REAL one, just a small block of malloc'ed memory) with
+ * m_type = MT_DUMMYNET
+ * m_next = actual mbuf to be processed by ip_input/output
+ * m_data = the matching rule
+ * and some other fields.
+ * The block IS FREED HERE because it contains parameters passed
+ * to the called routine.
*/
switch (pkt->dn_dir) {
- case DN_TO_IP_OUT: {
- struct route *ro = &(pkt->ro) ;
-
- (void)ip_output((struct mbuf *)pkt, (struct mbuf *)pkt->ifp,
- ro, pkt->flags, NULL);
- rt_unref (ro->ro_rt) ;
- }
+ case DN_TO_IP_OUT:
+ (void)ip_output((struct mbuf *)pkt, NULL, NULL, 0, NULL);
+ rt_unref (pkt->ro.ro_rt) ;
break ;
+
case DN_TO_IP_IN :
ip_input((struct mbuf *)pkt) ;
break ;
+
#ifdef BRIDGE
case DN_TO_BDG_FWD : {
struct mbuf *m = (struct mbuf *)pkt ;
-
bdg_forward(&m, pkt->ifp);
if (m)
m_freem(m);
}
break ;
#endif
+
default:
printf("dummynet: bad switch %d!\n", pkt->dn_dir);
m_freem(pkt->dn_m);
@@ -244,47 +330,191 @@ dn_move(struct dn_pipe *pipe, int immediate)
}
FREE(pkt, M_IPFW);
}
+ /* if there are leftover packets, put into the heap for next event */
+ if ( (pkt = pipe->p.head) )
+ heap_insert(&extract_heap, pkt->output_time, pipe ) ;
+ /* XXX should check errors on heap_insert, by draining the
+ * whole pipe p and hoping in the future we are more successful
+ */
}
+
/*
- * this is the periodic task that moves packets between the R-
- * and the P- queue
+ * ready_event() is invoked every time the queue must enter the
+ * scheduler, either because the first packet arrives, or because
+ * a previously scheduled event fired.
+ * On invokation, drain as many pkts as possible (could be 0) and then
+ * if there are leftover packets reinsert the pkt in the scheduler.
*/
-/*ARGSUSED*/
-void
-dummynet(void * __unused unused)
+static void
+ready_event(struct dn_flow_queue *q)
{
- struct dn_pipe *p ;
- int s ;
+ struct dn_pkt *pkt;
+ struct dn_pipe *p = q->p ;
+ int p_was_empty = (p->p.head == NULL) ;
- dn_calls++ ;
- for (p = all_pipes ; p ; p = p->next ) {
+ while ( (pkt = q->r.head) != NULL ) {
+ int len = pkt->dn_m->m_pkthdr.len;
+ int len_scaled = p->bandwidth ? len*8*hz : 0 ;
/*
- * Increment the amount of data that can be sent. However,
- * don't do that if the channel is idle
- * (r.head == NULL && numbytes >= bandwidth).
- * This bug fix is from tim shepard (shep@bbn.com)
+ * bandwidth==0 (no limit) means we can drain as many pkts as
+ * needed from the queue. Setting len_scaled = 0 does the job.
*/
- s = splimp();
- if (p->r.head != NULL || p->numbytes < p->bandwidth )
- p->numbytes += p->bandwidth ;
- dn_move(p, 0); /* is it really 0 (also below) ? */
- splx(s);
+ if (len_scaled > q->numbytes )
+ break ;
+ /*
+ * extract pkt from queue, compute output time (could be now)
+ * and put into delay line (p_queue)
+ */
+ q->numbytes -= len_scaled ;
+ q->r.head = DN_NEXT(pkt) ;
+ q->len-- ;
+ q->len_bytes -= len ;
+
+ pkt->output_time = curr_time + p->delay ;
+
+ if (p->p.head == NULL)
+ p->p.head = pkt;
+ else
+ DN_NEXT(p->p.tail) = pkt;
+ p->p.tail = pkt;
+ DN_NEXT(p->p.tail) = NULL;
}
-
/*
- * finally, if some queue has data, restart the timer.
+ * If the delay line was empty call transmit_event(p) now.
+ * Otherwise, the scheduler will take care of it.
*/
- s = splimp();
- dn_idle = 1;
- dn_restart();
+ if (p_was_empty)
+ transmit_event(p);
+ /*
+ * If we have more packets queued, schedule next ready event
+ * (can only occur when bandwidth != 0, otherwise we would have
+ * flushed the whole queue in the previous loop).
+ * To this purpose compute how many ticks to go for the next
+ * event, accounting for packet size and residual credit. This means
+ * we compute the finish time of the packet.
+ */
+ if ( (pkt = q->r.head) != NULL ) { /* this implies bandwidth != 0 */
+ dn_key t ;
+ t = (pkt->dn_m->m_pkthdr.len*8*hz - q->numbytes + p->bandwidth - 1 ) /
+ p->bandwidth ;
+ q->numbytes += t * p->bandwidth ;
+ heap_insert(&ready_heap, curr_time + t, (void *)q );
+ /* XXX should check errors on heap_insert, and drain the whole
+ * queue on error hoping next time we are luckier.
+ */
+ }
+}
+
+/*
+ * this is called once per tick, or HZ times per second. It is used to
+ * increment the current tick counter and schedule expired events.
+ */
+static void
+dummynet(void * __unused unused)
+{
+ void *p ; /* generic parameter to handler */
+ struct dn_heap *h ;
+ int s ;
+
+ s = splnet(); /* avoid network interrupts... */
+ curr_time++ ;
+ h = &ready_heap ;
+ while (h->elements > 0 && DN_KEY_LEQ(h->p[0].key, curr_time) ) {
+ /*
+ * XXX if the event is late, we should probably credit the queue
+ * by q->p->bandwidth * (delta_ticks). On the other hand, i dont
+ * think this can ever occur with this code (i.e. curr_time will
+ * still be incremented by one at each tick. Things might be
+ * different if we were using the counter from the high priority
+ * timer.
+ */
+ if (h->p[0].key != curr_time)
+ printf("-- dummynet: warning, event is %d ticks late\n",
+ curr_time - h->p[0].key);
+ p = h->p[0].object ;
+ heap_extract(h); /* need to extract before processing */
+ ready_event(p) ;
+ }
+ h = &extract_heap ;
+ while (h->elements > 0 && DN_KEY_LEQ(h->p[0].key, curr_time) ) {
+ if (h->p[0].key != curr_time) /* XXX same as above */
+ printf("-- dummynet: warning, event is %d ticks late\n",
+ curr_time - h->p[0].key);
+ p = h->p[0].object ;
+ heap_extract(&extract_heap);
+ transmit_event(p);
+ }
splx(s);
+ timeout(dummynet, NULL, 1);
+}
+
+/*
+ * Given a pipe and a pkt in dn_last_pkt, find a matching queue
+ * after appropriate masking. The queue is moved to front
+ * so that further searches take less time.
+ * XXX if the queue is longer than some threshold should consider
+ * purging old unused entries. They will get in the way every time
+ * we have a new flow.
+ */
+static struct dn_flow_queue *
+find_queue(struct dn_pipe *pipe)
+{
+ int i = 0 ; /* we need i and q for new allocations */
+ struct dn_flow_queue *q, *prev;
+
+ if ( !(pipe->flags & DN_HAVE_FLOW_MASK) )
+ q = pipe->rq[0] ;
+ else {
+ /* first, do the masking */
+ dn_last_pkt.dst_ip &= pipe->flow_mask.dst_ip ;
+ dn_last_pkt.src_ip &= pipe->flow_mask.src_ip ;
+ dn_last_pkt.dst_port &= pipe->flow_mask.dst_port ;
+ dn_last_pkt.src_port &= pipe->flow_mask.src_port ;
+ dn_last_pkt.proto &= pipe->flow_mask.proto ;
+ /* then, hash function */
+ i = ( (dn_last_pkt.dst_ip) & 0xffff ) ^
+ ( (dn_last_pkt.dst_ip >> 15) & 0xffff ) ^
+ ( (dn_last_pkt.src_ip << 1) & 0xffff ) ^
+ ( (dn_last_pkt.src_ip >> 16 ) & 0xffff ) ^
+ (dn_last_pkt.dst_port << 1) ^ (dn_last_pkt.src_port) ^
+ (dn_last_pkt.proto );
+ i = i % pipe->rq_size ;
+ /* finally, scan the current list for a match */
+ searches++ ;
+ for (prev=NULL, q = pipe->rq[i] ; q ; prev = q , q = q->next ) {
+ search_steps++;
+ if (bcmp(&dn_last_pkt, &(q->id), sizeof(q->id) ) == 0)
+ break ; /* found */
+ }
+ if (q && prev != NULL) { /* found and not in front */
+ prev->next = q->next ;
+ q->next = pipe->rq[i] ;
+ pipe->rq[i] = q ;
+ }
+ }
+ if (q == NULL) { /* no match, need to allocate a new entry */
+ q = malloc(sizeof(*q), M_IPFW, M_DONTWAIT) ;
+ if (q == NULL) {
+ printf("sorry, cannot allocate new flow\n");
+ return NULL ;
+ }
+ bzero(q, sizeof(*q) ); /* needed */
+ q->id = dn_last_pkt ;
+ q->p = pipe ;
+ q->hash_slot = i ;
+ q->next = pipe->rq[i] ;
+ pipe->rq[i] = q ;
+ pipe->rq_elements++ ;
+ DEB(printf("++ new queue (%d) for 0x%08x/0x%04x -> 0x%08x/0x%04x\n",
+ pipe->rq_elements,
+ dn_last_pkt.src_ip, dn_last_pkt.src_port,
+ dn_last_pkt.dst_ip, dn_last_pkt.dst_port); )
+ }
+ return q ;
}
/*
* dummynet hook for packets.
- * input and output use the same code, so i use bit 16 in the pipe
- * number to chose the direction: 1 for output packets, 0 for input.
- * for input, only m is significant. For output, also the others.
*/
int
dummynet_io(int pipe_nr, int dir,
@@ -293,174 +523,196 @@ dummynet_io(int pipe_nr, int dir,
struct ip_fw_chain *rule, int flags)
{
struct dn_pkt *pkt;
- struct dn_pipe *pipe;
+ struct dn_pipe *p;
int len = m->m_pkthdr.len ;
+ struct dn_flow_queue *q = NULL ;
+ int s ;
- int s=splimp();
+ s = splimp();
+ /* XXX check the spl protection. It might be unnecessary since we
+ * run this at splnet() already.
+ */
+
+ DEB(printf("-- last_pkt dst 0x%08x/0x%04x src 0x%08x/0x%04x\n",
+ dn_last_pkt.dst_ip, dn_last_pkt.dst_port,
+ dn_last_pkt.src_ip, dn_last_pkt.src_port);)
pipe_nr &= 0xffff ;
/*
* locate pipe. First time is expensive, next have direct access.
*/
-
- if ( (pipe = rule->rule->pipe_ptr) == NULL ) {
- for (pipe=all_pipes; pipe && pipe->pipe_nr !=pipe_nr; pipe=pipe->next)
+ if ( (p = rule->rule->pipe_ptr) == NULL ) {
+ for (p = all_pipes; p && p->pipe_nr != pipe_nr; p = p->next)
;
- if (pipe == NULL) {
- splx(s);
- if (dn_debug)
- printf("warning, pkt for no pipe %d\n", pipe_nr);
- m_freem(m);
- return 0 ;
- } else
- rule->rule->pipe_ptr = pipe ;
+ if (p == NULL)
+ goto dropit ; /* this pipe does not exist! */
+ rule->rule->pipe_ptr = p ; /* record pipe ptr for the future */
}
-
+ q = find_queue(p);
/*
- * should i drop ?
- * This section implements random packet drop.
+ * update statistics, then do various check on reasons to drop pkt
*/
- if ( (pipe->plr && random() < pipe->plr) ||
- (pipe->queue_size && pipe->r_len >= pipe->queue_size) ||
- (pipe->queue_size_bytes &&
- len + pipe->r_len_bytes > pipe->queue_size_bytes) ||
- (pkt = (struct dn_pkt *)malloc(sizeof (*pkt),
- M_IPFW, M_NOWAIT) ) == NULL ) {
- splx(s);
- if (dn_debug)
- printf("-- dummynet: drop from pipe %d, have %d pks, %d bytes\n",
- pipe_nr, pipe->r_len, pipe->r_len_bytes);
- pipe->r_drops++ ;
- m_freem(m);
- return 0 ; /* XXX error */
- }
- bzero(pkt, sizeof(*pkt) );
- /* build and enqueue packet */
+ if ( q == NULL )
+ goto dropit ; /* cannot allocate queue */
+ q->tot_bytes += len ;
+ q->tot_pkts++ ;
+ if ( p->plr && random() < p->plr )
+ goto dropit ; /* random pkt drop */
+ if ( p->queue_size && q->len >= p->queue_size)
+ goto dropit ; /* queue count overflow */
+ if ( p->queue_size_bytes && len + q->len_bytes > p->queue_size_bytes)
+ goto dropit ; /* queue size overflow */
+ /*
+ * can implement RED drops here if needed.
+ */
+
+ pkt = (struct dn_pkt *)malloc(sizeof (*pkt), M_IPFW, M_NOWAIT) ;
+ if ( pkt == NULL )
+ goto dropit ; /* cannot allocate packet header */
+ /* ok, i can handle the pkt now... */
+ bzero(pkt, sizeof(*pkt) ); /* XXX expensive, see if we can remove it*/
+ /* build and enqueue packet + parameters */
pkt->hdr.mh_type = MT_DUMMYNET ;
(struct ip_fw_chain *)pkt->hdr.mh_data = rule ;
- pkt->dn_next = NULL;
+ DN_NEXT(pkt) = NULL;
pkt->dn_m = m;
pkt->dn_dir = dir ;
- pkt->delay = 0;
pkt->ifp = ifp;
if (dir == DN_TO_IP_OUT) {
/*
- * we need to copy *ro because for icmp pkts (and maybe others)
- * the caller passed a pointer into the stack.
+ * We need to copy *ro because for ICMP pkts (and maybe others)
+ * the caller passed a pointer into the stack; and, dst might
+ * also be a pointer into *ro so it needs to be updated.
*/
pkt->ro = *ro;
if (ro->ro_rt)
ro->ro_rt->rt_refcnt++ ; /* XXX */
- /*
- * and again, dst might be a pointer into *ro...
- */
if (dst == (struct sockaddr_in *)&ro->ro_dst) /* dst points into ro */
dst = (struct sockaddr_in *)&(pkt->ro.ro_dst) ;
- pkt->dn_dst = dst; /* XXX this can't be right! */
- /*
- * 'flags' also need to be kept for later packet treatment
- * such as IPSEC. IPSEC consider sending packet's m->m_pkthdr.rcvif
- * as 'socket *' at ip_output(), if IP_SOCKINMRCVIF is set.
- */
- pkt->flags = flags;
+ pkt->dn_dst = dst;
+ pkt->flags = flags ;
}
- if (pipe->r.head == NULL)
- pipe->r.head = pkt;
+ if (q->r.head == NULL)
+ q->r.head = pkt;
else
- (struct dn_pkt *)pipe->r.tail->dn_next = pkt;
- pipe->r.tail = pkt;
- pipe->r_len++;
- pipe->r_len_bytes += len ;
+ DN_NEXT(q->r.tail) = pkt;
+ q->r.tail = pkt;
+ q->len++;
+ q->len_bytes += len ;
- /*
- * here we could implement RED if we like to
+ /*
+ * If queue was empty (this is first pkt) then call ready_event()
+ * now to make the pkt go out at the right time. Otherwise we are done,
+ * as there must be a ready event already scheduled.
*/
-
- if (pipe->r.head == pkt) { /* process immediately */
- dn_move(pipe, 1);
- }
- if (dn_idle)
- dn_restart();
+ if (q->r.head == pkt) /* r_queue was empty */
+ ready_event( q );
splx(s);
return 0;
+
+dropit:
+ splx(s);
+ if (q)
+ q->drops++ ;
+ m_freem(m);
+ return 0 ; /* XXX should I return an error ? */
}
/*
+ * below, the rt_unref is only needed when (pkt->dn_dir == DN_TO_IP_OUT)
+ * Doing this would probably save us the initial bzero of dn_pkt
+ */
+#define DN_FREE_PKT(pkt) { \
+ struct dn_pkt *n = pkt ; \
+ rt_unref ( n->ro.ro_rt ) ; \
+ m_freem(n->dn_m); \
+ pkt = DN_NEXT(n) ; \
+ free(n, M_IPFW) ; }
+/*
* dispose all packets queued on a pipe
*/
static void
purge_pipe(struct dn_pipe *pipe)
{
- struct dn_pkt *pkt, *n ;
- struct rtentry *tmp_rt ;
-
- for (pkt = pipe->r.head ; pkt ; ) {
- rt_unref (tmp_rt = pkt->ro.ro_rt ) ;
- m_freem(pkt->dn_m);
- n = pkt ;
- pkt = (struct dn_pkt *)pkt->dn_next ;
- free(n, M_IPFW) ;
- }
- for (pkt = pipe->p.head ; pkt ; ) {
- rt_unref (tmp_rt = pkt->ro.ro_rt ) ;
- m_freem(pkt->dn_m);
- n = pkt ;
- pkt = (struct dn_pkt *)pkt->dn_next ;
- free(n, M_IPFW) ;
- }
+ struct dn_pkt *pkt ;
+ struct dn_flow_queue *q, *qn ;
+ int i ;
+
+ for (i = 0 ; i < pipe->rq_size ; i++ )
+ for (q = pipe->rq[i] ; q ; q = qn ) {
+ for (pkt = q->r.head ; pkt ; )
+ DN_FREE_PKT(pkt) ;
+ qn = q->next ;
+ free(q, M_IPFW);
+ }
+ for (pkt = pipe->p.head ; pkt ; )
+ DN_FREE_PKT(pkt) ;
}
/*
- * delete all pipes returning memory
+ * Delete all pipes and heaps returning memory. Must also
+ * remove references from all ipfw rules to all pipes.
*/
static void
dummynet_flush()
{
- struct dn_pipe *q, *p = all_pipes ;
- int s = splnet() ;
+ struct dn_pipe *curr_p, *p ;
+ struct ip_fw_chain *chain ;
+ int s ;
- all_pipes = NULL ;
+ s = splnet() ;
+
+ /* remove all references to pipes ...*/
+ for (chain= ip_fw_chain.lh_first ; chain; chain = chain->chain.le_next)
+ chain->rule->pipe_ptr = NULL ;
+ /* prevent future matches... */
+ p = all_pipes ;
+ all_pipes = NULL ;
+ /* and free heaps so we don't have unwanted events */
+ if (ready_heap.size >0 )
+ free(ready_heap.p, M_IPFW);
+ ready_heap.elements = ready_heap.size = 0 ;
+ if (extract_heap.size >0 )
+ free(extract_heap.p, M_IPFW);
+ extract_heap.elements = extract_heap.size = 0 ;
splx(s) ;
/*
- * purge all queued pkts and delete all pipes
+ * Now purge all queued pkts and delete all pipes
*/
for ( ; p ; ) {
purge_pipe(p);
- q = p ;
+ curr_p = p ;
p = p->next ;
- free(q, M_IPFW);
+ free(curr_p->rq, M_IPFW);
+ free(curr_p, M_IPFW);
}
}
extern struct ip_fw_chain *ip_fw_default_rule ;
/*
- * when a firewall rule is deleted, scan all pipes and remove the flow-id
+ * when a firewall rule is deleted, scan all queues and remove the flow-id
* from packets matching this rule.
*/
void
dn_rule_delete(void *r)
{
struct dn_pipe *p ;
- int matches = 0 ;
+ struct dn_flow_queue *q ;
+ struct dn_pkt *pkt ;
+ int i ;
for ( p = all_pipes ; p ; p = p->next ) {
- struct dn_pkt *x ;
- for (x = p->r.head ; x ; x = (struct dn_pkt *)x->dn_next )
- if (x->hdr.mh_data == r) {
- matches++ ;
- x->hdr.mh_data = (void *)ip_fw_default_rule ;
- }
- for (x = p->p.head ; x ; x = (struct dn_pkt *)x->dn_next )
- if (x->hdr.mh_data == r) {
- matches++ ;
- x->hdr.mh_data = (void *)ip_fw_default_rule ;
- }
+ for (i = 0 ; i < p->rq_size ; i++)
+ for (q = p->rq[i] ; q ; q = q->next )
+ for (pkt = q->r.head ; pkt ; pkt = DN_NEXT(pkt) )
+ if (pkt->hdr.mh_data == r)
+ pkt->hdr.mh_data = (void *)ip_fw_default_rule ;
+ for (pkt = p->p.head ; pkt ; pkt = DN_NEXT(pkt) )
+ if (pkt->hdr.mh_data == r)
+ pkt->hdr.mh_data = (void *)ip_fw_default_rule ;
}
- printf("dn_rule_delete, r %p, default %p%s, %d matches\n",
- (void *)r, (void *)ip_fw_default_rule,
- r == ip_fw_default_rule ? " AARGH!":"", matches);
}
/*
@@ -472,7 +724,7 @@ ip_dn_ctl(struct sockopt *sopt)
{
int error = 0 ;
size_t size ;
- char *buf, *bp ;
+ char *buf, *bp ; /* bp is the "copy-pointer" */
struct dn_pipe *p, tmp_pipe ;
struct dn_pipe *x, *a, *b ;
@@ -487,92 +739,116 @@ ip_dn_ctl(struct sockopt *sopt)
case IP_DUMMYNET_GET :
for (p = all_pipes, size = 0 ; p ; p = p->next )
- size += sizeof( *p ) ;
+ size += sizeof( *p ) +
+ p->rq_elements * sizeof(struct dn_flow_queue);
buf = malloc(size, M_TEMP, M_WAITOK);
if (buf == 0) {
error = ENOBUFS ;
break ;
}
for (p = all_pipes, bp = buf ; p ; p = p->next ) {
- struct dn_pipe *q = (struct dn_pipe *)bp ;
+ int i ;
+ struct dn_pipe *pipe_bp = (struct dn_pipe *)bp ;
+ struct dn_flow_queue *q;
- bcopy(p, bp, sizeof( *p ) );
/*
- * return bw and delay in bits/s and ms, respectively
+ * copy the pipe descriptor into *bp, convert delay back to ms,
+ * then copy the queue descriptor(s) one at a time.
*/
- q->delay = (q->delay * 1000) / hz ;
+ bcopy(p, bp, sizeof( *p ) );
+ pipe_bp->delay = (pipe_bp->delay * 1000) / hz ;
bp += sizeof( *p ) ;
+ for (i = 0 ; i < p->rq_size ; i++)
+ for (q = p->rq[i] ; q ; q = q->next, bp += sizeof(*q) )
+ bcopy(q, bp, sizeof( *q ) );
}
error = sooptcopyout(sopt, buf, size);
FREE(buf, M_TEMP);
break ;
+
case IP_DUMMYNET_FLUSH :
- dummynet_flush() ;
+ dummynet_flush() ;
break ;
+
case IP_DUMMYNET_CONFIGURE :
p = &tmp_pipe ;
error = sooptcopyin(sopt, p, sizeof *p, sizeof *p);
if (error)
break ;
- /*
- * The config program passes parameters as follows:
- * bandwidth = bits/second (0 = no limits);
- * delay = ms
- * must be translated in ticks.
- * queue_size = slots (0 = no limit)
- * queue_size_bytes = bytes (0 = no limit)
- * only one can be set, must be bound-checked
- */
- p->delay = ( p->delay * hz ) / 1000 ;
- if (p->queue_size == 0 && p->queue_size_bytes == 0)
- p->queue_size = 50 ;
- if (p->queue_size != 0 ) /* buffers are prevailing */
- p->queue_size_bytes = 0 ;
- if (p->queue_size > 100)
- p->queue_size = 50 ;
- if (p->queue_size_bytes > 1024*1024)
- p->queue_size_bytes = 1024*1024 ;
-#if 0
- printf("ip_dn: config pipe %d %d bit/s %d ms %d bufs\n",
- p->pipe_nr,
- p->bandwidth * 8 * hz ,
- p->delay * 1000 / hz , p->queue_size);
-#endif
- for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ;
+ /*
+ * The config program passes parameters as follows:
+ * bandwidth = bits/second (0 means no limits);
+ * delay = millisec., must be translated into ticks.
+ * queue_size = slots (0 means no limit)
+ * queue_size_bytes = bytes (0 means no limit)
+ * only one can be set, must be bound-checked
+ */
+ p->delay = ( p->delay * hz ) / 1000 ;
+ if (p->queue_size == 0 && p->queue_size_bytes == 0)
+ p->queue_size = 50 ;
+ if (p->queue_size != 0 ) /* buffers are prevailing */
+ p->queue_size_bytes = 0 ;
+ if (p->queue_size > 100)
+ p->queue_size = 50 ;
+ if (p->queue_size_bytes > 1024*1024)
+ p->queue_size_bytes = 1024*1024 ;
+ for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ;
a = b , b = b->next) ;
- if (b && b->pipe_nr == p->pipe_nr) {
- /* XXX should spl and flush old pipe... */
- b->bandwidth = p->bandwidth ;
- b->delay = p->delay ;
- b->ticks_from_last_insert = p->delay ;
- b->queue_size = p->queue_size ;
- b->queue_size_bytes = p->queue_size_bytes ;
- b->plr = p->plr ;
- } else {
- int s ;
- x = malloc(sizeof(struct dn_pipe), M_IPFW, M_DONTWAIT) ;
- if (x == NULL) {
- printf("ip_dummynet.c: sorry no memory\n");
+ if (b && b->pipe_nr == p->pipe_nr) {
+ b->bandwidth = p->bandwidth ;
+ b->delay = p->delay ;
+ b->queue_size = p->queue_size ;
+ b->queue_size_bytes = p->queue_size_bytes ;
+ b->plr = p->plr ;
+ b->flow_mask = p->flow_mask ;
+ b->flags = p->flags ;
+ } else { /* completely new pipe */
+ int s ;
+ x = malloc(sizeof(struct dn_pipe), M_IPFW, M_DONTWAIT) ;
+ if (x == NULL) {
+ printf("ip_dummynet.c: no memory for new pipe\n");
error = ENOSPC ;
break ;
- }
- bzero(x, sizeof(*x) );
- x->bandwidth = p->bandwidth ;
- x->delay = p->delay ;
- x->ticks_from_last_insert = p->delay ;
- x->pipe_nr = p->pipe_nr ;
- x->queue_size = p->queue_size ;
- x->queue_size_bytes = p->queue_size_bytes ;
- x->plr = p->plr ;
-
- s = splnet() ;
- x->next = b ;
- if (a == NULL)
- all_pipes = x ;
- else
- a->next = x ;
- splx(s);
}
+ bzero(x, sizeof(*x) );
+ x->bandwidth = p->bandwidth ;
+ x->delay = p->delay ;
+ x->pipe_nr = p->pipe_nr ;
+ x->queue_size = p->queue_size ;
+ x->queue_size_bytes = p->queue_size_bytes ;
+ x->plr = p->plr ;
+ x->flow_mask = p->flow_mask ;
+ x->flags = p->flags ;
+ if (x->flags & DN_HAVE_FLOW_MASK) {/* allocate some slots */
+ int l = p->rq_size ;
+ if (l == 0)
+ l = dn_hash_size ;
+ if (l < 4)
+ l = 4 ;
+ else if (l > 1024)
+ l = 1024 ;
+ x->rq_size = l ;
+ } else /* one is enough for null mask */
+ x->rq_size = 1 ;
+ x->rq = malloc(x->rq_size * sizeof(struct dn_flow_queue *),
+ M_IPFW, M_DONTWAIT) ;
+ if (x->rq == NULL ) {
+ printf("sorry, cannot allocate queue\n");
+ free(x, M_IPFW);
+ error = ENOSPC ;
+ break ;
+ }
+ bzero(x->rq, x->rq_size * sizeof(struct dn_flow_queue *) );
+ x->rq_elements = 0 ;
+
+ s = splnet() ;
+ x->next = b ;
+ if (a == NULL)
+ all_pipes = x ;
+ else
+ a->next = x ;
+ splx(s);
+ }
break ;
case IP_DUMMYNET_DEL :
@@ -581,48 +857,82 @@ ip_dn_ctl(struct sockopt *sopt)
if (error)
break ;
- for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ;
+ for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ;
a = b , b = b->next) ;
- if (b && b->pipe_nr == p->pipe_nr) { /* found pipe */
- int s = splnet() ;
- struct ip_fw_chain *chain = ip_fw_chain.lh_first;
-
- if (a == NULL)
- all_pipes = b->next ;
- else
- a->next = b->next ;
- /*
- * remove references to this pipe from the ip_fw rules.
- */
- for (; chain; chain = chain->chain.le_next) {
- register struct ip_fw *const f = chain->rule;
- if (f->pipe_ptr == b)
- f->pipe_ptr = NULL ;
+ if (b && b->pipe_nr == p->pipe_nr) { /* found pipe */
+ int s ;
+ struct ip_fw_chain *chain ;
+
+ s = splnet() ;
+ chain = ip_fw_chain.lh_first;
+
+ if (a == NULL)
+ all_pipes = b->next ;
+ else
+ a->next = b->next ;
+ /*
+ * remove references to this pipe from the ip_fw rules.
+ */
+ for (; chain; chain = chain->chain.le_next)
+ if (chain->rule->pipe_ptr == b)
+ chain->rule->pipe_ptr = NULL ;
+ /* remove all references to b from heaps */
+ if (ready_heap.elements > 0) {
+ struct dn_heap *h = &ready_heap ;
+ int i = 0, found = 0 ;
+ while ( i < h->elements ) {
+ if (((struct dn_flow_queue *)(h->p[i].object))->p == b) {
+ /* found one */
+ h->elements-- ;
+ h->p[i] = h->p[h->elements] ;
+ found++ ;
+ } else
+ i++ ;
}
- splx(s);
- purge_pipe(b); /* remove pkts from here */
- free(b, M_IPFW);
+ if (found)
+ heapify(h);
}
- break ;
+ if (extract_heap.elements > 0) {
+ struct dn_heap *h = &extract_heap ;
+ int i = 0, found = 0 ;
+ while ( i < h->elements ) {
+ if (h->p[i].object == b) { /* found one */
+ h->elements-- ;
+ h->p[i] = h->p[h->elements] ;
+ found++ ;
+ } else
+ i++ ;
+ }
+ if (found)
+ heapify(h);
+ }
+ splx(s);
+ purge_pipe(b); /* remove pkts from here */
+ free(b->rq, M_IPFW);
+ free(b, M_IPFW);
}
+ break ;
+ }
return error ;
}
static void
ip_dn_init(void)
{
- printf("DUMMYNET initialized (990811)\n");
+ printf("DUMMYNET initialized (000106)\n");
all_pipes = NULL ;
+ ready_heap.size = ready_heap.elements = 0 ;
+ extract_heap.size = extract_heap.elements = 0 ;
ip_dn_ctl_ptr = ip_dn_ctl;
+ timeout(dummynet, NULL, 1);
}
-static ip_dn_ctl_t *old_dn_ctl_ptr;
+static ip_dn_ctl_t *old_dn_ctl_ptr ;
static int
dummynet_modevent(module_t mod, int type, void *data)
{
- int s;
-
+ int s ;
switch (type) {
case MOD_LOAD:
s = splnet();
@@ -632,19 +942,19 @@ dummynet_modevent(module_t mod, int type, void *data)
break;
case MOD_UNLOAD:
s = splnet();
- ip_dn_ctl_ptr = old_dn_ctl_ptr;
+ ip_dn_ctl_ptr = old_dn_ctl_ptr;
splx(s);
dummynet_flush();
- break;
+ break ;
default:
- break;
+ break ;
}
- return 0;
+ return 0 ;
}
static moduledata_t dummynet_mod = {
"dummynet",
dummynet_modevent,
NULL
-};
+} ;
DECLARE_MODULE(dummynet, dummynet_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
diff --git a/sys/netinet/ip_dummynet.h b/sys/netinet/ip_dummynet.h
index 61ca6dd..dbfa8a2 100644
--- a/sys/netinet/ip_dummynet.h
+++ b/sys/netinet/ip_dummynet.h
@@ -1,14 +1,28 @@
/*
- * Copyright (c) 1998 Luigi Rizzo
+ * Copyright (c) 1998-2000 Luigi Rizzo, Universita` di Pisa
+ * Portions Copyright (c) 2000 Akamba Corp.
+ * All rights reserved
*
- * Redistribution and use in source forms, with and without modification,
- * are permitted provided that this entire comment appears intact.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
*
- * Redistribution in binary form may occur without any restrictions.
- * Obviously, it would be nice if you gave credit where credit is due
- * but requiring it would be too onerous.
- *
- * This software is provided ``AS IS'' without any warranties of any kind.
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
*
* $FreeBSD$
*/
@@ -18,8 +32,7 @@
/*
* Definition of dummynet data structures.
- * Dummynet handles a list of pipes, each one identified by a unique
- * number (hopefully the list is short so we use a linked list).
+ * We first start with the heap which is used by the scheduler.
*
* Each list contains a set of parameters identifying the pipe, and
* a set of packets queued on the pipe itself.
@@ -28,6 +41,31 @@
* is pretty simple and this makes the code more portable.
*/
+typedef u_int32_t dn_key ; /* sorting key */
+#define DN_KEY_LT(a,b) ((int)((a)-(b)) < 0)
+#define DN_KEY_LEQ(a,b) ((int)((a)-(b)) <= 0)
+#define DN_KEY_GT(a,b) ((int)((a)-(b)) > 0)
+#define DN_KEY_GEQ(a,b) ((int)((a)-(b)) >= 0)
+
+struct dn_heap_entry {
+ dn_key key ; /* sorting key. Topmost element is smallest one */
+ void *object ; /* object pointer */
+} ;
+
+struct dn_heap {
+ int size ;
+ int elements ;
+ struct dn_heap_entry *p ; /* really an array of "size" entries */
+} ;
+
+/*
+ * MT_DUMMYNET is a new (fake) mbuf type that is prepended to the
+ * packet when it comes out of a pipe. The definition
+ * ought to go in /sys/sys/mbuf.h but here it is less intrusive.
+ */
+
+#define MT_DUMMYNET MT_CONTROL
+
/*
* struct dn_pkt identifies a packet in the dummynet queue. The
* first part is really an m_hdr for implementation purposes, and some
@@ -38,19 +76,19 @@
struct dn_pkt {
struct m_hdr hdr ;
#define dn_next hdr.mh_nextpkt /* next element in queue */
+#define DN_NEXT(x) (struct dn_pkt *)(x)->dn_next
#define dn_m hdr.mh_next /* packet to be forwarded */
-#define dn_dst hdr.mh_len /* dst, for ip_output */
-#define dn_dir hdr.mh_flags /* IP_FW_F_IN or IP_FW_F_OUT */
- int delay; /* stays queued until delay=0 */
+/* #define dn_dst hdr.mh_len -* dst, for ip_output */
+#define dn_dir hdr.mh_flags /* action when pkt extracted from a queue */
+#define DN_TO_IP_OUT 1
+#define DN_TO_IP_IN 2
+#define DN_TO_BDG_FWD 3
+
+ dn_key output_time; /* when the pkt is due for delivery */
struct ifnet *ifp; /* interface, for ip_output */
+ struct sockaddr_in *dn_dst ;
struct route ro; /* route, for ip_output. MUST COPY */
- int flags; /* flags, for ip_output */
-
-#ifdef DUMMYNET_DEBUG
- struct timeval beg, mid; /* testing only */
- int act_delay; /* testing only */
- int in_delay; /* testing only */
-#endif
+ int flags ; /* flags, for ip_output (IPv6 ?) */
};
struct dn_queue {
@@ -58,53 +96,64 @@ struct dn_queue {
} ;
/*
- * descriptor of a pipe. The flags field will be used to speed up the
- * forwarding code paths, in case some of the parameters are not
- * used.
+ * Flow mask/flow id for each queue.
+ */
+struct dn_flow_id {
+ u_int32_t dst_ip, src_ip ;
+ u_int16_t dst_port, src_port ;
+ u_int8_t proto ;
+} ;
+
+/*
+ * We use per flow queues. Hashing is used to select the right slot,
+ * then we scan the list to match the flow-id.
+ * The pipe is shared as it is only a delay line and thus one is enough.
+ */
+struct dn_flow_queue {
+ struct dn_flow_queue *next ;
+ struct dn_flow_id id ;
+ struct dn_pipe *p ; /* parent pipe */
+ struct dn_queue r;
+ long numbytes ;
+ u_int len ;
+ u_int len_bytes ;
+
+ u_int64_t tot_pkts ; /* statistics counters */
+ u_int64_t tot_bytes ;
+ u_int32_t drops ;
+ int hash_slot ; /* debugging/diagnostic */
+} ;
+
+/*
+ * Pipe descriptor. Contains global parameters, delay-line queue,
+ * and the hash array of the per-flow queues.
*/
struct dn_pipe { /* a pipe */
struct dn_pipe *next ;
u_short pipe_nr ; /* number */
u_short flags ; /* to speed up things */
-#define DN_HAVE_BW 1
-#define DN_HAVE_QUEUE 2
-#define DN_HAVE_DELAY 4
+#define DN_HAVE_FLOW_MASK 8
int bandwidth; /* really, bytes/tick. */
int queue_size ;
int queue_size_bytes ;
int delay ; /* really, ticks */
int plr ; /* pkt loss rate (2^31-1 means 100%) */
- struct dn_queue r;
- int r_len; /* elements in r_queue */
- int r_len_bytes; /* bytes in r_queue */
- int r_drops; /* drops from r_queue */
struct dn_queue p ;
- int ticks_from_last_insert;
- long numbytes; /* which can send or receive */
+ struct dn_flow_id flow_mask ;
+ int rq_size ;
+ int rq_elements ;
+ struct dn_flow_queue **rq ; /* array of rq_size entries */
};
-/*
- * The following is used to define a new mbuf type that is
- * prepended to the packet when it comes out of a pipe. The definition
- * ought to go in /sys/sys/mbuf.h but here it is less intrusive.
- */
-
-#define MT_DUMMYNET MT_CONTROL
-/*
- * what to do of a packet when it comes out of a pipe
- */
-#define DN_TO_IP_OUT 1
-#define DN_TO_IP_IN 2
-#define DN_TO_BDG_FWD 3
-
#ifdef _KERNEL
MALLOC_DECLARE(M_IPFW);
typedef int ip_dn_ctl_t __P((struct sockopt *)) ;
extern ip_dn_ctl_t *ip_dn_ctl_ptr;
+extern struct dn_flow_id dn_last_pkt ;
void dn_rule_delete(void *r); /* used in ip_fw.c */
int dummynet_io(int pipe, int dir,
OpenPOWER on IntegriCloud