Bring in the most recent version of ipfw and dummynet, developed

and tested over the past two months in the ipfw3-head branch. This also happens to be the same code available in the Linux and Windows ports of ipfw and dummynet. The major enhancement is a completely restructured version of dummynet, with support for different packet scheduling algorithms (loadable at runtime), faster queue/pipe lookup, and a much cleaner internal architecture and kernel/userland ABI which simplifies future extensions. In addition to the existing schedulers (FIFO and WF2Q+), we include a Deficit Round Robin (DRR or RR for brevity) scheduler, and a new, very fast version of WF2Q+ called QFQ. Some test code is also present (in sys/netinet/ipfw/test) that lets you build and test schedulers in userland. Also, we have added a compatibility layer that understands requests from the RELENG_7 and RELENG_8 versions of the /sbin/ipfw binaries, and replies correctly (at least, it does its best; sometimes you just cannot tell who sent the request and how to answer). The compatibility layer should make it possible to MFC this code in a relatively short time. Some minor glitches (e.g. handling of ipfw set enable/disable, and a workaround for a bug in RELENG_7's /sbin/ipfw) will be fixed with separate commits. CREDITS: This work has been partly supported by the ONELAB2 project, and mostly developed by Riccardo Panicucci and myself. The code for the qfq scheduler is mostly from Fabio Checconi, and Marta Carbone and Francesco Magno have helped with testing, debugging and some bug fixes.
author: luigi <luigi@FreeBSD.org> 2010-03-02 17:40:48 +0000
committer: luigi <luigi@FreeBSD.org> 2010-03-02 17:40:48 +0000
commit: 5ceeac4aa882074e2b1e42fbc20c79ebbd24f4c5 (patch)
tree: 225c58cb171a7858ca9880ee492dd9f3d281f623
parent: a8f766213ad48d0cb09214bdf6ac6a3b74221b03 (diff)
download: FreeBSD-src-5ceeac4aa882074e2b1e42fbc20c79ebbd24f4c5.zip
FreeBSD-src-5ceeac4aa882074e2b1e42fbc20c79ebbd24f4c5.tar.gz
37 files changed, 9756 insertions, 2856 deletions
diff --git a/sbin/ipfw/Makefile b/sbin/ipfw/Makefile
index b25f38c..c09ebca 100644
--- a/sbin/ipfw/Makefile
+++ b/sbin/ipfw/Makefile
@@ -3,7 +3,6 @@
 PROG=	ipfw
 SRCS=	ipfw2.c dummynet.c ipv6.c main.c nat.c altq.c
 WARNS?=	2
-DPADD=	${LIBUTIL}
 LDADD=	-lutil
 MAN=	ipfw.8
 
diff --git a/sbin/ipfw/altq.c b/sbin/ipfw/altq.c
index b00a1e0..8cf19e5 100644
--- a/sbin/ipfw/altq.c
+++ b/sbin/ipfw/altq.c
@@ -39,6 +39,7 @@
 
 #include <net/if.h>		/* IFNAMSIZ */
 #include <net/pfvar.h>
+#include <netinet/in.h>	/* in_addr */
 #include <netinet/ip_fw.h>
 
 /*
diff --git a/sbin/ipfw/dummynet.c b/sbin/ipfw/dummynet.c
index 490aa53..68b2220 100644
--- a/sbin/ipfw/dummynet.c
+++ b/sbin/ipfw/dummynet.c
@@ -1,10 +1,5 @@
 /*
- * Copyright (c) 2002-2003 Luigi Rizzo
- * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp
- * Copyright (c) 1994 Ugen J.S.Antsilevich
- *
- * Idea and grammar partially left from:
- * Copyright (c) 1993 Daniel Boulet
+ * Copyright (c) 2002-2003,2010 Luigi Rizzo
  *
  * Redistribution and use in source forms, with and without modification,
  * are permitted provided that this entire comment appears intact.
@@ -24,7 +19,6 @@
 
 #include <sys/types.h>
 #include <sys/socket.h>
-#include <sys/queue.h>
 /* XXX there are several sysctl leftover here */
 #include <sys/sysctl.h>
 
@@ -46,6 +40,7 @@
 #include <netinet/ip_dummynet.h>
 #include <arpa/inet.h>	/* inet_ntoa */
 
+
 static struct _s_x dummynet_params[] = {
 	{ "plr",		TOK_PLR },
 	{ "noerror",		TOK_NOERROR },
@@ -56,27 +51,59 @@ static struct _s_x dummynet_params[] = {
 	{ "src-port",		TOK_SRCPORT },
 	{ "proto",		TOK_PROTO },
 	{ "weight",		TOK_WEIGHT },
+	{ "lmax",		TOK_LMAX },
+	{ "maxlen",		TOK_LMAX },
 	{ "all",		TOK_ALL },
-	{ "mask",		TOK_MASK },
+	{ "mask",		TOK_MASK }, /* alias for both */
+	{ "sched_mask",		TOK_SCHED_MASK },
+	{ "flow_mask",		TOK_FLOW_MASK },
 	{ "droptail",		TOK_DROPTAIL },
 	{ "red",		TOK_RED },
 	{ "gred",		TOK_GRED },
 	{ "bw",			TOK_BW },
 	{ "bandwidth",		TOK_BW },
 	{ "delay",		TOK_DELAY },
+	{ "link",		TOK_LINK },
 	{ "pipe",		TOK_PIPE },
 	{ "queue",		TOK_QUEUE },
+	{ "flowset",		TOK_FLOWSET },
+	{ "sched",		TOK_SCHED },
+	{ "pri",		TOK_PRI },
+	{ "priority",		TOK_PRI },
+	{ "type",		TOK_TYPE },
 	{ "flow-id",		TOK_FLOWID},
 	{ "dst-ipv6",		TOK_DSTIP6},
 	{ "dst-ip6",		TOK_DSTIP6},
 	{ "src-ipv6",		TOK_SRCIP6},
 	{ "src-ip6",		TOK_SRCIP6},
-	{ "profile",		TOK_PIPE_PROFILE},
+	{ "profile",		TOK_PROFILE},
 	{ "burst",		TOK_BURST},
 	{ "dummynet-params",	TOK_NULL },
 	{ NULL, 0 }	/* terminator */
 };
 
+#define O_NEXT(p, len) ((void *)((char *)p + len))
+
+static void
+oid_fill(struct dn_id *oid, int len, int type, uintptr_t id)
+{
+	oid->len = len;
+	oid->type = type;
+	oid->subtype = 0;
+	oid->id = id;
+}
+
+/* make room in the buffer and move the pointer forward */
+static void *
+o_next(struct dn_id **o, int len, int type)
+{
+	struct dn_id *ret = *o;
+	oid_fill(ret, len, type, 0);
+	*o = O_NEXT(*o, len);
+	return ret;
+}
+
+#if 0
 static int
 sort_q(void *arg, const void *pa, const void *pb)
 {
@@ -108,117 +135,81 @@ sort_q(void *arg, const void *pa, const void *pb)
 		res = 1;
 	return (int)(rev ? res : -res);
 }
+#endif
 
+/* print a mask and header for the subsequent list of flows */
 static void
-list_queues(struct dn_flow_set *fs, struct dn_flow_queue *q)
+print_mask(struct ipfw_flow_id *id)
+{
+	if (!IS_IP6_FLOW_ID(id)) {
+		printf("    "
+		    "mask: 0x%02x 0x%08x/0x%04x -> 0x%08x/0x%04x\n",
+		    id->proto,
+		    id->src_ip, id->src_port,
+		    id->dst_ip, id->dst_port);
+
+		printf("BKT Prot ___Source IP/port____ "
+		    "____Dest. IP/port____ "
+		    "Tot_pkt/bytes Pkt/Byte Drp\n");
+	} else {
+		char buf[255];
+		printf("\n        mask: proto: 0x%02x, flow_id: 0x%08x,  ",
+		    id->proto, id->flow_id6);
+		inet_ntop(AF_INET6, &(id->src_ip6), buf, sizeof(buf));
+		printf("%s/0x%04x -> ", buf, id->src_port);
+		inet_ntop(AF_INET6, &(id->dst_ip6), buf, sizeof(buf));
+		printf("%s/0x%04x\n", buf, id->dst_port);
+
+		printf("BKT ___Prot___ _flow-id_ "
+		    "______________Source IPv6/port_______________ "
+		    "_______________Dest. IPv6/port_______________ "
+		    "Tot_pkt/bytes Pkt/Byte Drp\n");
+	}
+}
+
+static void
+list_flow(struct dn_flow *ni)
 {
-	int l;
-	int index_printed, indexes = 0;
 	char buff[255];
 	struct protoent *pe;
+	struct in_addr ina;
+	struct ipfw_flow_id *id = &ni->fid;
 
-	if (fs->rq_elements == 0)
-		return;
-
-	if (co.do_sort != 0)
-		qsort_r(q, fs->rq_elements, sizeof *q, NULL, sort_q);
-
-	/* Print IPv4 flows */
-	index_printed = 0;
-	for (l = 0; l < fs->rq_elements; l++) {
-		struct in_addr ina;
-
+	pe = getprotobynumber(id->proto);
 		/* XXX: Should check for IPv4 flows */
-		if (IS_IP6_FLOW_ID(&(q[l].id)))
-			continue;
-
-		if (!index_printed) {
-			index_printed = 1;
-			if (indexes > 0)	/* currently a no-op */
-				printf("\n");
-			indexes++;
-			printf("    "
-			    "mask: 0x%02x 0x%08x/0x%04x -> 0x%08x/0x%04x\n",
-			    fs->flow_mask.proto,
-			    fs->flow_mask.src_ip, fs->flow_mask.src_port,
-			    fs->flow_mask.dst_ip, fs->flow_mask.dst_port);
-
-			printf("BKT Prot ___Source IP/port____ "
-			    "____Dest. IP/port____ "
-			    "Tot_pkt/bytes Pkt/Byte Drp\n");
-		}
-
-		printf("%3d ", q[l].hash_slot);
-		pe = getprotobynumber(q[l].id.proto);
+	printf("%3u ", (ni->oid.id) & 0xff);
+	if (!IS_IP6_FLOW_ID(id)) {
 		if (pe)
 			printf("%-4s ", pe->p_name);
 		else
-			printf("%4u ", q[l].id.proto);
-		ina.s_addr = htonl(q[l].id.src_ip);
+			printf("%4u ", id->proto);
+		ina.s_addr = htonl(id->src_ip);
 		printf("%15s/%-5d ",
-		    inet_ntoa(ina), q[l].id.src_port);
-		ina.s_addr = htonl(q[l].id.dst_ip);
+		    inet_ntoa(ina), id->src_port);
+		ina.s_addr = htonl(id->dst_ip);
 		printf("%15s/%-5d ",
-		    inet_ntoa(ina), q[l].id.dst_port);
-		printf("%4llu %8llu %2u %4u %3u\n",
-		    align_uint64(&q[l].tot_pkts),
-		    align_uint64(&q[l].tot_bytes),
-		    q[l].len, q[l].len_bytes, q[l].drops);
-		if (co.verbose)
-			printf("   S %20llu  F %20llu\n",
-			    align_uint64(&q[l].S), align_uint64(&q[l].F));
-	}
-
-	/* Print IPv6 flows */
-	index_printed = 0;
-	for (l = 0; l < fs->rq_elements; l++) {
-		if (!IS_IP6_FLOW_ID(&(q[l].id)))
-			continue;
-
-		if (!index_printed) {
-			index_printed = 1;
-			if (indexes > 0)
-				printf("\n");
-			indexes++;
-			printf("\n        mask: proto: 0x%02x, flow_id: 0x%08x,  ",
-			    fs->flow_mask.proto, fs->flow_mask.flow_id6);
-			inet_ntop(AF_INET6, &(fs->flow_mask.src_ip6),
-			    buff, sizeof(buff));
-			printf("%s/0x%04x -> ", buff, fs->flow_mask.src_port);
-			inet_ntop( AF_INET6, &(fs->flow_mask.dst_ip6),
-			    buff, sizeof(buff) );
-			printf("%s/0x%04x\n", buff, fs->flow_mask.dst_port);
-
-			printf("BKT ___Prot___ _flow-id_ "
-			    "______________Source IPv6/port_______________ "
-			    "_______________Dest. IPv6/port_______________ "
-			    "Tot_pkt/bytes Pkt/Byte Drp\n");
-		}
-		printf("%3d ", q[l].hash_slot);
-		pe = getprotobynumber(q[l].id.proto);
+		    inet_ntoa(ina), id->dst_port);
+	} else {
+		/* Print IPv6 flows */
 		if (pe != NULL)
 			printf("%9s ", pe->p_name);
 		else
-			printf("%9u ", q[l].id.proto);
-		printf("%7d  %39s/%-5d ", q[l].id.flow_id6,
-		    inet_ntop(AF_INET6, &(q[l].id.src_ip6), buff, sizeof(buff)),
-		    q[l].id.src_port);
+			printf("%9u ", id->proto);
+		printf("%7d  %39s/%-5d ", id->flow_id6,
+		    inet_ntop(AF_INET6, &(id->src_ip6), buff, sizeof(buff)),
+		    id->src_port);
 		printf(" %39s/%-5d ",
-		    inet_ntop(AF_INET6, &(q[l].id.dst_ip6), buff, sizeof(buff)),
-		    q[l].id.dst_port);
-		printf(" %4llu %8llu %2u %4u %3u\n",
-		    align_uint64(&q[l].tot_pkts),
-		    align_uint64(&q[l].tot_bytes),
-		    q[l].len, q[l].len_bytes, q[l].drops);
-		if (co.verbose)
-			printf("   S %20llu  F %20llu\n",
-			    align_uint64(&q[l].S),
-			    align_uint64(&q[l].F));
+		    inet_ntop(AF_INET6, &(id->dst_ip6), buff, sizeof(buff)),
+		    id->dst_port);
 	}
+	printf("%4llu %8llu %2u %4u %3u\n",
+	    align_uint64(&ni->tot_pkts),
+	    align_uint64(&ni->tot_bytes),
+	    ni->length, ni->len_bytes, ni->drops);
 }
 
 static void
-print_flowset_parms(struct dn_flow_set *fs, char *prefix)
+print_flowset_parms(struct dn_fs *fs, char *prefix)
 {
 	int l;
 	char qs[30];
@@ -226,7 +217,7 @@ print_flowset_parms(struct dn_flow_set *fs, char *prefix)
 	char red[90];	/* Display RED parameters */
 
 	l = fs->qsize;
-	if (fs->flags_fs & DN_QSIZE_IS_BYTES) {
+	if (fs->flags & DN_QSIZE_BYTES) {
 		if (l >= 8192)
 			sprintf(qs, "%d KB", l / 1024);
 		else
@@ -237,23 +228,34 @@ print_flowset_parms(struct dn_flow_set *fs, char *prefix)
 		sprintf(plr, "plr %f", 1.0 * fs->plr / (double)(0x7fffffff));
 	else
 		plr[0] = '\0';
-	if (fs->flags_fs & DN_IS_RED)	/* RED parameters */
+
+	if (fs->flags & DN_IS_RED)	/* RED parameters */
 		sprintf(red,
 		    "\n\t %cRED w_q %f min_th %d max_th %d max_p %f",
-		    (fs->flags_fs & DN_IS_GENTLE_RED) ? 'G' : ' ',
+		    (fs->flags & DN_IS_GENTLE_RED) ? 'G' : ' ',
 		    1.0 * fs->w_q / (double)(1 << SCALE_RED),
-		    SCALE_VAL(fs->min_th),
-		    SCALE_VAL(fs->max_th),
+		    fs->min_th,
+		    fs->max_th,
 		    1.0 * fs->max_p / (double)(1 << SCALE_RED));
 	else
 		sprintf(red, "droptail");
 
-	printf("%s %s%s %d queues (%d buckets) %s\n",
-	    prefix, qs, plr, fs->rq_elements, fs->rq_size, red);
+	if (prefix[0]) {
+	    printf("%s %s%s %d queues (%d buckets) %s\n",
+		prefix, qs, plr, fs->oid.id, fs->buckets, red);
+	    prefix[0] = '\0';
+	} else {
+	    printf("q%05d %s%s %d flows (%d buckets) sched %d "
+			"weight %d lmax %d pri %d %s\n",
+		fs->fs_nr, qs, plr, fs->oid.id, fs->buckets,
+		fs->sched_nr, fs->par[0], fs->par[1], fs->par[2], red);
+	    if (fs->flags & DN_HAVE_MASK)
+		print_mask(&fs->flow_mask);
+	}
 }
 
 static void
-print_extra_delay_parms(struct dn_pipe *p)
+print_extra_delay_parms(struct dn_profile *p)
 {
 	double loss;
 	if (p->samples_no <= 0)
@@ -265,105 +267,126 @@ print_extra_delay_parms(struct dn_pipe *p)
 		p->name, loss, p->samples_no);
 }
 
-void
-ipfw_list_pipes(void *data, uint nbytes, int ac, char *av[])
+static void
+flush_buf(char *buf)
 {
-	int rulenum;
-	void *next = data;
-	struct dn_pipe *p = (struct dn_pipe *) data;
-	struct dn_flow_set *fs;
-	struct dn_flow_queue *q;
-	int l;
-
-	if (ac > 0)
-		rulenum = strtoul(*av++, NULL, 10);
-	else
-		rulenum = 0;
-	for (; nbytes >= sizeof *p; p = (struct dn_pipe *)next) {
-		double b = p->bandwidth;
-		char buf[30];
-		char prefix[80];
-		char burst[5 + 7];
-
-		if (SLIST_NEXT(p, next) != (struct dn_pipe *)DN_IS_PIPE)
-			break;	/* done with pipes, now queues */
-
-		/*
-		 * compute length, as pipe have variable size
-		 */
-		l = sizeof(*p) + p->fs.rq_elements * sizeof(*q);
-		next = (char *)p + l;
-		nbytes -= l;
-
-		if ((rulenum != 0 && rulenum != p->pipe_nr) || co.do_pipe == 2)
-			continue;
-
-		/*
-		 * Print rate (or clocking interface)
-		 */
-		if (p->if_name[0] != '\0')
-			sprintf(buf, "%s", p->if_name);
-		else if (b == 0)
-			sprintf(buf, "unlimited");
-		else if (b >= 1000000)
-			sprintf(buf, "%7.3f Mbit/s", b/1000000);
-		else if (b >= 1000)
-			sprintf(buf, "%7.3f Kbit/s", b/1000);
-		else
-			sprintf(buf, "%7.3f bit/s ", b);
-
-		sprintf(prefix, "%05d: %s %4d ms ",
-		    p->pipe_nr, buf, p->delay);
-
-		print_flowset_parms(&(p->fs), prefix);
-
-		if (humanize_number(burst, sizeof(burst), p->burst,
-		    "Byte", HN_AUTOSCALE, 0) < 0 || co.verbose)
-			printf("\t burst: %ju Byte\n", p->burst);
-		else
-			printf("\t burst: %s\n", burst);
-
-		print_extra_delay_parms(p);
-
-		q = (struct dn_flow_queue *)(p+1);
-		list_queues(&(p->fs), q);
-	}
-	for (fs = next; nbytes >= sizeof *fs; fs = next) {
-		char prefix[80];
-
-		if (SLIST_NEXT(fs, next) != (struct dn_flow_set *)DN_IS_QUEUE)
-			break;
-		l = sizeof(*fs) + fs->rq_elements * sizeof(*q);
-		next = (char *)fs + l;
-		nbytes -= l;
-
-		if (rulenum != 0 && ((rulenum != fs->fs_nr && co.do_pipe == 2) ||
-		    (rulenum != fs->parent_nr && co.do_pipe == 1))) {
-			continue;
-		}
-
-		q = (struct dn_flow_queue *)(fs+1);
-		sprintf(prefix, "q%05d: weight %d pipe %d ",
-		    fs->fs_nr, fs->weight, fs->parent_nr);
-		print_flowset_parms(fs, prefix);
-		list_queues(fs, q);
+	if (buf[0])
+		printf("%s\n", buf);
+	buf[0] = '\0';
+}
+	
+/*
+ * generic list routine. We expect objects in a specific order, i.e.
+ * PIPES AND SCHEDULERS:
+ *	link; scheduler; internal flowset if any; instances
+ * we can tell a pipe from the number.
+ *
+ * FLOWSETS:
+ *	flowset; queues;
+ * link i (int queue); scheduler i; si(i) { flowsets() : queues }
+ */
+static void
+list_pipes(struct dn_id *oid, struct dn_id *end)
+{
+    char buf[160];	/* pending buffer */
+    buf[0] = '\0';
+
+    for (; oid != end; oid = O_NEXT(oid, oid->len)) {
+	if (oid->len < sizeof(*oid))
+		errx(1, "invalid oid len %d\n", oid->len);
+
+	switch (oid->type) {
+	default:
+	    flush_buf(buf);
+	    printf("unrecognized object %d size %d\n", oid->type, oid->len);
+	    break;
+	case DN_TEXT: /* list of attached flowsets */
+	    {
+		int i, l;
+		struct {
+			struct dn_id id;
+			uint32_t p[0];
+		} *d = (void *)oid;
+		l = (oid->len - sizeof(*oid))/sizeof(d->p[0]);
+		if (l == 0)
+		    break;
+		printf("   Children flowsets: ");
+		for (i = 0; i < l; i++)
+			printf("%u ", d->p[i]);
+		printf("\n");
+		break;
+	    }
+	case DN_CMD_GET:
+	    if (co.verbose)
+		printf("answer for cmd %d, len %d\n", oid->type, oid->id);
+	    break;
+	case DN_SCH: {
+	    struct dn_sch *s = (struct dn_sch *)oid;
+	    flush_buf(buf);
+	    printf(" sched %d type %s flags 0x%x %d buckets %d active\n",
+			s->sched_nr,
+			s->name, s->flags, s->buckets, s->oid.id);
+	    if (s->flags & DN_HAVE_MASK)
+		print_mask(&s->sched_mask);
+	    }
+	    break;
+
+	case DN_FLOW:
+	    list_flow((struct dn_flow *)oid);
+	    break;
+
+	case DN_LINK: {
+	    struct dn_link *p = (struct dn_link *)oid;
+	    double b = p->bandwidth;
+	    char bwbuf[30];
+	    char burst[5 + 7];
+
+	    /* This starts a new object so flush buffer */
+	    flush_buf(buf);
+	    /* data rate */
+	    if (b == 0)
+		sprintf(bwbuf, "unlimited     ");
+	    else if (b >= 1000000)
+		sprintf(bwbuf, "%7.3f Mbit/s", b/1000000);
+	    else if (b >= 1000)
+		sprintf(bwbuf, "%7.3f Kbit/s", b/1000);
+	    else
+		sprintf(bwbuf, "%7.3f bit/s ", b);
+
+	    if (humanize_number(burst, sizeof(burst), p->burst,
+		    "", HN_AUTOSCALE, 0) < 0 || co.verbose)
+		sprintf(burst, "%d", (int)p->burst);
+	    sprintf(buf, "%05d: %s %4d ms burst %s",
+		p->link_nr % DN_MAX_ID, bwbuf, p->delay, burst);
+	    }
+	    break;
+
+	case DN_FS:
+	    print_flowset_parms((struct dn_fs *)oid, buf);
+	    break;
+	case DN_PROFILE:
+	    flush_buf(buf);
+	    print_extra_delay_parms((struct dn_profile *)oid);
 	}
+	flush_buf(buf); // XXX does it really go here ?
+    }
 }
 
 /*
- * Delete pipe or queue i
+ * Delete pipe, queue or scheduler i
  */
 int
-ipfw_delete_pipe(int pipe_or_queue, int i)
+ipfw_delete_pipe(int do_pipe, int i)
 {
-	struct dn_pipe p;
-
-	memset(&p, 0, sizeof p);
-	if (pipe_or_queue == 1)
-		p.pipe_nr = i;		/* pipe */
-	else
-		p.fs.fs_nr = i;		/* queue */
-	i = do_cmd(IP_DUMMYNET_DEL, &p, sizeof p);
+	struct {
+		struct dn_id oid;
+		uintptr_t a[1];	/* add more if we want a list */
+	} cmd;
+	oid_fill((void *)&cmd, sizeof(cmd), DN_CMD_DELETE, DN_API_VERSION);
+	cmd.oid.subtype = (do_pipe == 1) ? DN_LINK :
+		( (do_pipe == 2) ? DN_FS : DN_SCH);
+	cmd.a[0] = i;
+	i = do_cmd(IP_DUMMYNET3, &cmd, cmd.oid.len);
 	if (i) {
 		i = 1;
 		warn("rule %u: setsockopt(IP_DUMMYNET_DEL)", i);
@@ -400,7 +423,7 @@ ipfw_delete_pipe(int pipe_or_queue, int i)
  * The empirical curve may have both vertical and horizontal lines.
  * Vertical lines represent constant delay for a range of
  * probabilities; horizontal lines correspond to a discontinuty
- * in the delay distribution: the pipe will use the largest delay
+ * in the delay distribution: the link will use the largest delay
  * for a given probability.
  * 
  * To pass the curve to dummynet, we must store the parameters
@@ -490,9 +513,12 @@ static void
 read_bandwidth(char *arg, int *bandwidth, char *if_name, int namelen)
 {
 	if (*bandwidth != -1)
-		warn("duplicate token, override bandwidth value!");
+		warnx("duplicate token, override bandwidth value!");
 
 	if (arg[0] >= 'a' && arg[0] <= 'z') {
+		if (!if_name) {
+			errx(1, "no if support");
+		}
 		if (namelen >= IFNAMSIZ)
 			warn("interface name truncated");
 		namelen--;
@@ -521,7 +547,8 @@ read_bandwidth(char *arg, int *bandwidth, char *if_name, int namelen)
 			errx(EX_DATAERR, "bandwidth too large");
 
 		*bandwidth = bw;
-		if_name[0] = '\0';
+		if (if_name)
+			if_name[0] = '\0';
 	}
 }
 
@@ -551,7 +578,8 @@ compare_points(const void *vp1, const void *vp2)
 #define ED_EFMT(s) EX_DATAERR,"error in %s at line %d: "#s,filename,lineno
 
 static void
-load_extra_delays(const char *filename, struct dn_pipe *p)
+load_extra_delays(const char *filename, struct dn_profile *p,
+	struct dn_link *link)
 {
 	char    line[ED_MAX_LINE_LEN];
 	FILE    *f;
@@ -566,6 +594,9 @@ load_extra_delays(const char *filename, struct dn_pipe *p)
 	struct point    points[ED_MAX_SAMPLES_NO];
 	int     points_no = 0;
 
+	/* XXX link never NULL? */
+	p->link_nr = link->link_nr;
+
 	profile_name[0] = '\0';
 	f = fopen(filename, "r");
 	if (f == NULL)
@@ -606,7 +637,8 @@ load_extra_delays(const char *filename, struct dn_pipe *p)
 				ED_MAX_SAMPLES_NO);
 		    do_points = 0;
 		} else if (!strcasecmp(name, ED_TOK_BW)) {
-		    read_bandwidth(arg, &p->bandwidth, p->if_name, sizeof(p->if_name));
+		    char buf[IFNAMSIZ];
+		    read_bandwidth(arg, &link->bandwidth, buf, sizeof(buf));
 		} else if (!strcasecmp(name, ED_TOK_LOSS)) {
 		    if (loss != -1.0)
 			errx(ED_EFMT("duplicated token: %s"), name);
@@ -676,17 +708,17 @@ load_extra_delays(const char *filename, struct dn_pipe *p)
 	    double y2 = points[i+1].prob * samples;
 	    double x2 = points[i+1].delay;
 
-	    int index = y1;
+	    int ix = y1;
 	    int stop = y2;
 
 	    if (x1 == x2) {
-		for (; index<stop; ++index)
-		    p->samples[index] = x1;
+		for (; ix<stop; ++ix)
+		    p->samples[ix] = x1;
 	    } else {
 		double m = (y2-y1)/(x2-x1);
 		double c = y1 - m*x1;
-		for (; index<stop ; ++index)
-		    p->samples[index] = (index - c)/m;
+		for (; ix<stop ; ++ix)
+		    p->samples[ix] = (ix - c)/m;
 	    }
 	}
 	p->samples_no = samples;
@@ -694,27 +726,120 @@ load_extra_delays(const char *filename, struct dn_pipe *p)
 	strncpy(p->name, profile_name, sizeof(p->name));
 }
 
+/*
+ * configuration of pipes, schedulers, flowsets.
+ * When we configure a new scheduler, an empty pipe is created, so:
+ * 
+ * do_pipe = 1 -> "pipe N config ..." only for backward compatibility
+ *	sched N+Delta type fifo sched_mask ...
+ *	pipe N+Delta <parameters>
+ *	flowset N+Delta pipe N+Delta (no parameters)
+ *	sched N type wf2q+ sched_mask ...
+ *	pipe N <parameters>
+ *
+ * do_pipe = 2 -> flowset N config
+ *	flowset N parameters
+ *
+ * do_pipe = 3 -> sched N config
+ *	sched N parameters (default no pipe)
+ *	optional Pipe N config ...
+ * pipe ==>
+ */
 void
 ipfw_config_pipe(int ac, char **av)
 {
-	int samples[ED_MAX_SAMPLES_NO];
-	struct dn_pipe p;
-	int i;
+	int i, j;
 	char *end;
 	void *par = NULL;
-
-	memset(&p, 0, sizeof p);
-	p.bandwidth = -1;
+	struct dn_id *buf, *base;
+	struct dn_sch *sch = NULL;
+	struct dn_link *p = NULL;
+	struct dn_fs *fs = NULL;
+	struct dn_profile *pf = NULL;
+	struct ipfw_flow_id *mask = NULL;
+	int lmax;
+	uint32_t _foo = 0, *flags = &_foo , *buckets = &_foo;
+
+	/*
+	 * allocate space for 1 header,
+	 * 1 scheduler, 1 link, 1 flowset, 1 profile
+	 */
+	lmax = sizeof(struct dn_id);	/* command header */
+	lmax += sizeof(struct dn_sch) + sizeof(struct dn_link) +
+		sizeof(struct dn_fs) + sizeof(struct dn_profile);
 
 	av++; ac--;
 	/* Pipe number */
 	if (ac && isdigit(**av)) {
 		i = atoi(*av); av++; ac--;
-		if (co.do_pipe == 1)
-			p.pipe_nr = i;
-		else
-			p.fs.fs_nr = i;
+	} else
+		i = -1;
+	if (i <= 0)
+		errx(EX_USAGE, "need a pipe/flowset/sched number");
+	base = buf = safe_calloc(1, lmax);
+	/* all commands start with a 'CONFIGURE' and a version */
+	o_next(&buf, sizeof(struct dn_id), DN_CMD_CONFIG);
+	base->id = DN_API_VERSION;
+
+	switch (co.do_pipe) {
+	case 1: /* "pipe N config ..." */
+		/* Allocate space for the WF2Q+ scheduler, its link
+		 * and the FIFO flowset. Set the number, but leave
+		 * the scheduler subtype and other parameters to 0
+		 * so the kernel will use appropriate defaults.
+		 * XXX todo: add a flag to record if a parameter
+		 * is actually configured.
+		 * If we do a 'pipe config' mask -> sched_mask.
+		 * The FIFO scheduler and link are derived from the
+		 * WF2Q+ one in the kernel.
+		 */
+		sch = o_next(&buf, sizeof(*sch), DN_SCH);
+		p = o_next(&buf, sizeof(*p), DN_LINK);
+		fs = o_next(&buf, sizeof(*fs), DN_FS);
+
+		sch->sched_nr = i;
+		sch->oid.subtype = 0;	/* defaults to WF2Q+ */
+		mask = &sch->sched_mask;
+		flags = &sch->flags;
+		buckets = &sch->buckets;
+		*flags |= DN_PIPE_CMD;
+
+		p->link_nr = i;
+
+		/* This flowset is only for the FIFO scheduler */
+		fs->fs_nr = i + 2*DN_MAX_ID;
+		fs->sched_nr = i + DN_MAX_ID;
+		break;
+
+	case 2: /* "queue N config ... " */
+		fs = o_next(&buf, sizeof(*fs), DN_FS);
+		fs->fs_nr = i;
+		mask = &fs->flow_mask;
+		flags = &fs->flags;
+		buckets = &fs->buckets;
+		break;
+
+	case 3: /* "sched N config ..." */
+		sch = o_next(&buf, sizeof(*sch), DN_SCH);
+		fs = o_next(&buf, sizeof(*fs), DN_FS);
+		sch->sched_nr = i;
+		mask = &sch->sched_mask;
+		flags = &sch->flags;
+		buckets = &sch->buckets;
+		/* fs is used only with !MULTIQUEUE schedulers */
+		fs->fs_nr = i + DN_MAX_ID;
+		fs->sched_nr = i;
+		break;
 	}
+	/* set to -1 those fields for which we want to reuse existing
+	 * values from the kernel.
+	 * Also, *_nr and subtype = 0 mean reuse the value from the kernel.
+	 * XXX todo: support reuse of the mask.
+	 */
+	if (p)
+		p->bandwidth = -1;
+	for (j = 0; j < sizeof(fs->par)/sizeof(fs->par[0]); j++)
+		fs->par[j] = -1;
 	while (ac > 0) {
 		double d;
 		int tok = match_token(dummynet_params, *av);
@@ -722,41 +847,48 @@ ipfw_config_pipe(int ac, char **av)
 
 		switch(tok) {
 		case TOK_NOERROR:
-			p.fs.flags_fs |= DN_NOERROR;
+			NEED(fs, "noerror is only for pipes");
+			fs->flags |= DN_NOERROR;
 			break;
 
 		case TOK_PLR:
+			NEED(fs, "plr is only for pipes");
 			NEED1("plr needs argument 0..1\n");
 			d = strtod(av[0], NULL);
 			if (d > 1)
 				d = 1;
 			else if (d < 0)
 				d = 0;
-			p.fs.plr = (int)(d*0x7fffffff);
+			fs->plr = (int)(d*0x7fffffff);
 			ac--; av++;
 			break;
 
 		case TOK_QUEUE:
+			NEED(fs, "queue is only for pipes or flowsets");
 			NEED1("queue needs queue size\n");
 			end = NULL;
-			p.fs.qsize = strtoul(av[0], &end, 0);
+			fs->qsize = strtoul(av[0], &end, 0);
 			if (*end == 'K' || *end == 'k') {
-				p.fs.flags_fs |= DN_QSIZE_IS_BYTES;
-				p.fs.qsize *= 1024;
+				fs->flags |= DN_QSIZE_BYTES;
+				fs->qsize *= 1024;
 			} else if (*end == 'B' ||
 			    _substrcmp2(end, "by", "bytes") == 0) {
-				p.fs.flags_fs |= DN_QSIZE_IS_BYTES;
+				fs->flags |= DN_QSIZE_BYTES;
 			}
 			ac--; av++;
 			break;
 
 		case TOK_BUCKETS:
+			NEED(fs, "buckets is only for pipes or flowsets");
 			NEED1("buckets needs argument\n");
-			p.fs.rq_size = strtoul(av[0], NULL, 0);
+			*buckets = strtoul(av[0], NULL, 0);
 			ac--; av++;
 			break;
 
+		case TOK_FLOW_MASK:
+		case TOK_SCHED_MASK:
 		case TOK_MASK:
+			NEED(mask, "tok_mask");
 			NEED1("mask needs mask specifier\n");
 			/*
 			 * per-flow queue, mask is dst_ip, dst_port,
@@ -764,7 +896,7 @@ ipfw_config_pipe(int ac, char **av)
 			 */
 			par = NULL;
 
-			bzero(&p.fs.flow_mask, sizeof(p.fs.flow_mask));
+			bzero(mask, sizeof(*mask));
 			end = NULL;
 
 			while (ac >= 1) {
@@ -781,43 +913,48 @@ ipfw_config_pipe(int ac, char **av)
 				    /*
 				     * special case, all bits significant
 				     */
-				    p.fs.flow_mask.dst_ip = ~0;
-				    p.fs.flow_mask.src_ip = ~0;
-				    p.fs.flow_mask.dst_port = ~0;
-				    p.fs.flow_mask.src_port = ~0;
-				    p.fs.flow_mask.proto = ~0;
-				    n2mask(&(p.fs.flow_mask.dst_ip6), 128);
-				    n2mask(&(p.fs.flow_mask.src_ip6), 128);
-				    p.fs.flow_mask.flow_id6 = ~0;
-				    p.fs.flags_fs |= DN_HAVE_FLOW_MASK;
+				    mask->dst_ip = ~0;
+				    mask->src_ip = ~0;
+				    mask->dst_port = ~0;
+				    mask->src_port = ~0;
+				    mask->proto = ~0;
+				    n2mask(&mask->dst_ip6, 128);
+				    n2mask(&mask->src_ip6, 128);
+				    mask->flow_id6 = ~0;
+				    *flags |= DN_HAVE_MASK;
 				    goto end_mask;
 
 			    case TOK_DSTIP:
-				    p32 = &p.fs.flow_mask.dst_ip;
+				    mask->addr_type = 4;
+				    p32 = &mask->dst_ip;
 				    break;
 
 			    case TOK_SRCIP:
-				    p32 = &p.fs.flow_mask.src_ip;
+				    mask->addr_type = 4;
+				    p32 = &mask->src_ip;
 				    break;
 
 			    case TOK_DSTIP6:
-				    pa6 = &(p.fs.flow_mask.dst_ip6);
+				    mask->addr_type = 6;
+				    pa6 = &mask->dst_ip6;
 				    break;
 			    
 			    case TOK_SRCIP6:
-				    pa6 = &(p.fs.flow_mask.src_ip6);
+				    mask->addr_type = 6;
+				    pa6 = &mask->src_ip6;
 				    break;
 
 			    case TOK_FLOWID:
-				    p20 = &p.fs.flow_mask.flow_id6;
+				    mask->addr_type = 6;
+				    p20 = &mask->flow_id6;
 				    break;
 
 			    case TOK_DSTPORT:
-				    p16 = &p.fs.flow_mask.dst_port;
+				    p16 = &mask->dst_port;
 				    break;
 
 			    case TOK_SRCPORT:
-				    p16 = &p.fs.flow_mask.src_port;
+				    p16 = &mask->src_port;
 				    break;
 
 			    case TOK_PROTO:
@@ -857,10 +994,10 @@ ipfw_config_pipe(int ac, char **av)
 				    if (a > 0xFF)
 					    errx(EX_DATAERR,
 						"proto mask must be 8 bit");
-				    p.fs.flow_mask.proto = (uint8_t)a;
+				    fs->flow_mask.proto = (uint8_t)a;
 			    }
 			    if (a != 0)
-				    p.fs.flags_fs |= DN_HAVE_FLOW_MASK;
+				    *flags |= DN_HAVE_MASK;
 			    ac--; av++;
 			} /* end while, config masks */
 end_mask:
@@ -869,9 +1006,9 @@ end_mask:
 		case TOK_RED:
 		case TOK_GRED:
 			NEED1("red/gred needs w_q/min_th/max_th/max_p\n");
-			p.fs.flags_fs |= DN_IS_RED;
+			fs->flags |= DN_IS_RED;
 			if (tok == TOK_GRED)
-				p.fs.flags_fs |= DN_IS_GENTLE_RED;
+				fs->flags |= DN_IS_GENTLE_RED;
 			/*
 			 * the format for parameters is w_q/min_th/max_th/max_p
 			 */
@@ -879,82 +1016,108 @@ end_mask:
 			    double w_q = strtod(end, NULL);
 			    if (w_q > 1 || w_q <= 0)
 				errx(EX_DATAERR, "0 < w_q <= 1");
-			    p.fs.w_q = (int) (w_q * (1 << SCALE_RED));
+			    fs->w_q = (int) (w_q * (1 << SCALE_RED));
 			}
 			if ((end = strsep(&av[0], "/"))) {
-			    p.fs.min_th = strtoul(end, &end, 0);
+			    fs->min_th = strtoul(end, &end, 0);
 			    if (*end == 'K' || *end == 'k')
-				p.fs.min_th *= 1024;
+				fs->min_th *= 1024;
 			}
 			if ((end = strsep(&av[0], "/"))) {
-			    p.fs.max_th = strtoul(end, &end, 0);
+			    fs->max_th = strtoul(end, &end, 0);
 			    if (*end == 'K' || *end == 'k')
-				p.fs.max_th *= 1024;
+				fs->max_th *= 1024;
 			}
 			if ((end = strsep(&av[0], "/"))) {
 			    double max_p = strtod(end, NULL);
 			    if (max_p > 1 || max_p <= 0)
 				errx(EX_DATAERR, "0 < max_p <= 1");
-			    p.fs.max_p = (int)(max_p * (1 << SCALE_RED));
+			    fs->max_p = (int)(max_p * (1 << SCALE_RED));
 			}
 			ac--; av++;
 			break;
 
 		case TOK_DROPTAIL:
-			p.fs.flags_fs &= ~(DN_IS_RED|DN_IS_GENTLE_RED);
+			NEED(fs, "droptail is only for flowsets");
+			fs->flags &= ~(DN_IS_RED|DN_IS_GENTLE_RED);
 			break;
 
 		case TOK_BW:
+			NEED(p, "bw is only for links");
 			NEED1("bw needs bandwidth or interface\n");
-			if (co.do_pipe != 1)
-			    errx(EX_DATAERR, "bandwidth only valid for pipes");
-			read_bandwidth(av[0], &p.bandwidth, p.if_name, sizeof(p.if_name));
+			read_bandwidth(av[0], &p->bandwidth, NULL, 0);
 			ac--; av++;
 			break;
 
 		case TOK_DELAY:
-			if (co.do_pipe != 1)
-				errx(EX_DATAERR, "delay only valid for pipes");
+			NEED(p, "delay is only for links");
 			NEED1("delay needs argument 0..10000ms\n");
-			p.delay = strtoul(av[0], NULL, 0);
+			p->delay = strtoul(av[0], NULL, 0);
+			ac--; av++;
+			break;
+
+		case TOK_TYPE: {
+			int l;
+			NEED(sch, "type is only for schedulers");
+			NEED1("type needs a string");
+			l = strlen(av[0]);
+			if (l == 0 || l > 15)
+				errx(1, "type %s too long\n", av[0]);
+			strcpy(sch->name, av[0]);
+			sch->oid.subtype = 0; /* use string */
 			ac--; av++;
 			break;
+		    }
 
 		case TOK_WEIGHT:
-			if (co.do_pipe == 1)
-				errx(EX_DATAERR,"weight only valid for queues");
-			NEED1("weight needs argument 0..100\n");
-			p.fs.weight = strtoul(av[0], &end, 0);
+			NEED(fs, "weight is only for flowsets");
+			NEED1("weight needs argument\n");
+			fs->par[0] = strtol(av[0], &end, 0);
+			ac--; av++;
+			break;
+
+		case TOK_LMAX:
+			NEED(fs, "lmax is only for flowsets");
+			NEED1("lmax needs argument\n");
+			fs->par[1] = strtol(av[0], &end, 0);
 			ac--; av++;
 			break;
 
+		case TOK_PRI:
+			NEED(fs, "priority is only for flowsets");
+			NEED1("priority needs argument\n");
+			fs->par[2] = strtol(av[0], &end, 0);
+			ac--; av++;
+			break;
+
+		case TOK_SCHED:
 		case TOK_PIPE:
-			if (co.do_pipe == 1)
-				errx(EX_DATAERR,"pipe only valid for queues");
-			NEED1("pipe needs pipe_number\n");
-			p.fs.parent_nr = strtoul(av[0], &end, 0);
+			NEED(fs, "pipe/sched");
+			NEED1("pipe/link/sched needs number\n");
+			fs->sched_nr = strtoul(av[0], &end, 0);
 			ac--; av++;
 			break;
 
-		case TOK_PIPE_PROFILE:
-			if (co.do_pipe != 1)
-			    errx(EX_DATAERR, "extra delay only valid for pipes");
+		case TOK_PROFILE:
+			NEED((!pf), "profile already set");
+			NEED(p, "profile");
+		    {
 			NEED1("extra delay needs the file name\n");
-			p.samples = &samples[0];
-			load_extra_delays(av[0], &p);
+			pf = o_next(&buf, sizeof(*pf), DN_PROFILE);
+			load_extra_delays(av[0], pf, p); //XXX can't fail?
 			--ac; ++av;
+		    }
 			break;
 
 		case TOK_BURST:
-			if (co.do_pipe != 1)
-				errx(EX_DATAERR, "burst only valid for pipes");
+			NEED(p, "burst");
 			NEED1("burst needs argument\n");
 			errno = 0;
-			if (expand_number(av[0], (int64_t *)&p.burst) < 0)
+			if (expand_number(av[0], (int64_t *)&p->burst) < 0)
 				if (errno != ERANGE)
 					errx(EX_DATAERR,
 					    "burst: invalid argument");
-			if (errno || p.burst > (1ULL << 48) - 1)
+			if (errno || p->burst > (1ULL << 48) - 1)
 				errx(EX_DATAERR,
 				    "burst: out of range (0..2^48-1)");
 			ac--; av++;
@@ -964,26 +1127,17 @@ end_mask:
 			errx(EX_DATAERR, "unrecognised option ``%s''", av[-1]);
 		}
 	}
-	if (co.do_pipe == 1) {
-		if (p.pipe_nr == 0)
-			errx(EX_DATAERR, "pipe_nr must be > 0");
-		if (p.delay > 10000)
-			errx(EX_DATAERR, "delay must be < 10000");
-	} else { /* co.do_pipe == 2, queue */
-		if (p.fs.parent_nr == 0)
-			errx(EX_DATAERR, "pipe must be > 0");
-		if (p.fs.weight >100)
-			errx(EX_DATAERR, "weight must be <= 100");
-	}
 
-	/* check for bandwidth value */
-	if (p.bandwidth == -1) {
-		p.bandwidth = 0;
-		if (p.samples_no > 0)
-			errx(EX_DATAERR, "profile requires a bandwidth limit");
+	/* check validity of parameters */
+	if (p) {
+		if (p->delay > 10000)
+			errx(EX_DATAERR, "delay must be < 10000");
+		if (p->bandwidth == -1)
+			p->bandwidth = 0;
 	}
-
-	if (p.fs.flags_fs & DN_QSIZE_IS_BYTES) {
+	if (fs) {
+		/* XXX accept a 0 scheduler to keep the default */
+	    if (fs->flags & DN_QSIZE_BYTES) {
 		size_t len;
 		long limit;
 
@@ -991,9 +1145,9 @@ end_mask:
 		if (sysctlbyname("net.inet.ip.dummynet.pipe_byte_limit",
 			&limit, &len, NULL, 0) == -1)
 			limit = 1024*1024;
-		if (p.fs.qsize > limit)
+		if (fs->qsize > limit)
 			errx(EX_DATAERR, "queue size must be < %ldB", limit);
-	} else {
+	    } else {
 		size_t len;
 		long limit;
 
@@ -1001,27 +1155,25 @@ end_mask:
 		if (sysctlbyname("net.inet.ip.dummynet.pipe_slot_limit",
 			&limit, &len, NULL, 0) == -1)
 			limit = 100;
-		if (p.fs.qsize > limit)
+		if (fs->qsize > limit)
 			errx(EX_DATAERR, "2 <= queue size <= %ld", limit);
-	}
-	if (p.fs.flags_fs & DN_IS_RED) {
+	    }
+
+	    if (fs->flags & DN_IS_RED) {
 		size_t len;
 		int lookup_depth, avg_pkt_size;
-		double s, idle, weight, w_q;
-		struct clockinfo ck;
-		int t;
+		double w_q;
 
-		if (p.fs.min_th >= p.fs.max_th)
+		if (fs->min_th >= fs->max_th)
 		    errx(EX_DATAERR, "min_th %d must be < than max_th %d",
-			p.fs.min_th, p.fs.max_th);
-		if (p.fs.max_th == 0)
+			fs->min_th, fs->max_th);
+		if (fs->max_th == 0)
 		    errx(EX_DATAERR, "max_th must be > 0");
 
 		len = sizeof(int);
 		if (sysctlbyname("net.inet.ip.dummynet.red_lookup_depth",
 			&lookup_depth, &len, NULL, 0) == -1)
-		    errx(1, "sysctlbyname(\"%s\")",
-			"net.inet.ip.dummynet.red_lookup_depth");
+			lookup_depth = 256;
 		if (lookup_depth == 0)
 		    errx(EX_DATAERR, "net.inet.ip.dummynet.red_lookup_depth"
 			" must be greater than zero");
@@ -1029,18 +1181,13 @@ end_mask:
 		len = sizeof(int);
 		if (sysctlbyname("net.inet.ip.dummynet.red_avg_pkt_size",
 			&avg_pkt_size, &len, NULL, 0) == -1)
+			avg_pkt_size = 512;
 
-		    errx(1, "sysctlbyname(\"%s\")",
-			"net.inet.ip.dummynet.red_avg_pkt_size");
 		if (avg_pkt_size == 0)
 			errx(EX_DATAERR,
 			    "net.inet.ip.dummynet.red_avg_pkt_size must"
 			    " be greater than zero");
 
-		len = sizeof(struct clockinfo);
-		if (sysctlbyname("kern.clockrate", &ck, &len, NULL, 0) == -1)
-			errx(1, "sysctlbyname(\"%s\")", "kern.clockrate");
-
 		/*
 		 * Ticks needed for sending a medium-sized packet.
 		 * Unfortunately, when we are configuring a WF2Q+ queue, we
@@ -1050,38 +1197,77 @@ end_mask:
 		 * correct. But on the other hand, why do we want RED with
 		 * WF2Q+ ?
 		 */
+#if 0
 		if (p.bandwidth==0) /* this is a WF2Q+ queue */
 			s = 0;
 		else
 			s = (double)ck.hz * avg_pkt_size * 8 / p.bandwidth;
-
+#endif
 		/*
 		 * max idle time (in ticks) before avg queue size becomes 0.
 		 * NOTA:  (3/w_q) is approx the value x so that
 		 * (1-w_q)^x < 10^-3.
 		 */
-		w_q = ((double)p.fs.w_q) / (1 << SCALE_RED);
+		w_q = ((double)fs->w_q) / (1 << SCALE_RED);
+#if 0 // go in kernel
 		idle = s * 3. / w_q;
-		p.fs.lookup_step = (int)idle / lookup_depth;
-		if (!p.fs.lookup_step)
-			p.fs.lookup_step = 1;
+		fs->lookup_step = (int)idle / lookup_depth;
+		if (!fs->lookup_step)
+			fs->lookup_step = 1;
 		weight = 1 - w_q;
-		for (t = p.fs.lookup_step; t > 1; --t)
+		for (t = fs->lookup_step; t > 1; --t)
 			weight *= 1 - w_q;
-		p.fs.lookup_weight = (int)(weight * (1 << SCALE_RED));
+		fs->lookup_weight = (int)(weight * (1 << SCALE_RED));
+#endif
+	    }
 	}
-	if (p.samples_no <= 0) {
-		i = do_cmd(IP_DUMMYNET_CONFIGURE, &p, sizeof p);
-	} else {
-		struct dn_pipe_max pm;
-		int len = sizeof(pm);
-
-		memcpy(&pm.pipe, &p, sizeof(pm.pipe));
-		memcpy(&pm.samples, samples, sizeof(pm.samples));
 
-		i = do_cmd(IP_DUMMYNET_CONFIGURE, &pm, len);
-	}
+	i = do_cmd(IP_DUMMYNET3, base, (char *)buf - (char *)base);
 
 	if (i)
 		err(1, "setsockopt(%s)", "IP_DUMMYNET_CONFIGURE");
 }
+
+void
+dummynet_flush(void)
+{
+	struct dn_id oid;
+	oid_fill(&oid, sizeof(oid), DN_CMD_FLUSH, DN_API_VERSION);
+	do_cmd(IP_DUMMYNET3, &oid, oid.len);
+}
+
+/* main entry point for dummynet list functions. co.do_pipe indicates
+ * which function we want to support.
+ * XXX todo- accept filtering arguments.
+ */
+void
+dummynet_list(int ac, char *av[], int show_counters)
+{
+	struct dn_id oid, *x;
+	int ret, l = sizeof(oid);
+
+	oid_fill(&oid, l, DN_CMD_GET, DN_API_VERSION);
+	switch (co.do_pipe) {
+	case 1:
+		oid.subtype = DN_LINK;	/* list pipe */
+		break;
+	case 2:
+		oid.subtype = DN_FS;	/* list queue */
+		break;
+	case 3:
+		oid.subtype = DN_SCH;	/* list sched */
+		break;
+	}
+	ret = do_cmd(-IP_DUMMYNET3, &oid, (uintptr_t)&l);
+	// printf("%s returns %d need %d\n", __FUNCTION__, ret, oid.id);
+	if (ret != 0 || oid.id <= sizeof(oid))
+		return;
+	l = oid.id;
+	x = safe_calloc(1, l);
+	*x = oid;
+	ret = do_cmd(-IP_DUMMYNET3, x, (uintptr_t)&l);
+	// printf("%s returns %d need %d\n", __FUNCTION__, ret, oid.id);
+	// XXX filter on ac, av
+	list_pipes(x, O_NEXT(x, l));
+	free(x);
+}
diff --git a/sbin/ipfw/ipfw.8 b/sbin/ipfw/ipfw.8
index fc83ecc..d605d5e 100644
--- a/sbin/ipfw/ipfw.8
+++ b/sbin/ipfw/ipfw.8
@@ -6,8 +6,10 @@
 .Os
 .Sh NAME
 .Nm ipfw
-.Nd IP firewall and traffic shaper control program
+.Nd User interface for firewall, traffic shaper, packet scheduler,
+in-kernel NAT.
 .Sh SYNOPSIS
+.Ss FIREWALL CONFIGURATION
 .Nm
 .Op Fl cq
 .Cm add
@@ -26,12 +28,6 @@
 .Op Cm set Ar N
 .Brq Cm delete | zero | resetlog
 .Op Ar number ...
-.Nm
-.Cm enable
-.Brq Cm firewall | altq | one_pass | debug | verbose | dyn_keepalive
-.Nm
-.Cm disable
-.Brq Cm firewall | altq | one_pass | debug | verbose | dyn_keepalive
 .Pp
 .Nm
 .Cm set Oo Cm disable Ar number ... Oc Op Cm enable Ar number ...
@@ -43,8 +39,17 @@
 .Cm set swap Ar number number
 .Nm
 .Cm set show
+.Ss SYSCTL SHORTCUTS
 .Pp
 .Nm
+.Cm enable
+.Brq Cm firewall | altq | one_pass | debug | verbose | dyn_keepalive
+.Nm
+.Cm disable
+.Brq Cm firewall | altq | one_pass | debug | verbose | dyn_keepalive
+.Pp
+.Ss LOOKUP TABLES
+.Nm
 .Cm table Ar number Cm add Ar addr Ns Oo / Ns Ar masklen Oc Op Ar value
 .Nm
 .Cm table Ar number Cm delete Ar addr Ns Op / Ns Ar masklen
@@ -57,17 +62,19 @@
 .Brq Ar number | all
 .Cm list
 .Pp
+.Ss DUMMYNET CONFIGURATION (TRAFFIC SHAPER AND PACKET SCHEDULER)
 .Nm
-.Brq Cm pipe | queue
+.Brq Cm pipe | queue | sched
 .Ar number
 .Cm config
 .Ar config-options
 .Nm
 .Op Fl s Op Ar field
-.Brq Cm pipe | queue
+.Brq Cm pipe | queue | sched
 .Brq Cm delete | list | show
 .Op Ar number ...
 .Pp
+.Ss IN-KERNEL NAT
 .Nm
 .Op Fl q
 .Cm nat
@@ -89,28 +96,27 @@ The
 .Nm
 utility is the user interface for controlling the
 .Xr ipfw 4
-firewall and the
+firewall, the
 .Xr dummynet 4
-traffic shaper in
-.Fx .
+traffic shaper/packet scheduler, and the
+in-kernel NAT services.
 .Pp
-An
-.Nm
-configuration, or
+A firewall configuration, or
 .Em ruleset ,
 is made of a list of
 .Em rules
 numbered from 1 to 65535.
-Packets are passed to
-.Nm
+Packets are passed to the firewall
 from a number of different places in the protocol stack
 (depending on the source and destination of the packet,
-it is possible that
-.Nm
-is invoked multiple times on the same packet).
+it is possible for the firewall to be
+invoked multiple times on the same packet).
 The packet passed to the firewall is compared
-against each of the rules in the firewall
-.Em ruleset .
+against each of the rules in the
+.Em ruleset ,
+in rule-number order
+(multiple rules with the same number are permitted, in which case
+they are processed in order of insertion).
 When a match is found, the action corresponding to the
 matching rule is performed.
 .Pp
@@ -118,9 +124,7 @@ Depending on the action and certain system settings, packets
 can be reinjected into the firewall at some rule after the
 matching one for further processing.
 .Pp
-An
-.Nm
-ruleset always includes a
+A ruleset always includes a
 .Em default
 rule (numbered 65535) which cannot be modified or deleted,
 and matches all packets.
@@ -137,14 +141,14 @@ If the ruleset includes one or more rules with the
 or
 .Cm limit
 option,
-.Nm
-will have a
+the firewall will have a
 .Em stateful
-behaviour, i.e., upon a match it will create dynamic rules matching
-the exact parameters (source and destination addresses and ports)
-of the matching packet.
-.Pp
-These dynamic rules, which have a limited lifetime, are checked
+behaviour, i.e., upon a match it will create
+.Em dynamic rules ,
+i.e. rules that match packets with the same 5-tuple
+(protocol, source and destination addresses and ports)
+as the packet which caused their creation.
+Dynamic rules, which have a limited lifetime, are checked
 at the first occurrence of a
 .Cm check-state ,
 .Cm keep-state
@@ -283,6 +287,7 @@ When listing, show last match timestamp as seconds from the epoch.
 This form can be more convenient for postprocessing by scripts.
 .El
 .Pp
+.Ss LIST OF RULES AND PREPROCESSING
 To ease configuration, rules can be put into a file which is
 processed using
 .Nm
@@ -322,14 +327,16 @@ This allows for flexible configuration files (like conditionalizing
 them on the local hostname) and the use of macros to centralize
 frequently required arguments like IP addresses.
 .Pp
+.Ss TRAFFIC SHAPER CONFIGURATION
 The
 .Nm
-.Cm pipe
+.Cm pipe , queue
 and
-.Cm queue
-commands are used to configure the traffic shaper, as shown in the
+.Cm sched
+commands are used to configure the traffic shaper and packet scheduler.
+See the
 .Sx TRAFFIC SHAPER (DUMMYNET) CONFIGURATION
-Section below.
+Section below for details.
 .Pp
 If the world and the kernel get out of sync the
 .Nm
@@ -362,7 +369,7 @@ have this picture in mind in order to design a correct ruleset.
        |      to devices       |
 .Ed
 .Pp
-As can be noted from the above picture, the number of
+The number of
 times the same packet goes through the firewall can
 vary between 0 and 4 depending on packet source and
 destination, and system configuration.
@@ -421,9 +428,9 @@ Keywords are case-sensitive, whereas arguments may
 or may not be case-sensitive depending on their nature
 (e.g.\& uid's are, hostnames are not).
 .Pp
-In
-.Nm ipfw2
-you can introduce spaces after commas ',' to make
+Some arguments (e.g. port or address lists) are comma-separated
+lists of values.
+In this case, spaces after commas ',' are allowed to make
 the line more readable.
 You can also put the entire
 command (including flags) into a single argument.
@@ -434,9 +441,7 @@ ipfw -q add deny src-ip 10.0.0.0/24, 127.0.0.1/8
 ipfw "-q add deny src-ip 10.0.0.0/24, 127.0.0.1/8"
 .Ed
 .Sh RULE FORMAT
-The format of
-.Nm
-rules is the following:
+The format of firewall rules is the following:
 .Bd -ragged -offset indent
 .Bk -words
 .Op Ar rule_number
@@ -496,7 +501,7 @@ in future forwarding decisions.
 .El
 .Pp
 Note that some of the above information, e.g.\& source MAC or IP addresses and
-TCP/UDP ports, could easily be spoofed, so filtering on those fields
+TCP/UDP ports, can be easily spoofed, so filtering on those fields
 alone might not guarantee the desired results.
 .Bl -tag -width indent
 .It Ar rule_number
@@ -1643,15 +1648,17 @@ because it engages only on packets with source addresses of directly
 connected networks instead of all source addresses.
 .El
 .Sh LOOKUP TABLES
-Lookup tables are useful to handle large sparse address sets,
-typically from a hundred to several thousands of entries.
+Lookup tables are useful to handle large sparse sets of
+addresses or other search keys (e.g. ports, jail IDs).
+In the rest of this section we will use the term ``address''
+to mean any unsigned value of up to 32-bit.
 There may be up to 128 different lookup tables, numbered 0 to 127.
 .Pp
 Each entry is represented by an
 .Ar addr Ns Op / Ns Ar masklen
 and will match all addresses with base
 .Ar addr
-(specified as an IP address or a hostname)
+(specified as an IP address, a hostname or an unsigned integer)
 and mask width of
 .Ar masklen
 bits.
@@ -1669,9 +1676,9 @@ is not specified, it defaults to 0.
 .Pp
 An entry can be added to a table
 .Pq Cm add ,
-removed from a table
-.Pq Cm delete ,
-a table can be examined
+or removed from a table
+.Pq Cm delete .
+A table can be examined
 .Pq Cm list
 or flushed
 .Pq Cm flush .
@@ -1680,7 +1687,7 @@ Internally, each table is stored in a Radix tree, the same way as
 the routing table (see
 .Xr route 4 ) .
 .Pp
-Lookup tables currently support IPv4 addresses only.
+Lookup tables currently support only ports, jail IDs and IPv4 addresses.
 .Pp
 The
 .Cm tablearg
@@ -1838,7 +1845,7 @@ for more examples on how to use dynamic rules.
 .Nm
 is also the user interface for the
 .Nm dummynet
-traffic shaper and network emulator, a subsystem that
+traffic shaper, packet scheduler and network emulator, a subsystem that
 can artificially queue, delay or drop packets
 emulator the behaviour of certain network links
 or queueing systems.
@@ -1858,17 +1865,17 @@ Packets are queued in front of the pipe as they come out from the classifier,
 and then transferred to the pipe according to the pipe's parameters.
 .It Em queue
 A queue
-is an abstraction used to implement the WF2Q+
-(Worst-case Fair Weighted Fair Queueing) policy, which is
-an efficient variant of the WFQ policy.
+is an abstraction used to implement packet scheduling
+using one of several packet scheduling algorithms.
 .Pp
 The queue associates a
 .Em weight
-and a reference pipe to each flow (a flow is a set of packets
+and a reference scheduler to each flow (a flow is a set of packets
 with the same addresses and ports after masking).
-All backlogged flows (i.e., those
-with packets queued) linked to the same pipe share the pipe's
-bandwidth proportionally to their weights.
+A scheduler in turn is connected to a pipe, and arbitrates
+the pipe's bandwidth among backlogged flows according to
+weights and to the features of the scheduling algorithm in use.
+.Pp
 Note that weights are not priorities; a flow with a lower weight
 is still guaranteed to get its fraction of the bandwidth even if a
 flow with a higher weight is permanently backlogged.
@@ -1911,16 +1918,19 @@ mode can be enabled by setting the
 .Xr sysctl 8
 variable to a non-zero value.
 .Pp
-.Ss PIPE AND QUEUE CONFIGURATION
+.Ss PIPE, QUEUE AND SCHEDULER CONFIGURATION
 The
-.Em pipe
-and
+.Em pipe ,
 .Em queue
+and
+.Em scheduler
 configuration commands are the following:
 .Bd -ragged -offset indent
 .Cm pipe Ar number Cm config Ar pipe-configuration
 .Pp
 .Cm queue Ar number Cm config Ar queue-configuration
+.Pp
+.Cm sched Ar number Cm config Ar sched-configuration
 .Ed
 .Pp
 The following parameters can be configured for a pipe:
@@ -2073,6 +2083,14 @@ Specifies the weight to be used for flows matching this queue.
 The weight must be in the range 1..100, and defaults to 1.
 .El
 .Pp
+The following parameters can be configured for a scheduler:
+.Pp
+.Bl -tag -width indent -compact
+.It Cm type Ar {fifo | wf2qp | rr | qfq}
+.El
+.Pp
+plus all the parameters allowed for a pipe.
+.Pp
 Finally, the following parameters can be configured for both
 pipes and queues:
 .Pp
diff --git a/sbin/ipfw/ipfw2.c b/sbin/ipfw/ipfw2.c
index d4740c9..9adce1b 100644
--- a/sbin/ipfw/ipfw2.c
+++ b/sbin/ipfw/ipfw2.c
@@ -57,7 +57,7 @@ struct cmdline_opts co;	/* global options */
 int resvd_set_number = RESVD_SET;
 
 #define GET_UINT_ARG(arg, min, max, tok, s_x) do {			\
-	if (!ac)							\
+	if (!av[0])							\
 		errx(EX_USAGE, "%s: missing argument", match_value(s_x, tok)); \
 	if (_substrcmp(*av, "tablearg") == 0) {				\
 		arg = IP_FW_TABLEARG;					\
@@ -65,23 +65,23 @@ int resvd_set_number = RESVD_SET;
 	}								\
 									\
 	{								\
-	long val;							\
+	long _xval;							\
 	char *end;							\
 									\
-	val = strtol(*av, &end, 10);					\
+	_xval = strtol(*av, &end, 10);					\
 									\
-	if (!isdigit(**av) || *end != '\0' || (val == 0 && errno == EINVAL)) \
+	if (!isdigit(**av) || *end != '\0' || (_xval == 0 && errno == EINVAL)) \
 		errx(EX_DATAERR, "%s: invalid argument: %s",		\
 		    match_value(s_x, tok), *av);			\
 									\
-	if (errno == ERANGE || val < min || val > max)			\
+	if (errno == ERANGE || _xval < min || _xval > max)		\
 		errx(EX_DATAERR, "%s: argument is out of range (%u..%u): %s", \
 		    match_value(s_x, tok), min, max, *av);		\
 									\
-	if (val == IP_FW_TABLEARG)					\
+	if (_xval == IP_FW_TABLEARG)					\
 		errx(EX_DATAERR, "%s: illegal argument value: %s",	\
 		    match_value(s_x, tok), *av);			\
-	arg = val;							\
+	arg = _xval;							\
 	}								\
 } while (0)
 
@@ -353,6 +353,7 @@ safe_realloc(void *ptr, size_t size)
 
 /*
  * conditionally runs the command.
+ * Selected options or negative -> getsockopt
  */
 int
 do_cmd(int optname, void *optval, uintptr_t optlen)
@@ -372,11 +373,15 @@ do_cmd(int optname, void *optval, uintptr_t optlen)
 	    optname == IP_FW_ADD || optname == IP_FW_TABLE_LIST ||
 	    optname == IP_FW_TABLE_GETSIZE || 
 	    optname == IP_FW_NAT_GET_CONFIG || 
-	    optname == IP_FW_NAT_GET_LOG)
+	    optname < 0 ||
+	    optname == IP_FW_NAT_GET_LOG) {
+		if (optname < 0)
+			optname = -optname;
 		i = getsockopt(s, IPPROTO_IP, optname, optval,
 			(socklen_t *)optlen);
-	else
+	} else {
 		i = setsockopt(s, IPPROTO_IP, optname, optval, optlen);
+	}
 	return i;
 }
 
@@ -749,7 +754,7 @@ static void
 print_ip(ipfw_insn_ip *cmd, char const *s)
 {
 	struct hostent *he = NULL;
-	int len = F_LEN((ipfw_insn *)cmd);
+	uint32_t len = F_LEN((ipfw_insn *)cmd);
 	uint32_t *a = ((ipfw_insn_u32 *)cmd)->d;
 
 	if (cmd->o.opcode == O_IP_DST_LOOKUP && len > F_INSN_SIZE(ipfw_insn_u32)) {
@@ -1126,9 +1131,11 @@ show_ipfw(struct ip_fw *rule, int pcwidth, int bcwidth)
 		else
 			printf(" log");
 	}
+#ifndef NO_ALTQ
 	if (altqptr) {
 		print_altq_cmd(altqptr);
 	}
+#endif
 	if (tagptr) {
 		if (tagptr->len & F_NOT)
 			PRINT_UINT_ARG(" untag ", tagptr->arg1);
@@ -1606,17 +1613,16 @@ show_dyn_ipfw(ipfw_dyn_rule *d, int pcwidth, int bcwidth)
  * 	ipfw set move rule X to Y
  */
 void
-ipfw_sets_handler(int ac, char *av[])
+ipfw_sets_handler(char *av[])
 {
 	uint32_t set_disable, masks[2];
 	int i, nbytes;
 	uint16_t rulenum;
 	uint8_t cmd, new_set;
 
-	ac--;
 	av++;
 
-	if (!ac)
+	if (av[0] == NULL)
 		errx(EX_USAGE, "set needs command");
 	if (_substrcmp(*av, "show") == 0) {
 		void *data;
@@ -1642,8 +1648,8 @@ ipfw_sets_handler(int ac, char *av[])
 			}
 		printf("\n");
 	} else if (_substrcmp(*av, "swap") == 0) {
-		ac--; av++;
-		if (ac != 2)
+		av++;
+		if ( av[0] == NULL || av[1] == NULL )
 			errx(EX_USAGE, "set swap needs 2 set numbers\n");
 		rulenum = atoi(av[0]);
 		new_set = atoi(av[1]);
@@ -1654,13 +1660,14 @@ ipfw_sets_handler(int ac, char *av[])
 		masks[0] = (4 << 24) | (new_set << 16) | (rulenum);
 		i = do_cmd(IP_FW_DEL, masks, sizeof(uint32_t));
 	} else if (_substrcmp(*av, "move") == 0) {
-		ac--; av++;
-		if (ac && _substrcmp(*av, "rule") == 0) {
+		av++;
+		if (!av[0] && _substrcmp(*av, "rule") == 0) {
 			cmd = 2;
-			ac--; av++;
+			av++;
 		} else
 			cmd = 3;
-		if (ac != 3 || _substrcmp(av[1], "to") != 0)
+		if (av[0] == NULL || av[1] == NULL || av[2] == NULL ||
+				av[3] != NULL ||  _substrcmp(av[1], "to") != 0)
 			errx(EX_USAGE, "syntax: set move [rule] X to Y\n");
 		rulenum = atoi(av[0]);
 		new_set = atoi(av[2]);
@@ -1675,10 +1682,10 @@ ipfw_sets_handler(int ac, char *av[])
 		   _substrcmp(*av, "enable") == 0 ) {
 		int which = _substrcmp(*av, "enable") == 0 ? 1 : 0;
 
-		ac--; av++;
+		av++;
 		masks[0] = masks[1] = 0;
 
-		while (ac) {
+		while (!av[0]) {
 			if (isdigit(**av)) {
 				i = atoi(*av);
 				if (i < 0 || i > RESVD_SET)
@@ -1692,7 +1699,7 @@ ipfw_sets_handler(int ac, char *av[])
 			else
 				errx(EX_DATAERR,
 					"invalid set command %s\n", *av);
-			av++; ac--;
+			av++;
 		}
 		if ( (masks[0] & masks[1]) != 0 )
 			errx(EX_DATAERR,
@@ -1706,12 +1713,11 @@ ipfw_sets_handler(int ac, char *av[])
 }
 
 void
-ipfw_sysctl_handler(int ac, char *av[], int which)
+ipfw_sysctl_handler(char *av[], int which)
 {
-	ac--;
 	av++;
 
-	if (ac == 0) {
+	if (av[0] == NULL) {
 		warnx("missing keyword to enable/disable\n");
 	} else if (_substrcmp(*av, "firewall") == 0) {
 		sysctlbyname("net.inet.ip.fw.enable", NULL, 0,
@@ -1728,8 +1734,10 @@ ipfw_sysctl_handler(int ac, char *av[], int which)
 	} else if (_substrcmp(*av, "dyn_keepalive") == 0) {
 		sysctlbyname("net.inet.ip.fw.dyn_keepalive", NULL, 0,
 		    &which, sizeof(which));
+#ifndef NO_ALTQ
 	} else if (_substrcmp(*av, "altq") == 0) {
 		altq_set_enabled(which);
+#endif
 	} else {
 		warnx("unrecognize enable/disable keyword: %s\n", *av);
 	}
@@ -1762,6 +1770,10 @@ ipfw_list(int ac, char *av[], int show_counters)
 		fprintf(stderr, "Testing only, list disabled\n");
 		return;
 	}
+	if (co.do_pipe) {
+		dummynet_list(ac, av, show_counters);
+		return;
+	}
 
 	ac--;
 	av++;
@@ -1778,11 +1790,6 @@ ipfw_list(int ac, char *av[], int show_counters)
 				co.do_pipe ? "DUMMYNET" : "FW");
 	}
 
-	if (co.do_pipe) {
-		ipfw_list_pipes(data, nbytes, ac, av);
-		goto done;
-	}
-
 	/*
 	 * Count static rules. They have variable size so we
 	 * need to scan the list to count them.
@@ -2130,7 +2137,7 @@ fill_ip(ipfw_insn_ip *cmd, char *av)
 		return;
 	}
 	/* A single IP can be stored in an optimized format */
-	if (d[1] == ~0 && av == NULL && len == 0) {
+	if (d[1] == (uint32_t)~0 && av == NULL && len == 0) {
 		cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32);
 		return;
 	}
@@ -2199,29 +2206,28 @@ fill_flags(ipfw_insn *cmd, enum ipfw_opcodes opcode,
 
 
 void
-ipfw_delete(int ac, char *av[])
+ipfw_delete(char *av[])
 {
 	uint32_t rulenum;
 	int i;
 	int exitval = EX_OK;
 	int do_set = 0;
 
-
-	av++; ac--;
+	av++;
 	NEED1("missing rule specification");
-	if (ac > 0 && _substrcmp(*av, "set") == 0) {
+	if ( *av && _substrcmp(*av, "set") == 0) {
 		/* Do not allow using the following syntax:
 		 *	ipfw set N delete set M
 		 */
 		if (co.use_set)
 			errx(EX_DATAERR, "invalid syntax");
 		do_set = 1;	/* delete set */
-		ac--; av++;
+		av++;
 	}
 
 	/* Rule number */
-	while (ac && isdigit(**av)) {
-		i = atoi(*av); av++; ac--;
+	while (*av && isdigit(**av)) {
+		i = atoi(*av); av++;
 		if (co.do_nat) {
 			exitval = do_cmd(IP_FW_NAT_DEL, &i, sizeof i);
 			if (exitval) {
@@ -2275,7 +2281,8 @@ fill_iface(ipfw_insn_if *cmd, char *arg)
 static void
 get_mac_addr_mask(const char *p, uint8_t *addr, uint8_t *mask)
 {
-	int i, l;
+	int i;
+	size_t l;
 	char *ap, *ptr, *optr;
 	struct ether_addr *mac;
 	const char *macset = "0123456789abcdefABCDEF:";
@@ -2297,11 +2304,11 @@ get_mac_addr_mask(const char *p, uint8_t *addr, uint8_t *mask)
 
 	if (ptr != NULL) { /* we have mask? */
 		if (p[ptr - optr - 1] == '/') { /* mask len */
-			l = strtol(ptr, &ap, 10);
-			if (*ap != 0 || l > ETHER_ADDR_LEN * 8 || l < 0)
+			long ml = strtol(ptr, &ap, 10);
+			if (*ap != 0 || ml > ETHER_ADDR_LEN * 8 || ml < 0)
 				errx(EX_DATAERR, "Incorrect mask length");
-			for (i = 0; l > 0 && i < ETHER_ADDR_LEN; l -= 8, i++)
-				mask[i] = (l >= 8) ? 0xff: (~0) << (8 - l);
+			for (i = 0; ml > 0 && i < ETHER_ADDR_LEN; ml -= 8, i++)
+				mask[i] = (ml >= 8) ? 0xff: (~0) << (8 - ml);
 		} else { /* mask */
 			l = strlen(ptr);
 			if (strspn(ptr, macset) != l ||
@@ -2336,7 +2343,7 @@ next_cmd(ipfw_insn *cmd)
  * Takes arguments and copies them into a comment
  */
 static void
-fill_comment(ipfw_insn *cmd, int ac, char **av)
+fill_comment(ipfw_insn *cmd, char **av)
 {
 	int i, l;
 	char *p = (char *)(cmd + 1);
@@ -2345,7 +2352,7 @@ fill_comment(ipfw_insn *cmd, int ac, char **av)
 	cmd->len =  (cmd->len & (F_NOT | F_OR));
 
 	/* Compute length of comment string. */
-	for (i = 0, l = 0; i < ac; i++)
+	for (i = 0, l = 0; av[i] != NULL; i++)
 		l += strlen(av[i]) + 1;
 	if (l == 0)
 		return;
@@ -2354,7 +2361,7 @@ fill_comment(ipfw_insn *cmd, int ac, char **av)
 		    "comment too long (max 80 chars)");
 	l = 1 + (l+3)/4;
 	cmd->len =  (cmd->len & (F_NOT | F_OR)) | l;
-	for (i = 0; i < ac; i++) {
+	for (i = 0; av[i] != NULL; i++) {
 		strcpy(p, av[i]);
 		p += strlen(av[i]);
 		*p++ = ' ';
@@ -2379,11 +2386,11 @@ fill_cmd(ipfw_insn *cmd, enum ipfw_opcodes opcode, int flags, uint16_t arg)
  * two microinstructions, and returns the pointer to the last one.
  */
 static ipfw_insn *
-add_mac(ipfw_insn *cmd, int ac, char *av[])
+add_mac(ipfw_insn *cmd, char *av[])
 {
 	ipfw_insn_mac *mac;
 
-	if (ac < 2)
+	if ( ( av[0] == NULL ) || ( av[1] == NULL ) )
 		errx(EX_DATAERR, "MAC dst src");
 
 	cmd->opcode = O_MACADDR2;
@@ -2397,9 +2404,9 @@ add_mac(ipfw_insn *cmd, int ac, char *av[])
 }
 
 static ipfw_insn *
-add_mactype(ipfw_insn *cmd, int ac, char *av)
+add_mactype(ipfw_insn *cmd, char *av)
 {
-	if (ac < 1)
+	if (!av)
 		errx(EX_DATAERR, "missing MAC type");
 	if (strcmp(av, "any") != 0) { /* we have a non-null type */
 		fill_newports((ipfw_insn_u16 *)cmd, av, IPPROTO_ETHERTYPE);
@@ -2507,6 +2514,7 @@ add_dstip(ipfw_insn *cmd, char *av)
 static ipfw_insn *
 add_ports(ipfw_insn *cmd, char *av, u_char proto, int opcode)
 {
+	/* XXX "any" is trapped before. Perhaps "to" */
 	if (_substrcmp(av, "any") == 0) {
 		return NULL;
 	} else if (fill_newports((ipfw_insn_u16 *)cmd, av, proto)) {
@@ -2530,11 +2538,11 @@ add_src(ipfw_insn *cmd, char *av, u_char proto)
 		*ch = '\0';
 
 	if (proto == IPPROTO_IPV6  || strcmp(av, "me6") == 0 ||
-	    inet_pton(AF_INET6, host, &a))
+	    inet_pton(AF_INET6, host, &a) == 1)
 		ret = add_srcip6(cmd, av);
 	/* XXX: should check for IPv4, not !IPv6 */
 	if (ret == NULL && (proto == IPPROTO_IP || strcmp(av, "me") == 0 ||
-	    !inet_pton(AF_INET6, host, &a)))
+	    inet_pton(AF_INET6, host, &a) != 1))
 		ret = add_srcip(cmd, av);
 	if (ret == NULL && strcmp(av, "any") != 0)
 		ret = cmd;
@@ -2556,11 +2564,11 @@ add_dst(ipfw_insn *cmd, char *av, u_char proto)
 		*ch = '\0';
 
 	if (proto == IPPROTO_IPV6  || strcmp(av, "me6") == 0 ||
-	    inet_pton(AF_INET6, host, &a))
+	    inet_pton(AF_INET6, host, &a) == 1)
 		ret = add_dstip6(cmd, av);
 	/* XXX: should check for IPv4, not !IPv6 */
 	if (ret == NULL && (proto == IPPROTO_IP || strcmp(av, "me") == 0 ||
-	    !inet_pton(AF_INET6, host, &a)))
+	    inet_pton(AF_INET6, host, &a) != 1))
 		ret = add_dstip(cmd, av);
 	if (ret == NULL && strcmp(av, "any") != 0)
 		ret = cmd;
@@ -2582,7 +2590,7 @@ add_dst(ipfw_insn *cmd, char *av, u_char proto)
  *
  */
 void
-ipfw_add(int ac, char *av[])
+ipfw_add(char *av[])
 {
 	/*
 	 * rules are added into the 'rulebuf' and then copied in
@@ -2621,37 +2629,36 @@ ipfw_add(int ac, char *av[])
 	cmd = (ipfw_insn *)cmdbuf;
 	action = (ipfw_insn *)actbuf;
 
-	av++; ac--;
+	av++;
 
 	/* [rule N]	-- Rule number optional */
-	if (ac && isdigit(**av)) {
+	if (av[0] && isdigit(**av)) {
 		rule->rulenum = atoi(*av);
 		av++;
-		ac--;
 	}
 
 	/* [set N]	-- set number (0..RESVD_SET), optional */
-	if (ac > 1 && _substrcmp(*av, "set") == 0) {
+	if (av[0] && !av[1] && _substrcmp(*av, "set") == 0) {
 		int set = strtoul(av[1], NULL, 10);
 		if (set < 0 || set > RESVD_SET)
 			errx(EX_DATAERR, "illegal set %s", av[1]);
 		rule->set = set;
-		av += 2; ac -= 2;
+		av += 2;
 	}
 
 	/* [prob D]	-- match probability, optional */
-	if (ac > 1 && _substrcmp(*av, "prob") == 0) {
+	if (av[0] && av[1] && _substrcmp(*av, "prob") == 0) {
 		match_prob = strtod(av[1], NULL);
 
 		if (match_prob <= 0 || match_prob > 1)
 			errx(EX_DATAERR, "illegal match prob. %s", av[1]);
-		av += 2; ac -= 2;
+		av += 2;
 	}
 
 	/* action	-- mandatory */
 	NEED1("missing action");
 	i = match_token(rule_actions, *av);
-	ac--; av++;
+	av++;
 	action->len = 1;	/* default */
 	switch(i) {
 	case TOK_CHECKSTATE:
@@ -2687,14 +2694,14 @@ ipfw_add(int ac, char *av[])
 		action->opcode = O_REJECT;
 		NEED1("missing reject code");
 		fill_reject_code(&action->arg1, *av);
-		ac--; av++;
+		av++;
 		break;
 
 	case TOK_UNREACH6:
 		action->opcode = O_UNREACH6;
 		NEED1("missing unreach code");
 		fill_unreach6_code(&action->arg1, *av);
-		ac--; av++;
+		av++;
 		break;
 
 	case TOK_COUNT:
@@ -2727,7 +2734,7 @@ ipfw_add(int ac, char *av[])
 	case TOK_TEE:
 		action->opcode = O_TEE;
 chkarg:	
-		if (!ac)
+		if (!av[0])
 			errx(EX_USAGE, "missing argument for %s", *(av - 1));
 		if (isdigit(**av)) {
 			action->arg1 = strtoul(*av, NULL, 10);
@@ -2746,7 +2753,7 @@ chkarg:
 				errx(EX_DATAERR, "illegal divert/tee port");
 		} else
 			errx(EX_DATAERR, "illegal argument for %s", *(av - 1));
-		ac--; av++;
+		av++;
 		break;
 
 	case TOK_FORWARD: {
@@ -2784,13 +2791,13 @@ chkarg:
 			p->sa.sin_addr.s_addr = INADDR_ANY;
 		else
 			lookup_host(*av, &(p->sa.sin_addr));
-		ac--; av++;
+		av++;
 		break;
 	    }
 	case TOK_COMMENT:
 		/* pretend it is a 'count' rule followed by the comment */
 		action->opcode = O_COUNT;
-		ac++; av--;	/* go back... */
+		av--;		/* go back... */
 		break;
 
 	case TOK_SETFIB:
@@ -2805,7 +2812,7 @@ chkarg:
 			errx(EX_DATAERR, "fibs not suported.\n");
 		if (action->arg1 >= numfibs)  /* Temporary */
 			errx(EX_DATAERR, "fib too large.\n");
- 		ac--; av++;
+ 		av++;
  		break;
 	    }
 
@@ -2825,8 +2832,8 @@ chkarg:
 	 * If they exist, it go first in the cmdbuf, but then it is
 	 * skipped in the copy section to the end of the buffer.
 	 */
-	while (ac != 0 && (i = match_token(rule_action_params, *av)) != -1) {
-		ac--; av++;
+	while (av[0] != NULL && (i = match_token(rule_action_params, *av)) != -1) {
+		av++;
 		switch (i) {
 		case TOK_LOG:
 		    {
@@ -2839,15 +2846,15 @@ chkarg:
 			have_log = (ipfw_insn *)c;
 			cmd->len = F_INSN_SIZE(ipfw_insn_log);
 			cmd->opcode = O_LOG;
-			if (ac && _substrcmp(*av, "logamount") == 0) {
-				ac--; av++;
+			if (av[0] && _substrcmp(*av, "logamount") == 0) {
+				av++;
 				NEED1("logamount requires argument");
 				l = atoi(*av);
 				if (l < 0)
 					errx(EX_DATAERR,
 					    "logamount must be positive");
 				c->max_log = l;
-				ac--; av++;
+				av++;
 			} else {
 				len = sizeof(c->max_log);
 				if (sysctlbyname("net.inet.ip.fw.verbose_limit",
@@ -2858,6 +2865,7 @@ chkarg:
 		    }
 			break;
 
+#ifndef NO_ALTQ
 		case TOK_ALTQ:
 		    {
 			ipfw_insn_altq *a = (ipfw_insn_altq *)cmd;
@@ -2870,9 +2878,10 @@ chkarg:
 			cmd->len = F_INSN_SIZE(ipfw_insn_altq);
 			cmd->opcode = O_ALTQ;
 			a->qid = altq_name_to_qid(*av);
-			ac--; av++;
+			av++;
 		    }
 			break;
+#endif
 
 		case TOK_TAG:
 		case TOK_UNTAG: {
@@ -2885,7 +2894,7 @@ chkarg:
 			   rule_action_params);
 			have_tag = cmd;
 			fill_cmd(cmd, O_TAG, (i == TOK_TAG) ? 0: F_NOT, tag);
-			ac--; av++;
+			av++;
 			break;
 		}
 
@@ -2899,13 +2908,13 @@ chkarg:
 		goto done;
 
 #define OR_START(target)					\
-	if (ac && (*av[0] == '(' || *av[0] == '{')) {		\
+	if (av[0] && (*av[0] == '(' || *av[0] == '{')) { 	\
 		if (open_par)					\
 			errx(EX_USAGE, "nested \"(\" not allowed\n"); \
 		prev = NULL;					\
 		open_par = 1;					\
 		if ( (av[0])[1] == '\0') {			\
-			ac--; av++;				\
+			av++;					\
 		} else						\
 			(*av)++;				\
 	}							\
@@ -2914,30 +2923,30 @@ chkarg:
 
 #define	CLOSE_PAR						\
 	if (open_par) {						\
-		if (ac && (					\
+		if (av[0] && (					\
 		    strcmp(*av, ")") == 0 ||			\
 		    strcmp(*av, "}") == 0)) {			\
 			prev = NULL;				\
 			open_par = 0;				\
-			ac--; av++;				\
+			av++;					\
 		} else						\
 			errx(EX_USAGE, "missing \")\"\n");	\
 	}
 
 #define NOT_BLOCK						\
-	if (ac && _substrcmp(*av, "not") == 0) {		\
+	if (av[0] && _substrcmp(*av, "not") == 0) {		\
 		if (cmd->len & F_NOT)				\
 			errx(EX_USAGE, "double \"not\" not allowed\n"); \
 		cmd->len |= F_NOT;				\
-		ac--; av++;					\
+		av++;						\
 	}
 
 #define OR_BLOCK(target)					\
-	if (ac && _substrcmp(*av, "or") == 0) {		\
+	if (av[0] && _substrcmp(*av, "or") == 0) {		\
 		if (prev == NULL || open_par == 0)		\
 			errx(EX_DATAERR, "invalid OR block");	\
 		prev->len |= F_OR;				\
-		ac--; av++;					\
+		av++;					\
 		goto target;					\
 	}							\
 	CLOSE_PAR;
@@ -2954,15 +2963,15 @@ chkarg:
 	NEED1("missing protocol");
 	if (_substrcmp(*av, "MAC") == 0 ||
 	    _substrcmp(*av, "mac") == 0) {
-		ac--; av++;	/* the "MAC" keyword */
-		add_mac(cmd, ac, av); /* exits in case of errors */
+		av++;			/* the "MAC" keyword */
+		add_mac(cmd, av);	/* exits in case of errors */
 		cmd = next_cmd(cmd);
-		ac -= 2; av += 2;	/* dst-mac and src-mac */
+		av += 2;		/* dst-mac and src-mac */
 		NOT_BLOCK;
 		NEED1("missing mac type");
-		if (add_mactype(cmd, ac, av[0]))
+		if (add_mactype(cmd, av[0]))
 			cmd = next_cmd(cmd);
-		ac--; av++;	/* any or mac-type */
+		av++;			/* any or mac-type */
 		goto read_options;
 	}
 #endif
@@ -2974,7 +2983,7 @@ chkarg:
 	NOT_BLOCK;
 	NEED1("missing protocol");
 	if (add_proto_compat(cmd, *av, &proto)) {
-		av++; ac--;
+		av++;
 		if (F_LEN(cmd) != 0) {
 			prev = cmd;
 			cmd = next_cmd(cmd);
@@ -2988,9 +2997,9 @@ chkarg:
 	/*
 	 * "from", mandatory
 	 */
-	if (!ac || _substrcmp(*av, "from") != 0)
+	if ((av[0] == NULL) || _substrcmp(*av, "from") != 0)
 		errx(EX_USAGE, "missing ``from''");
-	ac--; av++;
+	av++;
 
 	/*
 	 * source IP, mandatory
@@ -2999,7 +3008,7 @@ chkarg:
 	NOT_BLOCK;	/* optional "not" */
 	NEED1("missing source address");
 	if (add_src(cmd, *av, proto)) {
-		ac--; av++;
+		av++;
 		if (F_LEN(cmd) != 0) {	/* ! any */
 			prev = cmd;
 			cmd = next_cmd(cmd);
@@ -3012,10 +3021,10 @@ chkarg:
 	 * source ports, optional
 	 */
 	NOT_BLOCK;	/* optional "not" */
-	if (ac) {
+	if ( av[0] != NULL ) {
 		if (_substrcmp(*av, "any") == 0 ||
 		    add_ports(cmd, *av, proto, O_IP_SRCPORT)) {
-			ac--; av++;
+			av++;
 			if (F_LEN(cmd) != 0)
 				cmd = next_cmd(cmd);
 		}
@@ -3024,9 +3033,9 @@ chkarg:
 	/*
 	 * "to", mandatory
 	 */
-	if (!ac || _substrcmp(*av, "to") != 0)
+	if ( (av[0] == NULL) || _substrcmp(*av, "to") != 0 )
 		errx(EX_USAGE, "missing ``to''");
-	av++; ac--;
+	av++;
 
 	/*
 	 * destination, mandatory
@@ -3035,7 +3044,7 @@ chkarg:
 	NOT_BLOCK;	/* optional "not" */
 	NEED1("missing dst address");
 	if (add_dst(cmd, *av, proto)) {
-		ac--; av++;
+		av++;
 		if (F_LEN(cmd) != 0) {	/* ! any */
 			prev = cmd;
 			cmd = next_cmd(cmd);
@@ -3048,17 +3057,17 @@ chkarg:
 	 * dest. ports, optional
 	 */
 	NOT_BLOCK;	/* optional "not" */
-	if (ac) {
+	if (av[0]) {
 		if (_substrcmp(*av, "any") == 0 ||
 		    add_ports(cmd, *av, proto, O_IP_DSTPORT)) {
-			ac--; av++;
+			av++;
 			if (F_LEN(cmd) != 0)
 				cmd = next_cmd(cmd);
 		}
 	}
 
 read_options:
-	if (ac && first_cmd == cmd) {
+	if (av[0] && first_cmd == cmd) {
 		/*
 		 * nothing specified so far, store in the rule to ease
 		 * printout later.
@@ -3066,7 +3075,7 @@ read_options:
 		 rule->_pad = 1;
 	}
 	prev = NULL;
-	while (ac) {
+	while ( av[0] != NULL ) {
 		char *s;
 		ipfw_insn_u32 *cmd32;	/* alias for cmd */
 
@@ -3080,7 +3089,7 @@ read_options:
 			s++;
 		}
 		i = match_token(rule_options, s);
-		ac--; av++;
+		av++;
 		switch(i) {
 		case TOK_NOT:
 			if (cmd->len & F_NOT)
@@ -3142,7 +3151,7 @@ read_options:
 			NEED1("recv, xmit, via require interface name"
 				" or address");
 			fill_iface((ipfw_insn_if *)cmd, av[0]);
-			ac--; av++;
+			av++;
 			if (F_LEN(cmd) == 0)	/* not a valid address */
 				break;
 			if (i == TOK_XMIT)
@@ -3156,13 +3165,13 @@ read_options:
 		case TOK_ICMPTYPES:
 			NEED1("icmptypes requires list of types");
 			fill_icmptypes((ipfw_insn_u32 *)cmd, *av);
-			av++; ac--;
+			av++;
 			break;
 		
 		case TOK_ICMP6TYPES:
 			NEED1("icmptypes requires list of types");
 			fill_icmp6types((ipfw_insn_icmp6 *)cmd, *av);
-			av++; ac--;
+			av++;
 			break;
 
 		case TOK_IPTTL:
@@ -3172,7 +3181,7 @@ read_options:
 				errx(EX_DATAERR, "invalid ipttl %s", *av);
 			} else
 			    fill_cmd(cmd, O_IPTTL, 0, strtoul(*av, NULL, 0));
-			ac--; av++;
+			av++;
 			break;
 
 		case TOK_IPID:
@@ -3182,7 +3191,7 @@ read_options:
 				errx(EX_DATAERR, "invalid ipid %s", *av);
 			} else
 			    fill_cmd(cmd, O_IPID, 0, strtoul(*av, NULL, 0));
-			ac--; av++;
+			av++;
 			break;
 
 		case TOK_IPLEN:
@@ -3192,32 +3201,32 @@ read_options:
 				errx(EX_DATAERR, "invalid ip len %s", *av);
 			} else
 			    fill_cmd(cmd, O_IPLEN, 0, strtoul(*av, NULL, 0));
-			ac--; av++;
+			av++;
 			break;
 
 		case TOK_IPVER:
 			NEED1("ipver requires version");
 			fill_cmd(cmd, O_IPVER, 0, strtoul(*av, NULL, 0));
-			ac--; av++;
+			av++;
 			break;
 
 		case TOK_IPPRECEDENCE:
 			NEED1("ipprecedence requires value");
 			fill_cmd(cmd, O_IPPRECEDENCE, 0,
 			    (strtoul(*av, NULL, 0) & 7) << 5);
-			ac--; av++;
+			av++;
 			break;
 
 		case TOK_IPOPTS:
 			NEED1("missing argument for ipoptions");
 			fill_flags(cmd, O_IPOPT, f_ipopts, *av);
-			ac--; av++;
+			av++;
 			break;
 
 		case TOK_IPTOS:
 			NEED1("missing argument for iptos");
 			fill_flags(cmd, O_IPTOS, f_iptos, *av);
-			ac--; av++;
+			av++;
 			break;
 
 		case TOK_UID:
@@ -3234,7 +3243,7 @@ read_options:
 				errx(EX_DATAERR, "uid \"%s\" nonexistent", *av);
 			cmd32->d[0] = pwd->pw_uid;
 			cmd->len |= F_INSN_SIZE(ipfw_insn_u32);
-			ac--; av++;
+			av++;
 		    }
 			break;
 
@@ -3252,7 +3261,7 @@ read_options:
 				errx(EX_DATAERR, "gid \"%s\" nonexistent", *av);
 			cmd32->d[0] = grp->gr_gid;
 			cmd->len |= F_INSN_SIZE(ipfw_insn_u32);
-			ac--; av++;
+			av++;
 		    }
 			break;
 
@@ -3268,7 +3277,7 @@ read_options:
 				errx(EX_DATAERR, "jail requires prison ID");
 			cmd32->d[0] = (uint32_t)jid;
 			cmd->len |= F_INSN_SIZE(ipfw_insn_u32);
-			ac--; av++;
+			av++;
 		    }
 			break;
 
@@ -3289,13 +3298,13 @@ read_options:
 			} else
 			    fill_cmd(cmd, O_TCPDATALEN, 0,
 				    strtoul(*av, NULL, 0));
-			ac--; av++;
+			av++;
 			break;
 
 		case TOK_TCPOPTS:
 			NEED1("missing argument for tcpoptions");
 			fill_flags(cmd, O_TCPOPTS, f_tcpopts, *av);
-			ac--; av++;
+			av++;
 			break;
 
 		case TOK_TCPSEQ:
@@ -3304,21 +3313,21 @@ read_options:
 			cmd->len = F_INSN_SIZE(ipfw_insn_u32);
 			cmd->opcode = (i == TOK_TCPSEQ) ? O_TCPSEQ : O_TCPACK;
 			cmd32->d[0] = htonl(strtoul(*av, NULL, 0));
-			ac--; av++;
+			av++;
 			break;
 
 		case TOK_TCPWIN:
 			NEED1("tcpwin requires length");
 			fill_cmd(cmd, O_TCPWIN, 0,
 			    htons(strtoul(*av, NULL, 0)));
-			ac--; av++;
+			av++;
 			break;
 
 		case TOK_TCPFLAGS:
 			NEED1("missing argument for tcpflags");
 			cmd->opcode = O_TCPFLAGS;
 			fill_flags(cmd, O_TCPFLAGS, f_tcpflags, *av);
-			ac--; av++;
+			av++;
 			break;
 
 		case TOK_KEEPSTATE:
@@ -3348,11 +3357,11 @@ read_options:
 			cmd->opcode = O_LIMIT;
 			c->limit_mask = c->conn_limit = 0;
 
-			while (ac > 0) {
+			while ( av[0] != NULL ) {
 				if ((val = match_token(limit_masks, *av)) <= 0)
 					break;
 				c->limit_mask |= val;
-				ac--; av++;
+				av++;
 			}
 
 			if (c->limit_mask == 0)
@@ -3361,14 +3370,14 @@ read_options:
 			GET_UINT_ARG(c->conn_limit, IPFW_ARG_MIN, IPFW_ARG_MAX,
 			    TOK_LIMIT, rule_options);
 
-			ac--; av++;
+			av++;
 			break;
 		}
 
 		case TOK_PROTO:
 			NEED1("missing protocol");
 			if (add_proto(cmd, *av, &proto)) {
-				ac--; av++;
+				av++;
 			} else
 				errx(EX_DATAERR, "invalid protocol ``%s''",
 				    *av);
@@ -3377,28 +3386,28 @@ read_options:
 		case TOK_SRCIP:
 			NEED1("missing source IP");
 			if (add_srcip(cmd, *av)) {
-				ac--; av++;
+				av++;
 			}
 			break;
 
 		case TOK_DSTIP:
 			NEED1("missing destination IP");
 			if (add_dstip(cmd, *av)) {
-				ac--; av++;
+				av++;
 			}
 			break;
 
 		case TOK_SRCIP6:
 			NEED1("missing source IP6");
 			if (add_srcip6(cmd, *av)) {
-				ac--; av++;
+				av++;
 			}
 			break;
 				
 		case TOK_DSTIP6:
 			NEED1("missing destination IP6");
 			if (add_dstip6(cmd, *av)) {
-				ac--; av++;
+				av++;
 			}
 			break;
 
@@ -3406,7 +3415,7 @@ read_options:
 			NEED1("missing source port");
 			if (_substrcmp(*av, "any") == 0 ||
 			    add_ports(cmd, *av, proto, O_IP_SRCPORT)) {
-				ac--; av++;
+				av++;
 			} else
 				errx(EX_DATAERR, "invalid source port %s", *av);
 			break;
@@ -3415,23 +3424,22 @@ read_options:
 			NEED1("missing destination port");
 			if (_substrcmp(*av, "any") == 0 ||
 			    add_ports(cmd, *av, proto, O_IP_DSTPORT)) {
-				ac--; av++;
+				av++;
 			} else
 				errx(EX_DATAERR, "invalid destination port %s",
 				    *av);
 			break;
 
 		case TOK_MAC:
-			if (add_mac(cmd, ac, av)) {
-				ac -= 2; av += 2;
-			}
+			if (add_mac(cmd, av))
+				av += 2;
 			break;
 
 		case TOK_MACTYPE:
 			NEED1("missing mac type");
-			if (!add_mactype(cmd, ac, *av))
+			if (!add_mactype(cmd, *av))
 				errx(EX_DATAERR, "invalid mac type %s", *av);
-			ac--; av++;
+			av++;
 			break;
 
 		case TOK_VERREVPATH:
@@ -3460,7 +3468,7 @@ read_options:
 
 		case TOK_EXT6HDR:
 			fill_ext6hdr( cmd, *av );
-			ac--; av++;
+			av++;
 			break;
 
 		case TOK_FLOWID:
@@ -3468,17 +3476,16 @@ read_options:
 				errx( EX_USAGE, "flow-id filter is active "
 				    "only for ipv6 protocol\n");
 			fill_flow6( (ipfw_insn_u32 *) cmd, *av );
-			ac--; av++;
+			av++;
 			break;
 
 		case TOK_COMMENT:
-			fill_comment(cmd, ac, av);
-			av += ac;
-			ac = 0;
+			fill_comment(cmd, av);
+			av[0]=NULL;
 			break;
 
 		case TOK_TAGGED:
-			if (ac > 0 && strpbrk(*av, "-,")) {
+			if (av[0] && strpbrk(*av, "-,")) {
 				if (!add_ports(cmd, *av, 0, O_TAGGED))
 					errx(EX_DATAERR, "tagged: invalid tag"
 					    " list: %s", *av);
@@ -3490,13 +3497,13 @@ read_options:
 				    TOK_TAGGED, rule_options);
 				fill_cmd(cmd, O_TAGGED, 0, tag);
 			}
-			ac--; av++;
+			av++;
 			break;
 
 		case TOK_FIB:
 			NEED1("fib requires fib number");
 			fill_cmd(cmd, O_FIB, 0, strtoul(*av, NULL, 0));
-			ac--; av++;
+			av++;
 			break;
 
 		case TOK_LOOKUP: {
@@ -3504,7 +3511,7 @@ read_options:
 			char *p;
 			int j;
 
-			if (ac < 2)
+			if (av[0] && av[1])
 				errx(EX_USAGE, "format: lookup argument tablenum");
 			cmd->opcode = O_IP_DST_LOOKUP;
 			cmd->len |= F_INSN_SIZE(ipfw_insn) + 2;
@@ -3516,11 +3523,11 @@ read_options:
 			if (lookup_key[j] <= 0)
 				errx(EX_USAGE, "format: cannot lookup on %s", *av);
 			c->d[1] = j; // i converted to option
-			ac--; av++;
+			av++;
 			cmd->arg1 = strtoul(*av, &p, 0);
 			if (p && *p)
 				errx(EX_USAGE, "format: lookup argument tablenum");
-			ac--; av++;
+			av++;
 		    }
 			break;
 
@@ -3698,6 +3705,10 @@ ipfw_flush(int force)
 		if (c == 'N')	/* user said no */
 			return;
 	}
+	if (co.do_pipe) {
+		dummynet_flush();
+		return;
+	}
 	/* `ipfw set N flush` - is the same that `ipfw delete set N` */
 	if (co.use_set) {
 		uint32_t arg = ((co.use_set - 1) & 0xffff) | (1 << 24);
@@ -3811,14 +3822,14 @@ ipfw_table_handler(int ac, char *av[])
 			}
 		}
 	} else if (_substrcmp(*av, "flush") == 0) {
-		a = is_all ? tables_max : (ent.tbl + 1);
+		a = is_all ? tables_max : (uint32_t)(ent.tbl + 1);
 		do {
 			if (do_cmd(IP_FW_TABLE_FLUSH, &ent.tbl,
 			    sizeof(ent.tbl)) < 0)
 				err(EX_OSERR, "setsockopt(IP_FW_TABLE_FLUSH)");
 		} while (++ent.tbl < a);
 	} else if (_substrcmp(*av, "list") == 0) {
-		a = is_all ? tables_max : (ent.tbl + 1);
+		a = is_all ? tables_max : (uint32_t)(ent.tbl + 1);
 		do {
 			table_list(ent, is_all);
 		} while (++ent.tbl < a);
diff --git a/sbin/ipfw/ipfw2.h b/sbin/ipfw/ipfw2.h
index b393a7d..1dbb42b 100644
--- a/sbin/ipfw/ipfw2.h
+++ b/sbin/ipfw/ipfw2.h
@@ -35,7 +35,7 @@ struct cmdline_opts {
 	int	do_resolv;	/* try to resolve all ip to names */
 	int	do_time;	/* Show time stamps */
 	int	do_quiet;	/* Be quiet in add and flush */
-	int	do_pipe;	/* this cmd refers to a pipe */
+	int	do_pipe;	/* this cmd refers to a pipe/queue/sched */
 	int	do_nat; 	/* this cmd refers to a nat config */
 	int	do_dynamic;	/* display dynamic rules */
 	int	do_expired;	/* display expired dynamic rules */
@@ -82,7 +82,10 @@ enum tokens {
 	TOK_ACCEPT,
 	TOK_COUNT,
 	TOK_PIPE,
+	TOK_LINK,
 	TOK_QUEUE,
+	TOK_FLOWSET,
+	TOK_SCHED,
 	TOK_DIVERT,
 	TOK_TEE,
 	TOK_NETGRAPH,
@@ -151,15 +154,23 @@ enum tokens {
 	TOK_SRCPORT,
 	TOK_ALL,
 	TOK_MASK,
+	TOK_FLOW_MASK,
+	TOK_SCHED_MASK,
 	TOK_BW,
 	TOK_DELAY,
-	TOK_PIPE_PROFILE,
+	TOK_PROFILE,
 	TOK_BURST,
 	TOK_RED,
 	TOK_GRED,
 	TOK_DROPTAIL,
 	TOK_PROTO,
+	/* dummynet tokens */
 	TOK_WEIGHT,
+	TOK_LMAX,
+	TOK_PRI,
+	TOK_TYPE,
+	TOK_SLOTSIZE,
+
 	TOK_IP,
 	TOK_IF,
  	TOK_ALOG,
@@ -192,7 +203,8 @@ enum tokens {
  * the following macro returns an error message if we run out of
  * arguments.
  */
-#define NEED1(msg)      {if (!ac) errx(EX_USAGE, msg);}
+#define NEED(_p, msg)      {if (!_p) errx(EX_USAGE, msg);}
+#define NEED1(msg)      {if (!(*av)) errx(EX_USAGE, msg);}
 
 unsigned long long align_uint64(const uint64_t *pll);
 
@@ -236,14 +248,14 @@ struct _ipfw_insn_icmp6;
 extern int resvd_set_number;
 
 /* first-level command handlers */
-void ipfw_add(int ac, char *av[]);
+void ipfw_add(char *av[]);
 void ipfw_show_nat(int ac, char **av);
 void ipfw_config_pipe(int ac, char **av);
 void ipfw_config_nat(int ac, char **av);
-void ipfw_sets_handler(int ac, char *av[]);
+void ipfw_sets_handler(char *av[]);
 void ipfw_table_handler(int ac, char *av[]);
-void ipfw_sysctl_handler(int ac, char *av[], int which);
-void ipfw_delete(int ac, char *av[]);
+void ipfw_sysctl_handler(char *av[], int which);
+void ipfw_delete(char *av[]);
 void ipfw_flush(int force);
 void ipfw_zero(int ac, char *av[], int optname);
 void ipfw_list(int ac, char *av[], int show_counters);
@@ -255,7 +267,8 @@ u_int32_t altq_name_to_qid(const char *name);
 void print_altq_cmd(struct _ipfw_insn_altq *altqptr);
 
 /* dummynet.c */
-void ipfw_list_pipes(void *data, uint nbytes, int ac, char *av[]);
+void dummynet_list(int ac, char *av[], int show_counters);
+void dummynet_flush(void);
 int ipfw_delete_pipe(int pipe_or_queue, int n);
 
 /* ipv6.c */
diff --git a/sbin/ipfw/main.c b/sbin/ipfw/main.c
index 3916057..3f48178 100644
--- a/sbin/ipfw/main.c
+++ b/sbin/ipfw/main.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2002-2003 Luigi Rizzo
+ * Copyright (c) 2002-2003,2010 Luigi Rizzo
  * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp
  * Copyright (c) 1994 Ugen J.S.Antsilevich
  *
@@ -80,31 +80,27 @@ help(void)
 }
 
 /*
- * Free a the (locally allocated) copy of command line arguments.
- */
-static void
-free_args(int ac, char **av)
-{
-	int i;
-
-	for (i=0; i < ac; i++)
-		free(av[i]);
-	free(av);
-}
-
-/*
  * Called with the arguments, including program name because getopt
  * wants it to be present.
  * Returns 0 if successful, 1 if empty command, errx() in case of errors.
+ * First thing we do is process parameters creating an argv[] array
+ * which includes the program name and a NULL entry at the end.
+ * If we are called with a single string, we split it on whitespace.
+ * Also, arguments with a trailing ',' are joined to the next one.
+ * The pointers (av[]) and data are in a a single chunk of memory.
+ * av[0] points to the original program name, all other entries
+ * point into the allocated chunk.
  */
 static int
 ipfw_main(int oldac, char **oldav)
 {
-	int ch, ac, save_ac;
+	int ch, ac;
 	const char *errstr;
 	char **av, **save_av;
 	int do_acct = 0;		/* Show packet/byte count */
 	int try_next = 0;		/* set if pipe cmd not found */
+	int av_size;			/* compute the av size */
+	char *av_p;			/* used to build the av list */
 
 #define WHITESP		" \t\f\v\n\r"
 	if (oldac < 2)
@@ -112,10 +108,9 @@ ipfw_main(int oldac, char **oldav)
 
 	if (oldac == 2) {
 		/*
-		 * If we are called with a single string, try to split it into
-		 * arguments for subsequent parsing.
-		 * But first, remove spaces after a ',', by copying the string
-		 * in-place.
+		 * If we are called with one argument, try to split it into
+		 * words for subsequent parsing. Spaces after a ',' are
+		 * removed by copying the string in-place.
 		 */
 		char *arg = oldav[1];	/* The string is the first arg. */
 		int l = strlen(arg);
@@ -150,31 +145,59 @@ ipfw_main(int oldac, char **oldav)
 				ac++;
 
 		/*
-		 * Allocate the argument list, including one entry for
-		 * the program name because getopt expects it.
+		 * Allocate the argument list structure as a single block
+		 * of memory, containing pointers and the argument
+		 * strings. We include one entry for the program name
+		 * because getopt expects it, and a NULL at the end
+		 * to simplify further parsing.
 		 */
-		av = safe_calloc(ac + 1, sizeof(char *));
+		ac++;		/* add 1 for the program name */
+		av_size = (ac+1) * sizeof(char *) + l + 1;
+		av = safe_calloc(av_size, 1);
 
 		/*
-		 * Second, copy arguments from arg[] to av[]. For each one,
+		 * Init the argument pointer to the end of the array
+		 * and copy arguments from arg[] to av[]. For each one,
 		 * j is the initial character, i is the one past the end.
 		 */
-		for (ac = 1, i = j = 0; i < l; i++)
+		av_p = (char *)&av[ac+1];
+		for (ac = 1, i = j = 0; i < l; i++) {
 			if (index(WHITESP, arg[i]) != NULL || i == l-1) {
 				if (i == l-1)
 					i++;
-				av[ac] = safe_calloc(i-j+1, 1);
-				bcopy(arg+j, av[ac], i-j);
+				bcopy(arg+j, av_p, i-j);
+				av[ac] = av_p;
+				av_p += i-j;	/* the lenght of the string */
+				*av_p++ = '\0';
 				ac++;
 				j = i + 1;
 			}
+		}
 	} else {
 		/*
 		 * If an argument ends with ',' join with the next one.
 		 */
-		int first, i, l;
+		int first, i, l=0;
+
+		/*
+		 * Allocate the argument list structure as a single block
+		 * of memory, containing both pointers and the argument
+		 * strings. We include some space for the program name
+		 * because getopt expects it.
+		 * We add an extra pointer to the end of the array,
+		 * to make simpler further parsing.
+		 */
+		for (i=0; i<oldac; i++)
+			l += strlen(oldav[i]);
+
+		av_size = (oldac+1) * sizeof(char *) + l + oldac;
+		av = safe_calloc(av_size, 1);
 
-		av = safe_calloc(oldac, sizeof(char *));
+		/*
+		 * Init the argument pointer to the end of the array
+		 * and copy arguments from arg[] to av[]
+		 */
+		av_p = (char *)&av[oldac+1];
 		for (first = i = ac = 1, l = 0; i < oldac; i++) {
 			char *arg = oldav[i];
 			int k = strlen(arg);
@@ -182,11 +205,12 @@ ipfw_main(int oldac, char **oldav)
 			l += k;
 			if (arg[k-1] != ',' || i == oldac-1) {
 				/* Time to copy. */
-				av[ac] = safe_calloc(l+1, 1);
+				av[ac] = av_p;
 				for (l=0; first <= i; first++) {
-					strcat(av[ac]+l, oldav[first]);
-					l += strlen(oldav[first]);
+					strcat(av_p, oldav[first]);
+					av_p += strlen(oldav[first]);
 				}
+				*av_p++ = '\0';
 				ac++;
 				l = 0;
 				first = i+1;
@@ -194,13 +218,47 @@ ipfw_main(int oldac, char **oldav)
 		}
 	}
 
-	av[0] = strdup(oldav[0]);	/* copy progname from the caller */
+	/*
+	 * set the progname pointer to the original string
+	 * and terminate the array with null
+	 */
+	av[0] = oldav[0];
+	av[ac] = NULL;
+
 	/* Set the force flag for non-interactive processes */
 	if (!co.do_force)
 		co.do_force = !isatty(STDIN_FILENO);
 
+#ifdef EMULATE_SYSCTL /* sysctl emulation */
+	if ( ac >= 2 && !strcmp(av[1], "sysctl")) {
+		char *s;
+		int i;
+
+		if (ac != 3) {
+			printf(	"sysctl emulation usage:\n"
+				"	ipfw sysctl name[=value]\n"
+				"	ipfw sysctl -a\n");
+			return 0;
+		}
+		s = index(av[2], '=');
+		if (s == NULL) {
+			s = !strcmp(av[2], "-a") ? NULL : av[2];
+			sysctlbyname(s, NULL, NULL, NULL, 0);
+		} else {	/* ipfw sysctl x.y.z=value */
+			/* assume an INT value, will extend later */
+			if (s[1] == '\0') {
+				printf("ipfw sysctl: missing value\n\n");
+				return 0;
+			}
+			*s = '\0';
+			i = strtol(s+1, NULL, 0);
+			sysctlbyname(av[2], NULL, NULL, &i, sizeof(int));
+		}
+		return 0;
+	}
+#endif
+
 	/* Save arguments for final freeing of memory. */
-	save_ac = ac;
 	save_av = av;
 
 	optind = optreset = 1;	/* restart getopt() */
@@ -232,7 +290,7 @@ ipfw_main(int oldac, char **oldav)
 			break;
 
 		case 'h': /* help */
-			free_args(save_ac, save_av);
+			free(save_av);
 			help();
 			break;	/* NOTREACHED */
 
@@ -273,7 +331,7 @@ ipfw_main(int oldac, char **oldav)
 			break;
 
 		default:
-			free_args(save_ac, save_av);
+			free(save_av);
 			return 1;
 		}
 
@@ -304,6 +362,10 @@ ipfw_main(int oldac, char **oldav)
 		co.do_pipe = 1;
 	else if (_substrcmp(*av, "queue") == 0)
 		co.do_pipe = 2;
+	else if (_substrcmp(*av, "flowset") == 0)
+		co.do_pipe = 2;
+	else if (_substrcmp(*av, "sched") == 0)
+		co.do_pipe = 3;
 	else if (!strncmp(*av, "set", strlen(*av))) {
 		if (ac > 1 && isdigit(av[1][0])) {
 			co.use_set = strtonum(av[1], 0, resvd_set_number,
@@ -335,7 +397,7 @@ ipfw_main(int oldac, char **oldav)
 
 	if (co.use_set == 0) {
 		if (_substrcmp(*av, "add") == 0)
-			ipfw_add(ac, av);
+			ipfw_add(av);
 		else if (co.do_nat && _substrcmp(*av, "show") == 0)
  			ipfw_show_nat(ac, av);
 		else if (co.do_pipe && _substrcmp(*av, "config") == 0)
@@ -343,20 +405,20 @@ ipfw_main(int oldac, char **oldav)
 		else if (co.do_nat && _substrcmp(*av, "config") == 0)
  			ipfw_config_nat(ac, av);
 		else if (_substrcmp(*av, "set") == 0)
-			ipfw_sets_handler(ac, av);
+			ipfw_sets_handler(av);
 		else if (_substrcmp(*av, "table") == 0)
 			ipfw_table_handler(ac, av);
 		else if (_substrcmp(*av, "enable") == 0)
-			ipfw_sysctl_handler(ac, av, 1);
+			ipfw_sysctl_handler(av, 1);
 		else if (_substrcmp(*av, "disable") == 0)
-			ipfw_sysctl_handler(ac, av, 0);
+			ipfw_sysctl_handler(av, 0);
 		else
 			try_next = 1;
 	}
 
 	if (co.use_set || try_next) {
 		if (_substrcmp(*av, "delete") == 0)
-			ipfw_delete(ac, av);
+			ipfw_delete(av);
 		else if (_substrcmp(*av, "flush") == 0)
 			ipfw_flush(co.do_force);
 		else if (_substrcmp(*av, "zero") == 0)
@@ -373,7 +435,7 @@ ipfw_main(int oldac, char **oldav)
 	}
 
 	/* Free memory allocated in the argument parsing. */
-	free_args(save_ac, save_av);
+	free(save_av);
 	return 0;
 }
 
diff --git a/sys/conf/files b/sys/conf/files
index e723775..5434e92 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -2488,7 +2488,14 @@ netinet/in_proto.c		optional inet \
 	compile-with "${NORMAL_C} -I$S/contrib/pf"
 netinet/in_rmx.c		optional inet
 netinet/ip_divert.c		optional inet ipdivert ipfirewall
+netinet/ipfw/dn_heap.c		optional inet dummynet
+netinet/ipfw/dn_sched_fifo.c	optional inet dummynet
+netinet/ipfw/dn_sched_rr.c	optional inet dummynet
+netinet/ipfw/dn_sched_wf2q.c	optional inet dummynet 
+netinet/ipfw/dn_sched_qfq.c	optional inet dummynet 
 netinet/ipfw/ip_dummynet.c	optional inet dummynet
+netinet/ipfw/ip_dn_io.c		optional inet dummynet
+netinet/ipfw/ip_dn_glue.c	optional inet dummynet
 netinet/ip_ecn.c		optional inet | inet6
 netinet/ip_encap.c		optional inet | inet6
 netinet/ip_fastfwd.c		optional inet
diff --git a/sys/net/if_bridge.c b/sys/net/if_bridge.c
index d985e98..5f33dd5 100644
--- a/sys/net/if_bridge.c
+++ b/sys/net/if_bridge.c
@@ -135,7 +135,6 @@ __FBSDID("$FreeBSD$");
 #include <net/route.h>
 #include <netinet/ip_fw.h>
 #include <netinet/ipfw/ip_fw_private.h>
-#include <netinet/ip_dummynet.h>
 
 /*
  * Size of the route hash table.  Must be a power of two.
@@ -3073,15 +3072,15 @@ bridge_pfil(struct mbuf **mp, struct ifnet *bifp, struct ifnet *ifp, int dir)
 		if (mtag == NULL) {
 			args.rule.slot = 0;
 		} else {
-			struct dn_pkt_tag *dn_tag;
+			struct ipfw_rule_ref *r;
 
 			/* XXX can we free the tag after use ? */
 			mtag->m_tag_id = PACKET_TAG_NONE;
-			dn_tag = (struct dn_pkt_tag *)(mtag + 1);
+			r = (struct ipfw_rule_ref *)(mtag + 1);
 			/* packet already partially processed ? */
-			if (dn_tag->rule.slot != 0 && V_fw_one_pass)
+			if (r->info & IPFW_ONEPASS)
 				goto ipfwpass;
-			args.rule = dn_tag->rule;
+			args.rule = *r;
 		}
 
 		args.m = *mp;
diff --git a/sys/net/if_ethersubr.c b/sys/net/if_ethersubr.c
index ac2ab05..bbf9753 100644
--- a/sys/net/if_ethersubr.c
+++ b/sys/net/if_ethersubr.c
@@ -73,7 +73,6 @@
 #include <netinet/ip_var.h>
 #include <netinet/ip_fw.h>
 #include <netinet/ipfw/ip_fw_private.h>
-#include <netinet/ip_dummynet.h>
 #endif
 #ifdef INET6
 #include <netinet6/nd6.h>
@@ -474,15 +473,15 @@ ether_ipfw_chk(struct mbuf **m0, struct ifnet *dst, int shared)
 	if (mtag == NULL) {
 		args.rule.slot = 0;
 	} else {
-		struct dn_pkt_tag *dn_tag;
+		/* dummynet packet, already partially processed */
+		struct ipfw_rule_ref *r;
 
 		/* XXX can we free it after use ? */
 		mtag->m_tag_id = PACKET_TAG_NONE;
-		dn_tag = (struct dn_pkt_tag *)(mtag + 1);
-		if (dn_tag->rule.slot != 0 && V_fw_one_pass)
-			/* dummynet packet, already partially processed */
+		r = (struct ipfw_rule_ref *)(mtag + 1);
+		if (r->info & IPFW_ONEPASS)
 			return (1);
-		args.rule = dn_tag->rule;
+		args.rule = *r;
 	}
 
 	/*
diff --git a/sys/netinet/ip_dummynet.h b/sys/netinet/ip_dummynet.h
index 3a193e9..0bbc326 100644
--- a/sys/netinet/ip_dummynet.h
+++ b/sys/netinet/ip_dummynet.h
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa
+ * Copyright (c) 1998-2010 Luigi Rizzo, Universita` di Pisa
  * Portions Copyright (c) 2000 Akamba Corp.
  * All rights reserved
  *
@@ -31,257 +31,124 @@
 #define _IP_DUMMYNET_H
 
 /*
- * Definition of dummynet data structures. In the structures, I decided
- * not to use the macros in <sys/queue.h> in the hope of making the code
- * easier to port to other architectures. The type of lists and queue we
- * use here is pretty simple anyways.
- */
-
-/*
- * We start with a heap, which is used in the scheduler to decide when
- * to transmit packets etc.
- *
- * The key for the heap is used for two different values:
+ * Definition of the kernel-userland API for dummynet.
  *
- * 1. timer ticks- max 10K/second, so 32 bits are enough;
+ * Setsockopt() and getsockopt() pass a batch of objects, each
+ * of them starting with a "struct dn_id" which should fully identify
+ * the object and its relation with others in the sequence.
+ * The first object in each request should have
+ *	 type= DN_CMD_*, id = DN_API_VERSION.
+ * For other objects, type and subtype specify the object, len indicates
+ * the total length including the header, and 'id' identifies the specific
+ * object.
  *
- * 2. virtual times. These increase in steps of len/x, where len is the
- *    packet length, and x is either the weight of the flow, or the
- *    sum of all weights.
- *    If we limit to max 1000 flows and a max weight of 100, then
- *    x needs 17 bits. The packet size is 16 bits, so we can easily
- *    overflow if we do not allow errors.
- * So we use a key "dn_key" which is 64 bits. Some macros are used to
- * compare key values and handle wraparounds.
- * MAX64 returns the largest of two key values.
- * MY_M is used as a shift count when doing fixed point arithmetic
- * (a better name would be useful...).
+ * Most objects are numbered with an identifier in the range 1..65535.
+ * DN_MAX_ID indicates the first value outside the range.
  */
-typedef u_int64_t dn_key ;      /* sorting key */
-#define DN_KEY_LT(a,b)     ((int64_t)((a)-(b)) < 0)
-#define DN_KEY_LEQ(a,b)    ((int64_t)((a)-(b)) <= 0)
-#define DN_KEY_GT(a,b)     ((int64_t)((a)-(b)) > 0)
-#define DN_KEY_GEQ(a,b)    ((int64_t)((a)-(b)) >= 0)
-#define MAX64(x,y)  (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x)
-#define MY_M	16 /* number of left shift to obtain a larger precision */
 
-/*
- * XXX With this scaling, max 1000 flows, max weight 100, 1Gbit/s, the
- * virtual time wraps every 15 days.
- */
+#define	DN_API_VERSION	12500000
+#define	DN_MAX_ID	0x10000
 
+struct dn_id {
+	uint16_t	len;	/* total obj len including this header */
+	uint8_t		type;
+	uint8_t		subtype;
+	uint32_t	id;	/* generic id */
+};
 
 /*
- * The maximum hash table size for queues.  This value must be a power
- * of 2.
- */
-#define DN_MAX_HASH_SIZE 65536
-
-/*
- * A heap entry is made of a key and a pointer to the actual
- * object stored in the heap.
- * The heap is an array of dn_heap_entry entries, dynamically allocated.
- * Current size is "size", with "elements" actually in use.
- * The heap normally supports only ordered insert and extract from the top.
- * If we want to extract an object from the middle of the heap, we
- * have to know where the object itself is located in the heap (or we
- * need to scan the whole array). To this purpose, an object has a
- * field (int) which contains the index of the object itself into the
- * heap. When the object is moved, the field must also be updated.
- * The offset of the index in the object is stored in the 'offset'
- * field in the heap descriptor. The assumption is that this offset
- * is non-zero if we want to support extract from the middle.
+ * These values are in the type field of struct dn_id.
+ * To preserve the ABI, never rearrange the list or delete
+ * entries with the exception of DN_LAST
  */
-struct dn_heap_entry {
-    dn_key key ;	/* sorting key. Topmost element is smallest one */
-    void *object ;	/* object pointer */
+enum {
+	DN_NONE = 0,
+	DN_LINK = 1,
+	DN_FS,
+	DN_SCH,
+	DN_SCH_I,
+	DN_QUEUE,
+	DN_DELAY_LINE,
+	DN_PROFILE,
+	DN_FLOW,		/* struct dn_flow */
+	DN_TEXT,		/* opaque text is the object */
+
+	DN_CMD_CONFIG = 0x80,	/* objects follow */
+	DN_CMD_DELETE,		/* subtype + list of entries */
+	DN_CMD_GET,		/* subtype + list of entries */
+	DN_CMD_FLUSH,
+	/* for compatibility with FreeBSD 7.2/8 */
+	DN_COMPAT_PIPE,
+	DN_COMPAT_QUEUE,
+	DN_GET_COMPAT,
+
+	/* special commands for emulation of sysctl variables */
+	DN_SYSCTL_GET,
+	DN_SYSCTL_SET,
+
+	DN_LAST,
 } ;
 
-struct dn_heap {
-    int size ;
-    int elements ;
-    int offset ; /* XXX if > 0 this is the offset of direct ptr to obj */
-    struct dn_heap_entry *p ;	/* really an array of "size" entries */
+enum { /* subtype for schedulers, flowset and the like */
+	DN_SCHED_UNKNOWN = 0,
+	DN_SCHED_FIFO = 1,
+	DN_SCHED_WF2QP = 2,
+	/* others are in individual modules */
 } ;
 
-#ifdef _KERNEL
-/*
- * Packets processed by dummynet have an mbuf tag associated with
- * them that carries their dummynet state.  This is used within
- * the dummynet code as well as outside when checking for special
- * processing requirements.
- * Note that the first part is the reinject info and is common to
- * other forms of packet reinjection.
- */
-struct dn_pkt_tag {
-	struct ipfw_rule_ref rule;	/* matching rule */
-
-    /* second part, dummynet specific */
-    int dn_dir;			/* action when packet comes out. */
-				/* see ip_fw_private.h */
-
-    dn_key output_time;		/* when the pkt is due for delivery	*/
-    struct ifnet *ifp;		/* interface, for ip_output		*/
-    struct _ip6dn_args ip6opt;	/* XXX ipv6 options			*/
+enum {	/* user flags */
+	DN_HAVE_MASK	= 0x0001,	/* fs or sched has a mask */
+	DN_NOERROR	= 0x0002,	/* do not report errors */
+	DN_QHT_HASH	= 0x0004,	/* qht is a hash table */
+	DN_QSIZE_BYTES	= 0x0008,	/* queue size is in bytes */
+	DN_HAS_PROFILE	= 0x0010,	/* a link has a profile */
+	DN_IS_RED	= 0x0020,
+	DN_IS_GENTLE_RED= 0x0040,
+	DN_PIPE_CMD	= 0x1000,	/* pipe config... */
 };
-#endif /* _KERNEL */
 
 /*
- * Overall structure of dummynet (with WF2Q+):
-
-In dummynet, packets are selected with the firewall rules, and passed
-to two different objects: PIPE or QUEUE.
-
-A QUEUE is just a queue with configurable size and queue management
-policy. It is also associated with a mask (to discriminate among
-different flows), a weight (used to give different shares of the
-bandwidth to different flows) and a "pipe", which essentially
-supplies the transmit clock for all queues associated with that
-pipe.
-
-A PIPE emulates a fixed-bandwidth link, whose bandwidth is
-configurable.  The "clock" for a pipe can come from either an
-internal timer, or from the transmit interrupt of an interface.
-A pipe is also associated with one (or more, if masks are used)
-queue, where all packets for that pipe are stored.
-
-The bandwidth available on the pipe is shared by the queues
-associated with that pipe (only one in case the packet is sent
-to a PIPE) according to the WF2Q+ scheduling algorithm and the
-configured weights.
-
-In general, incoming packets are stored in the appropriate queue,
-which is then placed into one of a few heaps managed by a scheduler
-to decide when the packet should be extracted.
-The scheduler (a function called dummynet()) is run at every timer
-tick, and grabs queues from the head of the heaps when they are
-ready for processing.
-
-There are three data structures definining a pipe and associated queues:
-
- + dn_pipe, which contains the main configuration parameters related
-   to delay and bandwidth;
- + dn_flow_set, which contains WF2Q+ configuration, flow
-   masks, plr and RED configuration;
- + dn_flow_queue, which is the per-flow queue (containing the packets)
-
-Multiple dn_flow_set can be linked to the same pipe, and multiple
-dn_flow_queue can be linked to the same dn_flow_set.
-All data structures are linked in a linear list which is used for
-housekeeping purposes.
-
-During configuration, we create and initialize the dn_flow_set
-and dn_pipe structures (a dn_pipe also contains a dn_flow_set).
-
-At runtime: packets are sent to the appropriate dn_flow_set (either
-WFQ ones, or the one embedded in the dn_pipe for fixed-rate flows),
-which in turn dispatches them to the appropriate dn_flow_queue
-(created dynamically according to the masks).
-
-The transmit clock for fixed rate flows (ready_event()) selects the
-dn_flow_queue to be used to transmit the next packet. For WF2Q,
-wfq_ready_event() extract a pipe which in turn selects the right
-flow using a number of heaps defined into the pipe itself.
-
- *
+ * link template.
  */
+struct dn_link {
+	struct dn_id oid;
 
-/*
- * per flow queue. This contains the flow identifier, the queue
- * of packets, counters, and parameters used to support both RED and
- * WF2Q+.
- *
- * A dn_flow_queue is created and initialized whenever a packet for
- * a new flow arrives.
- */
-struct dn_flow_queue {
-    struct dn_flow_queue *next ;
-    struct ipfw_flow_id id ;
-
-    struct mbuf *head, *tail ;	/* queue of packets */
-    u_int len ;
-    u_int len_bytes ;
-
-    /*
-     * When we emulate MAC overheads, or channel unavailability due
-     * to other traffic on a shared medium, we augment the packet at
-     * the head of the queue with an 'extra_bits' field representsing
-     * the additional delay the packet will be subject to:
-     *		extra_bits = bw*unavailable_time.
-     * With large bandwidth and large delays, extra_bits (and also numbytes)
-     * can become very large, so better play safe and use 64 bit
-     */
-    uint64_t numbytes ;		/* credit for transmission (dynamic queues) */
-    int64_t extra_bits;		/* extra bits simulating unavailable channel */
-
-    u_int64_t tot_pkts ;	/* statistics counters	*/
-    u_int64_t tot_bytes ;
-    u_int32_t drops ;
-
-    int hash_slot ;		/* debugging/diagnostic */
-
-    /* RED parameters */
-    int avg ;                   /* average queue length est. (scaled) */
-    int count ;                 /* arrivals since last RED drop */
-    int random ;                /* random value (scaled) */
-    dn_key idle_time;		/* start of queue idle time */
-
-    /* WF2Q+ support */
-    struct dn_flow_set *fs ;	/* parent flow set */
-    int heap_pos ;		/* position (index) of struct in heap */
-    dn_key sched_time ;		/* current time when queue enters ready_heap */
-
-    dn_key S,F ;		/* start time, finish time */
     /*
-     * Setting F < S means the timestamp is invalid. We only need
-     * to test this when the queue is empty.
+	 * Userland sets bw and delay in bits/s and milliseconds.
+	 * The kernel converts this back and forth to bits/tick and ticks.
+	 * XXX what about burst ?
      */
+	int32_t		link_nr;
+	int		bandwidth;	/* bit/s or bits/tick.   */
+	int		delay;		/* ms and ticks */
+	uint64_t	burst;		/* scaled. bits*Hz  XXX */
 } ;
 
 /*
- * flow_set descriptor. Contains the "template" parameters for the
- * queue configuration, and pointers to the hash table of dn_flow_queue's.
- *
- * The hash table is an array of lists -- we identify the slot by
- * hashing the flow-id, then scan the list looking for a match.
- * The size of the hash table (buckets) is configurable on a per-queue
- * basis.
- *
- * A dn_flow_set is created whenever a new queue or pipe is created (in the
- * latter case, the structure is located inside the struct dn_pipe).
+ * A flowset, which is a template for flows. Contains parameters
+ * from the command line: id, target scheduler, queue sizes, plr,
+ * flow masks, buckets for the flow hash, and possibly scheduler-
+ * specific parameters (weight, quantum and so on).
  */
-struct dn_flow_set {
-    SLIST_ENTRY(dn_flow_set)	next;	/* linked list in a hash slot */
-
-    u_short fs_nr ;             /* flow_set number       */
-    u_short flags_fs;
-#define DN_HAVE_FLOW_MASK	0x0001
-#define DN_IS_RED		0x0002
-#define DN_IS_GENTLE_RED	0x0004
-#define DN_QSIZE_IS_BYTES	0x0008	/* queue size is measured in bytes */
-#define DN_NOERROR		0x0010	/* do not report ENOBUFS on drops  */
-#define	DN_HAS_PROFILE		0x0020	/* the pipe has a delay profile. */
-#define DN_IS_PIPE		0x4000
-#define DN_IS_QUEUE		0x8000
-
-    struct dn_pipe *pipe ;	/* pointer to parent pipe */
-    u_short parent_nr ;		/* parent pipe#, 0 if local to a pipe */
-
-    int weight ;		/* WFQ queue weight */
+struct dn_fs {
+	struct dn_id oid;
+	uint32_t fs_nr;	/* the flowset number */
+	uint32_t flags;	/* userland flags */
     int qsize ;			/* queue size in slots or bytes */
-    int plr ;			/* pkt loss rate (2^31-1 means 100%) */
+	int32_t plr;	/* PLR, pkt loss rate (2^31-1 means 100%) */
+	uint32_t buckets;	/* buckets used for the queue hash table */
 
     struct ipfw_flow_id flow_mask ;
-
-    /* hash table of queues onto this flow_set */
-    int rq_size ;		/* number of slots */
-    int rq_elements ;		/* active elements */
-    struct dn_flow_queue **rq;	/* array of rq_size entries */
-
-    u_int32_t last_expired ;	/* do not expire too frequently */
-    int backlogged ;		/* #active queues for this flowset */
-
-        /* RED parameters */
+	uint32_t sched_nr;	/* the scheduler we attach to */
+	/* generic scheduler parameters. Leave them at -1 if unset.
+	 * Now we use 0: weight, 1: lmax, 2: priority
+	 */
+	int par[4];
+
+	/* RED/GRED parameters.
+	 * weight and probabilities are in the range 0..1 represented
+	 * in fixed point arithmetic with SCALE_RED decimal bits.
+	 */
 #define SCALE_RED               16
 #define SCALE(x)                ( (x) << SCALE_RED )
 #define SCALE_VAL(x)            ( (x) >> SCALE_RED )
@@ -290,85 +157,107 @@ struct dn_flow_set {
     int max_th ;		/* maximum threshold for queue (scaled) */
     int min_th ;		/* minimum threshold for queue (scaled) */
     int max_p ;			/* maximum value for p_b (scaled) */
-    u_int c_1 ;			/* max_p/(max_th-min_th) (scaled) */
-    u_int c_2 ;			/* max_p*min_th/(max_th-min_th) (scaled) */
-    u_int c_3 ;			/* for GRED, (1-max_p)/max_th (scaled) */
-    u_int c_4 ;			/* for GRED, 1 - 2*max_p (scaled) */
-    u_int * w_q_lookup ;	/* lookup table for computing (1-w_q)^t */
-    u_int lookup_depth ;	/* depth of lookup table */
-    int lookup_step ;		/* granularity inside the lookup table */
-    int lookup_weight ;		/* equal to (1-w_q)^t / (1-w_q)^(t+1) */
-    int avg_pkt_size ;		/* medium packet size */
-    int max_pkt_size ;		/* max packet size */
+
 };
-SLIST_HEAD(dn_flow_set_head, dn_flow_set);
 
 /*
- * Pipe descriptor. Contains global parameters, delay-line queue,
- * and the flow_set used for fixed-rate queues.
- *
- * For WF2Q+ support it also has 3 heaps holding dn_flow_queue:
- *   not_eligible_heap, for queues whose start time is higher
- *	than the virtual time. Sorted by start time.
- *   scheduler_heap, for queues eligible for scheduling. Sorted by
- *	finish time.
- *   idle_heap, all flows that are idle and can be removed. We
- *	do that on each tick so we do not slow down too much
- *	operations during forwarding.
- *
+ * dn_flow collects flow_id and stats for queues and scheduler
+ * instances, and is used to pass these info to userland.
+ * oid.type/oid.subtype describe the object, oid.id is number
+ * of the parent object.
  */
-struct dn_pipe {		/* a pipe */
-    SLIST_ENTRY(dn_pipe)	next;	/* linked list in a hash slot */
-
-    int	pipe_nr ;		/* number	*/
-    int bandwidth;		/* really, bytes/tick.	*/
-    int	delay ;			/* really, ticks	*/
-
-    struct	mbuf *head, *tail ;	/* packets in delay line */
-
-    /* WF2Q+ */
-    struct dn_heap scheduler_heap ; /* top extract - key Finish time*/
-    struct dn_heap not_eligible_heap; /* top extract- key Start time */
-    struct dn_heap idle_heap ; /* random extract - key Start=Finish time */
-
-    dn_key V ;			/* virtual time */
-    int sum;			/* sum of weights of all active sessions */
-
-    /* Same as in dn_flow_queue, numbytes can become large */
-    int64_t numbytes;		/* bits I can transmit (more or less). */
-    uint64_t burst;		/* burst size, scaled: bits * hz */
+struct dn_flow {
+	struct dn_id	oid;
+	struct ipfw_flow_id fid;
+	uint64_t	tot_pkts; /* statistics counters  */
+	uint64_t	tot_bytes;
+	uint32_t	length; /* Queue lenght, in packets */
+	uint32_t	len_bytes; /* Queue lenght, in bytes */
+	uint32_t	drops;
+};
 
-    dn_key sched_time ;		/* time pipe was scheduled in ready_heap */
-    dn_key idle_time;		/* start of pipe idle time */
 
     /*
-     * When the tx clock come from an interface (if_name[0] != '\0'), its name
-     * is stored below, whereas the ifp is filled when the rule is configured.
+ * Scheduler template, mostly indicating the name, number,
+ * sched_mask and buckets.
      */
-    char if_name[IFNAMSIZ];
-    struct ifnet *ifp ;
-    int ready ; /* set if ifp != NULL and we got a signal from it */
+struct dn_sch {
+	struct dn_id	oid;
+	uint32_t	sched_nr; /* N, scheduler number */
+	uint32_t	buckets; /* number of buckets for the instances */
+	uint32_t	flags;	/* have_mask, ... */
+
+	char name[16];	/* null terminated */
+	/* mask to select the appropriate scheduler instance */
+	struct ipfw_flow_id sched_mask; /* M */
+};
 
-    struct dn_flow_set fs ; /* used with fixed-rate flows */
 
+/* A delay profile is attached to a link.
+ * Note that a profile, as any other object, cannot be longer than 2^16
+ */
+#define	ED_MAX_SAMPLES_NO	1024
+struct dn_profile {
+	struct dn_id	oid;
     /* fields to simulate a delay profile */
-
 #define ED_MAX_NAME_LEN		32
     char name[ED_MAX_NAME_LEN];
+	int		link_nr;
     int loss_level;
-    int samples_no;
-    int *samples;
+	int		bandwidth;	// XXX use link bandwidth?
+	int		samples_no;	/* actual length of samples[] */
+	int samples[ED_MAX_SAMPLES_NO]; /* may be shorter */
 };
 
-/* dn_pipe_max is used to pass pipe configuration from userland onto
- * kernel space and back
- */
-#define ED_MAX_SAMPLES_NO	1024
-struct dn_pipe_max {
-	struct dn_pipe pipe;
-	int samples[ED_MAX_SAMPLES_NO];
-};
 
-SLIST_HEAD(dn_pipe_head, dn_pipe);
+
+/*
+ * Overall structure of dummynet
+
+In dummynet, packets are selected with the firewall rules, and passed
+to two different objects: PIPE or QUEUE (bad name).
+
+A QUEUE defines a classifier, which groups packets into flows
+according to a 'mask', puts them into independent queues (one
+per flow) with configurable size and queue management policy,
+and passes flows to a scheduler:
+
+                 (flow_mask|sched_mask)  sched_mask
+	 +---------+   weight Wx  +-------------+
+         |         |->-[flow]-->--|             |-+
+    -->--| QUEUE x |   ...        |             | |
+         |         |->-[flow]-->--| SCHEDuler N | |
+	 +---------+              |             | |
+	     ...                  |             +--[LINK N]-->--
+	 +---------+   weight Wy  |             | +--[LINK N]-->--
+         |         |->-[flow]-->--|             | |
+    -->--| QUEUE y |   ...        |             | |
+         |         |->-[flow]-->--|             | |
+	 +---------+              +-------------+ |
+	                            +-------------+
+
+Many QUEUE objects can connect to the same scheduler, each
+QUEUE object can have its own set of parameters.
+
+In turn, the SCHEDuler 'forks' multiple instances according
+to a 'sched_mask', each instance manages its own set of queues
+and transmits on a private instance of a configurable LINK.
+
+A PIPE is a simplified version of the above, where there
+is no flow_mask, and each scheduler instance handles a single queue.
+
+The following data structures (visible from userland) describe
+the objects used by dummynet:
+
+ + dn_link, contains the main configuration parameters related
+   to delay and bandwidth;
+ + dn_profile describes a delay profile;
+ + dn_flow describes the flow status (flow id, statistics)
+   
+ + dn_sch describes a scheduler
+ + dn_fs describes a flowset (msk, weight, queue parameters)
+
+ *
+ */
 
 #endif /* _IP_DUMMYNET_H */
diff --git a/sys/netinet/ip_fw.h b/sys/netinet/ip_fw.h
index 21a79ec..d357f18 100644
--- a/sys/netinet/ip_fw.h
+++ b/sys/netinet/ip_fw.h
@@ -487,24 +487,26 @@ struct ip_fw {
 #define RULESIZE(rule)  (sizeof(struct ip_fw) + \
 	((struct ip_fw *)(rule))->cmd_len * 4 - 4)
 
+#if 1 // moved to in.h
 /*
  * This structure is used as a flow mask and a flow id for various
  * parts of the code.
  */
 struct ipfw_flow_id {
-	u_int32_t	dst_ip;
-	u_int32_t	src_ip;
-	u_int16_t	dst_port;
-	u_int16_t	src_port;
-	u_int8_t	fib;
-	u_int8_t	proto;
-	u_int8_t	flags;	/* protocol-specific flags */
+	uint32_t	dst_ip;
+	uint32_t	src_ip;
+	uint16_t	dst_port;
+	uint16_t	src_port;
+	uint8_t	fib;
+	uint8_t	proto;
+	uint8_t	flags;	/* protocol-specific flags */
 	uint8_t		addr_type; /* 4 = ipv4, 6 = ipv6, 1=ether ? */
-	struct in6_addr dst_ip6;	/* could also store MAC addr! */
+	struct in6_addr dst_ip6;
 	struct in6_addr src_ip6;
-	u_int32_t	flow_id6;
-	u_int32_t	frag_id6;
+	uint32_t	flow_id6;
+	uint32_t	frag_id6;
 };
+#endif
 
 #define IS_IP6_FLOW_ID(id)	((id)->addr_type == 6)
 
diff --git a/sys/netinet/ipfw/dn_heap.c b/sys/netinet/ipfw/dn_heap.c
new file mode 100644
index 0000000..ec9cb6c
--- /dev/null
+++ b/sys/netinet/ipfw/dn_heap.c
@@ -0,0 +1,550 @@
+/*-
+ * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Binary heap and hash tables, used in dummynet
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#ifdef _KERNEL
+__FBSDID("$FreeBSD$");
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <netinet/ipfw/dn_heap.h>
+#ifndef log
+#define log(x, arg...)
+#endif
+
+#else /* !_KERNEL */
+
+#include <stdio.h>
+#include <dn_test.h>
+#include <strings.h>
+#include <stdlib.h>
+
+#include  "dn_heap.h"
+#define log(x, arg...)	fprintf(stderr, ## arg)
+#define panic(x...)	fprintf(stderr, ## x), exit(1)
+#define MALLOC_DEFINE(a, b, c)
+static void *my_malloc(int s) {	return malloc(s); }
+static void my_free(void *p) {	free(p); }
+#define malloc(s, t, w)	my_malloc(s)
+#define free(p, t)	my_free(p)
+#endif /* !_KERNEL */
+
+MALLOC_DEFINE(M_DN_HEAP, "dummynet", "dummynet heap");
+
+/*
+ * Heap management functions.
+ *
+ * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2.
+ * Some macros help finding parent/children so we can optimize them.
+ *
+ * heap_init() is called to expand the heap when needed.
+ * Increment size in blocks of 16 entries.
+ * Returns 1 on error, 0 on success
+ */
+#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 )
+#define HEAP_LEFT(x) ( (x)+(x) + 1 )
+#define	HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; }
+#define HEAP_INCREMENT	15
+
+static int
+heap_resize(struct dn_heap *h, unsigned int new_size)
+{
+	struct dn_heap_entry *p;
+
+	if (h->size >= new_size )	/* have enough room */
+		return 0;
+#if 1  /* round to the next power of 2 */
+	new_size |= new_size >> 1;
+	new_size |= new_size >> 2;
+	new_size |= new_size >> 4;
+	new_size |= new_size >> 8;
+	new_size |= new_size >> 16;
+#else
+	new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT;
+#endif
+	p = malloc(new_size * sizeof(*p), M_DN_HEAP, M_NOWAIT);
+	if (p == NULL) {
+		printf("--- %s, resize %d failed\n", __func__, new_size );
+		return 1; /* error */
+	}
+	if (h->size > 0) {
+		bcopy(h->p, p, h->size * sizeof(*p) );
+		free(h->p, M_DN_HEAP);
+	}
+	h->p = p;
+	h->size = new_size;
+	return 0;
+}
+
+int
+heap_init(struct dn_heap *h, int size, int ofs)
+{
+	if (heap_resize(h, size))
+		return 1;
+	h->elements = 0;
+	h->ofs = ofs;
+	return 0;
+}
+
+/*
+ * Insert element in heap. Normally, p != NULL, we insert p in
+ * a new position and bubble up. If p == NULL, then the element is
+ * already in place, and key is the position where to start the
+ * bubble-up.
+ * Returns 1 on failure (cannot allocate new heap entry)
+ *
+ * If ofs > 0 the position (index, int) of the element in the heap is
+ * also stored in the element itself at the given offset in bytes.
+ */
+#define SET_OFFSET(h, i) do {					\
+	if (h->ofs > 0)						\
+	    *((int32_t *)((char *)(h->p[i].object) + h->ofs)) = i;	\
+	} while (0)
+/*
+ * RESET_OFFSET is used for sanity checks. It sets ofs
+ * to an invalid value.
+ */
+#define RESET_OFFSET(h, i) do {					\
+	if (h->ofs > 0)						\
+	    *((int32_t *)((char *)(h->p[i].object) + h->ofs)) = -16;	\
+	} while (0)
+
+int
+heap_insert(struct dn_heap *h, uint64_t key1, void *p)
+{
+	int son = h->elements;
+
+	//log("%s key %llu p %p\n", __FUNCTION__, key1, p);
+	if (p == NULL) { /* data already there, set starting point */
+		son = key1;
+	} else { /* insert new element at the end, possibly resize */
+		son = h->elements;
+		if (son == h->size) /* need resize... */
+			// XXX expand by 16 or so
+			if (heap_resize(h, h->elements+16) )
+				return 1; /* failure... */
+		h->p[son].object = p;
+		h->p[son].key = key1;
+		h->elements++;
+	}
+	/* make sure that son >= father along the path */
+	while (son > 0) {
+		int father = HEAP_FATHER(son);
+		struct dn_heap_entry tmp;
+
+		if (DN_KEY_LT( h->p[father].key, h->p[son].key ) )
+			break; /* found right position */
+		/* son smaller than father, swap and repeat */
+		HEAP_SWAP(h->p[son], h->p[father], tmp);
+		SET_OFFSET(h, son);
+		son = father;
+	}
+	SET_OFFSET(h, son);
+	return 0;
+}
+
+/*
+ * remove top element from heap, or obj if obj != NULL
+ */
+void
+heap_extract(struct dn_heap *h, void *obj)
+{
+	int child, father, max = h->elements - 1;
+
+	if (max < 0) {
+		printf("--- %s: empty heap 0x%p\n", __FUNCTION__, h);
+		return;
+	}
+	if (obj == NULL)
+		father = 0; /* default: move up smallest child */
+	else { /* extract specific element, index is at offset */
+		if (h->ofs <= 0)
+			panic("%s: extract from middle not set on %p\n",
+				__FUNCTION__, h);
+		father = *((int *)((char *)obj + h->ofs));
+		if (father < 0 || father >= h->elements) {
+			panic("%s: father %d out of bound 0..%d\n",
+				__FUNCTION__, father, h->elements);
+		}
+	}
+	/*
+	 * below, father is the index of the empty element, which
+	 * we replace at each step with the smallest child until we
+	 * reach the bottom level.
+	 */
+	// XXX why removing RESET_OFFSET increases runtime by 10% ?
+	RESET_OFFSET(h, father);
+	while ( (child = HEAP_LEFT(father)) <= max ) {
+		if (child != max &&
+		    DN_KEY_LT(h->p[child+1].key, h->p[child].key) )
+			child++; /* take right child, otherwise left */
+		h->p[father] = h->p[child];
+		SET_OFFSET(h, father);
+		father = child;
+	}
+	h->elements--;
+	if (father != max) {
+		/*
+		 * Fill hole with last entry and bubble up,
+		 * reusing the insert code
+		 */
+		h->p[father] = h->p[max];
+		heap_insert(h, father, NULL);
+	}
+}
+
+#if 0
+/*
+ * change object position and update references
+ * XXX this one is never used!
+ */
+static void
+heap_move(struct dn_heap *h, uint64_t new_key, void *object)
+{
+	int temp, i, max = h->elements-1;
+	struct dn_heap_entry *p, buf;
+
+	if (h->ofs <= 0)
+		panic("cannot move items on this heap");
+	p = h->p;	/* shortcut */
+
+	i = *((int *)((char *)object + h->ofs));
+	if (DN_KEY_LT(new_key, p[i].key) ) { /* must move up */
+		p[i].key = new_key;
+		for (; i>0 &&
+		    DN_KEY_LT(new_key, p[(temp = HEAP_FATHER(i))].key);
+		    i = temp ) { /* bubble up */
+			HEAP_SWAP(p[i], p[temp], buf);
+			SET_OFFSET(h, i);
+		}
+	} else {		/* must move down */
+		p[i].key = new_key;
+		while ( (temp = HEAP_LEFT(i)) <= max ) {
+			/* found left child */
+			if (temp != max &&
+			    DN_KEY_LT(p[temp+1].key, p[temp].key))
+				temp++; /* select child with min key */
+			if (DN_KEY_LT(>p[temp].key, new_key)) {
+				/* go down */
+				HEAP_SWAP(p[i], p[temp], buf);
+				SET_OFFSET(h, i);
+			} else
+				break;
+			i = temp;
+		}
+	}
+	SET_OFFSET(h, i);
+}
+#endif /* heap_move, unused */
+
+/*
+ * heapify() will reorganize data inside an array to maintain the
+ * heap property. It is needed when we delete a bunch of entries.
+ */
+static void
+heapify(struct dn_heap *h)
+{
+	int i;
+
+	for (i = 0; i < h->elements; i++ )
+		heap_insert(h, i , NULL);
+}
+
+int
+heap_scan(struct dn_heap *h, int (*fn)(void *, uintptr_t),
+	uintptr_t arg)
+{
+	int i, ret, found;
+
+	for (i = found = 0 ; i < h->elements ;) {
+		ret = fn(h->p[i].object, arg);
+		if (ret & HEAP_SCAN_DEL) {
+			h->elements-- ;
+			h->p[i] = h->p[h->elements] ;
+			found++ ;
+		} else
+			i++ ;
+		if (ret & HEAP_SCAN_END)
+			break;
+	}
+	if (found)
+		heapify(h);
+	return found;
+}
+
+/*
+ * cleanup the heap and free data structure
+ */
+void
+heap_free(struct dn_heap *h)
+{
+	if (h->size >0 )
+		free(h->p, M_DN_HEAP);
+	bzero(h, sizeof(*h) );
+}
+
+/*
+ * hash table support.
+ */
+
+struct dn_ht {
+        int buckets;            /* how many buckets, really buckets - 1*/
+        int entries;            /* how many entries */
+        int ofs;	        /* offset of link field */
+        uint32_t (*hash)(uintptr_t, int, void *arg);
+        int (*match)(void *_el, uintptr_t key, int, void *);
+        void *(*new)(uintptr_t, int, void *);
+        void **ht;              /* bucket heads */
+};
+/*
+ * Initialize, allocating bucket pointers inline.
+ * Recycle previous record if possible.
+ * If the 'new' function is not supplied, we assume that the
+ * key passed to ht_find is the same object to be stored in.
+ */
+struct dn_ht *
+dn_ht_init(struct dn_ht *ht, int buckets, int ofs,
+        uint32_t (*h)(uintptr_t, int, void *),
+        int (*match)(void *, uintptr_t, int, void *),
+	void *(*new)(uintptr_t, int, void *))
+{
+	int l;
+
+	/*
+	 * Notes about rounding bucket size to a power of two.
+	 * Given the original bucket size, we compute the nearest lower and
+	 * higher power of two, minus 1  (respectively b_min and b_max) because
+	 * this value will be used to do an AND with the index returned
+	 * by hash function.
+	 * To choice between these two values, the original bucket size is
+	 * compared with b_min. If the original size is greater than 4/3 b_min,
+	 * we round the bucket size to b_max, else to b_min.
+	 * This ratio try to round to the nearest power of two, advantaging
+	 * the greater size if the different between two power is relatively
+	 * big.
+	 * Rounding the bucket size to a power of two avoid the use of
+	 * module when calculating the correct bucket.
+	 * The ht->buckets variable store the bucket size - 1 to simply
+	 * do an AND between the index returned by hash function and ht->bucket
+	 * instead of a module.
+	 */
+	int b_min; /* min buckets */
+	int b_max; /* max buckets */
+	int b_ori; /* original buckets */
+
+	if (h == NULL || match == NULL) {
+		printf("--- missing hash or match function");
+		return NULL;
+	}
+	if (buckets < 1 || buckets > 65536)
+		return NULL;
+
+	b_ori = buckets;
+	/* calculate next power of 2, - 1*/
+	buckets |= buckets >> 1;
+	buckets |= buckets >> 2;
+	buckets |= buckets >> 4;
+	buckets |= buckets >> 8;
+	buckets |= buckets >> 16;
+
+	b_max = buckets; /* Next power */
+	b_min = buckets >> 1; /* Previous power */
+
+	/* Calculate the 'nearest' bucket size */
+	if (b_min * 4000 / 3000 < b_ori)
+		buckets = b_max;
+	else
+		buckets = b_min;
+
+	if (ht) {	/* see if we can reuse */
+		if (buckets <= ht->buckets) {
+			ht->buckets = buckets;
+		} else {
+			/* free pointers if not allocated inline */
+			if (ht->ht != (void *)(ht + 1))
+				free(ht->ht, M_DN_HEAP);
+			free(ht, M_DN_HEAP);
+			ht = NULL;
+		}
+	}
+	if (ht == NULL) {
+		/* Allocate buckets + 1 entries because buckets is use to
+		 * do the AND with the index returned by hash function
+		 */
+		l = sizeof(*ht) + (buckets + 1) * sizeof(void **);
+		ht = malloc(l, M_DN_HEAP, M_NOWAIT | M_ZERO);
+	}
+	if (ht) {
+		ht->ht = (void **)(ht + 1);
+		ht->buckets = buckets;
+		ht->ofs = ofs;
+		ht->hash = h;
+		ht->match = match;
+		ht->new = new;
+	}
+	return ht;
+}
+
+/* dummy callback for dn_ht_free to unlink all */
+static int
+do_del(void *obj, void *arg)
+{
+	return DNHT_SCAN_DEL;
+}
+
+void
+dn_ht_free(struct dn_ht *ht, int flags)
+{
+	if (ht == NULL)
+		return;
+	if (flags & DNHT_REMOVE) {
+		(void)dn_ht_scan(ht, do_del, NULL);
+	} else {
+		if (ht->ht && ht->ht != (void *)(ht + 1))
+			free(ht->ht, M_DN_HEAP);
+		free(ht, M_DN_HEAP);
+	}
+}
+
+int
+dn_ht_entries(struct dn_ht *ht)
+{
+	return ht ? ht->entries : 0;
+}
+
+/* lookup and optionally create or delete element */
+void *
+dn_ht_find(struct dn_ht *ht, uintptr_t key, int flags, void *arg)
+{
+	int i;
+	void **pp, *p;
+
+	if (ht == NULL)	/* easy on an empty hash */
+		return NULL;
+	i = (ht->buckets == 1) ? 0 :
+		(ht->hash(key, flags, arg) & ht->buckets);
+
+	for (pp = &ht->ht[i]; (p = *pp); pp = (void **)((char *)p + ht->ofs)) {
+		if (flags & DNHT_MATCH_PTR) {
+			if (key == (uintptr_t)p)
+				break;
+		} else if (ht->match(p, key, flags, arg)) /* found match */
+			break;
+	}
+	if (p) {
+		if (flags & DNHT_REMOVE) {
+			/* link in the next element */
+			*pp = *(void **)((char *)p + ht->ofs);
+			*(void **)((char *)p + ht->ofs) = NULL;
+			ht->entries--;
+		}
+	} else if (flags & DNHT_INSERT) {
+		// printf("%s before calling new, bucket %d ofs %d\n",
+		//	__FUNCTION__, i, ht->ofs);
+		p = ht->new ? ht->new(key, flags, arg) : (void *)key;
+		// printf("%s new returns %p\n", __FUNCTION__, p);
+		if (p) {
+			ht->entries++;
+			*(void **)((char *)p + ht->ofs) = ht->ht[i];
+			ht->ht[i] = p;
+		}
+	}
+	return p;
+}
+
+/*
+ * do a scan with the option to delete the object. Extract next before
+ * running the callback because the element may be destroyed there.
+ */
+int
+dn_ht_scan(struct dn_ht *ht, int (*fn)(void *, void *), void *arg)
+{
+	int i, ret, found = 0;
+	void **curp, *cur, *next;
+
+	if (ht == NULL || fn == NULL)
+		return 0;
+	for (i = 0; i <= ht->buckets; i++) {
+		curp = &ht->ht[i];
+		while ( (cur = *curp) != NULL) {
+			next = *(void **)((char *)cur + ht->ofs);
+			ret = fn(cur, arg);
+			if (ret & DNHT_SCAN_DEL) {
+				found++;
+				ht->entries--;
+				*curp = next;
+			} else {
+				curp = (void **)((char *)cur + ht->ofs);
+			}
+			if (ret & DNHT_SCAN_END)
+				return found;
+		}
+	}
+	return found;
+}
+
+/*
+ * Similar to dn_ht_scan(), except thah the scan is performed only
+ * in the bucket 'bucket'. The function returns a correct bucket number if
+ * the original is invalid
+ */
+int
+dn_ht_scan_bucket(struct dn_ht *ht, int *bucket, int (*fn)(void *, void *),
+		 void *arg)
+{
+	int i, ret, found = 0;
+	void **curp, *cur, *next;
+
+	if (ht == NULL || fn == NULL)
+		return 0;
+	if (*bucket > ht->buckets)
+		*bucket = 0;
+	i = *bucket;
+
+	curp = &ht->ht[i];
+	while ( (cur = *curp) != NULL) {
+		next = *(void **)((char *)cur + ht->ofs);
+		ret = fn(cur, arg);
+		if (ret & DNHT_SCAN_DEL) {
+			found++;
+			ht->entries--;
+			*curp = next;
+		} else {
+			curp = (void **)((char *)cur + ht->ofs);
+		}
+		if (ret & DNHT_SCAN_END)
+			return found;
+	}
+	return found;
+}
+
diff --git a/sys/netinet/ipfw/dn_heap.h b/sys/netinet/ipfw/dn_heap.h
new file mode 100644
index 0000000..a663f91
--- /dev/null
+++ b/sys/netinet/ipfw/dn_heap.h
@@ -0,0 +1,191 @@
+/*-
+ * Copyright (c) 1998-2010 Luigi Rizzo, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Binary heap and hash tables, header file
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IP_DN_HEAP_H
+#define _IP_DN_HEAP_H
+
+#define DN_KEY_LT(a,b)     ((int64_t)((a)-(b)) < 0)
+#define DN_KEY_LEQ(a,b)    ((int64_t)((a)-(b)) <= 0)
+
+/*
+ * This module implements a binary heap supporting random extraction.
+ *
+ * A heap entry contains an uint64_t key and a pointer to object.
+ * DN_KEY_LT(a,b) returns true if key 'a' is smaller than 'b'
+ *
+ * The heap is a struct dn_heap plus a dynamically allocated
+ * array of dn_heap_entry entries. 'size' represents the size of
+ * the array, 'elements' count entries in use. The topmost
+ * element has the smallest key.
+ * The heap supports ordered insert, and extract from the top.
+ * To extract an object from the middle of the heap, we the object
+ * must reserve an 'int32_t' to store the position of the object
+ * in the heap itself, and the location of this field must be
+ * passed as an argument to heap_init() -- use -1 if the feature
+ * is not used.
+ */
+struct dn_heap_entry {
+	uint64_t key;	/* sorting key, smallest comes first */
+	void *object;	/* object pointer */
+};
+
+struct dn_heap {
+	int size;	/* the size of the array */
+	int elements;	/* elements in use */
+	int ofs;	/* offset in the object of heap index */
+	struct dn_heap_entry *p;	/* array of "size" entries */
+};
+
+enum {
+	HEAP_SCAN_DEL = 1,
+	HEAP_SCAN_END = 2,
+};
+
+/*
+ * heap_init() reinitializes the heap setting the size and the offset
+ *	of the index for random extraction (use -1 if not used).
+ *	The 'elements' counter is set to 0.
+ *
+ * SET_HEAP_OFS() indicates where, in the object, is stored the index
+ *	for random extractions from the heap.
+ *
+ * heap_free() frees the memory associated to a heap.
+ *
+ * heap_insert() adds a key-pointer pair to the heap
+ *
+ * HEAP_TOP() returns a pointer to the top element of the heap,
+ *	but makes no checks on its existance (XXX should we change ?)
+ *
+ * heap_extract() removes the entry at the top, returing the pointer.
+ *	(the key should have been read before).
+ *
+ * heap_scan() invokes a callback on each entry of the heap.
+ *	The callback can return a combination of HEAP_SCAN_DEL and
+ *	HEAP_SCAN_END. HEAP_SCAN_DEL means the current element must
+ *	be removed, and HEAP_SCAN_END means to terminate the scan.
+ *	heap_scan() returns the number of elements removed.
+ *	Because the order is not guaranteed, we should use heap_scan()
+ *	only as a last resort mechanism.
+ */
+#define HEAP_TOP(h)	((h)->p)
+#define SET_HEAP_OFS(h, n)	do { (h)->ofs = n; } while (0)
+int     heap_init(struct dn_heap *h, int size, int ofs);
+int     heap_insert(struct dn_heap *h, uint64_t key1, void *p);
+void    heap_extract(struct dn_heap *h, void *obj);
+void heap_free(struct dn_heap *h);
+int heap_scan(struct dn_heap *, int (*)(void *, uintptr_t), uintptr_t);
+
+/*------------------------------------------------------
+ * This module implements a generic hash table with support for
+ * running callbacks on the entire table. To avoid allocating
+ * memory during hash table operations, objects must reserve
+ * space for a link field. XXX if the heap is moderately full,
+ * an SLIST suffices, and we can tolerate the cost of a hash
+ * computation on each removal.
+ *
+ * dn_ht_init() initializes the table, setting the number of
+ *	buckets, the offset of the link field, the main callbacks.
+ *	Callbacks are:
+ * 
+ *	hash(key, flags, arg) called to return a bucket index.
+ *	match(obj, key, flags, arg) called to determine if key
+ *		matches the current 'obj' in the heap
+ *	new(key, flags, arg) optional, used to allocate a new
+ *		object during insertions.
+ *
+ * dn_ht_free() frees the heap or unlink elements.
+ *	DNHT_REMOVE unlink elements, 0 frees the heap.
+ *	You need two calls to do both.
+ *
+ * dn_ht_find() is the main lookup function, which can also be
+ *	used to insert or delete elements in the hash table.
+ *	The final 'arg' is passed to all callbacks.
+ *
+ * dn_ht_scan() is used to invoke a callback on all entries of
+ *	the heap, or possibly on just one bucket. The callback
+ *	is invoked with a pointer to the object, and must return
+ *	one of DNHT_SCAN_DEL or DNHT_SCAN_END to request the
+ *	removal of the object from the heap and the end of the
+ *	scan, respectively.
+ *
+ * dn_ht_scan_bucket() is similar to dn_ht_scan(), except that it scans
+ *	only the specific bucket of the table. The bucket is a in-out
+ *	parameter and return a valid bucket number if the original
+ *	is invalid.
+ *
+ * A combination of flags can be used to modify the operation
+ * of the dn_ht_find(), and of the callbacks:
+ *
+ * DNHT_KEY_IS_OBJ	means the key is the object pointer.
+ *	It is usally of interest for the hash and match functions.
+ *
+ * DNHT_MATCH_PTR	during a lookup, match pointers instead
+ *	of calling match(). Normally used when removing specific
+ *	entries. Does not imply KEY_IS_OBJ as the latter _is_ used
+ *	by the match function.
+ *
+ * DNHT_INSERT		insert the element if not found.
+ *	Calls new() to allocates a new object unless
+ *	DNHT_KEY_IS_OBJ is set.
+ *
+ * DNHT_UNIQUE		only insert if object not found.
+ *	XXX should it imply DNHT_INSERT ?
+ *
+ * DNHT_REMOVE		remove objects if we find them.
+ */
+struct dn_ht;	/* should be opaque */
+
+struct dn_ht *dn_ht_init(struct dn_ht *, int buckets, int ofs, 
+        uint32_t (*hash)(uintptr_t, int, void *),
+        int (*match)(void *, uintptr_t, int, void *),
+        void *(*new)(uintptr_t, int, void *));
+void dn_ht_free(struct dn_ht *, int flags);
+
+void *dn_ht_find(struct dn_ht *, uintptr_t, int, void *);
+int dn_ht_scan(struct dn_ht *, int (*)(void *, void *), void *);
+int dn_ht_scan_bucket(struct dn_ht *, int * , int (*)(void *, void *), void *);
+int dn_ht_entries(struct dn_ht *);
+
+enum {  /* flags values.
+	 * first two are returned by the scan callback to indicate
+	 * to delete the matching element or to end the scan
+	 */
+        DNHT_SCAN_DEL	= 0x0001,
+        DNHT_SCAN_END	= 0x0002,
+        DNHT_KEY_IS_OBJ	= 0x0004,	/* key is the obj pointer */
+        DNHT_MATCH_PTR	= 0x0008,	/* match by pointer, not match() */
+        DNHT_INSERT	= 0x0010,	/* insert if not found */
+        DNHT_UNIQUE	= 0x0020,	/* report error if already there */
+        DNHT_REMOVE	= 0x0040,	/* remove on find or dn_ht_free */
+}; 
+
+#endif /* _IP_DN_HEAP_H */
diff --git a/sys/netinet/ipfw/dn_sched.h b/sys/netinet/ipfw/dn_sched.h
new file mode 100644
index 0000000..b1c1c8f
--- /dev/null
+++ b/sys/netinet/ipfw/dn_sched.h
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2010 Riccardo Panicucci, Luigi Rizzo, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * The API to write a packet scheduling algorithm for dummynet.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _DN_SCHED_H
+#define _DN_SCHED_H
+
+#define	DN_MULTIQUEUE	0x01
+/*
+ * Descriptor for a scheduling algorithm.
+ * Contains all function pointers for a given scheduler
+ * This is typically created when a module is loaded, and stored
+ * in a global list of schedulers.
+ */
+struct dn_alg {
+	uint32_t type;           /* the scheduler type */
+	const char *name;   /* scheduler name */
+	uint32_t flags;	/* DN_MULTIQUEUE if supports multiple queues */
+
+	/*
+	 * The following define the size of 3 optional data structures
+	 * that may need to be allocated at runtime, and are appended
+	 * to each of the base data structures: scheduler, sched.inst,
+	 * and queue. We don't have a per-flowset structure.
+	 */
+	/*    + parameters attached to the template, e.g.
+	 *	default queue sizes, weights, quantum size, and so on;
+	 */
+	size_t schk_datalen;
+
+	/*    + per-instance parameters, such as timestamps,
+	 *	containers for queues, etc;
+	 */
+	size_t si_datalen;
+
+	size_t q_datalen;	/* per-queue parameters (e.g. S,F) */
+
+	/*
+	 * Methods implemented by the scheduler:
+	 * enqueue	enqueue packet 'm' on scheduler 's', queue 'q'.
+	 *	q is NULL for !MULTIQUEUE.
+	 *	Return 0 on success, 1 on drop (packet consumed anyways).
+	 *
+	 * dequeue	Called when scheduler instance 's' can
+	 *	dequeue a packet. Return NULL if none are available.
+	 *	XXX what about non work-conserving ?
+	 *
+	 * config	called on 'sched X config ...', normally writes
+	 *	in the area of size sch_arg
+	 *
+	 * destroy	called on 'sched delete', frees everything
+	 *	in sch_arg (other parts are handled by more specific
+	 *	functions)
+	 *
+	 * new_sched    called when a new instance is created, e.g.
+	 *	to create the local queue for !MULTIQUEUE, set V or
+	 *	copy parameters for WFQ, and so on.
+	 *
+	 * free_sched	called when deleting an instance, cleans
+	 *	extra data in the per-instance area.
+	 *
+	 * new_fsk	called when a flowset is linked to a scheduler,
+	 *	e.g. to validate parameters such as weights etc.
+	 * free_fsk	when a flowset is unlinked from a scheduler.
+	 *	(probably unnecessary)
+	 *
+	 * new_queue	called to set the per-queue parameters,
+	 *	e.g. S and F, adjust sum of weights in the parent, etc.
+	 *	If the queue has packets in it, add them to the scheduler
+	 *	as well.
+	 *
+	 * free_queue	actions related to a queue removal, e.g. undo
+	 *	all the above. If the queue has data in it, also remove
+	 *	from the scheduler. This can e.g. happen during a reconfigure.
+	 */
+	int (*enqueue)(struct dn_sch_inst *, struct dn_queue *,
+		struct mbuf *);
+	struct mbuf * (*dequeue)(struct dn_sch_inst *);
+
+	int (*config)(struct dn_schk *);
+	int (*destroy)(struct dn_schk*);
+	int (*new_sched)(struct dn_sch_inst *);
+	int (*free_sched)(struct dn_sch_inst *);
+	int (*new_fsk)(struct dn_fsk *f);
+	int (*free_fsk)(struct dn_fsk *f);
+	int (*new_queue)(struct dn_queue *q);
+	int (*free_queue)(struct dn_queue *q);
+
+	/* run-time fields */
+	int ref_count;      /* XXX number of instances in the system */
+	SLIST_ENTRY(dn_alg) next; /* Next scheduler in the list */
+};
+
+/* MSVC does not support initializers so we need this ugly macro */
+#ifdef _WIN32
+#define _SI(fld)        
+#else
+#define _SI(fld)        fld
+#endif
+
+/*
+ * Additionally, dummynet exports some functions and macros
+ * to be used by schedulers:
+ */
+
+void dn_free_pkts(struct mbuf *mnext);
+int dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop);
+/* bound a variable between min and max */
+int ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg);
+
+/*
+ * Extract the head of a queue, update stats. Must be the very last
+ * thing done on a dequeue as the queue itself may go away.
+ */
+static __inline struct mbuf*
+dn_dequeue(struct dn_queue *q)
+{
+	struct mbuf *m = q->mq.head;
+	if (m == NULL)
+		return NULL;
+	q->mq.head = m->m_nextpkt;
+	q->ni.length--;
+	q->ni.len_bytes -= m->m_pkthdr.len;
+	if (q->_si) {
+		q->_si->ni.length--;
+		q->_si->ni.len_bytes -= m->m_pkthdr.len;
+	}
+	if (q->ni.length == 0) /* queue is now idle */
+		q->q_time = dn_cfg.curr_time;
+	return m;
+}
+
+int dn_sched_modevent(module_t mod, int cmd, void *arg);
+
+#define DECLARE_DNSCHED_MODULE(name, dnsched)			\
+	static moduledata_t name##_mod = {			\
+		#name, dn_sched_modevent, dnsched		\
+	};							\
+	DECLARE_MODULE(name, name##_mod, 			\
+		SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); 	\
+        MODULE_DEPEND(name, dummynet, 3, 3, 3);
+#endif /* _DN_SCHED_H */
diff --git a/sys/netinet/ipfw/dn_sched_fifo.c b/sys/netinet/ipfw/dn_sched_fifo.c
new file mode 100644
index 0000000..0bb3800
--- /dev/null
+++ b/sys/netinet/ipfw/dn_sched_fifo.c
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ */
+
+#ifdef _KERNEL
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/kernel.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <net/if.h>	/* IFNAMSIZ */
+#include <netinet/in.h>
+#include <netinet/ip_var.h>		/* ipfw_rule_ref */
+#include <netinet/ip_fw.h>	/* flow_id */
+#include <netinet/ip_dummynet.h>
+#include <netinet/ipfw/dn_heap.h>
+#include <netinet/ipfw/ip_dn_private.h>
+#include <netinet/ipfw/dn_sched.h>
+#else
+#include <dn_test.h>
+#endif
+
+/*
+ * This file implements a FIFO scheduler for a single queue.
+ * The queue is allocated as part of the scheduler instance,
+ * and there is a single flowset is in the template which stores
+ * queue size and policy.
+ * Enqueue and dequeue use the default library functions.
+ */
+static int 
+fifo_enqueue(struct dn_sch_inst *si, struct dn_queue *q, struct mbuf *m)
+{
+	/* XXX if called with q != NULL and m=NULL, this is a
+	 * re-enqueue from an existing scheduler, which we should
+	 * handle.
+	 */
+	return dn_enqueue((struct dn_queue *)(si+1), m, 0);
+}
+
+static struct mbuf *
+fifo_dequeue(struct dn_sch_inst *si)
+{
+	return dn_dequeue((struct dn_queue *)(si + 1));
+}
+
+static int
+fifo_new_sched(struct dn_sch_inst *si)
+{
+	/* This scheduler instance contains the queue */
+	struct dn_queue *q = (struct dn_queue *)(si + 1);
+
+        set_oid(&q->ni.oid, DN_QUEUE, sizeof(*q));
+	q->_si = si;
+	q->fs = si->sched->fs;
+	return 0;
+}
+
+static int
+fifo_free_sched(struct dn_sch_inst *si)
+{
+	struct dn_queue *q = (struct dn_queue *)(si + 1);
+	dn_free_pkts(q->mq.head);
+	bzero(q, sizeof(*q));
+	return 0;
+}
+
+/*
+ * FIFO scheduler descriptor
+ * contains the type of the scheduler, the name, the size of extra
+ * data structures, and function pointers.
+ */
+static struct dn_alg fifo_desc = {
+	_SI( .type = )  DN_SCHED_FIFO,
+	_SI( .name = )  "FIFO",
+	_SI( .flags = ) 0,
+
+	_SI( .schk_datalen = ) 0,
+	_SI( .si_datalen = )  sizeof(struct dn_queue),
+	_SI( .q_datalen = )  0,
+
+	_SI( .enqueue = )  fifo_enqueue,
+	_SI( .dequeue = )  fifo_dequeue,
+	_SI( .config = )  NULL,
+	_SI( .destroy = )  NULL,
+	_SI( .new_sched = )  fifo_new_sched,
+	_SI( .free_sched = )  fifo_free_sched,
+	_SI( .new_fsk = )  NULL,
+	_SI( .free_fsk = )  NULL,
+	_SI( .new_queue = )  NULL,
+	_SI( .free_queue = )  NULL,
+};
+
+DECLARE_DNSCHED_MODULE(dn_fifo, &fifo_desc);
diff --git a/sys/netinet/ipfw/dn_sched_qfq.c b/sys/netinet/ipfw/dn_sched_qfq.c
new file mode 100644
index 0000000..44555ee
--- /dev/null
+++ b/sys/netinet/ipfw/dn_sched_qfq.c
@@ -0,0 +1,864 @@
+/*
+ * Copyright (c) 2010 Fabio Checconi, Luigi Rizzo, Paolo Valente
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ */
+
+#ifdef _KERNEL
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/kernel.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <net/if.h>	/* IFNAMSIZ */
+#include <netinet/in.h>
+#include <netinet/ip_var.h>		/* ipfw_rule_ref */
+#include <netinet/ip_fw.h>	/* flow_id */
+#include <netinet/ip_dummynet.h>
+#include <netinet/ipfw/dn_heap.h>
+#include <netinet/ipfw/ip_dn_private.h>
+#include <netinet/ipfw/dn_sched.h>
+#else
+#include <dn_test.h>
+#endif
+
+#ifdef QFQ_DEBUG
+struct qfq_sched;
+static void dump_sched(struct qfq_sched *q, const char *msg);
+#define	NO(x)	x
+#else
+#define NO(x)
+#endif
+#define DN_SCHED_QFQ	4 // XXX Where?
+typedef	unsigned long	bitmap;
+
+/*
+ * bitmaps ops are critical. Some linux versions have __fls
+ * and the bitmap ops. Some machines have ffs
+ */
+#if defined(_WIN32)
+int fls(unsigned int n)
+{
+	int i = 0;
+	for (i = 0; n > 0; n >>= 1, i++)
+		;
+	return i;
+}
+#endif
+
+#if !defined(_KERNEL) || defined( __FreeBSD__ ) || defined(_WIN32)
+static inline unsigned long __fls(unsigned long word)
+{
+	return fls(word) - 1;
+}
+#endif
+
+#if !defined(_KERNEL) || !defined(__linux__)
+#ifdef QFQ_DEBUG
+int test_bit(int ix, bitmap *p)
+{
+	if (ix < 0 || ix > 31)
+		D("bad index %d", ix);
+	return *p & (1<<ix);
+}
+void __set_bit(int ix, bitmap *p)
+{
+	if (ix < 0 || ix > 31)
+		D("bad index %d", ix);
+	*p |= (1<<ix);
+}
+void __clear_bit(int ix, bitmap *p)
+{
+	if (ix < 0 || ix > 31)
+		D("bad index %d", ix);
+	*p &= ~(1<<ix);
+}
+#else /* !QFQ_DEBUG */
+/* XXX do we have fast version, or leave it to the compiler ? */
+#define test_bit(ix, pData)	((*pData) & (1<<(ix)))
+#define __set_bit(ix, pData)	(*pData) |= (1<<(ix))
+#define __clear_bit(ix, pData)	(*pData) &= ~(1<<(ix))
+#endif /* !QFQ_DEBUG */
+#endif /* !__linux__ */
+
+#ifdef __MIPSEL__
+#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix))
+#endif
+
+/*-------------------------------------------*/
+/*
+
+Virtual time computations.
+
+S, F and V are all computed in fixed point arithmetic with
+FRAC_BITS decimal bits.
+
+   QFQ_MAX_INDEX is the maximum index allowed for a group. We need
+  	one bit per index.
+   QFQ_MAX_WSHIFT is the maximum power of two supported as a weight.
+   The layout of the bits is as below:
+  
+                   [ MTU_SHIFT ][      FRAC_BITS    ]
+                   [ MAX_INDEX    ][ MIN_SLOT_SHIFT ]
+  				 ^.__grp->index = 0
+  				 *.__grp->slot_shift
+  
+   where MIN_SLOT_SHIFT is derived by difference from the others.
+
+The max group index corresponds to Lmax/w_min, where
+Lmax=1<<MTU_SHIFT, w_min = 1 .
+From this, and knowing how many groups (MAX_INDEX) we want,
+we can derive the shift corresponding to each group.
+
+Because we often need to compute
+	F = S + len/w_i  and V = V + len/wsum
+instead of storing w_i store the value
+	inv_w = (1<<FRAC_BITS)/w_i
+so we can do F = S + len * inv_w * wsum.
+We use W_TOT in the formulas so we can easily move between
+static and adaptive weight sum.
+
+The per-scheduler-instance data contain all the data structures
+for the scheduler: bitmaps and bucket lists.
+
+ */
+/*
+ * Maximum number of consecutive slots occupied by backlogged classes
+ * inside a group. This is approx lmax/lmin + 5.
+ * XXX check because it poses constraints on MAX_INDEX
+ */
+#define QFQ_MAX_SLOTS	32
+/*
+ * Shifts used for class<->group mapping. Class weights are
+ * in the range [1, QFQ_MAX_WEIGHT], we to map each class i to the
+ * group with the smallest index that can support the L_i / r_i
+ * configured for the class.
+ *
+ * grp->index is the index of the group; and grp->slot_shift
+ * is the shift for the corresponding (scaled) sigma_i.
+ *
+ * When computing the group index, we do (len<<FP_SHIFT)/weight,
+ * then compute an FLS (which is like a log2()), and if the result
+ * is below the MAX_INDEX region we use 0 (which is the same as
+ * using a larger len).
+ */
+#define QFQ_MAX_INDEX		19
+#define QFQ_MAX_WSHIFT		16	/* log2(max_weight) */
+
+#define	QFQ_MAX_WEIGHT		(1<<QFQ_MAX_WSHIFT)
+#define QFQ_MAX_WSUM		(2*QFQ_MAX_WEIGHT)
+//#define IWSUM	(q->i_wsum)
+#define IWSUM	((1<<FRAC_BITS)/QFQ_MAX_WSUM)
+
+#define FRAC_BITS		30	/* fixed point arithmetic */
+#define ONE_FP			(1UL << FRAC_BITS)
+
+#define QFQ_MTU_SHIFT		11	/* log2(max_len) */
+#define QFQ_MIN_SLOT_SHIFT	(FRAC_BITS + QFQ_MTU_SHIFT - QFQ_MAX_INDEX)
+
+/*
+ * Possible group states, also indexes for the bitmaps array in
+ * struct qfq_queue. We rely on ER, IR, EB, IB being numbered 0..3
+ */
+enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE };
+
+struct qfq_group;
+/*
+ * additional queue info. Some of this info should come from
+ * the flowset, we copy them here for faster processing.
+ * This is an overlay of the struct dn_queue
+ */
+struct qfq_class {
+	struct dn_queue _q;
+	uint64_t S, F;		/* flow timestamps (exact) */
+	struct qfq_class *next; /* Link for the slot list. */
+
+	/* group we belong to. In principle we would need the index,
+	 * which is log_2(lmax/weight), but we never reference it
+	 * directly, only the group.
+	 */
+	struct qfq_group *grp;
+
+	/* these are copied from the flowset. */
+	uint32_t	inv_w;	/* ONE_FP/weight */
+	uint32_t 	lmax;	/* Max packet size for this flow. */
+};
+
+/* Group descriptor, see the paper for details.
+ * Basically this contains the bucket lists
+ */
+struct qfq_group {
+	uint64_t S, F;			/* group timestamps (approx). */
+	unsigned int slot_shift;	/* Slot shift. */
+	unsigned int index;		/* Group index. */
+	unsigned int front;		/* Index of the front slot. */
+	bitmap full_slots;		/* non-empty slots */
+
+	/* Array of lists of active classes. */
+	struct qfq_class *slots[QFQ_MAX_SLOTS];
+};
+
+/* scheduler instance descriptor. */
+struct qfq_sched {
+	uint64_t	V;		/* Precise virtual time. */
+	uint32_t	wsum;		/* weight sum */
+	NO(uint32_t	i_wsum;		/* ONE_FP/w_sum */
+	uint32_t	_queued;	/* debugging */
+	uint32_t	loops;	/* debugging */)
+	bitmap bitmaps[QFQ_MAX_STATE];	/* Group bitmaps. */
+	struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */
+};
+
+/*---- support functions ----------------------------*/
+
+/* Generic comparison function, handling wraparound. */
+static inline int qfq_gt(uint64_t a, uint64_t b)
+{
+	return (int64_t)(a - b) > 0;
+}
+
+/* Round a precise timestamp to its slotted value. */
+static inline uint64_t qfq_round_down(uint64_t ts, unsigned int shift)
+{
+	return ts & ~((1ULL << shift) - 1);
+}
+
+/* return the pointer to the group with lowest index in the bitmap */
+static inline struct qfq_group *qfq_ffs(struct qfq_sched *q,
+					unsigned long bitmap)
+{
+	int index = ffs(bitmap) - 1; // zero-based
+	return &q->groups[index];
+}
+
+/*
+ * Calculate a flow index, given its weight and maximum packet length.
+ * index = log_2(maxlen/weight) but we need to apply the scaling.
+ * This is used only once at flow creation.
+ */
+static int qfq_calc_index(uint32_t inv_w, unsigned int maxlen)
+{
+	uint64_t slot_size = (uint64_t)maxlen *inv_w;
+	unsigned long size_map;
+	int index = 0;
+
+	size_map = (unsigned long)(slot_size >> QFQ_MIN_SLOT_SHIFT);
+	if (!size_map)
+		goto out;
+
+	index = __fls(size_map) + 1;	// basically a log_2()
+	index -= !(slot_size - (1ULL << (index + QFQ_MIN_SLOT_SHIFT - 1)));
+
+	if (index < 0)
+		index = 0;
+
+out:
+	ND("W = %d, L = %d, I = %d\n", ONE_FP/inv_w, maxlen, index);
+	return index;
+}
+/*---- end support functions ----*/
+
+/*-------- API calls --------------------------------*/
+/*
+ * Validate and copy parameters from flowset.
+ */
+static int
+qfq_new_queue(struct dn_queue *_q)
+{
+	struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1);
+	struct qfq_class *cl = (struct qfq_class *)_q;
+	int i;
+	uint32_t w;	/* approximated weight */
+
+	/* import parameters from the flowset. They should be correct
+	 * already.
+	 */
+	w = _q->fs->fs.par[0];
+	cl->lmax = _q->fs->fs.par[1];
+	if (!w || w > QFQ_MAX_WEIGHT) {
+		w = 1;
+		D("rounding weight to 1");
+	}
+	cl->inv_w = ONE_FP/w;
+	w = ONE_FP/cl->inv_w;	
+	if (q->wsum + w > QFQ_MAX_WSUM)
+		return EINVAL;
+
+	i = qfq_calc_index(cl->inv_w, cl->lmax);
+	cl->grp = &q->groups[i];
+	q->wsum += w;
+	// XXX cl->S = q->V; ?
+	// XXX compute q->i_wsum
+	return 0;
+}
+
+/* remove an empty queue */
+static int
+qfq_free_queue(struct dn_queue *_q)
+{
+	struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1);
+	struct qfq_class *cl = (struct qfq_class *)_q;
+	if (cl->inv_w) {
+		q->wsum -= ONE_FP/cl->inv_w;
+		cl->inv_w = 0; /* reset weight to avoid run twice */
+	}
+	return 0;
+}
+
+/* Calculate a mask to mimic what would be ffs_from(). */
+static inline unsigned long
+mask_from(unsigned long bitmap, int from)
+{
+	return bitmap & ~((1UL << from) - 1);
+}
+
+/*
+ * The state computation relies on ER=0, IR=1, EB=2, IB=3
+ * First compute eligibility comparing grp->S, q->V,
+ * then check if someone is blocking us and possibly add EB
+ */
+static inline unsigned int
+qfq_calc_state(struct qfq_sched *q, struct qfq_group *grp)
+{
+	/* if S > V we are not eligible */
+	unsigned int state = qfq_gt(grp->S, q->V);
+	unsigned long mask = mask_from(q->bitmaps[ER], grp->index);
+	struct qfq_group *next;
+
+	if (mask) {
+		next = qfq_ffs(q, mask);
+		if (qfq_gt(grp->F, next->F))
+			state |= EB;
+	}
+
+	return state;
+}
+
+/*
+ * In principle
+ *	q->bitmaps[dst] |= q->bitmaps[src] & mask;
+ *	q->bitmaps[src] &= ~mask;
+ * but we should make sure that src != dst
+ */
+static inline void
+qfq_move_groups(struct qfq_sched *q, unsigned long mask, int src, int dst)
+{
+	q->bitmaps[dst] |= q->bitmaps[src] & mask;
+	q->bitmaps[src] &= ~mask;
+}
+
+static inline void
+qfq_unblock_groups(struct qfq_sched *q, int index, uint64_t old_finish)
+{
+	unsigned long mask = mask_from(q->bitmaps[ER], index + 1);
+	struct qfq_group *next;
+
+	if (mask) {
+		next = qfq_ffs(q, mask);
+		if (!qfq_gt(next->F, old_finish))
+			return;
+	}
+
+	mask = (1UL << index) - 1;
+	qfq_move_groups(q, mask, EB, ER);
+	qfq_move_groups(q, mask, IB, IR);
+}
+
+/*
+ * perhaps
+ *
+	old_V ^= q->V;
+	old_V >>= QFQ_MIN_SLOT_SHIFT;
+	if (old_V) {
+		...
+	}
+ *
+ */
+static inline void
+qfq_make_eligible(struct qfq_sched *q, uint64_t old_V)
+{
+	unsigned long mask, vslot, old_vslot;
+
+	vslot = q->V >> QFQ_MIN_SLOT_SHIFT;
+	old_vslot = old_V >> QFQ_MIN_SLOT_SHIFT;
+
+	if (vslot != old_vslot) {
+		mask = (2UL << (__fls(vslot ^ old_vslot))) - 1;
+		qfq_move_groups(q, mask, IR, ER);
+		qfq_move_groups(q, mask, IB, EB);
+	}
+}
+
+/*
+ * XXX we should make sure that slot becomes less than 32.
+ * This is guaranteed by the input values.
+ * roundedS is always cl->S rounded on grp->slot_shift bits.
+ */
+static inline void
+qfq_slot_insert(struct qfq_group *grp, struct qfq_class *cl, uint64_t roundedS)
+{
+	uint64_t slot = (roundedS - grp->S) >> grp->slot_shift;
+	unsigned int i = (grp->front + slot) % QFQ_MAX_SLOTS;
+
+	cl->next = grp->slots[i];
+	grp->slots[i] = cl;
+	__set_bit(slot, &grp->full_slots);
+}
+
+/*
+ * remove the entry from the slot
+ */
+static inline void
+qfq_front_slot_remove(struct qfq_group *grp)
+{
+	struct qfq_class **h = &grp->slots[grp->front];
+
+	*h = (*h)->next;
+	if (!*h)
+		__clear_bit(0, &grp->full_slots);
+}
+
+/*
+ * Returns the first full queue in a group. As a side effect,
+ * adjust the bucket list so the first non-empty bucket is at
+ * position 0 in full_slots.
+ */
+static inline struct qfq_class *
+qfq_slot_scan(struct qfq_group *grp)
+{
+	int i;
+
+	ND("grp %d full %x", grp->index, grp->full_slots);
+	if (!grp->full_slots)
+		return NULL;
+
+	i = ffs(grp->full_slots) - 1; // zero-based
+	if (i > 0) {
+		grp->front = (grp->front + i) % QFQ_MAX_SLOTS;
+		grp->full_slots >>= i;
+	}
+
+	return grp->slots[grp->front];
+}
+
+/*
+ * adjust the bucket list. When the start time of a group decreases,
+ * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to
+ * move the objects. The mask of occupied slots must be shifted
+ * because we use ffs() to find the first non-empty slot.
+ * This covers decreases in the group's start time, but what about
+ * increases of the start time ?
+ * Here too we should make sure that i is less than 32
+ */
+static inline void
+qfq_slot_rotate(struct qfq_sched *q, struct qfq_group *grp, uint64_t roundedS)
+{
+	unsigned int i = (grp->S - roundedS) >> grp->slot_shift;
+
+	grp->full_slots <<= i;
+	grp->front = (grp->front - i) % QFQ_MAX_SLOTS;
+}
+
+
+static inline void
+qfq_update_eligible(struct qfq_sched *q, uint64_t old_V)
+{
+	bitmap ineligible;
+
+	ineligible = q->bitmaps[IR] | q->bitmaps[IB];
+	if (ineligible) {
+		if (!q->bitmaps[ER]) {
+			struct qfq_group *grp;
+			grp = qfq_ffs(q, ineligible);
+			if (qfq_gt(grp->S, q->V))
+				q->V = grp->S;
+		}
+		qfq_make_eligible(q, old_V);
+	}
+}
+
+/*
+ * Updates the class, returns true if also the group needs to be updated.
+ */
+static inline int
+qfq_update_class(struct qfq_sched *q, struct qfq_group *grp,
+	    struct qfq_class *cl)
+{
+
+	cl->S = cl->F;
+	if (cl->_q.mq.head == NULL)  {
+		qfq_front_slot_remove(grp);
+	} else {
+		unsigned int len;
+		uint64_t roundedS;
+
+		len = cl->_q.mq.head->m_pkthdr.len;
+		cl->F = cl->S + (uint64_t)len * cl->inv_w;
+		roundedS = qfq_round_down(cl->S, grp->slot_shift);
+		if (roundedS == grp->S)
+			return 0;
+
+		qfq_front_slot_remove(grp);
+		qfq_slot_insert(grp, cl, roundedS);
+	}
+	return 1;
+}
+
+static struct mbuf *
+qfq_dequeue(struct dn_sch_inst *si)
+{
+	struct qfq_sched *q = (struct qfq_sched *)(si + 1);
+	struct qfq_group *grp;
+	struct qfq_class *cl;
+	struct mbuf *m;
+	uint64_t old_V;
+
+	NO(q->loops++;)
+	if (!q->bitmaps[ER]) {
+		NO(if (q->queued)
+			dump_sched(q, "start dequeue");)
+		return NULL;
+	}
+
+	grp = qfq_ffs(q, q->bitmaps[ER]);
+
+	cl = grp->slots[grp->front];
+	/* extract from the first bucket in the bucket list */
+	m = dn_dequeue(&cl->_q);
+
+	if (!m) {
+		D("BUG/* non-workconserving leaf */");
+		return NULL;
+	}
+	NO(q->queued--;)
+	old_V = q->V;
+	q->V += (uint64_t)m->m_pkthdr.len * IWSUM;
+	ND("m is %p F 0x%llx V now 0x%llx", m, cl->F, q->V);
+
+	if (qfq_update_class(q, grp, cl)) {
+		uint64_t old_F = grp->F;
+		cl = qfq_slot_scan(grp);
+		if (!cl) { /* group gone, remove from ER */
+			__clear_bit(grp->index, &q->bitmaps[ER]);
+			// grp->S = grp->F + 1; // XXX debugging only
+		} else {
+			uint64_t roundedS = qfq_round_down(cl->S, grp->slot_shift);
+			unsigned int s;
+
+			if (grp->S == roundedS)
+				goto skip_unblock;
+			grp->S = roundedS;
+			grp->F = roundedS + (2ULL << grp->slot_shift);
+			/* remove from ER and put in the new set */
+			__clear_bit(grp->index, &q->bitmaps[ER]);
+			s = qfq_calc_state(q, grp);
+			__set_bit(grp->index, &q->bitmaps[s]);
+		}
+		/* we need to unblock even if the group has gone away */
+		qfq_unblock_groups(q, grp->index, old_F);
+	}
+
+skip_unblock:
+	qfq_update_eligible(q, old_V);
+	NO(if (!q->bitmaps[ER] && q->queued)
+		dump_sched(q, "end dequeue");)
+
+	return m;
+}
+
+/*
+ * Assign a reasonable start time for a new flow k in group i.
+ * Admissible values for \hat(F) are multiples of \sigma_i
+ * no greater than V+\sigma_i . Larger values mean that
+ * we had a wraparound so we consider the timestamp to be stale.
+ *
+ * If F is not stale and F >= V then we set S = F.
+ * Otherwise we should assign S = V, but this may violate
+ * the ordering in ER. So, if we have groups in ER, set S to
+ * the F_j of the first group j which would be blocking us.
+ * We are guaranteed not to move S backward because
+ * otherwise our group i would still be blocked.
+ */
+static inline void
+qfq_update_start(struct qfq_sched *q, struct qfq_class *cl)
+{
+	unsigned long mask;
+	uint32_t limit, roundedF;
+	int slot_shift = cl->grp->slot_shift;
+
+	roundedF = qfq_round_down(cl->F, slot_shift);
+	limit = qfq_round_down(q->V, slot_shift) + (1UL << slot_shift);
+
+	if (!qfq_gt(cl->F, q->V) || qfq_gt(roundedF, limit)) {
+		/* timestamp was stale */
+		mask = mask_from(q->bitmaps[ER], cl->grp->index);
+		if (mask) {
+			struct qfq_group *next = qfq_ffs(q, mask);
+			if (qfq_gt(roundedF, next->F)) {
+				cl->S = next->F;
+				return;
+			}
+		}
+		cl->S = q->V;
+	} else { /* timestamp is not stale */
+		cl->S = cl->F;
+	}
+}
+
+static int
+qfq_enqueue(struct dn_sch_inst *si, struct dn_queue *_q, struct mbuf *m)
+{
+	struct qfq_sched *q = (struct qfq_sched *)(si + 1);
+	struct qfq_group *grp;
+	struct qfq_class *cl = (struct qfq_class *)_q;
+	uint64_t roundedS;
+	int s;
+
+	NO(q->loops++;)
+	DX(4, "len %d flow %p inv_w 0x%x grp %d", m->m_pkthdr.len,
+		_q, cl->inv_w, cl->grp->index);
+	/* XXX verify that the packet obeys the parameters */
+	if (m != _q->mq.head) {
+		if (dn_enqueue(_q, m, 0)) /* packet was dropped */
+			return 1;
+		NO(q->queued++;)
+		if (m != _q->mq.head)
+			return 0;
+	}
+	/* If reach this point, queue q was idle */
+	grp = cl->grp;
+	qfq_update_start(q, cl); /* adjust start time */
+	/* compute new finish time and rounded start. */
+	cl->F = cl->S + (uint64_t)(m->m_pkthdr.len) * cl->inv_w;
+	roundedS = qfq_round_down(cl->S, grp->slot_shift);
+
+	/*
+	 * insert cl in the correct bucket.
+	 * If cl->S >= grp->S we don't need to adjust the
+	 * bucket list and simply go to the insertion phase.
+	 * Otherwise grp->S is decreasing, we must make room
+	 * in the bucket list, and also recompute the group state.
+	 * Finally, if there were no flows in this group and nobody
+	 * was in ER make sure to adjust V.
+	 */
+	if (grp->full_slots) {
+		if (!qfq_gt(grp->S, cl->S))
+			goto skip_update;
+		/* create a slot for this cl->S */
+		qfq_slot_rotate(q, grp, roundedS);
+		/* group was surely ineligible, remove */
+		__clear_bit(grp->index, &q->bitmaps[IR]);
+		__clear_bit(grp->index, &q->bitmaps[IB]);
+	} else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V))
+		q->V = roundedS;
+
+	grp->S = roundedS;
+	grp->F = roundedS + (2ULL << grp->slot_shift); // i.e. 2\sigma_i
+	s = qfq_calc_state(q, grp);
+	__set_bit(grp->index, &q->bitmaps[s]);
+	ND("new state %d 0x%x", s, q->bitmaps[s]);
+	ND("S %llx F %llx V %llx", cl->S, cl->F, q->V);
+skip_update:
+	qfq_slot_insert(grp, cl, roundedS);
+
+	return 0;
+}
+
+
+#if 0
+static inline void
+qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp,
+	struct qfq_class *cl, struct qfq_class **pprev)
+{
+	unsigned int i, offset;
+	uint64_t roundedS;
+
+	roundedS = qfq_round_down(cl->S, grp->slot_shift);
+	offset = (roundedS - grp->S) >> grp->slot_shift;
+	i = (grp->front + offset) % QFQ_MAX_SLOTS;
+
+#ifdef notyet
+	if (!pprev) {
+		pprev = &grp->slots[i];
+		while (*pprev && *pprev != cl)
+			pprev = &(*pprev)->next;
+	}
+#endif
+
+	*pprev = cl->next;
+	if (!grp->slots[i])
+		__clear_bit(offset, &grp->full_slots);
+}
+
+/*
+ * called to forcibly destroy a queue.
+ * If the queue is not in the front bucket, or if it has
+ * other queues in the front bucket, we can simply remove
+ * the queue with no other side effects.
+ * Otherwise we must propagate the event up.
+ * XXX description to be completed.
+ */
+static void
+qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl,
+				 struct qfq_class **pprev)
+{
+	struct qfq_group *grp = &q->groups[cl->index];
+	unsigned long mask;
+	uint64_t roundedS;
+	int s;
+
+	cl->F = cl->S;	// not needed if the class goes away.
+	qfq_slot_remove(q, grp, cl, pprev);
+
+	if (!grp->full_slots) {
+		/* nothing left in the group, remove from all sets.
+		 * Do ER last because if we were blocking other groups
+		 * we must unblock them.
+		 */
+		__clear_bit(grp->index, &q->bitmaps[IR]);
+		__clear_bit(grp->index, &q->bitmaps[EB]);
+		__clear_bit(grp->index, &q->bitmaps[IB]);
+
+		if (test_bit(grp->index, &q->bitmaps[ER]) &&
+		    !(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) {
+			mask = q->bitmaps[ER] & ((1UL << grp->index) - 1);
+			if (mask)
+				mask = ~((1UL << __fls(mask)) - 1);
+			else
+				mask = ~0UL;
+			qfq_move_groups(q, mask, EB, ER);
+			qfq_move_groups(q, mask, IB, IR);
+		}
+		__clear_bit(grp->index, &q->bitmaps[ER]);
+	} else if (!grp->slots[grp->front]) {
+		cl = qfq_slot_scan(grp);
+		roundedS = qfq_round_down(cl->S, grp->slot_shift);
+		if (grp->S != roundedS) {
+			__clear_bit(grp->index, &q->bitmaps[ER]);
+			__clear_bit(grp->index, &q->bitmaps[IR]);
+			__clear_bit(grp->index, &q->bitmaps[EB]);
+			__clear_bit(grp->index, &q->bitmaps[IB]);
+			grp->S = roundedS;
+			grp->F = roundedS + (2ULL << grp->slot_shift);
+			s = qfq_calc_state(q, grp);
+			__set_bit(grp->index, &q->bitmaps[s]);
+		}
+	}
+	qfq_update_eligible(q, q->V);
+}
+#endif
+
+static int
+qfq_new_fsk(struct dn_fsk *f)
+{
+	ipdn_bound_var(&f->fs.par[0], 1, 1, QFQ_MAX_WEIGHT, "qfq weight");
+	ipdn_bound_var(&f->fs.par[1], 1500, 1, 2000, "qfq maxlen");
+	ND("weight %d len %d\n", f->fs.par[0], f->fs.par[1]);
+	return 0;
+}
+
+/*
+ * initialize a new scheduler instance
+ */
+static int
+qfq_new_sched(struct dn_sch_inst *si)
+{
+	struct qfq_sched *q = (struct qfq_sched *)(si + 1);
+	struct qfq_group *grp;
+	int i;
+
+	for (i = 0; i <= QFQ_MAX_INDEX; i++) {
+		grp = &q->groups[i];
+		grp->index = i;
+		grp->slot_shift = QFQ_MTU_SHIFT + FRAC_BITS -
+					(QFQ_MAX_INDEX - i);
+	}
+	return 0;
+}
+
+/*
+ * QFQ scheduler descriptor
+ */
+static struct dn_alg qfq_desc = {
+	_SI( .type = ) DN_SCHED_QFQ,
+	_SI( .name = ) "QFQ",
+	_SI( .flags = ) DN_MULTIQUEUE,
+
+	_SI( .schk_datalen = ) 0,
+	_SI( .si_datalen = ) sizeof(struct qfq_sched),
+	_SI( .q_datalen = ) sizeof(struct qfq_class) - sizeof(struct dn_queue),
+
+	_SI( .enqueue = ) qfq_enqueue,
+	_SI( .dequeue = ) qfq_dequeue,
+
+	_SI( .config = )  NULL,
+	_SI( .destroy = )  NULL,
+	_SI( .new_sched = ) qfq_new_sched,
+	_SI( .free_sched = )  NULL,
+	_SI( .new_fsk = ) qfq_new_fsk,
+	_SI( .free_fsk = )  NULL,
+	_SI( .new_queue = ) qfq_new_queue,
+	_SI( .free_queue = ) qfq_free_queue,
+};
+
+DECLARE_DNSCHED_MODULE(dn_qfq, &qfq_desc);
+
+#ifdef QFQ_DEBUG
+static void
+dump_groups(struct qfq_sched *q, uint32_t mask)
+{
+	int i, j;
+
+	for (i = 0; i < QFQ_MAX_INDEX + 1; i++) {
+		struct qfq_group *g = &q->groups[i];
+
+		if (0 == (mask & (1<<i)))
+			continue;
+		for (j = 0; j < QFQ_MAX_SLOTS; j++) {
+			if (g->slots[j])
+				D("    bucket %d %p", j, g->slots[j]);
+		}
+		D("full_slots 0x%x", g->full_slots);
+		D("        %2d S 0x%20llx F 0x%llx %c", i,
+			g->S, g->F,
+			mask & (1<<i) ? '1' : '0');
+	}
+}
+
+static void
+dump_sched(struct qfq_sched *q, const char *msg)
+{
+	D("--- in %s: ---", msg);
+	ND("loops %d queued %d V 0x%llx", q->loops, q->queued, q->V);
+	D("    ER 0x%08x", q->bitmaps[ER]);
+	D("    EB 0x%08x", q->bitmaps[EB]);
+	D("    IR 0x%08x", q->bitmaps[IR]);
+	D("    IB 0x%08x", q->bitmaps[IB]);
+	dump_groups(q, 0xffffffff);
+};
+#endif /* QFQ_DEBUG */
diff --git a/sys/netinet/ipfw/dn_sched_rr.c b/sys/netinet/ipfw/dn_sched_rr.c
new file mode 100644
index 0000000..fc7be00
--- /dev/null
+++ b/sys/netinet/ipfw/dn_sched_rr.c
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ */
+
+#ifdef _KERNEL
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/kernel.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <net/if.h>	/* IFNAMSIZ */
+#include <netinet/in.h>
+#include <netinet/ip_var.h>		/* ipfw_rule_ref */
+#include <netinet/ip_fw.h>	/* flow_id */
+#include <netinet/ip_dummynet.h>
+#include <netinet/ipfw/dn_heap.h>
+#include <netinet/ipfw/ip_dn_private.h>
+#include <netinet/ipfw/dn_sched.h>
+#else
+#include <dn_test.h>
+#endif
+
+#define DN_SCHED_RR	3 // XXX Where?
+
+struct rr_queue {
+	struct dn_queue q;		/* Standard queue */
+	int status;			/* 1: queue is in the list */
+	int credit;			/* Number of bytes to transmit */
+	int quantum;			/* quantum * C */
+	struct rr_queue *qnext;		/* */
+};
+
+/* struct rr_schk contains global config parameters
+ * and is right after dn_schk
+ */
+struct rr_schk {
+	int min_q;		/* Min quantum */
+	int max_q;		/* Max quantum */
+	int q_bytes;		/* Bytes per quantum */
+};
+
+/* per-instance round robin list, right after dn_sch_inst */
+struct rr_si {
+	struct rr_queue *head, *tail;	/* Pointer to current queue */
+};
+
+/* Append a queue to the rr list */
+static inline void
+rr_append(struct rr_queue *q, struct rr_si *si)
+{
+	q->status = 1;		/* mark as in-rr_list */
+	q->credit = q->quantum;	/* initialize credit */
+
+	/* append to the tail */
+	if (si->head == NULL)
+		si->head = q;
+	else
+		si->tail->qnext = q;
+	si->tail = q;		/* advance the tail pointer */
+	q->qnext = si->head;	/* make it circular */
+}
+
+/* Remove the head queue from circular list. */
+static inline void
+rr_remove_head(struct rr_si *si)
+{
+	if (si->head == NULL)
+		return; /* empty queue */
+	si->head->status = 0;
+	
+	if (si->head == si->tail) {
+		si->head = si->tail = NULL;
+		return;
+	}
+
+	si->head = si->head->qnext;
+	si->tail->qnext = si->head;
+}
+
+/* Remove a queue from circular list.
+ * XXX see if ti can be merge with remove_queue()
+ */
+static inline void
+remove_queue_q(struct rr_queue *q, struct rr_si *si)
+{
+	struct rr_queue *prev;
+
+	if (q->status != 1)
+		return;
+	if (q == si->head) {
+		rr_remove_head(si);
+		return;
+	}
+
+	for (prev = si->head; prev; prev = prev->qnext) {
+		if (prev->qnext != q)
+			continue;
+		prev->qnext = q->qnext;
+		if (q == si->tail)
+			si->tail = prev;
+		q->status = 0;
+		break;
+	}
+}
+
+
+static inline void
+next_pointer(struct rr_si *si)
+{
+	if (si->head == NULL)
+		return; /* empty queue */
+
+	si->head = si->head->qnext;
+	si->tail = si->tail->qnext;
+}
+
+static int 
+rr_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m)
+{
+	struct rr_si *si;
+	struct rr_queue *rrq;
+
+	if (m != q->mq.head) {
+		if (dn_enqueue(q, m, 0)) /* packet was dropped */
+			return 1;
+		if (m != q->mq.head)
+			return 0;
+	}
+
+	/* If reach this point, queue q was idle */ 
+	si = (struct rr_si *)(_si + 1);
+	rrq = (struct rr_queue *)q;
+
+	if (rrq->status == 1) /* Queue is already in the queue list */
+		return 0;
+
+	/* Insert the queue in the queue list */
+	rr_append(rrq, si);
+
+	return 0;
+}
+
+static struct mbuf *
+rr_dequeue(struct dn_sch_inst *_si)
+{
+	/* Access scheduler instance private data */
+	struct rr_si *si = (struct rr_si *)(_si + 1);
+	struct rr_queue *rrq;
+	uint64_t len;
+
+	while ( (rrq = si->head) ) {
+		struct mbuf *m = rrq->q.mq.head;
+		if ( m == NULL) {
+			/* empty queue, remove from list */
+			rr_remove_head(si);
+			continue;
+		}
+		len = m->m_pkthdr.len;
+
+		if (len > rrq->credit) {
+			/* Packet too big */
+			rrq->credit += rrq->quantum;
+			/* Try next queue */
+			next_pointer(si);
+		} else {
+			rrq->credit -= len;
+			return dn_dequeue(&rrq->q);
+		}
+	}
+
+	/* no packet to dequeue*/
+	return NULL;
+}
+
+static int
+rr_config(struct dn_schk *_schk)
+{
+	struct rr_schk *schk = (struct rr_schk *)(_schk + 1);
+	ND("called");
+
+	/* use reasonable quantums (64..2k bytes, default 1500) */
+	schk->min_q = 64;
+	schk->max_q = 2048;
+	schk->q_bytes = 1500;	/* quantum */
+
+	return 0;
+}
+
+static int
+rr_new_sched(struct dn_sch_inst *_si)
+{
+	struct rr_si *si = (struct rr_si *)(_si + 1);
+
+	ND("called");
+	si->head = si->tail = NULL;
+
+	return 0;
+}
+
+static int
+rr_free_sched(struct dn_sch_inst *_si)
+{
+	ND("called");
+	/* Nothing to do? */
+	return 0;
+}
+
+static int
+rr_new_fsk(struct dn_fsk *fs)
+{
+	struct rr_schk *schk = (struct rr_schk *)(fs->sched + 1);
+	/* par[0] is the weight, par[1] is the quantum step */
+	ipdn_bound_var(&fs->fs.par[0], 1,
+		1, 65536, "RR weight");
+	ipdn_bound_var(&fs->fs.par[1], schk->q_bytes,
+		schk->min_q, schk->max_q, "RR quantum");
+	return 0;
+}
+
+static int
+rr_new_queue(struct dn_queue *_q)
+{
+	struct rr_queue *q = (struct rr_queue *)_q;
+
+	_q->ni.oid.subtype = DN_SCHED_RR;
+
+	q->quantum = _q->fs->fs.par[0] * _q->fs->fs.par[1];
+	ND("called, q->quantum %d", q->quantum);
+	q->credit = q->quantum;
+	q->status = 0;
+
+	if (_q->mq.head != NULL) {
+		/* Queue NOT empty, insert in the queue list */
+		rr_append(q, (struct rr_si *)(_q->_si + 1));
+	}
+	return 0;
+}
+
+static int
+rr_free_queue(struct dn_queue *_q)
+{
+	struct rr_queue *q = (struct rr_queue *)_q;
+
+	ND("called");
+	if (q->status == 1) {
+		struct rr_si *si = (struct rr_si *)(_q->_si + 1);
+		remove_queue_q(q, si);
+	}
+	return 0;
+}
+
+/*
+ * RR scheduler descriptor
+ * contains the type of the scheduler, the name, the size of the
+ * structures and function pointers.
+ */
+static struct dn_alg rr_desc = {
+	_SI( .type = ) DN_SCHED_RR,
+	_SI( .name = ) "RR",
+	_SI( .flags = ) DN_MULTIQUEUE,
+
+	_SI( .schk_datalen = ) 0,
+	_SI( .si_datalen = ) sizeof(struct rr_si),
+	_SI( .q_datalen = ) sizeof(struct rr_queue) - sizeof(struct dn_queue),
+
+	_SI( .enqueue = ) rr_enqueue,
+	_SI( .dequeue = ) rr_dequeue,
+
+	_SI( .config = ) rr_config,
+	_SI( .destroy = ) NULL,
+	_SI( .new_sched = ) rr_new_sched,
+	_SI( .free_sched = ) rr_free_sched,
+	_SI( .new_fsk = ) rr_new_fsk,
+	_SI( .free_fsk = ) NULL,
+	_SI( .new_queue = ) rr_new_queue,
+	_SI( .free_queue = ) rr_free_queue,
+};
+
+
+DECLARE_DNSCHED_MODULE(dn_rr, &rr_desc);
diff --git a/sys/netinet/ipfw/dn_sched_wf2q.c b/sys/netinet/ipfw/dn_sched_wf2q.c
new file mode 100644
index 0000000..1fbc120
--- /dev/null
+++ b/sys/netinet/ipfw/dn_sched_wf2q.c
@@ -0,0 +1,373 @@
+/*
+ * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
+ * Copyright (c) 2000-2002 Luigi Rizzo, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ */
+
+#ifdef _KERNEL
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/kernel.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <net/if.h>	/* IFNAMSIZ */
+#include <netinet/in.h>
+#include <netinet/ip_var.h>		/* ipfw_rule_ref */
+#include <netinet/ip_fw.h>	/* flow_id */
+#include <netinet/ip_dummynet.h>
+#include <netinet/ipfw/dn_heap.h>
+#include <netinet/ipfw/ip_dn_private.h>
+#include <netinet/ipfw/dn_sched.h>
+#else
+#include <dn_test.h>
+#endif
+
+#ifndef MAX64
+#define MAX64(x,y)  (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x)
+#endif
+
+/*
+ * timestamps are computed on 64 bit using fixed point arithmetic.
+ * LMAX_BITS, WMAX_BITS are the max number of bits for the packet len
+ * and sum of weights, respectively. FRAC_BITS is the number of
+ * fractional bits. We want FRAC_BITS >> WMAX_BITS to avoid too large
+ * errors when computing the inverse, FRAC_BITS < 32 so we can do 1/w
+ * using an unsigned 32-bit division, and to avoid wraparounds we need
+ * LMAX_BITS + WMAX_BITS + FRAC_BITS << 64
+ * As an example
+ * FRAC_BITS = 26, LMAX_BITS=14, WMAX_BITS = 19
+ */
+#ifndef FRAC_BITS
+#define FRAC_BITS    28 /* shift for fixed point arithmetic */
+#define	ONE_FP	(1UL << FRAC_BITS)
+#endif
+
+/*
+ * Private information for the scheduler instance:
+ * sch_heap (key is Finish time) returns the next queue to serve
+ * ne_heap (key is Start time) stores not-eligible queues
+ * idle_heap (key=start/finish time) stores idle flows. It must
+ *	support extract-from-middle.
+ * A flow is only in 1 of the three heaps.
+ * XXX todo: use a more efficient data structure, e.g. a tree sorted
+ * by F with min_subtree(S) in each node
+ */
+struct wf2qp_si {
+    struct dn_heap sch_heap;	/* top extract - key Finish  time */
+    struct dn_heap ne_heap;	/* top extract - key Start   time */
+    struct dn_heap idle_heap;	/* random extract - key Start=Finish time */
+    uint64_t V;			/* virtual time */
+    uint32_t inv_wsum;		/* inverse of sum of weights */
+    uint32_t wsum;		/* sum of weights */
+};
+
+struct wf2qp_queue {
+    struct dn_queue _q;
+    uint64_t S, F;		/* start time, finish time */
+    uint32_t inv_w;		/* ONE_FP / weight */
+    int32_t heap_pos;		/* position (index) of struct in heap */
+};
+
+/*
+ * This file implements a WF2Q+ scheduler as it has been in dummynet
+ * since 2000.
+ * The scheduler supports per-flow queues and has O(log N) complexity.
+ *
+ * WF2Q+ needs to drain entries from the idle heap so that we
+ * can keep the sum of weights up to date. We can do it whenever
+ * we get a chance, or periodically, or following some other
+ * strategy. The function idle_check() drains at most N elements
+ * from the idle heap.
+ */
+static void
+idle_check(struct wf2qp_si *si, int n, int force)
+{
+    struct dn_heap *h = &si->idle_heap;
+    while (n-- > 0 && h->elements > 0 &&
+		(force || DN_KEY_LT(HEAP_TOP(h)->key, si->V))) {
+	struct dn_queue *q = HEAP_TOP(h)->object;
+        struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q;
+
+        heap_extract(h, NULL);
+        /* XXX to let the flowset delete the queue we should
+	 * mark it as 'unused' by the scheduler.
+	 */
+        alg_fq->S = alg_fq->F + 1; /* Mark timestamp as invalid. */
+        si->wsum -= q->fs->fs.par[0];	/* adjust sum of weights */
+	if (si->wsum > 0)
+		si->inv_wsum = ONE_FP/si->wsum;
+    }
+}
+
+static int 
+wf2qp_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m)
+{
+    struct dn_fsk *fs = q->fs;
+    struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
+    struct wf2qp_queue *alg_fq;
+    uint64_t len = m->m_pkthdr.len;
+
+    if (m != q->mq.head) {
+	if (dn_enqueue(q, m, 0)) /* packet was dropped */
+	    return 1;
+	if (m != q->mq.head)	/* queue was already busy */
+	    return 0;
+    }
+
+    /* If reach this point, queue q was idle */ 
+    alg_fq = (struct wf2qp_queue *)q;
+
+    if (DN_KEY_LT(alg_fq->F, alg_fq->S)) {
+        /* F<S means timestamps are invalid ->brand new queue. */
+        alg_fq->S = si->V;		/* init start time */
+        si->wsum += fs->fs.par[0];	/* add weight of new queue. */
+	si->inv_wsum = ONE_FP/si->wsum;
+    } else { /* if it was idle then it was in the idle heap */
+        heap_extract(&si->idle_heap, q);
+        alg_fq->S = MAX64(alg_fq->F, si->V);	/* compute new S */
+    }
+    alg_fq->F = alg_fq->S + len * alg_fq->inv_w;
+
+    /* if nothing is backlogged, make sure this flow is eligible */
+    if (si->ne_heap.elements == 0 && si->sch_heap.elements == 0)
+        si->V = MAX64(alg_fq->S, si->V);
+
+    /*
+     * Look at eligibility. A flow is not eligibile if S>V (when
+     * this happens, it means that there is some other flow already
+     * scheduled for the same pipe, so the sch_heap cannot be
+     * empty). If the flow is not eligible we just store it in the
+     * ne_heap. Otherwise, we store in the sch_heap.
+     * Note that for all flows in sch_heap (SCH), S_i <= V,
+     * and for all flows in ne_heap (NEH), S_i > V.
+     * So when we need to compute max(V, min(S_i)) forall i in
+     * SCH+NEH, we only need to look into NEH.
+     */
+    if (DN_KEY_LT(si->V, alg_fq->S)) {
+        /* S>V means flow Not eligible. */
+        if (si->sch_heap.elements == 0)
+            D("++ ouch! not eligible but empty scheduler!");
+        heap_insert(&si->ne_heap, alg_fq->S, q);
+    } else {
+        heap_insert(&si->sch_heap, alg_fq->F, q);
+    }
+    return 0;
+}
+
+/* XXX invariant: sch > 0 || V >= min(S in neh) */
+static struct mbuf *
+wf2qp_dequeue(struct dn_sch_inst *_si)
+{
+	/* Access scheduler instance private data */
+	struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
+	struct mbuf *m;
+	struct dn_queue *q;
+	struct dn_heap *sch = &si->sch_heap;
+	struct dn_heap *neh = &si->ne_heap;
+	struct wf2qp_queue *alg_fq;
+
+	if (sch->elements == 0 && neh->elements == 0) {
+		/* we have nothing to do. We could kill the idle heap
+		 * altogether and reset V
+		 */
+		idle_check(si, 0x7fffffff, 1);
+		si->V = 0;
+		si->wsum = 0;	/* should be set already */
+		return NULL;	/* quick return if nothing to do */
+	}
+	idle_check(si, 1, 0);	/* drain something from the idle heap */
+
+	/* make sure at least one element is eligible, bumping V
+	 * and moving entries that have become eligible.
+	 * We need to repeat the first part twice, before and
+	 * after extracting the candidate, or enqueue() will
+	 * find the data structure in a wrong state.
+	 */
+  m = NULL;
+  for(;;) {
+	/*
+	 * Compute V = max(V, min(S_i)). Remember that all elements
+	 * in sch have by definition S_i <= V so if sch is not empty,
+	 * V is surely the max and we must not update it. Conversely,
+	 * if sch is empty we only need to look at neh.
+	 * We don't need to move the queues, as it will be done at the
+	 * next enqueue
+	 */
+	if (sch->elements == 0 && neh->elements > 0) {
+		si->V = MAX64(si->V, HEAP_TOP(neh)->key);
+	}
+	while (neh->elements > 0 &&
+		    DN_KEY_LEQ(HEAP_TOP(neh)->key, si->V)) {
+		q = HEAP_TOP(neh)->object;
+		alg_fq = (struct wf2qp_queue *)q;
+		heap_extract(neh, NULL);
+		heap_insert(sch, alg_fq->F, q);
+	}
+	if (m) /* pkt found in previous iteration */
+		break;
+	/* ok we have at least one eligible pkt */
+	q = HEAP_TOP(sch)->object;
+	alg_fq = (struct wf2qp_queue *)q;
+	m = dn_dequeue(q);
+	heap_extract(sch, NULL); /* Remove queue from heap. */
+	si->V += (uint64_t)(m->m_pkthdr.len) * si->inv_wsum;
+	alg_fq->S = alg_fq->F;  /* Update start time. */
+	if (q->mq.head == 0) {	/* not backlogged any more. */
+		heap_insert(&si->idle_heap, alg_fq->F, q);
+	} else {			/* Still backlogged. */
+		/* Update F, store in neh or sch */
+		uint64_t len = q->mq.head->m_pkthdr.len;
+		alg_fq->F += len * alg_fq->inv_w;
+		if (DN_KEY_LEQ(alg_fq->S, si->V)) {
+			heap_insert(sch, alg_fq->F, q);
+		} else {
+			heap_insert(neh, alg_fq->S, q);
+		}
+	}
+    }
+	return m;
+}
+
+static int
+wf2qp_new_sched(struct dn_sch_inst *_si)
+{
+	struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
+	int ofs = offsetof(struct wf2qp_queue, heap_pos);
+
+	/* all heaps support extract from middle */
+	if (heap_init(&si->idle_heap, 16, ofs) ||
+	    heap_init(&si->sch_heap, 16, ofs) ||
+	    heap_init(&si->ne_heap, 16, ofs)) {
+		heap_free(&si->ne_heap);
+		heap_free(&si->sch_heap);
+		heap_free(&si->idle_heap);
+		return ENOMEM;
+	}
+	return 0;
+}
+
+static int
+wf2qp_free_sched(struct dn_sch_inst *_si)
+{
+	struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
+
+	heap_free(&si->sch_heap);
+	heap_free(&si->ne_heap);
+	heap_free(&si->idle_heap);
+
+	return 0;
+}
+
+static int
+wf2qp_new_fsk(struct dn_fsk *fs)
+{
+	ipdn_bound_var(&fs->fs.par[0], 1,
+		1, 100, "WF2Q+ weight");
+	return 0;
+}
+
+static int
+wf2qp_new_queue(struct dn_queue *_q)
+{
+	struct wf2qp_queue *q = (struct wf2qp_queue *)_q;
+
+	_q->ni.oid.subtype = DN_SCHED_WF2QP;
+	q->F = 0;	/* not strictly necessary */
+	q->S = q->F + 1;    /* mark timestamp as invalid. */
+        q->inv_w = ONE_FP / _q->fs->fs.par[0];
+	if (_q->mq.head != NULL) {
+		wf2qp_enqueue(_q->_si, _q, _q->mq.head);
+	}
+	return 0;
+}
+
+/*
+ * Called when the infrastructure removes a queue (e.g. flowset
+ * is reconfigured). Nothing to do if we did not 'own' the queue,
+ * otherwise remove it from the right heap and adjust the sum
+ * of weights.
+ */
+static int
+wf2qp_free_queue(struct dn_queue *q)
+{
+	struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q;
+	struct wf2qp_si *si = (struct wf2qp_si *)(q->_si + 1);
+    
+	if (alg_fq->S >= alg_fq->F + 1)
+		return 0;	/* nothing to do, not in any heap */
+	si->wsum -= q->fs->fs.par[0];
+	if (si->wsum > 0)
+		si->inv_wsum = ONE_FP/si->wsum;
+
+	/* extract from the heap. XXX TODO we may need to adjust V
+	 * to make sure the invariants hold.
+	 */
+	if (q->mq.head == NULL) {
+		heap_extract(&si->idle_heap, q);
+	} else if (DN_KEY_LT(si->V, alg_fq->S)) {
+		heap_extract(&si->ne_heap, q);
+	} else {
+		heap_extract(&si->sch_heap, q);
+	}
+	return 0;
+}
+
+/*
+ * WF2Q+ scheduler descriptor
+ * contains the type of the scheduler, the name, the size of the
+ * structures and function pointers.
+ */
+static struct dn_alg wf2qp_desc = {
+	_SI( .type = ) DN_SCHED_WF2QP,
+	_SI( .name = ) "WF2Q+",
+	_SI( .flags = ) DN_MULTIQUEUE,
+
+	/* we need extra space in the si and the queue */
+	_SI( .schk_datalen = ) 0,
+	_SI( .si_datalen = ) sizeof(struct wf2qp_si),
+	_SI( .q_datalen = ) sizeof(struct wf2qp_queue) -
+				sizeof(struct dn_queue),
+
+	_SI( .enqueue = ) wf2qp_enqueue,
+	_SI( .dequeue = ) wf2qp_dequeue,
+
+	_SI( .config = )  NULL,
+	_SI( .destroy = )  NULL,
+	_SI( .new_sched = ) wf2qp_new_sched,
+	_SI( .free_sched = ) wf2qp_free_sched,
+    
+	_SI( .new_fsk = ) wf2qp_new_fsk,
+	_SI( .free_fsk = )  NULL,
+
+	_SI( .new_queue = ) wf2qp_new_queue,
+	_SI( .free_queue = ) wf2qp_free_queue,
+};
+
+
+DECLARE_DNSCHED_MODULE(dn_wf2qp, &wf2qp_desc);
diff --git a/sys/netinet/ipfw/dummynet.txt b/sys/netinet/ipfw/dummynet.txt
new file mode 100644
index 0000000..0ed6ad1
--- /dev/null
+++ b/sys/netinet/ipfw/dummynet.txt
@@ -0,0 +1,860 @@
+#
+# $FreeBSD$
+#
+
+Notes on the internal structure of dummynet (2010 version)
+by Riccardo Panicucci and Luigi Rizzo
+Work supported by the EC project ONELAB2
+
+
+*********
+* INDEX *
+*********
+Implementation of new dummynet
+    Internal structure
+    Files
+Packet arrival
+    The reconfiguration routine
+dummynet_task()
+Configuration
+    Add a pipe
+    Add a scheduler
+    Add a flowset
+Listing object
+Delete of object
+    Delete a pipe
+    Delete a flowset
+    Delete a scheduler
+Compatibility with FreeBSD7.2 and FreeBSD 8 ipfw binary
+    ip_dummynet_glue.c
+    ip_fw_glue.c
+How to configure dummynet
+How to implement a new scheduler
+
+
+
+OPEN ISSUES
+------------------------------
+20100131 deleting RR causes infinite loop
+	presumably in the rr_free_queue() call -- seems to hang
+	forever when deleting a live flow
+------------------------------
+
+Dummynet is a traffic shaper and network emulator. Packets are
+selected by an external filter such as ipfw, and passed to the emulator
+with a tag such as "pipe 10" or "queue 5" which tells what to
+do with the packet. As an example
+
+	ipfw add queue 5 icmp from 10.0.0.2 to all
+
+All packets with the same tag belong to a "flowset", or a set
+of flows which can be further partitioned according to a mask.
+Flowsets are then passed to a scheduler for processing. The
+association of flowsets and schedulers is configurable e.g.
+
+	ipfw queue 5 config sched 10 weight 3 flow_mask xxxx
+	ipfw queue 8 config sched 10 weight 1 ...
+	ipfw queue 3 config sched 20 weight 1 ...
+
+"sched 10" represents one or more scheduler instances,
+selected through a mask on the 5-tuple itself.
+
+	ipfw sched 20 config type FIFO sched_mask yyy ...
+
+There are in fact two masks applied to each packet:
++ the "sched_mask" sends packets arriving to a scheduler_id to
+  one of many instances.
++ the "flow_mask" together with the flowset_id is used to
+  collect packets into independent flows on each scheduler.
+
+As an example, we can have
+	ipfw queue 5 config sched 10 flow_mask src-ip 0x000000ff
+	ipfw sched 10 config type WF2Q+ sched_mask src-ip 0xffffff00
+
+means that sched 10 will have one instance per /24 source subnet,
+and within that, each individual source will be a flow.
+	
+Internal structure
+-----------------
+Dummynet-related data is split into several data structures,
+part of them constituting the userland-kernel API, and others
+specific to the kernel.
+NOTE: for up-to-date details please look at the relevant source
+	headers (ip_dummynet.h, ip_dn_private.h, dn_sched.h)
+
+USERLAND-KERNEL API	(ip_dummynet.h)
+
+    struct dn_link:
+	contains data about the physical link such as
+	bandwith, delay, burst size;
+
+    struct dn_fs:
+	describes a flowset, i.e. a template for queues.
+	Main parameters are the scheduler we attach to, a flow_mask,
+	buckets, queue size, plr, weight, and other scheduler-specific
+	parameters.
+
+    struct dn_flow
+	contains information on a flow, including masks and
+	statistics
+
+    struct dn_sch:
+	defines a scheduler (and a link attached to it).
+	Parameters include scheduler type, sched_mask, number of
+	buckets, and possibly other scheduler-specific parameters,
+
+    struct dn_profile:
+	fields to simulate a delay profile
+
+
+KERNEL REPRESENTATION	(ip_dn_private.h)
+
+    struct mq
+	a queue of mbufs with head and tail.
+
+    struct dn_queue
+	individual queue of packets, created by a flowset using
+	flow_mask and attached to a scheduler instance selected
+	through sched_mask.
+	A dn_queue has a pointer to the dn_fsk (which in turn counts
+	how many queues point to it), a pointer to the
+	dn_sch_inst it attaches to, and is in a hash table in the
+	flowset. scheduler instances also should store queues in
+	their own containers used for scheduling (lists, trees, etc.)
+	CREATE: done on packet arrivals when a flow matches a flowset.
+	DELETE: done only when deleting the parent dn_sch_inst
+		or draining memory.
+
+    struct dn_fsk
+	includes a dn_fs; a pointer to the dn_schk; a link field
+	for the list of dn_fsk attached to the same scheduler,
+	or for the unlinked list;
+	a refcount for the number of queues pointing to it;
+	The dn_fsk is in a hash table, fshash.
+	CREATE: done on configuration commands.
+	DELETE: on configuration commands.
+
+    struct dn_sch_inst
+	a scheduler instance, created from a dn_schk applying sched_mask.
+	Contains a delay line, a reference to the parent, and scheduler-
+	specific info.  Both dn_sch_inst and its delay line can be in the
+	evheap if they have events to be processed.
+	CREATE: created from a dn_schk applying sched_mask
+	DELETE: configuration command delete a scheduler which in turn
+		sweeps the hash table of instances deleting them
+
+    struct dn_schk
+	includes dn_sch, dn_link, a pointer to dn_profile,
+	a hash table of dn_sch_inst, a list of dn_fsk
+	attached to it.
+	CREATE: configuration command. If there are flowsets that
+		refer to this number, they are attached and moved
+		to the hash table
+	DELETE: manual, see dn_sch_inst
+
+
+	fshash                            schedhash
+      +---------------+   sched        +--------------+
+      |      sched-------------------->|      NEW_SCHK|
+  -<----*sch_chain    |<-----------------*fsk_list    |
+      |NEW_FSK        |<----.          | [dn_link]    |
+      +---------------+     |          +--------------+
+      |qht (hash)     |     |          |  siht(hash)  |
+      |   [dn_queue]  |     |          |  [dn_si]     |
+      |   [dn_queue]  |     |          |  [dn_si]     |
+      |     ...       |     |          |   ...        |
+      |   +--------+  |     |          | +---------+  |
+      |   |dn_queue|  |     |          | |dn_si    |  |
+      |  |    fs *----------'          | |         |  |
+      |  |    si *---------------------->|         |  |
+      |  +---------+  |                | +---------+  |
+      +---------------+                +--------------+
+
+The following global data structures contain all
+schedulers and flowsets.
+
+- schedhash[x]: contains all scheduler templates in the system.
+	Looked up only on manual configurations, where flowsets
+	are attached to matching schedulers.
+	We have one entry per 'sched X config' command
+	(plus one for each 'pipe X config').
+
+- fshash[x]: contains all flowsets.
+	We do a lookup on this for each packet.
+	We have one entry for each 'queue X config'
+	(plus one for each 'pipe X config').
+
+Additionally, a list that contains all unlinked flowset:
+- fsu:  contains flowset that are not linked with any scheduler.
+	Flowset are put in this list when they refer to a non
+	existing scheduler.
+	We don't need an efficient data structure as we never search
+	here on a packet arrivals.
+
+Scheduler instances and the delay lines associated with each scheduler
+instance need to be woken up at certain times. Because we have many
+such objects, we keep them in a priority heap (system_heap).
+
+Almost all objects in this implementation are preceded by a structure
+(struct dn_id) which makes it easier to identify them.
+
+
+Files
+-----
+The dummynet code is split in several files.
+All kernel code is in sys/netinet/ipfw except ip_dummynet.h
+All userland code is in sbin/ipfw.
+Files are
+- sys/netinet/ip_dummynet.h defines the kernel-userland API
+- ip_dn_private.h contains the kernel-specific APIs
+  and data structures
+- dn_sched.h defines the scheduler API
+- ip_dummynet.c cointains module glue and sockopt handlers, with all
+  functions to configure and list objects.
+- ip_dn_io.c contains the functions directly related to packet processing,
+  and run in the critical path. It also contains some functions
+  exported to the schedulers.
+- dn_heap.[ch] implement a binary heap and a generic hash table
+- dn_sched_* implement the various scheduler modules
+  
+- dummynet.c is the file used to implement the user side of dummynet.
+  It contains the function to parsing command line, and functions to
+  show the output of dummynet objects.
+Moreover, there are two new file (ip_dummynet_glue.c and ip_fw_glue.c) that
+are used to allow compatibility with the "ipfw" binary from FreeBSD 7.2 and
+FreeBSD 8.
+
+LOCKING
+=======
+At the moment the entire processing occurs under a single lock
+which is expected to be acquired in exclusive mode
+DN_BH_WLOCK() / DN_BH_WUNLOCK().
+
+In perspective we aim at the following:
+- the 'busy' flag, 'pending' list and all structures modified by packet
+  arrivals and departures are protected by the BH_WLOCK.
+  This is normally acquired in exclusive mode by the packet processing
+  functions for short sections of code (exception -- the timer).
+  If 'busy' is not set, we can do regular packet processing.
+  If 'busy' is set, no pieces can be accessed.
+  We must enqueue the packet on 'pending' and return immediately.
+
+- the 'busy' flag is set/cleared by long sections of code as follows:
+	UH_WLOCK(); KASSERT(busy == 0);
+	BH_WLOCK(); busy=1; BH_WUNLOCK();
+	... do processing ...
+	BH_WLOCK(); busy=0; drain_queue(pending); BH_WUNLOCK();
+	UH_WUNLOCK();
+  this normally happens when the upper half has something heavy
+  to do. The prologue and epilogue are not in the critical path.
+
+- the main containers (fshash, schedhash, ...) are protected by
+  UH_WLOCK.
+  
+Packet processing
+=================
+A packet enters dummynet through dummynet_io(). We first lookup
+the flowset number in fshash using dn_ht_find(), then find the scheduler
+instance using ipdn_si_find(), then possibly identify the correct
+queue with ipdn_q_find().
+If successful, we call the scheduler's enqueue function(), and
+if needed start I/O on the link calling serve_sched().
+If the packet can be returned immediately, this is done by
+leaving *m0 set. Otherwise, the packet is absorbed by dummynet
+and we simply return, possibly with some appropriate error code.
+
+Reconfiguration
+---------------
+Reconfiguration is the complex part of the system because we need to
+keep track of the various objects and containers.
+At the moment we do not use reference counts for objects so all
+processing must be done under a lock.
+
+The main entry points for configuration is the ip_dn_ctl() handler
+for the IP_DUMMYNET3 sockopt (others are provided only for backward
+compatibility). Modifications to the configuration call do_config().
+The argument is a sequence of blocks each starting with a  struct dn_id
+which specifies its content.
+The first dn_id must contain as obj.id the DN_API_VERSION
+The obj.type is DN_CMD_CONFIG (followed by actual objects),
+DN_CMD_DELETE (with the correct subtype and list of objects), or
+DN_CMD_FLUSH.
+
+DN_CMD_CONFIG is followed by objects to add/reconfigure. In general,
+if an object already exists it is reconfigured, otherwise it is
+created in a way that keeps the structure consistent.
+We have the following objects in the system, normally numbered with
+an identifier N between 1 and 65535. For certain objects we have
+"shadow" copies numbered I+NMAX and I+ 2*NMAX which are used to
+implement certain backward compatibility features.
+
+In general we have the following linking
+
+  TRADITIONAL DUMMYNET QUEUES "queue N config ... pipe M ..."
+	corresponds to a dn_fs object numbered N
+
+  TRADITIONAL DUMMYNET PIPES "pipe N config ..."
+	dn_fs N+2*NMAX --> dn_sch N+NMAX type FIFO --> dn_link N+NMAX
+
+  GENERIC SCHEDULER "sched N config ... "
+	[dn_fs N+NMAX] --> dn_sch N --> dn_link N
+	The flowset N+NMAX is created only if the scheduler is not
+	of type MULTIQUEUE.
+
+  DELAY PROFILE	"pipe N config profile ..."
+	it is always attached to an existing dn_link N
+
+Because traditional dummynet pipes actually configure both a
+'standalone' instance and one that can be used by queues,
+we do the following:
+
+    "pipe N config ..." configures:
+	dn_sched N type WF2Q+
+	dn_sched N+NMAX type FIFO
+	dn_fs N+2NMAX attached to dn_sched N+NMAX
+	dn_pipe N
+	dn_pipe N+NMAX
+
+    "queue N config" configures
+	dn_fs N
+
+    "sched N config" configures
+	dn_sched N type as desired
+	dn_fs N+NMAX attached to dn_sched N
+
+
+dummynet_task()
+===============
+The dummynet_task() is the the main dummynet processing function and is
+called every tick. This function first calculate the new current time, then
+it checks if it is the time to wake up object from the system_heap comparing
+the current time and the key of the heap. Two types of object (really the
+heap contains pointer to objects) are in the
+system_heap:
+
+- scheduler instance: if a scheduler instance is waked up, the dequeue()
+  function is called until it has credit. If the dequeue() returns packets,
+  the scheduler instance is inserted in the heap with a new key depending of
+  the data that will be send out. If the scheduler instance remains with
+  some credit, it means that is hasn't other packet to send and so the
+  instance is no longer inserted in the heap.
+
+  If the scheduler instance extracted from the heap has the DELETE flag set,
+  the dequeue() is not called and the instance is destroyed now.
+
+- delay line: when extracting a delay line, the function transmit_event() is
+  called to send out packet from delay line.
+
+  If the scheduler instance associated with this delay line doesn't exists,
+  the delay line will be delete now.
+
+Configuration
+=============
+To create a pipe, queue or scheduler, the user should type commands like:
+"ipfw pipe x config"
+"ipfw queue y config pipe x"
+"ipfw pipe x config sched <type>"
+
+The userland side of dummynet will prepare a buffer contains data to pass to
+kernel side.
+The buffer contains all struct needed to configure an object. In more detail,
+to configure a pipe all three structs (dn_link, dn_sch, dn_fs) are needed,
+plus the delay profile struct if the pipe has a delay profile.
+
+If configuring a scheduler only the struct dn_sch is wrote in the buffer,
+while if configuring a flowset only the dn_fs struct is wrote.
+
+The first struct in the buffer contains the type of command request, that is
+if it is configuring a pipe, a queue, or a scheduler. Then there are structs
+need to configure the object, and finally there is the struct that mark
+the end of the buffer.
+
+To support the insertion of pipe and queue using the old syntax, when adding
+a pipe it's necessary to create a FIFO flowset and a FIFO scheduler, which
+have a number x + DN_PIPEOFFSET.
+
+Add a pipe
+----------
+A pipe is only a template for a link.
+If the pipe already exists, parameters are updated. If a delay profile exists
+it is deleted and a new one is created.
+If the pipe doesn't exist a new one is created. After the creation, the
+flowset unlinked list is scanned to see if there are some flowset that would
+be linked with this pipe. If so, these flowset will be of wf2q+ type (for
+compatibility) and a new wf2q+ scheduler is created now.
+
+Add a scheduler
+---------------
+If the scheduler already exists, and the type and the mask are the same, the
+scheduler is simply reconfigured calling the config_scheduler() scheduler
+function with the RECONFIGURE flag active.
+If the type or the mask differ, it is necessary to delete the old scheduler
+and create a new one.
+If the scheduler doesn't exists, a new one is created. If the scheduler has
+a mask, the hash table is created to store pointers to scheduler instances.
+When a new scheduler is created, it is necessary to scan the unlinked
+flowset list to search eventually flowset that would be linked with this
+scheduler number. If some are found, flowsets became of the type of this
+scheduler and they are configured properly.
+
+Add a flowset
+-------------
+Flowset pointers are store in the system in two list. The unlinked flowset list
+contains all flowset that aren't linked with a scheduler, the flowset list
+contains flowset linked to a scheduler, and so they have a type.
+When adding a new flowset, first it is checked if the flowset exists (that is,
+it is in the flowset list) and if it doesn't exists a new flowset is created
+and added to unlinked flowset list if the scheduler which the flowset would be
+linked doesn't exists, or added in the flowset list and configured properly if
+the scheduler exists. If the flowset (before to be created) was in the
+unlinked flowset list, it is removed and deleted, and then recreated.
+If the flowset exists, to allow reconfiguration of this flowset, the
+scheduler number and types must match with the one in memory. If this isn't
+so, the flowset is deleted and a new one will be created. Really, the flowset
+it isn't deleted now, but it is removed from flowset list and it will be
+deleted later because there could be some queues that are using it.
+
+Listing of object
+=================
+The user can request a list of object present in dummynet through the command
+"ipfw [-v] pipe|queue [x] list|show"
+The kernel side of dummynet send a buffer to user side that contains all
+pipe, all scheduler, all flowset, plus all scheduler instances and all queues.
+The dummynet user land will format the output and show only the relevant
+information.
+The buffer sent start with all pipe from the system. The entire struct dn_link
+is passed, except the delay_profile struct that is useless in user space.
+After pipes, all flowset are wrote in the buffer. The struct contains
+scheduler flowset specific data is linked with the flowset writing the
+'obj' id of the extension into the 'alg_fs' pointer.
+Then schedulers are wrote. If a scheduler has one or more scheduler instance,
+these are linked to the parent scheduler writing the id of the parent in the
+'ptr_sched' pointer. If a scheduler instance has queues, there are wrote in
+the buffer and linked thorugh the 'obj' and 'sched_inst' pointer.
+Finally, flowsets in the unlinked flowset list  are write in the buffer, and
+then a struct gen in saved in the buffer to mark the last struct in the buffer.
+
+
+Delete of object
+================
+An object is usually removed by user through a command like
+"ipfw pipe|queue x delete". XXX sched?
+ipfw pass to the kernel a struct gen that contains the type and the number
+of the object to remove
+
+Delete of pipe x
+----------------
+A pipe can be deleted by the user throught the command 'ipfw pipe x delete'.
+To delete a pipe, the pipe is removed from the pipe list, and then deleted.
+Also the scheduler associated with this pipe should be deleted.
+For compatibility with old dummynet syntax, the associated FIFO scheduler and
+FIFO flowset must be deleted.
+
+Delete of flowset x
+-------------------
+To remove a flowset, we must be sure that is no loger referenced by any object.
+If the flowset to remove is in the unlinked flowset list, there is not any
+issue, the flowset can be safely removed calling a free() (the flowset
+extension is not yet created if the flowset is in this list).
+If the flowset is in the flowset list, first we remove from it so new packet
+are discarded when arrive. Next, the flowset is marked as delete.
+Now we must check if some queue is using this flowset.
+To do this, a counter (active_f) is provided. This counter indicate how many
+queues exist using this flowset.
+The active_f counter is automatically incremented when a queue is created
+and decremented when a queue is deleted.
+If the counter is 0, the flowset can be safely deleted, and the delete_alg_fs()
+scheduler function is called before deallocate memory.
+If the counter is not 0, the flowset remain in memory until the counter become
+zero. When a queue is delete (by dn_delete_queue() function) it is checked if
+the linked flowset is deleting and if so the counter is decrementing. If the
+counter reaches 0, the flowset is deleted.
+The deletion of a queue can be done only by the scheduler, or when the scheduler
+is destroyed.
+
+Delete of scheduler x
+---------------------
+To delete a scheduler we must be sure that any scheduler instance of this type
+are in the system_heap. To do so, a counter (inst_counter) is provided.
+This counter is managed by the system: it is incremented every time it is
+inserted in the system_heap, and decremented every time it is extracted from it.
+To delete the scheduler, first we remove it from the scheduler list, so new
+packet are discarded when they arrive, and mark the scheduler as deleting.
+
+If the counter is 0, we can remove the scheduler safely calling the
+really_deletescheduler() function. This function will scan all scheduler
+instances and call the delete_scheduler_instance() function that will delete
+the instance. When all instance are deleted, the scheduler template is
+deleted calling the delete_scheduler_template(). If the delay line associate
+with the scheduler is empty, it is deleted now, else it will be deleted when
+it will became empy.
+If the counter was not 0, we wait for it. Every time the dummynet_task()
+function extract a scheduler from the system_heap, the counter is decremented.
+If the scheduler has the delete flag enabled the dequeue() is not called and
+delete_scheduler_instance() is called to delete the instance.
+Obviously this scheduler instance is no loger inserted in the system_heap.
+If the counter reaches 0, the delete_scheduler_template() function is called
+all memory is released.
+NOTE: Flowsets that belong to this scheduler are not deleted, so if a new
+      scheduler with the same number is inserted will use these flowsets.
+      To do so, the best approach would be insert these flowset in the
+      unlinked flowset list, but doing this now will be very expensive.
+      So flowsets will remain in memory and linked with a scheduler that no
+      longer exists until a packet belonging to this flowset arrives. When
+      this packet arrives, the reconfigure() function is called because the
+      generation number mismatch with one contains in the flowset and so
+      the flowset will be moved into the flowset unlinked list, or will be
+      linked with the new scheduler if a new one was created.
+
+
+COMPATIBILITY WITH FREEBSD 7.2 AND FREEBSD 8 'IPFW' BINARY
+==========================================================
+Dummynet is not compatible with old ipfw binary because internal structs are
+changed. Moreover, the old ipfw binary is not compatible with new kernels
+because the struct that represents a firewall rule has changed. So, if a user
+install a new kernel on a FreeBSD 7.2, the ipfw (and possibly many other
+commands) will not work.
+New dummynet uses a new socket option: IP_DUMMYNET3, used for both set and get.
+The old option can be used to allow compatibility with the 'ipfw' binary of
+older version (tested with 7.2 and 8.0) of FreeBSD.
+Two file are provided for this purpose:
+- ip_dummynet_glue.c translates old dummynet requests to the new ones,
+- ip_fw_glue.c converts the rule format between 7.2 and 8 versions.
+Let see in detail these two files.
+
+IP_DUMMYNET_GLUE.C
+------------------
+The internal structs of new dummynet are very different from the original.
+Because of there are some difference from between dummynet in FreeBSD 7.2 and
+dummynet in FreeBSD 8 (the FreeBSD 8 version includes support to pipe delay
+profile and burst option), I have to include both header files. I copied
+the revision 191715 (for version 7.2) and the revision 196045 (for version 8)
+and I appended a number to each struct to mark them.
+
+The main function of this file is ip_dummynet_compat() that is called by
+ip_dn_ctl() when it receive a request of old socket option.
+
+A global variabile ('is7') store the version of 'ipfw' that FreeBSD is using.
+This variable is set every time a request of configuration is done, because
+with this request we receive a buffer of which size depending of ipfw version.
+Because of in general the first action is a configuration, this variable is
+usually set accordly. If the first action is a request of listing of pipes
+or queues, the system cannot know the version of ipfw, and we suppose that
+version 7.2 is used. If version is wrong, the output can be senseless, but
+the application should not crash.
+
+There are four request for old dummynet:
+- IP_DUMMYNET_FLUSH: the flush options have no parameter, so simply the
+  dummynet_flush() function is called;
+- IP_DUMMYNET_DEL: the delete option need to be translate.
+  It is only necessary to extract the number and the type of the object
+  (pipe or queue) to delete from the buffer received and build a new struct
+  gen contains the right parameters, then call the delete_object() function;
+- IP_DUMMYNET_CONFIGURE: the configure command receive a buffer depending of
+  the ipfw version. After the properly extraction of all data, that depends
+  by the ipfw version used, new structures are filled and then the dummynet
+  config_link() function is properly called. Note that the 7.2 version does
+  not support some parameter as burst or delay profile.
+- IP_DUMMYNET_GET: The get command should send to the ipfw the correct buffer
+  depending of its version. There are two function that build the
+  corrected buffer, ip_dummynet_get7() and ip_dummynet_get8(). These
+  functions reproduce the buffer exactly as 'ipfw' expect. The only difference
+  is that the weight parameter for a queue is no loger sent by dummynet and so
+  it is set to 0.
+  Moreover, because of the internal structure has changed, the bucket size
+  of a queue could not be correct, because now all flowset share the hash
+  table.
+  If the version of ipfw is wrong, the output could be senseless or truncated,
+  but the application should not crash.
+
+IP_FW_GLUE.C
+------------
+The ipfw binary also is used to add rules to FreeBSD firewall. Because of the
+struct ip_fw is changed from FreeBsd 7.2 to FreeBSD 8, it is necessary
+to write some glue code to allow use ipfw from FreeBSD 7.2 with the kernel
+provided with FreeBSD 8.
+This file contains two functions to convert a rule from FreeBSD 7.2 format to
+FreeBSD 8 format, and viceversa.
+The conversion should be done when a rule passes from userspace to kernel space
+and viceversa.
+I have to modify the ip_fw2.c file to manage these two case, and added a
+variable (is7) to store the ipfw version used, using an approach like the
+previous file:
+- when a new rule is added (option IP_FW_ADD) the is7 variable is set if the
+  size of the rule received corrispond to FreeBSD 7.2 ipfw version. If so, the
+  rule is converted to version 8 calling the function convert_rule_to_8().
+  Moreover, after the insertion of the rule, the rule is now reconverted to
+  version 7 because the ipfw binary will print it.
+- when the user request a list of rules (option IP_FW_GET) the is7 variable
+  should be set correctly because we suppose that a configure command was done,
+  else we suppose that the FreeBSD version is 8. The function ipfw_getrules()
+  in ip_fw2.c file return all rules, eventually converted to version 7 (if
+  the is7 is set) to the ipfw binary.
+The conversion of a rule is quite simple. The only difference between the
+two structures (struct ip_fw) is that in the new there is a new field
+(uint32_t id). So, I copy the entire rule in a buffer and the copy the rule in
+the right position in the new (or old) struct. The size of commands are not
+changed, and the copy is done into a cicle.
+
+How to configure dummynet
+=========================
+It is possible to configure dummynet through two main commands:
+'ipfw pipe' and 'ipfw queue'.
+To allow compatibility with old version, it is possible configure dummynet
+using the old command syntax. Doing so, obviously, it is only possible to
+configure a FIFO scheduler or a wf2q+ scheduler.
+A new command, 'ipfw pipe x config sched <type>' is supported to add a new
+scheduler to the system.
+
+- ipfw pipe x config ...
+  create a new pipe with the link parameters
+  create a new scheduler fifo (x + offset)
+  create a new flowset fifo (x + offset)
+  the mask is eventually stored in the FIFO scheduler
+
+- ipfw queue y config pipe x ...
+  create a new flowset y linked to sched x.
+    The type of flowset depends by the specified scheduler.
+    If the scheduler does not exist, this flowset is inserted in a special
+    list and will be not active.
+    If pipe x exists and sched does not exist, a new wf2q+ scheduler is
+    created and the flowset will be linked to this new scheduler (this is
+    done for compatibility with old syntax).
+
+- ipfw pipe x config sched <type> ...
+  create a new scheduler x of type <type>.
+  Search into the flowset unlinked list if there are some flowset that
+  should be linked with this new scheduler.
+
+- ipfw pipe x delete
+  delete the pipe x
+  delete the scheduler fifo (x + offset)
+  delete the scheduler x
+  delete the flowset fifo (x + offset)
+
+- ipfw queue x delete
+  delete the flowset x
+
+- ipfw sched x delete ///XXX
+  delete the scheduler x
+
+Follow now some examples to how configure dummynet:
+- Ex1:
+  ipfw pipe 10 config bw 1M delay 15 // create a pipe with band and delay
+                                        A FIFO flowset and scheduler is
+                                        also created
+  ipfw queue 5 config pipe 10 weight 56 // create a flowset. This flowset
+                                           will be of wf2q+ because a pipe 10
+                                           exists. Moreover, the wf2q+
+                                           scheduler is created now.
+- Ex2:
+  ipfw queue 5 config pipe 10 weight 56 // Create a flowset. Scheduler 10
+                                           does not exist, so this flowset
+                                           is inserted in the unlinked
+                                           flowset list.
+  ipfw pipe 10 config bw... // Create a pipe, a FIFO flowset and scheduler.
+                               Because of a flowset with 'pipe 10' exists,
+                               a wf2q+ scheduler is created now and that
+                               flowset is linked with this sceduler.
+
+- Ex3:
+  ipfw pipe 10 config bw...    // Create a pipe, a FIFO flowset and scheduler.
+  ipfw pipe 10 config sched rr // Create a scheduler of type RR, linked to
+                                  pipe 10
+  ipfw queue 5 config pipe 10 weight 56 // Create a flowset 5. This flowset
+                                           will belong to scheduler 10 and
+                                           it is of type RR
+
+- Ex4:
+  ipfw pipe 10 config sched rr // Create a scheduler of type RR, linked to
+                                  pipe 10 (not exist yet)
+  ipfw pipe 10 config bw... // Create a pipe, a FIFO flowset and scheduler.
+  ipfw queue 5 config pipe 10 weight 56 // Create a flowset 5.This flowset
+                                           will belong to scheduler 10 and
+                                           it is of type RR
+  ipfw pipe 10 config sched wf2q+ // Modify the type of scheduler 10. It
+                                     becomes a wf2q+ scheduler.
+                                     When a new packet of flowset 5 arrives,
+                                     the flowset 5 becomes to wf2q+ type.
+
+How to implement a new scheduler
+================================
+In dummynet, a scheduler algorithm is represented by two main structs, some
+functions and other minor structs.
+- A struct dn_sch_xyz (where xyz is the 'type' of scheduler algorithm
+  implemented) contains data relative to scheduler, as global parameter that
+  are common to all instances of the scheduler
+- A struct dn_sch_inst_xyz contains data relative to a single scheduler
+  instance, as local status variable depending for example by flows that
+  are linked with the scheduler, and so on.
+To add a scheduler to dummynet, the user should type a command like:
+'ipfw pipe x config sched <type> [mask ... ...]'
+This command creates a new struct dn_sch_xyz of type <type>, and
+store the optional parameter in that struct.
+
+The parameter mask determines how many scheduler instance of this
+scheduler may exist. For example, it is possible to divide traffic
+depending on the source port (or destination, or ip address...),
+so that every scheduler instance act as an independent scheduler.
+If the mask is not set, all traffic goes to the same instance.
+
+When a packet arrives to a scheduler, the system search the corrected
+scheduler instance, and if it does not exist it is created now (the
+struct dn_sch_inst_xyz is allocated by the system, and the scheduler
+fills the field correctly). It is a task of the scheduler to create
+the struct that contains all queues for a scheduler instance.
+Dummynet provides some function to create an hash table to store
+queues, but the schedule algorithm can choice the own struct.
+
+To link a flow to a scheduler, the user should type a command like:
+'ipfw queue z config pipe x [mask... ...]'
+
+This command creates a new 'dn_fs' struct that will be inserted
+in the system.  If the scheduler x exists, this flowset will be
+linked to that scheduler and the flowset type become the same as
+the scheduler type. At this point, the function create_alg_fs_xyz()
+is called to allow store eventually parameter for the flowset that
+depend by scheduler (for example the 'weight' parameter for a wf2q+
+scheduler, or some priority...). A parameter mask can be used for
+a flowset. If the mask parameter is set, the scheduler instance can
+separate packet according to its flow id (src and dst ip, ports...)
+and assign it to a separate queue. This is done by the scheduler,
+so it can ignore the mask if it wants.
+
+See now the two main structs:
+struct dn_sch_xyz {
+    struct gen g; /* important the name g */
+    /* global params */
+};
+struct dn_sch_inst_xyz {
+    struct gen g; /* important the name g */
+    /* params of the instance */
+};
+It is important to embed the struct gen as first parameter. The struct gen
+contains some values that the scheduler instance must fill (the 'type' of
+scheduler, the 'len' of the struct...)
+The function create_scheduler_xyz() should be implemented to initialize global
+parameters in the first struct, and if memory allocation is done it is
+mandatory to implement the delete_scheduler_template() function to free that
+memory.
+The function create_scheduler_instance_xyz() must be implemented even if the
+scheduler instance does not use extra parameters. In this function the struct
+gen fields must be filled with corrected infos. The
+delete_scheduler_instance_xyz() function must bu implemented if the instance
+has allocated some memory in the previous function.
+
+To store data belonging to a flowset the follow struct is used:
+struct alg_fs_xyz {
+    struct gen g;
+    /* fill correctly the gen struct
+     g.subtype = DN_XYZ;
+     g.len = sizeof(struct alg_fs_xyz)
+     ...
+     */
+    /* params for the flow */
+};
+The create_alg_fs_xyz() function is mandatory, because it must fill the struct
+gen, but the delete_alg_fs_xyz() is mandatory only if the previous function
+has allocated some memory.
+
+A struct dn_queue contains packets belonging to a queue and some statistical
+data. The scheduler could have to store data in this struct, so it must define
+a dn_queue_xyz struct:
+struct dn_queue_xyz {
+    struct dn_queue q;
+    /* parameter for a queue */
+}
+
+All structures are allocated by the system. To do so, the scheduler must
+set the size of its structs in the scheduler descriptor:
+scheduler_size:     sizeof(dn_sch_xyz)
+scheduler_i_size:   sizeof(dn_sch_inst_xyz)
+flowset_size:       sizeof(alg_fs_xyz)
+queue_size:         sizeof(dn_queue_xyz);
+The scheduler_size could be 0, but other struct must have at least a struct gen.
+
+
+After the definition of structs, it is necessary to implement the
+scheduler functions.
+
+- int (*config_scheduler)(char *command, void *sch, int reconfigure);
+    Configure a scheduler, or reconfigure if 'reconfigure' == 1.
+    This function performs additional allocation and initialization of global
+    parameter for this scheduler.
+    If memory is allocated here, the delete_scheduler_template() function
+    should be implemented to remove this memory.
+- int (*delete_scheduler_template)(void* sch);
+    Delete a scheduler template. This function is mandatory if the scheduler
+    uses extra data respect the struct dn_sch.
+- int (*create_scheduler_instance)(void *s);
+    Create a new scheduler instance. The system allocate the necessary memory
+    and the schedulet can access it using the 's' pointer.
+    The scheduler instance stores all queues, and to do this can use the
+    hash table provided by the system.
+- int (*delete_scheduler_instance)(void *s);
+    Delete a scheduler instance. It is important to free memory allocated
+    by create_scheduler_instance() function. The memory allocated by system
+    is freed by the system itself. The struct contains all queue also has
+    to be deleted.
+- int (*enqueue)(void *s, struct gen *f, struct mbuf *m,
+                 struct ipfw_flow_id *id);
+    Called when a packet arrives. The packet 'm' belongs to the scheduler
+    instance 's', has a flowset 'f' and the flowid 'id' has already been
+    masked. The enqueue() must call dn_queue_packet(q, m) function to really
+    enqueue packet in the queue q. The queue 'q' is chosen by the scheduler
+    and if it does not exist should be created calling the dn_create_queue()
+    function. If the schedule want to drop the packet, it must call the
+    dn_drop_packet() function and then return 1.
+- struct mbuf * (*dequeue)(void *s);
+    Called when the timer expires (or when a packet arrives and the scheduler
+    instance is idle).
+    This function is called when at least a packet can be send out. The
+    scheduler choices the packet and returns it; if no packet are in the
+    schedulerinstance, the function must return NULL.
+    Before return a packet, it is important to call the function
+    dn_return_packet() to update some statistic of the queue and update the
+    queue counters.
+- int (*drain_queue)(void *s, int flag);
+    The system request to scheduler to delete all queues that is not using
+    to free memory. The flag parameter indicate if a queue must be deleted
+    even if it is active.
+
+- int (*create_alg_fs)(char *command, struct gen *g, int reconfigure);
+    It is called when a flowset is linked with a scheduler. This is done
+    when the scheduler is defined, so we can know the type of flowset.
+    The function initialize the flowset paramenter parsing the command
+    line. The parameter will be stored in the g struct that have the right
+    size allocated by the system. If the reconfigure flag is set, it means
+    that the flowset is reconfiguring
+- int (*delete_alg_fs)(struct gen *f);
+    It is called when a flowset is deleting. Must remove the memory allocate
+    by the create_alg_fs() function.
+
+- int (*create_queue_alg)(struct dn_queue *q, struct gen *f);
+    Called when a queue is created. The function should link the queue
+    to the struct used by the scheduler instance to store all queues.
+- int (*delete_queue_alg)(struct dn_queue *q);
+    Called when a queue is deleting. The function should remove extra data
+    and update the struct contains all queues in the scheduler instance.
+
+The struct scheduler represent the scheduler descriptor that is passed to
+dummynet when a scheduler module is loaded.
+This struct contains the type of scheduler, the lenght of all structs and
+all function pointers.
+If a function is not implemented should be initialize to NULL. Some functions
+are mandatory, other are mandatory if some memory should be freed.
+Mandatory functions:
+- create_scheduler_instance()
+- enqueue()
+- dequeue()
+- create_alg_fs()
+- drain_queue()
+Optional functions:
+- config_scheduler()
+- create_queue_alg()
+Mandatory functions if the corresponding create...() has allocated memory:
+- delete_scheduler_template()
+- delete_scheduler_instance()
+- delete_alg_fs()
+- delete_queue_alg()
+
diff --git a/sys/netinet/ipfw/ip_dn_glue.c b/sys/netinet/ipfw/ip_dn_glue.c
new file mode 100644
index 0000000..69f189c
--- /dev/null
+++ b/sys/netinet/ipfw/ip_dn_glue.c
@@ -0,0 +1,843 @@
+/*-    
+ * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ *
+ * Binary compatibility support for /sbin/ipfw RELENG_7 and RELENG_8
+ */
+
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/time.h>
+#include <sys/taskqueue.h>
+#include <net/if.h>	/* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */
+#include <netinet/in.h>
+#include <netinet/ip_var.h>	/* ip_output(), IP_FORWARDING */
+#include <netinet/ip_fw.h>
+#include <netinet/ipfw/ip_fw_private.h>
+#include <netinet/ipfw/dn_heap.h>
+#include <netinet/ip_dummynet.h>
+#include <netinet/ipfw/ip_dn_private.h>
+#include <netinet/ipfw/dn_sched.h>
+
+/* FREEBSD7.2 ip_dummynet.h r191715*/
+
+struct dn_heap_entry7 {
+	int64_t key;        /* sorting key. Topmost element is smallest one */
+	void *object;      /* object pointer */
+};
+
+struct dn_heap7 {
+	int size;
+	int elements;
+	int offset; /* XXX if > 0 this is the offset of direct ptr to obj */
+	struct dn_heap_entry7 *p;   /* really an array of "size" entries */
+};
+
+/* Common to 7.2 and 8 */
+struct dn_flow_set {
+	SLIST_ENTRY(dn_flow_set)    next;   /* linked list in a hash slot */
+
+	u_short fs_nr ;             /* flow_set number       */
+	u_short flags_fs;
+#define DNOLD_HAVE_FLOW_MASK   0x0001
+#define DNOLD_IS_RED       0x0002
+#define DNOLD_IS_GENTLE_RED    0x0004
+#define DNOLD_QSIZE_IS_BYTES   0x0008  /* queue size is measured in bytes */
+#define DNOLD_NOERROR      0x0010  /* do not report ENOBUFS on drops  */
+#define DNOLD_HAS_PROFILE      0x0020  /* the pipe has a delay profile. */
+#define DNOLD_IS_PIPE      0x4000
+#define DNOLD_IS_QUEUE     0x8000
+
+	struct dn_pipe7 *pipe ;  /* pointer to parent pipe */
+	u_short parent_nr ;     /* parent pipe#, 0 if local to a pipe */
+
+	int weight ;        /* WFQ queue weight */
+	int qsize ;         /* queue size in slots or bytes */
+	int plr ;           /* pkt loss rate (2^31-1 means 100%) */
+
+	struct ipfw_flow_id flow_mask ;
+
+	/* hash table of queues onto this flow_set */
+	int rq_size ;       /* number of slots */
+	int rq_elements ;       /* active elements */
+	struct dn_flow_queue7 **rq;  /* array of rq_size entries */
+
+	u_int32_t last_expired ;    /* do not expire too frequently */
+	int backlogged ;        /* #active queues for this flowset */
+
+        /* RED parameters */
+#define SCALE_RED               16
+#define SCALE(x)                ( (x) << SCALE_RED )
+#define SCALE_VAL(x)            ( (x) >> SCALE_RED )
+#define SCALE_MUL(x,y)          ( ( (x) * (y) ) >> SCALE_RED )
+	int w_q ;           /* queue weight (scaled) */
+	int max_th ;        /* maximum threshold for queue (scaled) */
+	int min_th ;        /* minimum threshold for queue (scaled) */
+	int max_p ;         /* maximum value for p_b (scaled) */
+	u_int c_1 ;         /* max_p/(max_th-min_th) (scaled) */
+	u_int c_2 ;         /* max_p*min_th/(max_th-min_th) (scaled) */
+	u_int c_3 ;         /* for GRED, (1-max_p)/max_th (scaled) */
+	u_int c_4 ;         /* for GRED, 1 - 2*max_p (scaled) */
+	u_int * w_q_lookup ;    /* lookup table for computing (1-w_q)^t */
+	u_int lookup_depth ;    /* depth of lookup table */
+	int lookup_step ;       /* granularity inside the lookup table */
+	int lookup_weight ;     /* equal to (1-w_q)^t / (1-w_q)^(t+1) */
+	int avg_pkt_size ;      /* medium packet size */
+	int max_pkt_size ;      /* max packet size */
+};
+SLIST_HEAD(dn_flow_set_head, dn_flow_set);
+
+#define DN_IS_PIPE		0x4000
+#define DN_IS_QUEUE		0x8000
+struct dn_flow_queue7 {
+	struct dn_flow_queue7 *next ;
+	struct ipfw_flow_id id ;
+
+	struct mbuf *head, *tail ;  /* queue of packets */
+	u_int len ;
+	u_int len_bytes ;
+
+	u_long numbytes;
+
+	u_int64_t tot_pkts ;    /* statistics counters  */
+	u_int64_t tot_bytes ;
+	u_int32_t drops ;
+
+	int hash_slot ;     /* debugging/diagnostic */
+
+	/* RED parameters */
+	int avg ;                   /* average queue length est. (scaled) */
+	int count ;                 /* arrivals since last RED drop */
+	int random ;                /* random value (scaled) */
+	u_int32_t q_time;      /* start of queue idle time */
+
+	/* WF2Q+ support */
+	struct dn_flow_set *fs ;    /* parent flow set */
+	int heap_pos ;      /* position (index) of struct in heap */
+	int64_t sched_time ;     /* current time when queue enters ready_heap */
+
+	int64_t S,F ;        /* start time, finish time */
+};
+
+struct dn_pipe7 {        /* a pipe */
+	SLIST_ENTRY(dn_pipe7)    next;   /* linked list in a hash slot */
+
+	int pipe_nr ;       /* number   */
+	int bandwidth;      /* really, bytes/tick.  */
+	int delay ;         /* really, ticks    */
+
+	struct  mbuf *head, *tail ; /* packets in delay line */
+
+	/* WF2Q+ */
+	struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/
+	struct dn_heap7 not_eligible_heap; /* top extract- key Start time */
+	struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */
+
+	int64_t V ;          /* virtual time */
+	int sum;            /* sum of weights of all active sessions */
+
+	int numbytes;
+
+	int64_t sched_time ;     /* time pipe was scheduled in ready_heap */
+
+	/*
+	* When the tx clock come from an interface (if_name[0] != '\0'), its name
+	* is stored below, whereas the ifp is filled when the rule is configured.
+	*/
+	char if_name[IFNAMSIZ];
+	struct ifnet *ifp ;
+	int ready ; /* set if ifp != NULL and we got a signal from it */
+
+	struct dn_flow_set fs ; /* used with fixed-rate flows */
+};
+SLIST_HEAD(dn_pipe_head7, dn_pipe7);
+
+
+/* FREEBSD8 ip_dummynet.h r196045 */
+struct dn_flow_queue8 {
+	struct dn_flow_queue8 *next ;
+	struct ipfw_flow_id id ;
+
+	struct mbuf *head, *tail ;  /* queue of packets */
+	u_int len ;
+	u_int len_bytes ;
+
+	uint64_t numbytes ;     /* credit for transmission (dynamic queues) */
+	int64_t extra_bits;     /* extra bits simulating unavailable channel */
+
+	u_int64_t tot_pkts ;    /* statistics counters  */
+	u_int64_t tot_bytes ;
+	u_int32_t drops ;
+
+	int hash_slot ;     /* debugging/diagnostic */
+
+	/* RED parameters */
+	int avg ;                   /* average queue length est. (scaled) */
+	int count ;                 /* arrivals since last RED drop */
+	int random ;                /* random value (scaled) */
+	int64_t idle_time;       /* start of queue idle time */
+
+	/* WF2Q+ support */
+	struct dn_flow_set *fs ;    /* parent flow set */
+	int heap_pos ;      /* position (index) of struct in heap */
+	int64_t sched_time ;     /* current time when queue enters ready_heap */
+
+	int64_t S,F ;        /* start time, finish time */
+};
+
+struct dn_pipe8 {        /* a pipe */
+	SLIST_ENTRY(dn_pipe8)    next;   /* linked list in a hash slot */
+
+	int pipe_nr ;       /* number   */
+	int bandwidth;      /* really, bytes/tick.  */
+	int delay ;         /* really, ticks    */
+
+	struct  mbuf *head, *tail ; /* packets in delay line */
+
+	/* WF2Q+ */
+	struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/
+	struct dn_heap7 not_eligible_heap; /* top extract- key Start time */
+	struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */
+
+	int64_t V ;          /* virtual time */
+	int sum;            /* sum of weights of all active sessions */
+
+	/* Same as in dn_flow_queue, numbytes can become large */
+	int64_t numbytes;       /* bits I can transmit (more or less). */
+	uint64_t burst;     /* burst size, scaled: bits * hz */
+
+	int64_t sched_time ;     /* time pipe was scheduled in ready_heap */
+	int64_t idle_time;       /* start of pipe idle time */
+
+	char if_name[IFNAMSIZ];
+	struct ifnet *ifp ;
+	int ready ; /* set if ifp != NULL and we got a signal from it */
+
+	struct dn_flow_set fs ; /* used with fixed-rate flows */
+
+    /* fields to simulate a delay profile */
+#define ED_MAX_NAME_LEN     32
+	char name[ED_MAX_NAME_LEN];
+	int loss_level;
+	int samples_no;
+	int *samples;
+};
+
+#define ED_MAX_SAMPLES_NO   1024
+struct dn_pipe_max8 {
+	struct dn_pipe8 pipe;
+	int samples[ED_MAX_SAMPLES_NO];
+};
+SLIST_HEAD(dn_pipe_head8, dn_pipe8);
+
+/*
+ * Changes from 7.2 to 8:
+ * dn_pipe:
+ *      numbytes from int to int64_t
+ *      add burst (int64_t)
+ *      add idle_time (int64_t)
+ *      add profile
+ *      add struct dn_pipe_max
+ *      add flag DN_HAS_PROFILE
+ *
+ * dn_flow_queue
+ *      numbytes from u_long to int64_t
+ *      add extra_bits (int64_t)
+ *      q_time from u_int32_t to int64_t and name idle_time
+ *
+ * dn_flow_set unchanged
+ *
+ */
+
+/* NOTE:XXX copied from dummynet.c */
+#define O_NEXT(p, len) ((void *)((char *)p + len))
+static void
+oid_fill(struct dn_id *oid, int len, int type, uintptr_t id)
+{
+	oid->len = len;
+	oid->type = type;
+	oid->subtype = 0;
+	oid->id = id;
+}
+/* make room in the buffer and move the pointer forward */
+static void *
+o_next(struct dn_id **o, int len, int type)
+{
+	struct dn_id *ret = *o;
+	oid_fill(ret, len, type, 0);
+	*o = O_NEXT(*o, len);
+	return ret;
+}
+
+
+static size_t pipesize7 = sizeof(struct dn_pipe7);
+static size_t pipesize8 = sizeof(struct dn_pipe8);
+static size_t pipesizemax8 = sizeof(struct dn_pipe_max8);
+
+/* Indicate 'ipfw' version
+ * 1: from FreeBSD 7.2
+ * 0: from FreeBSD 8
+ * -1: unknow (for now is unused)
+ *
+ * It is update when a IP_DUMMYNET_DEL or IP_DUMMYNET_CONFIGURE request arrives
+ * NOTE: if a IP_DUMMYNET_GET arrives and the 'ipfw' version is unknow,
+ *       it is suppose to be the FreeBSD 8 version.
+ */
+static int is7 = 0;
+
+static int
+convertflags2new(int src)
+{
+	int dst = 0;
+
+	if (src & DNOLD_HAVE_FLOW_MASK)
+		dst |= DN_HAVE_MASK;
+	if (src & DNOLD_QSIZE_IS_BYTES)
+		dst |= DN_QSIZE_BYTES;
+	if (src & DNOLD_NOERROR)
+		dst |= DN_NOERROR;
+	if (src & DNOLD_IS_RED)
+		dst |= DN_IS_RED;
+	if (src & DNOLD_IS_GENTLE_RED)
+		dst |= DN_IS_GENTLE_RED;
+	if (src & DNOLD_HAS_PROFILE)
+		dst |= DN_HAS_PROFILE;
+
+	return dst;
+}
+
+static int
+convertflags2old(int src)
+{
+	int dst = 0;
+
+	if (src & DN_HAVE_MASK)
+		dst |= DNOLD_HAVE_FLOW_MASK;
+	if (src & DN_IS_RED)
+		dst |= DNOLD_IS_RED;
+	if (src & DN_IS_GENTLE_RED)
+		dst |= DNOLD_IS_GENTLE_RED;
+	if (src & DN_NOERROR)
+		dst |= DNOLD_NOERROR;
+	if (src & DN_HAS_PROFILE)
+		dst |= DNOLD_HAS_PROFILE;
+	if (src & DN_QSIZE_BYTES)
+		dst |= DNOLD_QSIZE_IS_BYTES;
+
+	return dst;
+}
+
+static int
+dn_compat_del(void *v)
+{
+	struct dn_pipe7 *p = (struct dn_pipe7 *) v;
+	struct dn_pipe8 *p8 = (struct dn_pipe8 *) v;
+	struct {
+		struct dn_id oid;
+		uintptr_t a[1];	/* add more if we want a list */
+	} cmd;
+
+	/* XXX DN_API_VERSION ??? */
+	oid_fill((void *)&cmd, sizeof(cmd), DN_CMD_DELETE, DN_API_VERSION);
+
+	if (is7) {
+		if (p->pipe_nr == 0 && p->fs.fs_nr == 0)
+			return EINVAL;
+		if (p->pipe_nr != 0 && p->fs.fs_nr != 0)
+			return EINVAL;
+	} else {
+		if (p8->pipe_nr == 0 && p8->fs.fs_nr == 0)
+			return EINVAL;
+		if (p8->pipe_nr != 0 && p8->fs.fs_nr != 0)
+			return EINVAL;
+	}
+
+	if (p->pipe_nr != 0) { /* pipe x delete */
+		cmd.a[0] = p->pipe_nr;
+		cmd.oid.subtype = DN_LINK;
+	} else { /* queue x delete */
+		cmd.oid.subtype = DN_FS;
+		cmd.a[0] = (is7) ? p->fs.fs_nr : p8->fs.fs_nr;
+	}
+
+	return do_config(&cmd, cmd.oid.len);
+}
+
+static int
+dn_compat_config_queue(struct dn_fs *fs, void* v)
+{
+	struct dn_pipe7 *p7 = (struct dn_pipe7 *)v;
+	struct dn_pipe8 *p8 = (struct dn_pipe8 *)v;
+	struct dn_flow_set *f;
+
+	if (is7)
+		f = &p7->fs;
+	else
+		f = &p8->fs;
+
+	fs->fs_nr = f->fs_nr;
+	fs->sched_nr = f->parent_nr;
+	fs->flow_mask = f->flow_mask;
+	fs->buckets = f->rq_size;
+	fs->qsize = f->qsize;
+	fs->plr = f->plr;
+	fs->par[0] = f->weight;
+	fs->flags = convertflags2new(f->flags_fs);
+	if (fs->flags & DN_IS_GENTLE_RED || fs->flags & DN_IS_RED) {
+		fs->w_q = f->w_q;
+		fs->max_th = f->max_th;
+		fs->min_th = f->min_th;
+		fs->max_p = f->max_p;
+	}
+
+	return 0;
+}
+
+static int
+dn_compat_config_pipe(struct dn_sch *sch, struct dn_link *p, 
+		      struct dn_fs *fs, void* v)
+{
+	struct dn_pipe7 *p7 = (struct dn_pipe7 *)v;
+	struct dn_pipe8 *p8 = (struct dn_pipe8 *)v;
+	int i = p7->pipe_nr;
+
+	sch->sched_nr = i;
+	sch->oid.subtype = 0;
+	p->link_nr = i;
+	fs->fs_nr = i + 2*DN_MAX_ID;
+	fs->sched_nr = i + DN_MAX_ID;
+
+	/* Common to 7 and 8 */
+	p->bandwidth = p7->bandwidth;
+	p->delay = p7->delay;
+	if (!is7) {
+		/* FreeBSD 8 has burst  */
+		p->burst = p8->burst;
+	}
+
+	/* fill the fifo flowset */
+	dn_compat_config_queue(fs, v);
+	fs->fs_nr = i + 2*DN_MAX_ID;
+	fs->sched_nr = i + DN_MAX_ID;
+
+	/* Move scheduler related parameter from fs to sch */
+	sch->buckets = fs->buckets; /*XXX*/
+	fs->buckets = 0;
+	if (fs->flags & DN_HAVE_MASK) {
+		sch->flags |= DN_HAVE_MASK;
+		fs->flags &= ~DN_HAVE_MASK;
+		sch->sched_mask = fs->flow_mask;
+		bzero(&fs->flow_mask, sizeof(struct ipfw_flow_id));
+	}
+
+	return 0;
+}
+
+static int
+dn_compat_config_profile(struct dn_profile *pf, struct dn_link *p,
+			 void *v)
+{
+	struct dn_pipe8 *p8 = (struct dn_pipe8 *)v;
+
+	p8->samples = &(((struct dn_pipe_max8 *)p8)->samples[0]);
+	
+	pf->link_nr = p->link_nr;
+	pf->loss_level = p8->loss_level;
+// 	pf->bandwidth = p->bandwidth; //XXX bandwidth redundant?
+	pf->samples_no = p8->samples_no;
+	strncpy(pf->name, p8->name,sizeof(pf->name));
+	bcopy(p8->samples, pf->samples, sizeof(pf->samples));
+
+	return 0;
+}
+
+/*
+ * If p->pipe_nr != 0 the command is 'pipe x config', so need to create
+ * the three main struct, else only a flowset is created
+ */
+static int
+dn_compat_configure(void *v)
+{
+	struct dn_id *buf, *base;
+	struct dn_sch *sch = NULL;
+	struct dn_link *p = NULL;
+	struct dn_fs *fs = NULL;
+	struct dn_profile *pf = NULL;
+	int lmax;
+	int error;
+
+	struct dn_pipe7 *p7 = (struct dn_pipe7 *)v;
+	struct dn_pipe8 *p8 = (struct dn_pipe8 *)v;
+
+	int i; /* number of object to configure */
+
+	lmax = sizeof(struct dn_id);	/* command header */
+	lmax += sizeof(struct dn_sch) + sizeof(struct dn_link) +
+		sizeof(struct dn_fs) + sizeof(struct dn_profile);
+
+	base = buf = malloc(lmax, M_DUMMYNET, M_WAIT|M_ZERO);
+	o_next(&buf, sizeof(struct dn_id), DN_CMD_CONFIG);
+	base->id = DN_API_VERSION;
+
+	/* pipe_nr is the same in p7 and p8 */
+	i = p7->pipe_nr;
+	if (i != 0) { /* pipe config */
+		sch = o_next(&buf, sizeof(*sch), DN_SCH);
+		p = o_next(&buf, sizeof(*p), DN_LINK);
+		fs = o_next(&buf, sizeof(*fs), DN_FS);
+
+		error = dn_compat_config_pipe(sch, p, fs, v);
+		if (error) {
+			free(buf, M_DUMMYNET);
+			return error;
+		}
+		if (!is7 && p8->samples_no > 0) {
+			/* Add profiles*/
+			pf = o_next(&buf, sizeof(*pf), DN_PROFILE);
+			error = dn_compat_config_profile(pf, p, v);
+			if (error) {
+				free(buf, M_DUMMYNET);
+				return error;
+			}
+		}
+	} else { /* queue config */
+		fs = o_next(&buf, sizeof(*fs), DN_FS);
+		error = dn_compat_config_queue(fs, v);
+		if (error) {
+			free(buf, M_DUMMYNET);
+			return error;
+		}
+	}
+	error = do_config(base, (char *)buf - (char *)base);
+
+	return error;
+}
+
+int
+dn_compat_calc_size(struct dn_parms dn_cfg)
+{
+	int need = 0;
+	/* XXX use FreeBSD 8 struct size */
+	/* NOTE:
+	 * - half scheduler: 		schk_count/2
+	 * - all flowset:		fsk_count
+	 * - all flowset queues:	queue_count
+	 * - all pipe queue:		si_count
+	 */
+	need += dn_cfg.schk_count * sizeof(struct dn_pipe8) / 2;
+	need += dn_cfg.fsk_count * sizeof(struct dn_flow_set);
+	need += dn_cfg.si_count * sizeof(struct dn_flow_queue8);
+	need += dn_cfg.queue_count * sizeof(struct dn_flow_queue8);
+
+	return need;
+}
+
+int
+dn_c_copy_q (void *_ni, void *arg)
+{
+	struct copy_args *a = arg;
+	struct dn_flow_queue7 *fq7 = (struct dn_flow_queue7 *)*a->start;
+	struct dn_flow_queue8 *fq8 = (struct dn_flow_queue8 *)*a->start;
+	struct dn_flow *ni = (struct dn_flow *)_ni;
+	int size = 0;
+
+	/* XXX hash slot not set */
+	/* No difference between 7.2/8 */
+	fq7->len = ni->length;
+	fq7->len_bytes = ni->len_bytes;
+	fq7->id = ni->fid;
+
+	if (is7) {
+		size = sizeof(struct dn_flow_queue7);
+		fq7->tot_pkts = ni->tot_pkts;
+		fq7->tot_bytes = ni->tot_bytes;
+		fq7->drops = ni->drops;
+	} else {
+		size = sizeof(struct dn_flow_queue8);
+		fq8->tot_pkts = ni->tot_pkts;
+		fq8->tot_bytes = ni->tot_bytes;
+		fq8->drops = ni->drops;
+	}
+
+	*a->start += size;
+	return 0;
+}
+
+int
+dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq)
+{
+	struct dn_link *l = &s->link;
+	struct dn_fsk *f = s->fs;
+
+	struct dn_pipe7 *pipe7 = (struct dn_pipe7 *)*a->start;
+	struct dn_pipe8 *pipe8 = (struct dn_pipe8 *)*a->start;
+	struct dn_flow_set *fs;
+	int size = 0;
+
+	if (is7) {
+		fs = &pipe7->fs;
+		size = sizeof(struct dn_pipe7);
+	} else {
+		fs = &pipe8->fs;
+		size = sizeof(struct dn_pipe8);
+	}
+
+	/* These 4 field are the same in pipe7 and pipe8 */
+	pipe7->next.sle_next = (struct dn_pipe7 *)DN_IS_PIPE;
+	pipe7->bandwidth = l->bandwidth;
+	pipe7->delay = l->delay;
+	pipe7->pipe_nr = l->link_nr - DN_MAX_ID;
+
+	if (!is7) {
+		if (s->profile) {
+			struct dn_profile *pf = s->profile;
+			strncpy(pipe8->name, pf->name, sizeof(pf->name));
+			pipe8->loss_level = pf->loss_level;
+			pipe8->samples_no = pf->samples_no;
+		}
+		pipe8->burst = div64(l->burst , 8 * hz);
+	}
+
+	fs->flow_mask = s->sch.sched_mask;
+	fs->rq_size = s->sch.buckets ? s->sch.buckets : 1;
+
+	fs->parent_nr = l->link_nr - DN_MAX_ID;
+	fs->qsize = f->fs.qsize;
+	fs->plr = f->fs.plr;
+	fs->w_q = f->fs.w_q;
+	fs->max_th = f->max_th;
+	fs->min_th = f->min_th;
+	fs->max_p = f->fs.max_p;
+	fs->rq_elements = nq;
+
+	fs->flags_fs = convertflags2old(f->fs.flags);
+
+	*a->start += size;
+	return 0;
+}
+
+
+int
+dn_compat_copy_pipe(struct copy_args *a, void *_o)
+{
+	int have = a->end - *a->start;
+	int need = 0;
+	int pipe_size = sizeof(struct dn_pipe8);
+	int queue_size = sizeof(struct dn_flow_queue8);
+	int n_queue = 0; /* number of queues */
+
+	struct dn_schk *s = (struct dn_schk *)_o;
+	/* calculate needed space:
+	 * - struct dn_pipe
+	 * - if there are instances, dn_queue * n_instances
+	 */
+	n_queue = (s->sch.flags & DN_HAVE_MASK ? dn_ht_entries(s->siht) :
+						(s->siht ? 1 : 0));
+	need = pipe_size + queue_size * n_queue;
+	if (have < need) {
+		D("have %d < need %d", have, need);
+		return 1;
+	}
+	/* copy pipe */
+	dn_c_copy_pipe(s, a, n_queue);
+
+	/* copy queues */
+	if (s->sch.flags & DN_HAVE_MASK)
+		dn_ht_scan(s->siht, dn_c_copy_q, a);
+	else if (s->siht)
+		dn_c_copy_q(s->siht, a);
+	return 0;
+}
+
+int
+dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq)
+{
+	struct dn_flow_set *fs = (struct dn_flow_set *)*a->start;
+
+	fs->next.sle_next = (struct dn_flow_set *)DN_IS_QUEUE;
+	fs->fs_nr = f->fs.fs_nr;
+	fs->qsize = f->fs.qsize;
+	fs->plr = f->fs.plr;
+	fs->w_q = f->fs.w_q;
+	fs->max_th = f->max_th;
+	fs->min_th = f->min_th;
+	fs->max_p = f->fs.max_p;
+	fs->flow_mask = f->fs.flow_mask;
+	fs->rq_elements = nq;
+	fs->rq_size = (f->fs.buckets ? f->fs.buckets : 1);
+	fs->parent_nr = f->fs.sched_nr;
+	fs->weight = f->fs.par[0];
+
+	fs->flags_fs = convertflags2old(f->fs.flags);
+	*a->start += sizeof(struct dn_flow_set);
+	return 0;
+}
+
+int
+dn_compat_copy_queue(struct copy_args *a, void *_o)
+{
+	int have = a->end - *a->start;
+	int need = 0;
+	int fs_size = sizeof(struct dn_flow_set);
+	int queue_size = sizeof(struct dn_flow_queue8);
+
+	struct dn_fsk *fs = (struct dn_fsk *)_o;
+	int n_queue = 0; /* number of queues */
+
+	n_queue = (fs->fs.flags & DN_HAVE_MASK ? dn_ht_entries(fs->qht) :
+						(fs->qht ? 1 : 0));
+
+	need = fs_size + queue_size * n_queue;
+	if (have < need) {
+		D("have < need");
+		return 1;
+	}
+
+	/* copy flowset */
+	dn_c_copy_fs(fs, a, n_queue);
+
+	/* copy queues */
+	if (fs->fs.flags & DN_HAVE_MASK)
+		dn_ht_scan(fs->qht, dn_c_copy_q, a);
+	else if (fs->qht)
+		dn_c_copy_q(fs->qht, a);
+
+	return 0;
+}
+
+int
+copy_data_helper_compat(void *_o, void *_arg)
+{
+	struct copy_args *a = _arg;
+
+	if (a->type == DN_COMPAT_PIPE) {
+		struct dn_schk *s = _o;
+		if (s->sch.oid.subtype != 1 || s->sch.sched_nr <= DN_MAX_ID) {
+			return 0;	/* not old type */
+		}
+		/* copy pipe parameters, and if instance exists, copy
+		 * other parameters and eventually queues.
+		 */
+		if(dn_compat_copy_pipe(a, _o))
+			return DNHT_SCAN_END;
+	} else if (a->type == DN_COMPAT_QUEUE) {
+		struct dn_fsk *fs = _o;
+		if (fs->fs.fs_nr >= DN_MAX_ID)
+			return 0;
+		if (dn_compat_copy_queue(a, _o))
+			return DNHT_SCAN_END;
+	}
+	return 0;
+}
+
+/* Main function to manage old requests */
+int
+ip_dummynet_compat(struct sockopt *sopt)
+{
+	int error=0;
+	void *v = NULL;
+	struct dn_id oid;
+
+	/* Lenght of data, used to found ipfw version... */
+	int len = sopt->sopt_valsize;
+
+	/* len can be 0 if command was dummynet_flush */
+	if (len == pipesize7) {
+		D("setting compatibility with FreeBSD 7.2");
+		is7 = 1;
+	}
+	else if (len == pipesize8 || len == pipesizemax8) {
+		D("setting compatibility with FreeBSD 8");
+		is7 = 0;
+	}
+
+	switch (sopt->sopt_name) {
+	default:
+		printf("dummynet: -- unknown option %d", sopt->sopt_name);
+		error = EINVAL;
+		break;
+
+	case IP_DUMMYNET_FLUSH:
+		oid_fill(&oid, sizeof(oid), DN_CMD_FLUSH, DN_API_VERSION);
+		do_config(&oid, oid.len);
+		break;
+
+	case IP_DUMMYNET_DEL:
+		v = malloc(len, M_TEMP, M_WAITOK);
+		error = sooptcopyin(sopt, v, len, len);
+		if (error)
+			break;
+		error = dn_compat_del(v);
+		free(v, M_DUMMYNET);
+		break;
+
+	case IP_DUMMYNET_CONFIGURE:
+		v = malloc(len, M_TEMP, M_WAITOK);
+		error = sooptcopyin(sopt, v, len, len);
+		if (error)
+			break;
+		error = dn_compat_configure(v);
+		free(v, M_DUMMYNET);
+		break;
+
+	case IP_DUMMYNET_GET: {
+		void *buf;
+		int ret;
+		int original_size = sopt->sopt_valsize;
+		int size;
+
+		ret = dummynet_get(sopt, &buf);
+		if (ret)
+			return 0;//XXX ?
+		size = sopt->sopt_valsize;
+		sopt->sopt_valsize = original_size;
+		D("size=%d, buf=%p", size, buf);
+		ret = sooptcopyout(sopt, buf, size);
+		if (ret)
+			printf("  %s ERROR sooptcopyout\n", __FUNCTION__);
+		if (buf)
+			free(buf, M_DUMMYNET);
+	    }
+	}
+
+	return error;
+}
+
+
diff --git a/sys/netinet/ipfw/ip_dn_io.c b/sys/netinet/ipfw/ip_dn_io.c
new file mode 100644
index 0000000..4219433
--- /dev/null
+++ b/sys/netinet/ipfw/ip_dn_io.c
@@ -0,0 +1,781 @@
+/*-
+ * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Dummynet portions related to packet handling.
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/sysctl.h>
+#include <net/if.h>	/* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */
+#include <net/netisr.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>		/* ip_len, ip_off */
+#include <netinet/ip_var.h>	/* ip_output(), IP_FORWARDING */
+#include <netinet/ip_fw.h>
+#include <netinet/ipfw/ip_fw_private.h>
+#include <netinet/ipfw/dn_heap.h>
+#include <netinet/ip_dummynet.h>
+#include <netinet/ipfw/ip_dn_private.h>
+#include <netinet/ipfw/dn_sched.h>
+
+#include <netinet/if_ether.h> /* various ether_* routines */
+
+#include <netinet/ip6.h>       /* for ip6_input, ip6_output prototypes */
+#include <netinet6/ip6_var.h>
+
+/*
+ * We keep a private variable for the simulation time, but we could
+ * probably use an existing one ("softticks" in sys/kern/kern_timeout.c)
+ * instead of dn_cfg.curr_time
+ */
+
+struct dn_parms dn_cfg;
+
+static long tick_last;		/* Last tick duration (usec). */
+static long tick_delta;		/* Last vs standard tick diff (usec). */
+static long tick_delta_sum;	/* Accumulated tick difference (usec).*/
+static long tick_adjustment;	/* Tick adjustments done. */
+static long tick_lost;		/* Lost(coalesced) ticks number. */
+/* Adjusted vs non-adjusted curr_time difference (ticks). */
+static long tick_diff;
+
+static unsigned long	io_pkt;
+static unsigned long	io_pkt_fast;
+static unsigned long	io_pkt_drop;
+
+/*
+ * We use a heap to store entities for which we have pending timer events.
+ * The heap is checked at every tick and all entities with expired events
+ * are extracted.
+ */
+  
+MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap");
+
+extern	void (*bridge_dn_p)(struct mbuf *, struct ifnet *);
+
+#ifdef SYSCTL_NODE
+
+SYSBEGIN(f4)
+
+SYSCTL_DECL(_net_inet);
+SYSCTL_DECL(_net_inet_ip);
+SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet");
+
+/* parameters */
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size,
+    CTLFLAG_RW, &dn_cfg.hash_size, 0, "Default hash table size");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_slot_limit,
+    CTLFLAG_RW, &dn_cfg.slot_limit, 0,
+    "Upper limit in slots for pipe queue.");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_byte_limit,
+    CTLFLAG_RW, &dn_cfg.byte_limit, 0,
+    "Upper limit in bytes for pipe queue.");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, io_fast,
+    CTLFLAG_RW, &dn_cfg.io_fast, 0, "Enable fast dummynet io.");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug,
+    CTLFLAG_RW, &dn_cfg.debug, 0, "Dummynet debug level");
+
+/* RED parameters */
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth,
+    CTLFLAG_RD, &dn_cfg.red_lookup_depth, 0, "Depth of RED lookup table");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size,
+    CTLFLAG_RD, &dn_cfg.red_avg_pkt_size, 0, "RED Medium packet size");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size,
+    CTLFLAG_RD, &dn_cfg.red_max_pkt_size, 0, "RED Max packet size");
+
+/* time adjustment */
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta,
+    CTLFLAG_RD, &tick_delta, 0, "Last vs standard tick difference (usec).");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta_sum,
+    CTLFLAG_RD, &tick_delta_sum, 0, "Accumulated tick difference (usec).");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_adjustment,
+    CTLFLAG_RD, &tick_adjustment, 0, "Tick adjustments done.");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_diff,
+    CTLFLAG_RD, &tick_diff, 0,
+    "Adjusted vs non-adjusted curr_time difference (ticks).");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost,
+    CTLFLAG_RD, &tick_lost, 0,
+    "Number of ticks coalesced by dummynet taskqueue.");
+
+/* statistics */
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, schk_count,
+    CTLFLAG_RD, &dn_cfg.schk_count, 0, "Number of schedulers");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, si_count,
+    CTLFLAG_RD, &dn_cfg.si_count, 0, "Number of scheduler instances");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, fsk_count,
+    CTLFLAG_RD, &dn_cfg.fsk_count, 0, "Number of flowsets");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, queue_count,
+    CTLFLAG_RD, &dn_cfg.queue_count, 0, "Number of queues");
+SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt,
+    CTLFLAG_RD, &io_pkt, 0,
+    "Number of packets passed to dummynet.");
+SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_fast,
+    CTLFLAG_RD, &io_pkt_fast, 0,
+    "Number of packets bypassed dummynet scheduler.");
+SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_drop,
+    CTLFLAG_RD, &io_pkt_drop, 0,
+    "Number of packets dropped by dummynet.");
+
+SYSEND
+
+#endif
+
+static void	dummynet_send(struct mbuf *);
+
+/*
+ * Packets processed by dummynet have an mbuf tag associated with
+ * them that carries their dummynet state.
+ * Outside dummynet, only the 'rule' field is relevant, and it must
+ * be at the beginning of the structure.
+ */
+struct dn_pkt_tag {
+	struct ipfw_rule_ref rule;	/* matching rule	*/
+
+	/* second part, dummynet specific */
+	int dn_dir;		/* action when packet comes out.*/
+				/* see ip_fw_private.h		*/
+	uint64_t output_time;	/* when the pkt is due for delivery*/
+	struct ifnet *ifp;	/* interface, for ip_output	*/
+	struct _ip6dn_args ip6opt;	/* XXX ipv6 options	*/
+};
+
+/*
+ * Return the mbuf tag holding the dummynet state (it should
+ * be the first one on the list).
+ */
+static struct dn_pkt_tag *
+dn_tag_get(struct mbuf *m)
+{
+	struct m_tag *mtag = m_tag_first(m);
+	KASSERT(mtag != NULL &&
+	    mtag->m_tag_cookie == MTAG_ABI_COMPAT &&
+	    mtag->m_tag_id == PACKET_TAG_DUMMYNET,
+	    ("packet on dummynet queue w/o dummynet tag!"));
+	return (struct dn_pkt_tag *)(mtag+1);
+}
+
+static inline void
+mq_append(struct mq *q, struct mbuf *m)
+{
+	if (q->head == NULL)
+		q->head = m;
+	else
+		q->tail->m_nextpkt = m;
+	q->tail = m;
+	m->m_nextpkt = NULL;
+}
+
+/*
+ * Dispose a list of packet. Use a functions so if we need to do
+ * more work, this is a central point to do it.
+ */
+void dn_free_pkts(struct mbuf *mnext)
+{
+        struct mbuf *m;
+    
+        while ((m = mnext) != NULL) {
+                mnext = m->m_nextpkt;
+                FREE_PKT(m);
+        }
+}
+
+static int
+red_drops (struct dn_queue *q, int len)
+{
+	/*
+	 * RED algorithm
+	 *
+	 * RED calculates the average queue size (avg) using a low-pass filter
+	 * with an exponential weighted (w_q) moving average:
+	 * 	avg  <-  (1-w_q) * avg + w_q * q_size
+	 * where q_size is the queue length (measured in bytes or * packets).
+	 *
+	 * If q_size == 0, we compute the idle time for the link, and set
+	 *	avg = (1 - w_q)^(idle/s)
+	 * where s is the time needed for transmitting a medium-sized packet.
+	 *
+	 * Now, if avg < min_th the packet is enqueued.
+	 * If avg > max_th the packet is dropped. Otherwise, the packet is
+	 * dropped with probability P function of avg.
+	 */
+
+	struct dn_fsk *fs = q->fs;
+	int64_t p_b = 0;
+
+	/* Queue in bytes or packets? */
+	uint32_t q_size = (fs->fs.flags & DN_QSIZE_BYTES) ?
+	    q->ni.len_bytes : q->ni.length;
+
+	/* Average queue size estimation. */
+	if (q_size != 0) {
+		/* Queue is not empty, avg <- avg + (q_size - avg) * w_q */
+		int diff = SCALE(q_size) - q->avg;
+		int64_t v = SCALE_MUL((int64_t)diff, (int64_t)fs->w_q);
+
+		q->avg += (int)v;
+	} else {
+		/*
+		 * Queue is empty, find for how long the queue has been
+		 * empty and use a lookup table for computing
+		 * (1 - * w_q)^(idle_time/s) where s is the time to send a
+		 * (small) packet.
+		 * XXX check wraps...
+		 */
+		if (q->avg) {
+			u_int t = div64((dn_cfg.curr_time - q->q_time), fs->lookup_step);
+
+			q->avg = (t < fs->lookup_depth) ?
+			    SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0;
+		}
+	}
+
+	/* Should i drop? */
+	if (q->avg < fs->min_th) {
+		q->count = -1;
+		return (0);	/* accept packet */
+	}
+	if (q->avg >= fs->max_th) {	/* average queue >=  max threshold */
+		if (fs->fs.flags & DN_IS_GENTLE_RED) {
+			/*
+			 * According to Gentle-RED, if avg is greater than
+			 * max_th the packet is dropped with a probability
+			 *	 p_b = c_3 * avg - c_4
+			 * where c_3 = (1 - max_p) / max_th
+			 *       c_4 = 1 - 2 * max_p
+			 */
+			p_b = SCALE_MUL((int64_t)fs->c_3, (int64_t)q->avg) -
+			    fs->c_4;
+		} else {
+			q->count = -1;
+			return (1);
+		}
+	} else if (q->avg > fs->min_th) {
+		/*
+		 * We compute p_b using the linear dropping function
+		 *	 p_b = c_1 * avg - c_2
+		 * where c_1 = max_p / (max_th - min_th)
+		 * 	 c_2 = max_p * min_th / (max_th - min_th)
+		 */
+		p_b = SCALE_MUL((int64_t)fs->c_1, (int64_t)q->avg) - fs->c_2;
+	}
+
+	if (fs->fs.flags & DN_QSIZE_BYTES)
+		p_b = div64((p_b * len) , fs->max_pkt_size);
+	if (++q->count == 0)
+		q->random = random() & 0xffff;
+	else {
+		/*
+		 * q->count counts packets arrived since last drop, so a greater
+		 * value of q->count means a greater packet drop probability.
+		 */
+		if (SCALE_MUL(p_b, SCALE((int64_t)q->count)) > q->random) {
+			q->count = 0;
+			/* After a drop we calculate a new random value. */
+			q->random = random() & 0xffff;
+			return (1);	/* drop */
+		}
+	}
+	/* End of RED algorithm. */
+
+	return (0);	/* accept */
+
+}
+
+/*
+ * Enqueue a packet in q, subject to space and queue management policy
+ * (whose parameters are in q->fs).
+ * Update stats for the queue and the scheduler.
+ * Return 0 on success, 1 on drop. The packet is consumed anyways.
+ */
+int
+dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop)
+{   
+	struct dn_fs *f;
+	struct dn_flow *ni;	/* stats for scheduler instance */
+	uint64_t len;
+
+	if (q->fs == NULL || q->_si == NULL) {
+		printf("%s fs %p si %p, dropping\n",
+			__FUNCTION__, q->fs, q->_si);
+		FREE_PKT(m);
+		return 1;
+	}
+	f = &(q->fs->fs);
+	ni = &q->_si->ni;
+	len = m->m_pkthdr.len;
+	/* Update statistics, then check reasons to drop pkt. */
+	q->ni.tot_bytes += len;
+	q->ni.tot_pkts++;
+	ni->tot_bytes += len;
+	ni->tot_pkts++;
+	if (drop)
+		goto drop;
+	if (f->plr && random() < f->plr)
+		goto drop;
+	if (f->flags & DN_IS_RED && red_drops(q, m->m_pkthdr.len))
+		goto drop;
+	if (f->flags & DN_QSIZE_BYTES) {
+		if (q->ni.len_bytes > f->qsize)
+			goto drop;
+	} else if (q->ni.length >= f->qsize) {
+		goto drop;
+	}
+	mq_append(&q->mq, m);
+	q->ni.length++;
+	q->ni.len_bytes += len;
+	ni->length++;
+	ni->len_bytes += len;
+	return 0;
+
+drop:
+	io_pkt_drop++;
+	q->ni.drops++;
+	ni->drops++;
+	FREE_PKT(m);
+	return 1;
+}
+
+/*
+ * Fetch packets from the delay line which are due now. If there are
+ * leftover packets, reinsert the delay line in the heap.
+ * Runs under scheduler lock.
+ */
+static void
+transmit_event(struct mq *q, struct delay_line *dline, uint64_t now)
+{
+	struct mbuf *m;
+	struct dn_pkt_tag *pkt = NULL;
+
+	dline->oid.subtype = 0; /* not in heap */
+	while ((m = dline->mq.head) != NULL) {
+		pkt = dn_tag_get(m);
+		if (!DN_KEY_LEQ(pkt->output_time, now))
+			break;
+		dline->mq.head = m->m_nextpkt;
+		mq_append(q, m);
+	}
+	if (m != NULL) {
+		dline->oid.subtype = 1; /* in heap */
+		heap_insert(&dn_cfg.evheap, pkt->output_time, dline);
+	}
+}
+
+/*
+ * Convert the additional MAC overheads/delays into an equivalent
+ * number of bits for the given data rate. The samples are
+ * in milliseconds so we need to divide by 1000.
+ */
+static uint64_t
+extra_bits(struct mbuf *m, struct dn_schk *s)
+{
+	int index;
+	uint64_t bits;
+	struct dn_profile *pf = s->profile;
+
+	if (!pf || pf->samples_no == 0)
+		return 0;
+	index  = random() % pf->samples_no;
+	bits = div64((uint64_t)pf->samples[index] * s->link.bandwidth, 1000);
+	if (index >= pf->loss_level) {
+		struct dn_pkt_tag *dt = dn_tag_get(m);
+		if (dt)
+			dt->dn_dir = DIR_DROP;
+	}
+	return bits;
+}
+
+/*
+ * Send traffic from a scheduler instance due by 'now'.
+ * Return a pointer to the head of the queue.
+ */
+static struct mbuf *
+serve_sched(struct mq *q, struct dn_sch_inst *si, uint64_t now)
+{
+	struct mq def_q;
+	struct dn_schk *s = si->sched;
+	struct mbuf *m = NULL;
+	int delay_line_idle = (si->dline.mq.head == NULL);
+	int done, bw;
+
+	if (q == NULL) {
+		q = &def_q;
+		q->head = NULL;
+	}
+
+	bw = s->link.bandwidth;
+	si->kflags &= ~DN_ACTIVE;
+
+	if (bw > 0)
+		si->credit += (now - si->sched_time) * bw;
+	else
+		si->credit = 0;
+	si->sched_time = now;
+	done = 0;
+	while (si->credit >= 0 && (m = s->fp->dequeue(si)) != NULL) {
+		uint64_t len_scaled;
+		done++;
+		len_scaled = (bw == 0) ? 0 : hz *
+		    (m->m_pkthdr.len * 8 + extra_bits(m, s));
+		si->credit -= len_scaled;
+		/* Move packet in the delay line */
+		dn_tag_get(m)->output_time += s->link.delay ;
+		mq_append(&si->dline.mq, m);
+	}
+	/*
+	 * If credit >= 0 the instance is idle, mark time.
+	 * Otherwise put back in the heap, and adjust the output
+	 * time of the last inserted packet, m, which was too early.
+	 */
+	if (si->credit >= 0) {
+		si->idle_time = now;
+	} else {
+		uint64_t t;
+		KASSERT (bw > 0, ("bw=0 and credit<0 ?"));
+		t = div64(bw - 1 - si->credit, bw);
+		if (m)
+			dn_tag_get(m)->output_time += t;
+		si->kflags |= DN_ACTIVE;
+		heap_insert(&dn_cfg.evheap, now + t, si);
+	}
+	if (delay_line_idle && done)
+		transmit_event(q, &si->dline, now);
+	return q->head;
+}
+
+/*
+ * The timer handler for dummynet. Time is computed in ticks, but
+ * but the code is tolerant to the actual rate at which this is called.
+ * Once complete, the function reschedules itself for the next tick.
+ */
+void
+dummynet_task(void *context, int pending)
+{
+	struct timeval t;
+	struct mq q = { NULL, NULL }; /* queue to accumulate results */
+
+	DN_BH_WLOCK();
+
+	/* Update number of lost(coalesced) ticks. */
+	tick_lost += pending - 1;
+
+	getmicrouptime(&t);
+	/* Last tick duration (usec). */
+	tick_last = (t.tv_sec - dn_cfg.prev_t.tv_sec) * 1000000 +
+	(t.tv_usec - dn_cfg.prev_t.tv_usec);
+	/* Last tick vs standard tick difference (usec). */
+	tick_delta = (tick_last * hz - 1000000) / hz;
+	/* Accumulated tick difference (usec). */
+	tick_delta_sum += tick_delta;
+
+	dn_cfg.prev_t = t;
+
+	/*
+	* Adjust curr_time if the accumulated tick difference is
+	* greater than the 'standard' tick. Since curr_time should
+	* be monotonically increasing, we do positive adjustments
+	* as required, and throttle curr_time in case of negative
+	* adjustment.
+	*/
+	dn_cfg.curr_time++;
+	if (tick_delta_sum - tick >= 0) {
+		int diff = tick_delta_sum / tick;
+
+		dn_cfg.curr_time += diff;
+		tick_diff += diff;
+		tick_delta_sum %= tick;
+		tick_adjustment++;
+	} else if (tick_delta_sum + tick <= 0) {
+		dn_cfg.curr_time--;
+		tick_diff--;
+		tick_delta_sum += tick;
+		tick_adjustment++;
+	}
+
+	/* serve pending events, accumulate in q */
+	for (;;) {
+		struct dn_id *p;    /* generic parameter to handler */
+
+		if (dn_cfg.evheap.elements == 0 ||
+		    DN_KEY_LT(dn_cfg.curr_time, HEAP_TOP(&dn_cfg.evheap)->key))
+			break;
+		p = HEAP_TOP(&dn_cfg.evheap)->object;
+		heap_extract(&dn_cfg.evheap, NULL);
+
+		if (p->type == DN_SCH_I) {
+			serve_sched(&q, (struct dn_sch_inst *)p, dn_cfg.curr_time);
+		} else { /* extracted a delay line */
+			transmit_event(&q, (struct delay_line *)p, dn_cfg.curr_time);
+		}
+	}
+	dn_drain_scheduler();
+	dn_drain_queue();
+
+	DN_BH_WUNLOCK();
+	dn_reschedule();
+	if (q.head != NULL)
+		dummynet_send(q.head);
+}
+
+/*
+ * forward a chain of packets to the proper destination.
+ * This runs outside the dummynet lock.
+ */
+static void
+dummynet_send(struct mbuf *m)
+{
+	struct mbuf *n;
+
+	for (; m != NULL; m = n) {
+		struct ifnet *ifp = NULL;	/* gcc 3.4.6 complains */
+        	struct m_tag *tag;
+		int dst;
+
+		n = m->m_nextpkt;
+		m->m_nextpkt = NULL;
+		tag = m_tag_first(m);
+		if (tag == NULL) { /* should not happen */
+			dst = DIR_DROP;
+		} else {
+			struct dn_pkt_tag *pkt = dn_tag_get(m);
+			/* extract the dummynet info, rename the tag
+			 * to carry reinject info.
+			 */
+			dst = pkt->dn_dir;
+			ifp = pkt->ifp;
+			tag->m_tag_cookie = MTAG_IPFW_RULE;
+			tag->m_tag_id = 0;
+		}
+
+		switch (dst) {
+		case DIR_OUT:
+			SET_HOST_IPLEN(mtod(m, struct ip *));
+			ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL);
+			break ;
+
+		case DIR_IN :
+			/* put header in network format for ip_input() */
+			//SET_NET_IPLEN(mtod(m, struct ip *));
+			netisr_dispatch(NETISR_IP, m);
+			break;
+
+#ifdef INET6
+		case DIR_IN | PROTO_IPV6:
+			netisr_dispatch(NETISR_IPV6, m);
+			break;
+
+		case DIR_OUT | PROTO_IPV6:
+			SET_HOST_IPLEN(mtod(m, struct ip *));
+			ip6_output(m, NULL, NULL, IPV6_FORWARDING, NULL, NULL, NULL);
+			break;
+#endif
+
+		case DIR_FWD | PROTO_IFB: /* DN_TO_IFB_FWD: */
+			if (bridge_dn_p != NULL)
+				((*bridge_dn_p)(m, ifp));
+			else
+				printf("dummynet: if_bridge not loaded\n");
+
+			break;
+
+		case DIR_IN | PROTO_LAYER2: /* DN_TO_ETH_DEMUX: */
+			/*
+			 * The Ethernet code assumes the Ethernet header is
+			 * contiguous in the first mbuf header.
+			 * Insure this is true.
+			 */
+			if (m->m_len < ETHER_HDR_LEN &&
+			    (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) {
+				printf("dummynet/ether: pullup failed, "
+				    "dropping packet\n");
+				break;
+			}
+			ether_demux(m->m_pkthdr.rcvif, m);
+			break;
+
+		case DIR_OUT | PROTO_LAYER2: /* N_TO_ETH_OUT: */
+			ether_output_frame(ifp, m);
+			break;
+
+		case DIR_DROP:
+			/* drop the packet after some time */
+			FREE_PKT(m);
+			break;
+
+		default:
+			printf("dummynet: bad switch %d!\n", dst);
+			FREE_PKT(m);
+			break;
+		}
+	}
+}
+
+static inline int
+tag_mbuf(struct mbuf *m, int dir, struct ip_fw_args *fwa)
+{
+	struct dn_pkt_tag *dt;
+	struct m_tag *mtag;
+
+	mtag = m_tag_get(PACKET_TAG_DUMMYNET,
+		    sizeof(*dt), M_NOWAIT | M_ZERO);
+	if (mtag == NULL)
+		return 1;		/* Cannot allocate packet header. */
+	m_tag_prepend(m, mtag);		/* Attach to mbuf chain. */
+	dt = (struct dn_pkt_tag *)(mtag + 1);
+	dt->rule = fwa->rule;
+	dt->rule.info &= IPFW_ONEPASS;	/* only keep this info */
+	dt->dn_dir = dir;
+	dt->ifp = fwa->oif;
+	/* dt->output tame is updated as we move through */
+	dt->output_time = dn_cfg.curr_time;
+	return 0;
+}
+
+
+/*
+ * dummynet hook for packets.
+ * We use the argument to locate the flowset fs and the sched_set sch
+ * associated to it. The we apply flow_mask and sched_mask to
+ * determine the queue and scheduler instances.
+ *
+ * dir		where shall we send the packet after dummynet.
+ * *m0		the mbuf with the packet
+ * ifp		the 'ifp' parameter from the caller.
+ *		NULL in ip_input, destination interface in ip_output,
+ */
+int
+dummynet_io(struct mbuf **m0, int dir, struct ip_fw_args *fwa)
+{
+	struct mbuf *m = *m0;
+	struct dn_fsk *fs = NULL;
+	struct dn_sch_inst *si;
+	struct dn_queue *q = NULL;	/* default */
+
+	int fs_id = (fwa->rule.info & IPFW_INFO_MASK) +
+		((fwa->rule.info & IPFW_IS_PIPE) ? 2*DN_MAX_ID : 0);
+	DN_BH_WLOCK();
+	io_pkt++;
+	/* we could actually tag outside the lock, but who cares... */
+	if (tag_mbuf(m, dir, fwa))
+		goto dropit;
+	if (dn_cfg.busy) {
+		/* if the upper half is busy doing something expensive,
+		 * lets queue the packet and move forward
+		 */
+		mq_append(&dn_cfg.pending, m);
+		m = *m0 = NULL; /* consumed */
+		goto done; /* already active, nothing to do */
+	}
+	/* XXX locate_flowset could be optimised with a direct ref. */
+	fs = dn_ht_find(dn_cfg.fshash, fs_id, 0, NULL);
+	if (fs == NULL)
+		goto dropit;	/* This queue/pipe does not exist! */
+	if (fs->sched == NULL)	/* should not happen */
+		goto dropit;
+	/* find scheduler instance, possibly applying sched_mask */
+	si = ipdn_si_find(fs->sched, &(fwa->f_id));
+	if (si == NULL)
+		goto dropit;
+	/*
+	 * If the scheduler supports multiple queues, find the right one
+	 * (otherwise it will be ignored by enqueue).
+	 */
+	if (fs->sched->fp->flags & DN_MULTIQUEUE) {
+		q = ipdn_q_find(fs, si, &(fwa->f_id));
+		if (q == NULL)
+			goto dropit;
+	}
+	if (fs->sched->fp->enqueue(si, q, m)) {
+		printf("%s dropped by enqueue\n", __FUNCTION__);
+		/* packet was dropped by enqueue() */
+		m = *m0 = NULL;
+		goto dropit;
+	}
+
+	if (si->kflags & DN_ACTIVE) {
+		m = *m0 = NULL; /* consumed */
+		goto done; /* already active, nothing to do */
+	}
+
+	/* compute the initial allowance */
+	{
+	    struct dn_link *p = &fs->sched->link;
+	    si->credit = dn_cfg.io_fast ? p->bandwidth : 0;
+	    if (p->burst) {
+		uint64_t burst = (dn_cfg.curr_time - si->idle_time) * p->bandwidth;
+		if (burst > p->burst)
+			burst = p->burst;
+		si->credit += burst;
+	    }
+	}
+	/* pass through scheduler and delay line */
+	m = serve_sched(NULL, si, dn_cfg.curr_time);
+
+	/* optimization -- pass it back to ipfw for immediate send */
+	/* XXX Don't call dummynet_send() if scheduler return the packet
+	 *     just enqueued. This avoid a lock order reversal.
+	 *     
+	 */
+	if (/*dn_cfg.io_fast &&*/ m == *m0 && (dir & PROTO_LAYER2) == 0 ) {
+		/* fast io */
+		io_pkt_fast++;
+		if (m->m_nextpkt != NULL) {
+			printf("dummynet: fast io: pkt chain detected!\n");
+			m->m_nextpkt = NULL;
+		}
+		m = NULL;
+	} else {
+		*m0 = NULL;
+	}
+done:
+	DN_BH_WUNLOCK();
+	if (m)
+		dummynet_send(m);
+	return 0;
+
+dropit:
+	io_pkt_drop++;
+	DN_BH_WUNLOCK();
+	if (m)
+		FREE_PKT(m);
+	*m0 = NULL;
+	return (fs && (fs->fs.flags & DN_NOERROR)) ? 0 : ENOBUFS;
+}
diff --git a/sys/netinet/ipfw/ip_dn_private.h b/sys/netinet/ipfw/ip_dn_private.h
new file mode 100644
index 0000000..0f66fef
--- /dev/null
+++ b/sys/netinet/ipfw/ip_dn_private.h
@@ -0,0 +1,387 @@
+/*-
+ * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * internal dummynet APIs.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IP_DN_PRIVATE_H
+#define _IP_DN_PRIVATE_H
+
+/* debugging support
+ * use ND() to remove debugging, D() to print a line,
+ * DX(level, ...) to print above a certain level
+ * If you redefine D() you are expected to redefine all.
+ */
+#ifndef D
+#define ND(fmt, ...) do {} while (0)
+#define D1(fmt, ...) do {} while (0)
+#define D(fmt, ...) printf("%-10s " fmt "\n",      \
+        __FUNCTION__, ## __VA_ARGS__)
+#define DX(lev, fmt, ...) do {              \
+        if (dn_cfg.debug > lev) D(fmt, ## __VA_ARGS__); } while (0)
+#endif
+
+MALLOC_DECLARE(M_DUMMYNET);
+
+#ifndef FREE_PKT
+#define	FREE_PKT(m)	m_freem(m)
+#endif
+
+#ifndef __linux__
+#define div64(a, b)  ((int64_t)(a) / (int64_t)(b))
+#endif
+
+#define DN_LOCK_INIT() do {				\
+	mtx_init(&dn_cfg.uh_mtx, "dn_uh", NULL, MTX_DEF);	\
+	mtx_init(&dn_cfg.bh_mtx, "dn_bh", NULL, MTX_DEF);	\
+	} while (0)
+#define DN_LOCK_DESTROY() do {				\
+	mtx_destroy(&dn_cfg.uh_mtx);			\
+	mtx_destroy(&dn_cfg.bh_mtx);			\
+	} while (0)
+#if 0 /* not used yet */
+#define DN_UH_RLOCK()		mtx_lock(&dn_cfg.uh_mtx)
+#define DN_UH_RUNLOCK()		mtx_unlock(&dn_cfg.uh_mtx)
+#define DN_UH_WLOCK()		mtx_lock(&dn_cfg.uh_mtx)
+#define DN_UH_WUNLOCK()		mtx_unlock(&dn_cfg.uh_mtx)
+#define DN_UH_LOCK_ASSERT()	mtx_assert(&dn_cfg.uh_mtx, MA_OWNED)
+#endif
+
+#define DN_BH_RLOCK()		mtx_lock(&dn_cfg.uh_mtx)
+#define DN_BH_RUNLOCK()		mtx_unlock(&dn_cfg.uh_mtx)
+#define DN_BH_WLOCK()		mtx_lock(&dn_cfg.uh_mtx)
+#define DN_BH_WUNLOCK()		mtx_unlock(&dn_cfg.uh_mtx)
+#define DN_BH_LOCK_ASSERT()	mtx_assert(&dn_cfg.uh_mtx, MA_OWNED)
+
+SLIST_HEAD(dn_schk_head, dn_schk);
+SLIST_HEAD(dn_sch_inst_head, dn_sch_inst);
+SLIST_HEAD(dn_fsk_head, dn_fsk);
+SLIST_HEAD(dn_queue_head, dn_queue);
+SLIST_HEAD(dn_alg_head, dn_alg);
+
+struct mq {	/* a basic queue of packets*/
+        struct mbuf *head, *tail;
+};
+
+static inline void
+set_oid(struct dn_id *o, int type, int len)
+{
+        o->type = type;
+        o->len = len;
+        o->subtype = 0;
+};
+
+/*
+ * configuration and global data for a dummynet instance
+ *
+ * When a configuration is modified from userland, 'id' is incremented
+ * so we can use the value to check for stale pointers.
+ */
+struct dn_parms {
+	uint32_t	id;		/* configuration version */
+
+	/* defaults (sysctl-accessible) */
+	int	red_lookup_depth;
+	int	red_avg_pkt_size;
+	int	red_max_pkt_size;
+	int	hash_size;
+	int	max_hash_size;
+	long	byte_limit;		/* max queue sizes */
+	long	slot_limit;
+
+	int	io_fast;
+	int	debug;
+
+	/* timekeeping */
+	struct timeval prev_t;		/* last time dummynet_tick ran */
+	struct dn_heap	evheap;		/* scheduled events */
+
+	/* counters of objects -- used for reporting space */
+	int	schk_count;
+	int	si_count;
+	int	fsk_count;
+	int	queue_count;
+
+	/* ticks and other stuff */
+	uint64_t	curr_time;
+	/* flowsets and schedulers are in hash tables, with 'hash_size'
+	 * buckets. fshash is looked up at every packet arrival
+	 * so better be generous if we expect many entries.
+	 */
+	struct dn_ht	*fshash;
+	struct dn_ht	*schedhash;
+	/* list of flowsets without a scheduler -- use sch_chain */
+	struct dn_fsk_head	fsu;	/* list of unlinked flowsets */
+	struct dn_alg_head	schedlist;	/* list of algorithms */
+
+	/* Store the fs/sch to scan when draining. The value is the
+	 * bucket number of the hash table 
+	 **/
+	int drain_fs;
+	int drain_sch;
+	
+	/* if the upper half is busy doing something long,
+	 * can set the busy flag and we will enqueue packets in
+	 * a queue for later processing.
+	 */
+	int	busy;
+	struct	mq	pending;
+
+#ifdef _KERNEL
+	/*
+	 * This file is normally used in the kernel, unless we do
+	 * some userland tests, in which case we do not need a mtx.
+	 * uh_mtx arbitrates between system calls and also
+	 * protects fshash, schedhash and fsunlinked.
+	 * These structures are readonly for the lower half.
+	 * bh_mtx protects all other structures which may be
+	 * modified upon packet arrivals
+	 */
+#if defined( __linux__ ) || defined( _WIN32 )
+	spinlock_t uh_mtx;
+	spinlock_t bh_mtx;
+#else
+	struct mtx uh_mtx;
+	struct mtx bh_mtx;
+#endif
+
+#endif /* _KERNEL */
+};
+
+/*
+ * Delay line, contains all packets on output from a link.
+ * Every scheduler instance has one.
+ */
+struct delay_line {
+	struct dn_id oid;
+	struct dn_sch_inst *si;
+	struct mq mq;
+};
+
+/*
+ * The kernel side of a flowset. It is linked in a hash table
+ * of flowsets, and in a list of children of their parent scheduler.
+ * qht is either the queue or (if HAVE_MASK) a hash table queues.
+ * Note that the mask to use is the (flow_mask|sched_mask), which
+ * changes as we attach/detach schedulers. So we store it here.
+ *
+ * XXX If we want to add scheduler-specific parameters, we need to
+ * put them in external storage because the scheduler may not be
+ * available when the fsk is created.
+ */
+struct dn_fsk { /* kernel side of a flowset */
+	struct dn_fs fs;
+	SLIST_ENTRY(dn_fsk) fsk_next;	/* hash chain for fshash */
+
+	struct ipfw_flow_id fsk_mask;
+
+	/* qht is a hash table of queues, or just a single queue
+	 * a bit in fs.flags tells us which one
+	 */
+	struct dn_ht	*qht;
+	struct dn_schk *sched;		/* Sched we are linked to */
+	SLIST_ENTRY(dn_fsk) sch_chain;	/* list of fsk attached to sched */
+
+	/* bucket index used by drain routine to drain queues for this
+	 * flowset
+	 */
+	int drain_bucket;
+	/* Parameter realted to RED / GRED */
+	/* original values are in dn_fs*/
+	int w_q ;		/* queue weight (scaled) */
+	int max_th ;		/* maximum threshold for queue (scaled) */
+	int min_th ;		/* minimum threshold for queue (scaled) */
+	int max_p ;		/* maximum value for p_b (scaled) */
+
+	u_int c_1 ;		/* max_p/(max_th-min_th) (scaled) */
+	u_int c_2 ;		/* max_p*min_th/(max_th-min_th) (scaled) */
+	u_int c_3 ;		/* for GRED, (1-max_p)/max_th (scaled) */
+	u_int c_4 ;		/* for GRED, 1 - 2*max_p (scaled) */
+	u_int * w_q_lookup ;	/* lookup table for computing (1-w_q)^t */
+	u_int lookup_depth ;	/* depth of lookup table */
+	int lookup_step ;	/* granularity inside the lookup table */
+	int lookup_weight ;	/* equal to (1-w_q)^t / (1-w_q)^(t+1) */
+	int avg_pkt_size ;	/* medium packet size */
+	int max_pkt_size ;	/* max packet size */
+};
+
+/*
+ * A queue is created as a child of a flowset unless it belongs to
+ * a !MULTIQUEUE scheduler. It is normally in a hash table in the
+ * flowset. fs always points to the parent flowset.
+ * si normally points to the sch_inst, unless the flowset has been
+ * detached from the scheduler -- in this case si == NULL and we
+ * should not enqueue.
+ */
+struct dn_queue {
+	struct dn_flow ni;	/* oid, flow_id, stats */
+	struct mq mq;	/* packets queue */
+	struct dn_sch_inst *_si;	/* owner scheduler instance */
+	SLIST_ENTRY(dn_queue) q_next; /* hash chain list for qht */
+	struct dn_fsk *fs;		/* parent flowset. */
+
+	/* RED parameters */
+	int avg;		/* average queue length est. (scaled) */
+	int count;		/* arrivals since last RED drop */
+	int random;		/* random value (scaled) */
+	uint64_t q_time;	/* start of queue idle time */
+
+};
+
+/*
+ * The kernel side of a scheduler. Contains the userland config,
+ * a link, pointer to extra config arguments from command line,
+ * kernel flags, and a pointer to the scheduler methods.
+ * It is stored in a hash table, and holds a list of all
+ * flowsets and scheduler instances.
+ * XXX sch must be at the beginning, see schk_hash().
+ */
+struct dn_schk {
+	struct dn_sch sch;
+	struct dn_alg *fp;	/* Pointer to scheduler functions */
+	struct dn_link link;	/* The link, embedded */
+	struct dn_profile *profile; /* delay profile, if any */
+	struct dn_id *cfg;	/* extra config arguments */
+
+	SLIST_ENTRY(dn_schk) schk_next;  /* hash chain for schedhash */
+
+	struct dn_fsk_head fsk_list;  /* all fsk linked to me */
+	struct dn_fsk *fs;	/* Flowset for !MULTIQUEUE */
+
+	/* bucket index used by the drain routine to drain the scheduler
+	 * instance for this flowset.
+	 */
+	int drain_bucket;
+
+	/* Hash table of all instances (through sch.sched_mask)
+	 * or single instance if no mask. Always valid.
+	 */
+	struct dn_ht	*siht;
+};
+
+
+/*
+ * Scheduler instance.
+ * Contains variables and all queues relative to a this instance.
+ * This struct is created a runtime.
+ */
+struct dn_sch_inst {
+	struct dn_flow	ni;	/* oid, flowid and stats */
+	SLIST_ENTRY(dn_sch_inst) si_next; /* hash chain for siht */
+	struct delay_line dline;
+	struct dn_schk *sched;	/* the template */
+	int		kflags;	/* DN_ACTIVE */
+
+	int64_t	credit;		/* bits I can transmit (more or less). */
+	uint64_t sched_time;	/* time link was scheduled in ready_heap */
+	uint64_t idle_time;	/* start of scheduler instance idle time */
+
+	/* q_count is the number of queues that this instance is using.
+	 * The counter is incremented or decremented when
+	 * a reference from the queue is created or deleted.
+	 * It is used to make sure that a scheduler instance can be safely
+	 * deleted by the drain routine. See notes below.
+	 */
+	int q_count;
+
+};
+
+/*
+ * NOTE about object drain.
+ * The system will automatically (XXX check when) drain queues and
+ * scheduler instances when they are idle.
+ * A queue is idle when it has no packets; an instance is idle when
+ * it is not in the evheap heap, and the corresponding delay line is empty.
+ * A queue can be safely deleted when it is idle because of the scheduler
+ * function xxx_free_queue() will remove any references to it.
+ * An instance can be only deleted when no queues reference it. To be sure
+ * of that, a counter (q_count) stores the number of queues that are pointing
+ * to the instance.
+ *
+ * XXX
+ * Order of scan:
+ * - take all flowset in a bucket for the flowset hash table
+ * - take all queues in a bucket for the flowset
+ * - increment the queue bucket
+ * - scan next flowset bucket
+ * Nothing is done if a bucket contains no entries.
+ *
+ * The same schema is used for sceduler instances
+ */
+
+
+/* kernel-side flags. Linux has DN_DELETE in fcntl.h
+ */
+enum {
+	/* 1 and 2 are reserved for the SCAN flags */
+	DN_DESTROY	= 0x0004, /* destroy */
+	DN_DELETE_FS	= 0x0008, /* destroy flowset */
+	DN_DETACH	= 0x0010,
+	DN_ACTIVE	= 0x0020, /* object is in evheap */
+	DN_F_DLINE	= 0x0040, /* object is a delay line */
+	DN_F_SCHI	= 0x00C0, /* object is a sched.instance */
+	DN_QHT_IS_Q	= 0x0100, /* in flowset, qht is a single queue */
+};
+
+extern struct dn_parms dn_cfg;
+
+int dummynet_io(struct mbuf **, int , struct ip_fw_args *);
+void dummynet_task(void *context, int pending);
+void dn_reschedule(void);
+
+struct dn_queue *ipdn_q_find(struct dn_fsk *, struct dn_sch_inst *,
+        struct ipfw_flow_id *);
+struct dn_sch_inst *ipdn_si_find(struct dn_schk *, struct ipfw_flow_id *);
+
+/* helper structure to copy objects returned to userland */
+struct copy_args {
+	char **start;
+	char *end;
+	int flags;
+	int type;
+	int extra;	/* extra filtering */
+};
+
+struct sockopt;
+int ip_dummynet_compat(struct sockopt *sopt);
+int dummynet_get(struct sockopt *sopt, void **compat);
+int dn_c_copy_q (void *_ni, void *arg);
+int dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq);
+int dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq);
+int dn_compat_copy_queue(struct copy_args *a, void *_o);
+int dn_compat_copy_pipe(struct copy_args *a, void *_o);
+int copy_data_helper_compat(void *_o, void *_arg);
+int dn_compat_calc_size(struct dn_parms dn_cfg);
+int do_config(void *p, int l);
+
+/* function to drain idle object */
+void dn_drain_scheduler(void);
+void dn_drain_queue(void);
+
+#endif /* _IP_DN_PRIVATE_H */
diff --git a/sys/netinet/ipfw/ip_dummynet.c b/sys/netinet/ipfw/ip_dummynet.c
index 267776f..20b776f 100644
--- a/sys/netinet/ipfw/ip_dummynet.c
+++ b/sys/netinet/ipfw/ip_dummynet.c
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa
+ * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa
  * Portions Copyright (c) 2000 Akamba Corp.
  * All rights reserved
  *
@@ -28,32 +28,12 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#define	DUMMYNET_DEBUG
-
-#include "opt_inet6.h"
-
 /*
- * This module implements IP dummynet, a bandwidth limiter/delay emulator
- * used in conjunction with the ipfw package.
- * Description of the data structures used is in ip_dummynet.h
- * Here you mainly find the following blocks of code:
- *  + variable declarations;
- *  + heap management functions;
- *  + scheduler and dummynet functions;
- *  + configuration and initialization.
- *
- * NOTA BENE: critical sections are protected by the "dummynet lock".
- *
- * Most important Changes:
- *
- * 011004: KLDable
- * 010124: Fixed WF2Q behaviour
- * 010122: Fixed spl protection.
- * 000601: WF2Q support
- * 000106: large rewrite, use heaps to handle very many pipes.
- * 980513:	initial release
+ * Configuration and internal object management for dummynet.
  */
 
+#include "opt_inet6.h"
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
@@ -67,2197 +47,2012 @@ __FBSDID("$FreeBSD$");
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/time.h>
-#include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <net/if.h>	/* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */
-#include <net/netisr.h>
 #include <netinet/in.h>
-#include <netinet/ip.h>		/* ip_len, ip_off */
 #include <netinet/ip_var.h>	/* ip_output(), IP_FORWARDING */
 #include <netinet/ip_fw.h>
 #include <netinet/ipfw/ip_fw_private.h>
+#include <netinet/ipfw/dn_heap.h>
 #include <netinet/ip_dummynet.h>
+#include <netinet/ipfw/ip_dn_private.h>
+#include <netinet/ipfw/dn_sched.h>
+
+/* which objects to copy */
+#define DN_C_LINK 	0x01
+#define DN_C_SCH	0x02
+#define DN_C_FLOW	0x04
+#define DN_C_FS		0x08
+#define DN_C_QUEUE	0x10
+
+/* we use this argument in case of a schk_new */
+struct schk_new_arg {
+	struct dn_alg *fp;
+	struct dn_sch *sch;
+};
 
-#include <netinet/if_ether.h> /* various ether_* routines */
-
-#include <netinet/ip6.h>       /* for ip6_input, ip6_output prototypes */
-#include <netinet6/ip6_var.h>
-
-/*
- * We keep a private variable for the simulation time, but we could
- * probably use an existing one ("softticks" in sys/kern/kern_timeout.c)
- */
-static dn_key curr_time = 0 ; /* current simulation time */
-
-static int dn_hash_size = 64 ;	/* default hash size */
-
-/* statistics on number of queue searches and search steps */
-static long searches, search_steps ;
-static int pipe_expire = 1 ;   /* expire queue if empty */
-static int dn_max_ratio = 16 ; /* max queues/buckets ratio */
-
-static long pipe_slot_limit = 100; /* Foot shooting limit for pipe queues. */
-static long pipe_byte_limit = 1024 * 1024;
-
-static int red_lookup_depth = 256;	/* RED - default lookup table depth */
-static int red_avg_pkt_size = 512;      /* RED - default medium packet size */
-static int red_max_pkt_size = 1500;     /* RED - default max packet size */
-
-static struct timeval prev_t;
-static long tick_last;			/* Last tick duration (usec). */
-static long tick_delta;			/* Last vs standard tick diff (usec). */
-static long tick_delta_sum;		/* Accumulated tick difference (usec).*/
-static long tick_adjustment;		/* Tick adjustments done. */
-static long tick_lost;			/* Lost(coalesced) ticks number. */
-/* Adjusted vs non-adjusted curr_time difference (ticks). */
-static long tick_diff;
-
-static int		io_fast;
-static unsigned long	io_pkt;
-static unsigned long	io_pkt_fast;
-static unsigned long	io_pkt_drop;
-
-/*
- * Three heaps contain queues and pipes that the scheduler handles:
- *
- * ready_heap contains all dn_flow_queue related to fixed-rate pipes.
- *
- * wfq_ready_heap contains the pipes associated with WF2Q flows
- *
- * extract_heap contains pipes associated with delay lines.
- *
- */
-
-MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap");
-
-static struct dn_heap ready_heap, extract_heap, wfq_ready_heap ;
-
-static int	heap_init(struct dn_heap *h, int size);
-static int	heap_insert (struct dn_heap *h, dn_key key1, void *p);
-static void	heap_extract(struct dn_heap *h, void *obj);
-static void	transmit_event(struct dn_pipe *pipe, struct mbuf **head,
-		    struct mbuf **tail);
-static void	ready_event(struct dn_flow_queue *q, struct mbuf **head,
-		    struct mbuf **tail);
-static void	ready_event_wfq(struct dn_pipe *p, struct mbuf **head,
-		    struct mbuf **tail);
-
-#define	HASHSIZE	16
-#define	HASH(num)	((((num) >> 8) ^ ((num) >> 4) ^ (num)) & 0x0f)
-static struct dn_pipe_head	pipehash[HASHSIZE];	/* all pipes */
-static struct dn_flow_set_head	flowsethash[HASHSIZE];	/* all flowsets */
-
+/*---- callout hooks. ----*/
 static struct callout dn_timeout;
+static struct task	dn_task;
+static struct taskqueue	*dn_tq = NULL;
 
-extern	void (*bridge_dn_p)(struct mbuf *, struct ifnet *);
+static void
+dummynet(void * __unused unused)
+{
 
-#ifdef SYSCTL_NODE
-SYSCTL_DECL(_net_inet);
-SYSCTL_DECL(_net_inet_ip);
+	taskqueue_enqueue(dn_tq, &dn_task);
+}
 
-SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet");
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size,
-    CTLFLAG_RW, &dn_hash_size, 0, "Default hash table size");
-#if 0	/* curr_time is 64 bit */
-SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, curr_time,
-    CTLFLAG_RD, &curr_time, 0, "Current tick");
-#endif
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, ready_heap,
-    CTLFLAG_RD, &ready_heap.size, 0, "Size of ready heap");
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, extract_heap,
-    CTLFLAG_RD, &extract_heap.size, 0, "Size of extract heap");
-SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, searches,
-    CTLFLAG_RD, &searches, 0, "Number of queue searches");
-SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, search_steps,
-    CTLFLAG_RD, &search_steps, 0, "Number of queue search steps");
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire,
-    CTLFLAG_RW, &pipe_expire, 0, "Expire queue if empty");
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, max_chain_len,
-    CTLFLAG_RW, &dn_max_ratio, 0,
-    "Max ratio between dynamic queues and buckets");
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth,
-    CTLFLAG_RD, &red_lookup_depth, 0, "Depth of RED lookup table");
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size,
-    CTLFLAG_RD, &red_avg_pkt_size, 0, "RED Medium packet size");
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size,
-    CTLFLAG_RD, &red_max_pkt_size, 0, "RED Max packet size");
-SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta,
-    CTLFLAG_RD, &tick_delta, 0, "Last vs standard tick difference (usec).");
-SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta_sum,
-    CTLFLAG_RD, &tick_delta_sum, 0, "Accumulated tick difference (usec).");
-SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_adjustment,
-    CTLFLAG_RD, &tick_adjustment, 0, "Tick adjustments done.");
-SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_diff,
-    CTLFLAG_RD, &tick_diff, 0,
-    "Adjusted vs non-adjusted curr_time difference (ticks).");
-SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost,
-    CTLFLAG_RD, &tick_lost, 0,
-    "Number of ticks coalesced by dummynet taskqueue.");
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, io_fast,
-    CTLFLAG_RW, &io_fast, 0, "Enable fast dummynet io.");
-SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt,
-    CTLFLAG_RD, &io_pkt, 0,
-    "Number of packets passed to dummynet.");
-SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_fast,
-    CTLFLAG_RD, &io_pkt_fast, 0,
-    "Number of packets bypassed dummynet scheduler.");
-SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_drop,
-    CTLFLAG_RD, &io_pkt_drop, 0,
-    "Number of packets dropped by dummynet.");
-SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_slot_limit,
-    CTLFLAG_RW, &pipe_slot_limit, 0, "Upper limit in slots for pipe queue.");
-SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_byte_limit,
-    CTLFLAG_RW, &pipe_byte_limit, 0, "Upper limit in bytes for pipe queue.");
-#endif
+void
+dn_reschedule(void)
+{
+	callout_reset(&dn_timeout, 1, dummynet, NULL);
+}
+/*----- end of callout hooks -----*/
 
-#ifdef DUMMYNET_DEBUG
-int	dummynet_debug = 0;
-#ifdef SYSCTL_NODE
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW, &dummynet_debug,
-	    0, "control debugging printfs");
-#endif
-#define	DPRINTF(X)	if (dummynet_debug) printf X
-#else
-#define	DPRINTF(X)
-#endif
+/* Return a scheduler descriptor given the type or name. */
+static struct dn_alg *
+find_sched_type(int type, char *name)
+{
+	struct dn_alg *d;
 
-static struct task	dn_task;
-static struct taskqueue	*dn_tq = NULL;
-static void dummynet_task(void *, int);
+	SLIST_FOREACH(d, &dn_cfg.schedlist, next) {
+		if (d->type == type || (name && !strcmp(d->name, name)))
+			return d;
+	}
+	return NULL; /* not found */
+}
 
-static struct mtx dummynet_mtx;
-#define	DUMMYNET_LOCK_INIT() \
-	mtx_init(&dummynet_mtx, "dummynet", NULL, MTX_DEF)
-#define	DUMMYNET_LOCK_DESTROY()	mtx_destroy(&dummynet_mtx)
-#define	DUMMYNET_LOCK()		mtx_lock(&dummynet_mtx)
-#define	DUMMYNET_UNLOCK()	mtx_unlock(&dummynet_mtx)
-#define	DUMMYNET_LOCK_ASSERT()	mtx_assert(&dummynet_mtx, MA_OWNED)
+int
+ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg)
+{
+	int oldv = *v;
+	const char *op = NULL;
+	if (oldv < lo) {
+		*v = dflt;
+		op = "Bump";
+	} else if (oldv > hi) {
+		*v = hi;
+		op = "Clamp";
+	} else
+		return *v;
+	if (op && msg)
+		printf("%s %s to %d (was %d)\n", op, msg, *v, oldv);
+	return *v;
+}
 
-static int	config_pipe(struct dn_pipe *p);
-static int	ip_dn_ctl(struct sockopt *sopt);
+/*---- flow_id mask, hash and compare functions ---*/
+static struct ipfw_flow_id *
+flow_id_mask(struct ipfw_flow_id *mask, struct ipfw_flow_id *id)
+{
+	int is_v6 = IS_IP6_FLOW_ID(id);
 
-static void	dummynet(void *);
-static void	dummynet_flush(void);
-static void	dummynet_send(struct mbuf *);
-static int	dummynet_io(struct mbuf **, int , struct ip_fw_args *);
+	id->dst_port &= mask->dst_port;
+	id->src_port &= mask->src_port;
+	id->proto &= mask->proto;
+	id->flags = 0; /* we don't care about this one */
+	if (is_v6) {
+		APPLY_MASK(&id->dst_ip6, &mask->dst_ip6);
+		APPLY_MASK(&id->src_ip6, &mask->src_ip6);
+		id->flow_id6 &= mask->flow_id6;
+	} else {
+		id->dst_ip &= mask->dst_ip;
+		id->src_ip &= mask->src_ip;
+	}
+	return id;
+}
 
-/*
- * Flow queue is idle if:
- *   1) it's empty for at least 1 tick
- *   2) it has invalid timestamp (WF2Q case)
- *   3) parent pipe has no 'exhausted' burst.
- */
-#define QUEUE_IS_IDLE(q) ((q)->head == NULL && (q)->S == (q)->F + 1 && \
-	curr_time > (q)->idle_time + 1 && \
-	((q)->numbytes + (curr_time - (q)->idle_time - 1) * \
-	(q)->fs->pipe->bandwidth >= (q)->fs->pipe->burst))
+/* computes an OR of two masks, result in dst and also returned */
+static struct ipfw_flow_id *
+flow_id_or(struct ipfw_flow_id *src, struct ipfw_flow_id *dst)
+{
+	int is_v6 = IS_IP6_FLOW_ID(dst);
 
-/*
- * Heap management functions.
- *
- * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2.
- * Some macros help finding parent/children so we can optimize them.
- *
- * heap_init() is called to expand the heap when needed.
- * Increment size in blocks of 16 entries.
- * XXX failure to allocate a new element is a pretty bad failure
- * as we basically stall a whole queue forever!!
- * Returns 1 on error, 0 on success
- */
-#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 )
-#define HEAP_LEFT(x) ( 2*(x) + 1 )
-#define HEAP_IS_LEFT(x) ( (x) & 1 )
-#define HEAP_RIGHT(x) ( 2*(x) + 2 )
-#define	HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; }
-#define HEAP_INCREMENT	15
+	dst->dst_port |= src->dst_port;
+	dst->src_port |= src->src_port;
+	dst->proto |= src->proto;
+	dst->flags = 0; /* we don't care about this one */
+	if (is_v6) {
+#define OR_MASK(_d, _s)                          \
+    (_d)->__u6_addr.__u6_addr32[0] |= (_s)->__u6_addr.__u6_addr32[0]; \
+    (_d)->__u6_addr.__u6_addr32[1] |= (_s)->__u6_addr.__u6_addr32[1]; \
+    (_d)->__u6_addr.__u6_addr32[2] |= (_s)->__u6_addr.__u6_addr32[2]; \
+    (_d)->__u6_addr.__u6_addr32[3] |= (_s)->__u6_addr.__u6_addr32[3];
+		OR_MASK(&dst->dst_ip6, &src->dst_ip6);
+		OR_MASK(&dst->src_ip6, &src->src_ip6);
+#undef OR_MASK
+		dst->flow_id6 |= src->flow_id6;
+	} else {
+		dst->dst_ip |= src->dst_ip;
+		dst->src_ip |= src->src_ip;
+	}
+	return dst;
+}
 
 static int
-heap_init(struct dn_heap *h, int new_size)
+nonzero_mask(struct ipfw_flow_id *m)
 {
-    struct dn_heap_entry *p;
+	if (m->dst_port || m->src_port || m->proto)
+		return 1;
+	if (IS_IP6_FLOW_ID(m)) {
+		return
+			m->dst_ip6.__u6_addr.__u6_addr32[0] ||
+			m->dst_ip6.__u6_addr.__u6_addr32[1] ||
+			m->dst_ip6.__u6_addr.__u6_addr32[2] ||
+			m->dst_ip6.__u6_addr.__u6_addr32[3] ||
+			m->src_ip6.__u6_addr.__u6_addr32[0] ||
+			m->src_ip6.__u6_addr.__u6_addr32[1] ||
+			m->src_ip6.__u6_addr.__u6_addr32[2] ||
+			m->src_ip6.__u6_addr.__u6_addr32[3] ||
+			m->flow_id6;
+	} else {
+		return m->dst_ip || m->src_ip;
+	}
+}
 
-    if (h->size >= new_size ) {
-	printf("dummynet: %s, Bogus call, have %d want %d\n", __func__,
-		h->size, new_size);
-	return 0 ;
-    }
-    new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT ;
-    p = malloc(new_size * sizeof(*p), M_DUMMYNET, M_NOWAIT);
-    if (p == NULL) {
-	printf("dummynet: %s, resize %d failed\n", __func__, new_size );
-	return 1 ; /* error */
-    }
-    if (h->size > 0) {
-	bcopy(h->p, p, h->size * sizeof(*p) );
-	free(h->p, M_DUMMYNET);
+/* XXX we may want a better hash function */
+static uint32_t
+flow_id_hash(struct ipfw_flow_id *id)
+{
+    uint32_t i;
+
+    if (IS_IP6_FLOW_ID(id)) {
+	uint32_t *d = (uint32_t *)&id->dst_ip6;
+	uint32_t *s = (uint32_t *)&id->src_ip6;
+        i = (d[0]      ) ^ (d[1])       ^
+            (d[2]      ) ^ (d[3])       ^
+            (d[0] >> 15) ^ (d[1] >> 15) ^
+            (d[2] >> 15) ^ (d[3] >> 15) ^
+            (s[0] <<  1) ^ (s[1] <<  1) ^
+            (s[2] <<  1) ^ (s[3] <<  1) ^
+            (s[0] << 16) ^ (s[1] << 16) ^
+            (s[2] << 16) ^ (s[3] << 16) ^
+            (id->dst_port << 1) ^ (id->src_port) ^
+            (id->proto ) ^ (id->flow_id6);
+    } else {
+        i = (id->dst_ip)        ^ (id->dst_ip >> 15) ^
+            (id->src_ip << 1)   ^ (id->src_ip >> 16) ^
+            (id->dst_port << 1) ^ (id->src_port)     ^ (id->proto);
     }
-    h->p = p ;
-    h->size = new_size ;
-    return 0 ;
+    return i;
 }
 
-/*
- * Insert element in heap. Normally, p != NULL, we insert p in
- * a new position and bubble up. If p == NULL, then the element is
- * already in place, and key is the position where to start the
- * bubble-up.
- * Returns 1 on failure (cannot allocate new heap entry)
- *
- * If offset > 0 the position (index, int) of the element in the heap is
- * also stored in the element itself at the given offset in bytes.
- */
-#define SET_OFFSET(heap, node) \
-    if (heap->offset > 0) \
-	    *((int *)((char *)(heap->p[node].object) + heap->offset)) = node ;
-/*
- * RESET_OFFSET is used for sanity checks. It sets offset to an invalid value.
- */
-#define RESET_OFFSET(heap, node) \
-    if (heap->offset > 0) \
-	    *((int *)((char *)(heap->p[node].object) + heap->offset)) = -1 ;
+/* Like bcmp, returns 0 if ids match, 1 otherwise. */
 static int
-heap_insert(struct dn_heap *h, dn_key key1, void *p)
-{
-    int son = h->elements ;
-
-    if (p == NULL)	/* data already there, set starting point */
-	son = key1 ;
-    else {		/* insert new element at the end, possibly resize */
-	son = h->elements ;
-	if (son == h->size) /* need resize... */
-	    if (heap_init(h, h->elements+1) )
-		return 1 ; /* failure... */
-	h->p[son].object = p ;
-	h->p[son].key = key1 ;
-	h->elements++ ;
-    }
-    while (son > 0) {				/* bubble up */
-	int father = HEAP_FATHER(son) ;
-	struct dn_heap_entry tmp  ;
-
-	if (DN_KEY_LT( h->p[father].key, h->p[son].key ) )
-	    break ; /* found right position */
-	/* son smaller than father, swap and repeat */
-	HEAP_SWAP(h->p[son], h->p[father], tmp) ;
-	SET_OFFSET(h, son);
-	son = father ;
-    }
-    SET_OFFSET(h, son);
-    return 0 ;
+flow_id_cmp(struct ipfw_flow_id *id1, struct ipfw_flow_id *id2)
+{
+	int is_v6 = IS_IP6_FLOW_ID(id1);
+
+	if (is_v6 != IS_IP6_FLOW_ID(id2))
+		return 1; /* a ipv4 and a ipv6 flow */
+
+	if (!is_v6 && id1->dst_ip == id2->dst_ip &&
+	    id1->src_ip == id2->src_ip &&
+	    id1->dst_port == id2->dst_port &&
+	    id1->src_port == id2->src_port &&
+	    id1->proto == id2->proto &&
+	    id1->flags == id2->flags)
+		return 0;
+	    
+	if (is_v6 &&
+	    !bcmp(&id1->dst_ip6,&id2->dst_ip6, sizeof(id1->dst_ip6)) &&
+	    !bcmp(&id1->src_ip6,&id2->src_ip6, sizeof(id1->src_ip6)) &&
+	    id1->dst_port == id2->dst_port &&
+	    id1->src_port == id2->src_port &&
+	    id1->proto == id2->proto &&
+	    id1->flags == id2->flags &&
+	    id1->flow_id6 == id2->flow_id6)
+		return 0;
+     
+	/* Masks differ */
+	return 1;
 }
+/*--------- end of flow-id mask, hash and compare ---------*/
 
-/*
- * remove top element from heap, or obj if obj != NULL
+/*--- support functions for the qht hashtable ----
+ * Entries are hashed by flow-id
  */
-static void
-heap_extract(struct dn_heap *h, void *obj)
+static uint32_t
+q_hash(uintptr_t key, int flags, void *arg)
 {
-    int child, father, max = h->elements - 1 ;
+	/* compute the hash slot from the flow id */
+	struct ipfw_flow_id *id = (flags & DNHT_KEY_IS_OBJ) ?
+		&((struct dn_queue *)key)->ni.fid :
+		(struct ipfw_flow_id *)key;
 
-    if (max < 0) {
-	printf("dummynet: warning, extract from empty heap 0x%p\n", h);
-	return ;
-    }
-    father = 0 ; /* default: move up smallest child */
-    if (obj != NULL) { /* extract specific element, index is at offset */
-	if (h->offset <= 0)
-	    panic("dummynet: heap_extract from middle not supported on this heap!!!\n");
-	father = *((int *)((char *)obj + h->offset)) ;
-	if (father < 0 || father >= h->elements) {
-	    printf("dummynet: heap_extract, father %d out of bound 0..%d\n",
-		father, h->elements);
-	    panic("dummynet: heap_extract");
+	return flow_id_hash(id);
+}
+
+static int
+q_match(void *obj, uintptr_t key, int flags, void *arg)
+{
+	struct dn_queue *o = (struct dn_queue *)obj;
+	struct ipfw_flow_id *id2;
+
+	if (flags & DNHT_KEY_IS_OBJ) {
+		/* compare pointers */
+		id2 = &((struct dn_queue *)key)->ni.fid;
+	} else {
+		id2 = (struct ipfw_flow_id *)key;
 	}
-    }
-    RESET_OFFSET(h, father);
-    child = HEAP_LEFT(father) ;		/* left child */
-    while (child <= max) {		/* valid entry */
-	if (child != max && DN_KEY_LT(h->p[child+1].key, h->p[child].key) )
-	    child = child+1 ;		/* take right child, otherwise left */
-	h->p[father] = h->p[child] ;
-	SET_OFFSET(h, father);
-	father = child ;
-	child = HEAP_LEFT(child) ;   /* left child for next loop */
-    }
-    h->elements-- ;
-    if (father != max) {
-	/*
-	 * Fill hole with last entry and bubble up, reusing the insert code
-	 */
-	h->p[father] = h->p[max] ;
-	heap_insert(h, father, NULL); /* this one cannot fail */
-    }
+	return (0 == flow_id_cmp(&o->ni.fid,  id2));
 }
 
-#if 0
 /*
- * change object position and update references
- * XXX this one is never used!
+ * create a new queue instance for the given 'key'.
  */
-static void
-heap_move(struct dn_heap *h, dn_key new_key, void *object)
-{
-    int temp;
-    int i ;
-    int max = h->elements-1 ;
-    struct dn_heap_entry buf ;
-
-    if (h->offset <= 0)
-	panic("cannot move items on this heap");
-
-    i = *((int *)((char *)object + h->offset));
-    if (DN_KEY_LT(new_key, h->p[i].key) ) { /* must move up */
-	h->p[i].key = new_key ;
-	for (; i>0 && DN_KEY_LT(new_key, h->p[(temp = HEAP_FATHER(i))].key) ;
-		 i = temp ) { /* bubble up */
-	    HEAP_SWAP(h->p[i], h->p[temp], buf) ;
-	    SET_OFFSET(h, i);
-	}
-    } else {		/* must move down */
-	h->p[i].key = new_key ;
-	while ( (temp = HEAP_LEFT(i)) <= max ) { /* found left child */
-	    if ((temp != max) && DN_KEY_GT(h->p[temp].key, h->p[temp+1].key))
-		temp++ ; /* select child with min key */
-	    if (DN_KEY_GT(new_key, h->p[temp].key)) { /* go down */
-		HEAP_SWAP(h->p[i], h->p[temp], buf) ;
-		SET_OFFSET(h, i);
-	    } else
-		break ;
-	    i = temp ;
+static void *
+q_new(uintptr_t key, int flags, void *arg)
+{   
+	struct dn_queue *q, *template = arg;
+	struct dn_fsk *fs = template->fs;
+	int size = sizeof(*q) + fs->sched->fp->q_datalen;
+
+	q = malloc(size, M_DUMMYNET, M_NOWAIT | M_ZERO);
+	if (q == NULL) {
+		D("no memory for new queue");
+		return NULL;
 	}
-    }
-    SET_OFFSET(h, i);
+
+	set_oid(&q->ni.oid, DN_QUEUE, size);
+	if (fs->fs.flags & DN_QHT_HASH)
+		q->ni.fid = *(struct ipfw_flow_id *)key;
+	q->fs = fs;
+	q->_si = template->_si;
+	q->_si->q_count++;
+
+	if (fs->sched->fp->new_queue)
+		fs->sched->fp->new_queue(q);
+	dn_cfg.queue_count++;
+	return q;
 }
-#endif /* heap_move, unused */
 
 /*
- * heapify() will reorganize data inside an array to maintain the
- * heap property. It is needed when we delete a bunch of entries.
+ * Notify schedulers that a queue is going away.
+ * If (flags & DN_DESTROY), also free the packets.
+ * The version for callbacks is called q_delete_cb().
  */
 static void
-heapify(struct dn_heap *h)
+dn_delete_queue(struct dn_queue *q, int flags)
 {
-    int i ;
+	struct dn_fsk *fs = q->fs;
+
+	// D("fs %p si %p\n", fs, q->_si);
+	/* notify the parent scheduler that the queue is going away */
+	if (fs && fs->sched->fp->free_queue)
+		fs->sched->fp->free_queue(q);
+	q->_si->q_count--;
+	q->_si = NULL;
+	if (flags & DN_DESTROY) {
+		if (q->mq.head)
+			dn_free_pkts(q->mq.head);
+		bzero(q, sizeof(*q));	// safety
+		free(q, M_DUMMYNET);
+		dn_cfg.queue_count--;
+	}
+}
 
-    for (i = 0 ; i < h->elements ; i++ )
-	heap_insert(h, i , NULL) ;
+static int
+q_delete_cb(void *q, void *arg)
+{
+	int flags = (int)(uintptr_t)arg;
+	dn_delete_queue(q, flags);
+	return (flags & DN_DESTROY) ? DNHT_SCAN_DEL : 0;
 }
 
 /*
- * cleanup the heap and free data structure
+ * calls dn_delete_queue/q_delete_cb on all queues,
+ * which notifies the parent scheduler and possibly drains packets.
+ * flags & DN_DESTROY: drains queues and destroy qht;
  */
 static void
-heap_free(struct dn_heap *h)
+qht_delete(struct dn_fsk *fs, int flags)
 {
-    if (h->size >0 )
-	free(h->p, M_DUMMYNET);
-    bzero(h, sizeof(*h) );
+	ND("fs %d start flags %d qht %p",
+		fs->fs.fs_nr, flags, fs->qht);
+	if (!fs->qht)
+		return;
+	if (fs->fs.flags & DN_QHT_HASH) {
+		dn_ht_scan(fs->qht, q_delete_cb, (void *)(uintptr_t)flags);
+		if (flags & DN_DESTROY) {
+			dn_ht_free(fs->qht, 0);
+			fs->qht = NULL;
+		}
+	} else {
+		dn_delete_queue((struct dn_queue *)(fs->qht), flags);
+		if (flags & DN_DESTROY)
+			fs->qht = NULL;
+	}
 }
 
 /*
- * --- end of heap management functions ---
+ * Find and possibly create the queue for a MULTIQUEUE scheduler.
+ * We never call it for !MULTIQUEUE (the queue is in the sch_inst).
  */
+struct dn_queue *
+ipdn_q_find(struct dn_fsk *fs, struct dn_sch_inst *si,
+	struct ipfw_flow_id *id)
+{
+	struct dn_queue template;
+
+	template._si = si;
+	template.fs = fs;
+
+	if (fs->fs.flags & DN_QHT_HASH) {
+		struct ipfw_flow_id masked_id;
+		if (fs->qht == NULL) {
+			fs->qht = dn_ht_init(NULL, fs->fs.buckets,
+				offsetof(struct dn_queue, q_next),
+				q_hash, q_match, q_new);
+			if (fs->qht == NULL)
+				return NULL;
+		}
+		masked_id = *id;
+		flow_id_mask(&fs->fsk_mask, &masked_id);
+		return dn_ht_find(fs->qht, (uintptr_t)&masked_id,
+			DNHT_INSERT, &template);
+	} else {
+		if (fs->qht == NULL)
+			fs->qht = q_new(0, 0, &template);
+		return (struct dn_queue *)fs->qht;
+	}
+}
+/*--- end of queue hash table ---*/
 
-/*
- * Dispose a list of packet. Use an inline functions so if we
- * need to free extra state associated to a packet, this is a
- * central point to do it.
+/*--- support functions for the sch_inst hashtable ----
+ *
+ * These are hashed by flow-id
  */
-
-static __inline void dn_free_pkts(struct mbuf *mnext)
+static uint32_t
+si_hash(uintptr_t key, int flags, void *arg)
 {
-	struct mbuf *m;
+	/* compute the hash slot from the flow id */
+	struct ipfw_flow_id *id = (flags & DNHT_KEY_IS_OBJ) ?
+		&((struct dn_sch_inst *)key)->ni.fid :
+		(struct ipfw_flow_id *)key;
 
-	while ((m = mnext) != NULL) {
-		mnext = m->m_nextpkt;
-		FREE_PKT(m);
-	}
+	return flow_id_hash(id);
 }
 
-/*
- * Return the mbuf tag holding the dummynet state.  As an optimization
- * this is assumed to be the first tag on the list.  If this turns out
- * wrong we'll need to search the list.
- */
-static struct dn_pkt_tag *
-dn_tag_get(struct mbuf *m)
+static int
+si_match(void *obj, uintptr_t key, int flags, void *arg)
 {
-    struct m_tag *mtag = m_tag_first(m);
-    KASSERT(mtag != NULL &&
-	    mtag->m_tag_cookie == MTAG_ABI_COMPAT &&
-	    mtag->m_tag_id == PACKET_TAG_DUMMYNET,
-	    ("packet on dummynet queue w/o dummynet tag!"));
-    return (struct dn_pkt_tag *)(mtag+1);
+	struct dn_sch_inst *o = obj;
+	struct ipfw_flow_id *id2;
+
+	id2 = (flags & DNHT_KEY_IS_OBJ) ?
+		&((struct dn_sch_inst *)key)->ni.fid :
+		(struct ipfw_flow_id *)key;
+	return flow_id_cmp(&o->ni.fid,  id2) == 0;
 }
 
 /*
- * Scheduler functions:
- *
- * transmit_event() is called when the delay-line needs to enter
- * the scheduler, either because of existing pkts getting ready,
- * or new packets entering the queue. The event handled is the delivery
- * time of the packet.
- *
- * ready_event() does something similar with fixed-rate queues, and the
- * event handled is the finish time of the head pkt.
- *
- * wfq_ready_event() does something similar with WF2Q queues, and the
- * event handled is the start time of the head pkt.
- *
- * In all cases, we make sure that the data structures are consistent
- * before passing pkts out, because this might trigger recursive
- * invocations of the procedures.
+ * create a new instance for the given 'key'
+ * Allocate memory for instance, delay line and scheduler private data.
  */
-static void
-transmit_event(struct dn_pipe *pipe, struct mbuf **head, struct mbuf **tail)
+static void *
+si_new(uintptr_t key, int flags, void *arg)
 {
-	struct mbuf *m;
-	struct dn_pkt_tag *pkt;
-
-	DUMMYNET_LOCK_ASSERT();
-
-	while ((m = pipe->head) != NULL) {
-		pkt = dn_tag_get(m);
-		if (!DN_KEY_LEQ(pkt->output_time, curr_time))
-			break;
-
-		pipe->head = m->m_nextpkt;
-		if (*tail != NULL)
-			(*tail)->m_nextpkt = m;
-		else
-			*head = m;
-		*tail = m;
+	struct dn_schk *s = arg;
+	struct dn_sch_inst *si;
+	int l = sizeof(*si) + s->fp->si_datalen;
+
+	si = malloc(l, M_DUMMYNET, M_NOWAIT | M_ZERO);
+	if (si == NULL)
+		goto error;
+	/* Set length only for the part passed up to userland. */
+	set_oid(&si->ni.oid, DN_SCH_I, sizeof(struct dn_flow));
+	set_oid(&(si->dline.oid), DN_DELAY_LINE,
+		sizeof(struct delay_line));
+	/* mark si and dline as outside the event queue */
+	si->ni.oid.id = si->dline.oid.id = -1;
+
+	si->sched = s;
+	si->dline.si = si;
+
+	if (s->fp->new_sched && s->fp->new_sched(si)) {
+		D("new_sched error");
+		goto error;
 	}
-	if (*tail != NULL)
-		(*tail)->m_nextpkt = NULL;
+	if (s->sch.flags & DN_HAVE_MASK)
+		si->ni.fid = *(struct ipfw_flow_id *)key;
 
-	/* If there are leftover packets, put into the heap for next event. */
-	if ((m = pipe->head) != NULL) {
-		pkt = dn_tag_get(m);
-		/*
-		 * XXX Should check errors on heap_insert, by draining the
-		 * whole pipe p and hoping in the future we are more successful.
-		 */
-		heap_insert(&extract_heap, pkt->output_time, pipe);
+	dn_cfg.si_count++;
+	return si;
+
+error:
+	if (si) {
+		bzero(si, sizeof(*si)); // safety
+		free(si, M_DUMMYNET);
 	}
+        return NULL;
 }
 
-#define div64(a, b)	((int64_t)(a) / (int64_t)(b))
 /*
- * Compute how many ticks we have to wait before being able to send
- * a packet. This is computed as the "wire time" for the packet
- * (length + extra bits), minus the credit available, scaled to ticks.
- * Check that the result is not be negative (it could be if we have
- * too much leftover credit in q->numbytes).
+ * Callback from siht to delete all scheduler instances. Remove
+ * si and delay line from the system heap, destroy all queues.
+ * We assume that all flowset have been notified and do not
+ * point to us anymore.
  */
-static inline dn_key
-set_ticks(struct mbuf *m, struct dn_flow_queue *q, struct dn_pipe *p)
+static int
+si_destroy(void *_si, void *arg)
 {
-	int64_t ret;
-
-	ret = div64( (m->m_pkthdr.len * 8 + q->extra_bits) * hz
-		- q->numbytes + p->bandwidth - 1 , p->bandwidth);
-	if (ret < 0)
-		ret = 0;
-	return ret;
+	struct dn_sch_inst *si = _si;
+	struct dn_schk *s = si->sched;
+	struct delay_line *dl = &si->dline;
+
+	if (dl->oid.subtype) /* remove delay line from event heap */
+		heap_extract(&dn_cfg.evheap, dl);
+	dn_free_pkts(dl->mq.head);	/* drain delay line */
+	if (si->kflags & DN_ACTIVE) /* remove si from event heap */
+		heap_extract(&dn_cfg.evheap, si);
+	if (s->fp->free_sched)
+		s->fp->free_sched(si);
+	bzero(si, sizeof(*si));	/* safety */
+	free(si, M_DUMMYNET);
+	dn_cfg.si_count--;
+	return DNHT_SCAN_DEL;
 }
 
 /*
- * Convert the additional MAC overheads/delays into an equivalent
- * number of bits for the given data rate. The samples are in milliseconds
- * so we need to divide by 1000.
+ * Find the scheduler instance for this packet. If we need to apply
+ * a mask, do on a local copy of the flow_id to preserve the original.
+ * Assume siht is always initialized if we have a mask.
  */
-static dn_key
-compute_extra_bits(struct mbuf *pkt, struct dn_pipe *p)
+struct dn_sch_inst *
+ipdn_si_find(struct dn_schk *s, struct ipfw_flow_id *id)
 {
-	int index;
-	dn_key extra_bits;
 
-	if (!p->samples || p->samples_no == 0)
-		return 0;
-	index  = random() % p->samples_no;
-	extra_bits = div64((dn_key)p->samples[index] * p->bandwidth, 1000);
-	if (index >= p->loss_level) {
-		struct dn_pkt_tag *dt = dn_tag_get(pkt);
-		if (dt)
-			dt->dn_dir = DIR_DROP;
+	if (s->sch.flags & DN_HAVE_MASK) {
+		struct ipfw_flow_id id_t = *id;
+		flow_id_mask(&s->sch.sched_mask, &id_t);
+		return dn_ht_find(s->siht, (uintptr_t)&id_t,
+			DNHT_INSERT, s);
 	}
-	return extra_bits;
+	if (!s->siht)
+		s->siht = si_new(0, 0, s);
+	return (struct dn_sch_inst *)s->siht;
 }
 
-static void
-free_pipe(struct dn_pipe *p)
+/* callback to flush credit for the scheduler instance */
+static int
+si_reset_credit(void *_si, void *arg)
 {
-	if (p->samples)
-		free(p->samples, M_DUMMYNET);
-	free(p, M_DUMMYNET);
+	struct dn_sch_inst *si = _si;
+	struct dn_link *p = &si->sched->link;
+
+	si->credit = p->burst + (dn_cfg.io_fast ?  p->bandwidth : 0);
+	return 0;
 }
 
-/*
- * extract pkt from queue, compute output time (could be now)
- * and put into delay line (p_queue)
- */
 static void
-move_pkt(struct mbuf *pkt, struct dn_flow_queue *q, struct dn_pipe *p,
-    int len)
+schk_reset_credit(struct dn_schk *s)
 {
-    struct dn_pkt_tag *dt = dn_tag_get(pkt);
-
-    q->head = pkt->m_nextpkt ;
-    q->len-- ;
-    q->len_bytes -= len ;
-
-    dt->output_time = curr_time + p->delay ;
-
-    if (p->head == NULL)
-	p->head = pkt;
-    else
-	p->tail->m_nextpkt = pkt;
-    p->tail = pkt;
-    p->tail->m_nextpkt = NULL;
+	if (s->sch.flags & DN_HAVE_MASK)
+		dn_ht_scan(s->siht, si_reset_credit, NULL);
+	else if (s->siht)
+		si_reset_credit(s->siht, NULL);
 }
+/*---- end of sch_inst hashtable ---------------------*/
 
-/*
- * ready_event() is invoked every time the queue must enter the
- * scheduler, either because the first packet arrives, or because
- * a previously scheduled event fired.
- * On invokation, drain as many pkts as possible (could be 0) and then
- * if there are leftover packets reinsert the pkt in the scheduler.
+/*-------------------------------------------------------
+ * flowset hash (fshash) support. Entries are hashed by fs_nr.
+ * New allocations are put in the fsunlinked list, from which
+ * they are removed when they point to a specific scheduler.
  */
-static void
-ready_event(struct dn_flow_queue *q, struct mbuf **head, struct mbuf **tail)
+static uint32_t
+fsk_hash(uintptr_t key, int flags, void *arg)
 {
-	struct mbuf *pkt;
-	struct dn_pipe *p = q->fs->pipe;
-	int p_was_empty;
-
-	DUMMYNET_LOCK_ASSERT();
+	uint32_t i = !(flags & DNHT_KEY_IS_OBJ) ? key :
+		((struct dn_fsk *)key)->fs.fs_nr;
 
-	if (p == NULL) {
-		printf("dummynet: ready_event- pipe is gone\n");
-		return;
-	}
-	p_was_empty = (p->head == NULL);
+	return ( (i>>8)^(i>>4)^i );
+}
 
-	/*
-	 * Schedule fixed-rate queues linked to this pipe:
-	 * account for the bw accumulated since last scheduling, then
-	 * drain as many pkts as allowed by q->numbytes and move to
-	 * the delay line (in p) computing output time.
-	 * bandwidth==0 (no limit) means we can drain the whole queue,
-	 * setting len_scaled = 0 does the job.
-	 */
-	q->numbytes += (curr_time - q->sched_time) * p->bandwidth;
-	while ((pkt = q->head) != NULL) {
-		int len = pkt->m_pkthdr.len;
-		dn_key len_scaled = p->bandwidth ? len*8*hz
-			+ q->extra_bits*hz
-			: 0;
-
-		if (DN_KEY_GT(len_scaled, q->numbytes))
-			break;
-		q->numbytes -= len_scaled;
-		move_pkt(pkt, q, p, len);
-		if (q->head)
-			q->extra_bits = compute_extra_bits(q->head, p);
-	}
-	/*
-	 * If we have more packets queued, schedule next ready event
-	 * (can only occur when bandwidth != 0, otherwise we would have
-	 * flushed the whole queue in the previous loop).
-	 * To this purpose we record the current time and compute how many
-	 * ticks to go for the finish time of the packet.
-	 */
-	if ((pkt = q->head) != NULL) {	/* this implies bandwidth != 0 */
-		dn_key t = set_ticks(pkt, q, p); /* ticks i have to wait */
+static int
+fsk_match(void *obj, uintptr_t key, int flags, void *arg)
+{
+	struct dn_fsk *fs = obj;
+	int i = !(flags & DNHT_KEY_IS_OBJ) ? key :
+		((struct dn_fsk *)key)->fs.fs_nr;
 
-		q->sched_time = curr_time;
-		heap_insert(&ready_heap, curr_time + t, (void *)q);
-		/*
-		 * XXX Should check errors on heap_insert, and drain the whole
-		 * queue on error hoping next time we are luckier.
-		 */
-	} else		/* RED needs to know when the queue becomes empty. */
-		q->idle_time = curr_time;
+	return (fs->fs.fs_nr == i);
+}
 
-	/*
-	 * If the delay line was empty call transmit_event() now.
-	 * Otherwise, the scheduler will take care of it.
-	 */
-	if (p_was_empty)
-		transmit_event(p, head, tail);
+static void *
+fsk_new(uintptr_t key, int flags, void *arg)
+{
+	struct dn_fsk *fs;
+
+	fs = malloc(sizeof(*fs), M_DUMMYNET, M_NOWAIT | M_ZERO);
+	if (fs) {
+		set_oid(&fs->fs.oid, DN_FS, sizeof(fs->fs));
+		dn_cfg.fsk_count++;
+		fs->drain_bucket = 0;
+		SLIST_INSERT_HEAD(&dn_cfg.fsu, fs, sch_chain);
+	}
+	return fs;
 }
 
 /*
- * Called when we can transmit packets on WF2Q queues. Take pkts out of
- * the queues at their start time, and enqueue into the delay line.
- * Packets are drained until p->numbytes < 0. As long as
- * len_scaled >= p->numbytes, the packet goes into the delay line
- * with a deadline p->delay. For the last packet, if p->numbytes < 0,
- * there is an additional delay.
+ * detach flowset from its current scheduler. Flags as follows:
+ * DN_DETACH removes from the fsk_list
+ * DN_DESTROY deletes individual queues
+ * DN_DELETE_FS destroys the flowset (otherwise goes in unlinked).
  */
 static void
-ready_event_wfq(struct dn_pipe *p, struct mbuf **head, struct mbuf **tail)
+fsk_detach(struct dn_fsk *fs, int flags)
 {
-	int p_was_empty = (p->head == NULL);
-	struct dn_heap *sch = &(p->scheduler_heap);
-	struct dn_heap *neh = &(p->not_eligible_heap);
-	int64_t p_numbytes = p->numbytes;
-
-	/*
-	 * p->numbytes is only 32bits in FBSD7, but we might need 64 bits.
-	 * Use a local variable for the computations, and write back the
-	 * results when done, saturating if needed.
-	 * The local variable has no impact on performance and helps
-	 * reducing diffs between the various branches.
-	 */
-
-	DUMMYNET_LOCK_ASSERT();
-
-	if (p->if_name[0] == 0)		/* tx clock is simulated */
-		p_numbytes += (curr_time - p->sched_time) * p->bandwidth;
-	else {	/*
-		 * tx clock is for real,
-		 * the ifq must be empty or this is a NOP.
-		 */
-		if (p->ifp && p->ifp->if_snd.ifq_head != NULL)
-			return;
-		else {
-			DPRINTF(("dummynet: pipe %d ready from %s --\n",
-			    p->pipe_nr, p->if_name));
-		}
-	}
-
-	/*
-	 * While we have backlogged traffic AND credit, we need to do
-	 * something on the queue.
-	 */
-	while (p_numbytes >= 0 && (sch->elements > 0 || neh->elements > 0)) {
-		if (sch->elements > 0) {
-			/* Have some eligible pkts to send out. */
-			struct dn_flow_queue *q = sch->p[0].object;
-			struct mbuf *pkt = q->head;
-			struct dn_flow_set *fs = q->fs;
-			uint64_t len = pkt->m_pkthdr.len;
-			int len_scaled = p->bandwidth ? len * 8 * hz : 0;
-
-			heap_extract(sch, NULL); /* Remove queue from heap. */
-			p_numbytes -= len_scaled;
-			move_pkt(pkt, q, p, len);
-
-			p->V += div64((len << MY_M), p->sum);	/* Update V. */
-			q->S = q->F;			/* Update start time. */
-			if (q->len == 0) {
-				/* Flow not backlogged any more. */
-				fs->backlogged--;
-				heap_insert(&(p->idle_heap), q->F, q);
-			} else {
-				/* Still backlogged. */
-
-				/*
-				 * Update F and position in backlogged queue,
-				 * then put flow in not_eligible_heap
-				 * (we will fix this later).
-				 */
-				len = (q->head)->m_pkthdr.len;
-				q->F += div64((len << MY_M), fs->weight);
-				if (DN_KEY_LEQ(q->S, p->V))
-					heap_insert(neh, q->S, q);
-				else
-					heap_insert(sch, q->F, q);
-			}
-		}
-		/*
-		 * Now compute V = max(V, min(S_i)). Remember that all elements
-		 * in sch have by definition S_i <= V so if sch is not empty,
-		 * V is surely the max and we must not update it. Conversely,
-		 * if sch is empty we only need to look at neh.
-		 */
-		if (sch->elements == 0 && neh->elements > 0)
-			p->V = MAX64(p->V, neh->p[0].key);
-		/* Move from neh to sch any packets that have become eligible */
-		while (neh->elements > 0 && DN_KEY_LEQ(neh->p[0].key, p->V)) {
-			struct dn_flow_queue *q = neh->p[0].object;
-			heap_extract(neh, NULL);
-			heap_insert(sch, q->F, q);
-		}
-
-		if (p->if_name[0] != '\0') { /* Tx clock is from a real thing */
-			p_numbytes = -1;	/* Mark not ready for I/O. */
-			break;
-		}
-	}
-	if (sch->elements == 0 && neh->elements == 0 && p_numbytes >= 0) {
-		p->idle_time = curr_time;
-		/*
-		 * No traffic and no events scheduled.
-		 * We can get rid of idle-heap.
-		 */
-		if (p->idle_heap.elements > 0) {
-			int i;
-
-			for (i = 0; i < p->idle_heap.elements; i++) {
-				struct dn_flow_queue *q;
-				
-				q = p->idle_heap.p[i].object;
-				q->F = 0;
-				q->S = q->F + 1;
-			}
-			p->sum = 0;
-			p->V = 0;
-			p->idle_heap.elements = 0;
-		}
+	if (flags & DN_DELETE_FS)
+		flags |= DN_DESTROY;
+	ND("fs %d from sched %d flags %s %s %s",
+		fs->fs.fs_nr, fs->fs.sched_nr,
+		(flags & DN_DELETE_FS) ? "DEL_FS":"",
+		(flags & DN_DESTROY) ? "DEL":"",
+		(flags & DN_DETACH) ? "DET":"");
+	if (flags & DN_DETACH) { /* detach from the list */
+		struct dn_fsk_head *h;
+		h = fs->sched ? &fs->sched->fsk_list : &dn_cfg.fsu;
+		SLIST_REMOVE(h, fs, dn_fsk, sch_chain);
 	}
-	/*
-	 * If we are getting clocks from dummynet (not a real interface) and
-	 * If we are under credit, schedule the next ready event.
-	 * Also fix the delivery time of the last packet.
-	 */
-	if (p->if_name[0]==0 && p_numbytes < 0) { /* This implies bw > 0. */
-		dn_key t = 0;		/* Number of ticks i have to wait. */
-
-		if (p->bandwidth > 0)
-			t = div64(p->bandwidth - 1 - p_numbytes, p->bandwidth);
-		dn_tag_get(p->tail)->output_time += t;
-		p->sched_time = curr_time;
-		heap_insert(&wfq_ready_heap, curr_time + t, (void *)p);
-		/*
-		 * XXX Should check errors on heap_insert, and drain the whole
-		 * queue on error hoping next time we are luckier.
-		 */
+	qht_delete(fs, flags);
+	if (fs->sched && fs->sched->fp->free_fsk)
+		fs->sched->fp->free_fsk(fs);
+	fs->sched = NULL;
+	if (flags & DN_DELETE_FS) {
+		bzero(fs, sizeof(fs));	/* safety */
+		free(fs, M_DUMMYNET);
+		dn_cfg.fsk_count--;
+	} else {
+		SLIST_INSERT_HEAD(&dn_cfg.fsu, fs, sch_chain);
 	}
-
-	/* Write back p_numbytes (adjust 64->32bit if necessary). */
-	p->numbytes = p_numbytes;
-
-	/*
-	 * If the delay line was empty call transmit_event() now.
-	 * Otherwise, the scheduler will take care of it.
-	 */
-	if (p_was_empty)
-		transmit_event(p, head, tail);
 }
 
 /*
- * This is called one tick, after previous run. It is used to
- * schedule next run.
+ * Detach or destroy all flowsets in a list.
+ * flags specifies what to do:
+ * DN_DESTROY:	flush all queues
+ * DN_DELETE_FS:	DN_DESTROY + destroy flowset
+ *	DN_DELETE_FS implies DN_DESTROY
  */
 static void
-dummynet(void * __unused unused)
+fsk_detach_list(struct dn_fsk_head *h, int flags)
 {
-
-	taskqueue_enqueue(dn_tq, &dn_task);
+	struct dn_fsk *fs;
+	int n = 0; /* only for stats */
+
+	ND("head %p flags %x", h, flags);
+	while ((fs = SLIST_FIRST(h))) {
+		SLIST_REMOVE_HEAD(h, sch_chain);
+		n++;
+		fsk_detach(fs, flags);
+	}
+	ND("done %d flowsets", n);
 }
 
 /*
- * The timer handler for dummynet. Time is computed in ticks, but
- * but the code is tolerant to the actual rate at which this is called.
- * Once complete, the function reschedules itself for the next tick.
+ * called on 'queue X delete' -- removes the flowset from fshash,
+ * deletes all queues for the flowset, and removes the flowset.
  */
-static void
-dummynet_task(void *context, int pending)
+static int
+delete_fs(int i, int locked)
 {
-	struct mbuf *head = NULL, *tail = NULL;
-	struct dn_pipe *pipe;
-	struct dn_heap *heaps[3];
-	struct dn_heap *h;
-	void *p;	/* generic parameter to handler */
-	int i;
-	struct timeval t;
-
-	DUMMYNET_LOCK();
-
-	heaps[0] = &ready_heap;			/* fixed-rate queues */
-	heaps[1] = &wfq_ready_heap;		/* wfq queues */
-	heaps[2] = &extract_heap;		/* delay line */
-
- 	/* Update number of lost(coalesced) ticks. */
- 	tick_lost += pending - 1;
- 
- 	getmicrouptime(&t);
- 	/* Last tick duration (usec). */
- 	tick_last = (t.tv_sec - prev_t.tv_sec) * 1000000 +
- 	    (t.tv_usec - prev_t.tv_usec);
- 	/* Last tick vs standard tick difference (usec). */
- 	tick_delta = (tick_last * hz - 1000000) / hz;
- 	/* Accumulated tick difference (usec). */
- 	tick_delta_sum += tick_delta;
- 
- 	prev_t = t;
- 
- 	/*
- 	 * Adjust curr_time if accumulated tick difference greater than
- 	 * 'standard' tick. Since curr_time should be monotonically increasing,
- 	 * we do positive adjustment as required and throttle curr_time in
- 	 * case of negative adjustment.
- 	 */
-  	curr_time++;
- 	if (tick_delta_sum - tick >= 0) {
- 		int diff = tick_delta_sum / tick;
- 
- 		curr_time += diff;
- 		tick_diff += diff;
- 		tick_delta_sum %= tick;
- 		tick_adjustment++;
- 	} else if (tick_delta_sum + tick <= 0) {
- 		curr_time--;
- 		tick_diff--;
- 		tick_delta_sum += tick;
- 		tick_adjustment++;
- 	}
-
-	for (i = 0; i < 3; i++) {
-		h = heaps[i];
-		while (h->elements > 0 && DN_KEY_LEQ(h->p[0].key, curr_time)) {
-			if (h->p[0].key > curr_time)
-				printf("dummynet: warning, "
-				    "heap %d is %d ticks late\n",
-				    i, (int)(curr_time - h->p[0].key));
-			/* store a copy before heap_extract */
-			p = h->p[0].object;
-			/* need to extract before processing */
-			heap_extract(h, NULL);
-			if (i == 0)
-				ready_event(p, &head, &tail);
-			else if (i == 1) {
-				struct dn_pipe *pipe = p;
-				if (pipe->if_name[0] != '\0')
-					printf("dummynet: bad ready_event_wfq "
-					    "for pipe %s\n", pipe->if_name);
-				else
-					ready_event_wfq(p, &head, &tail);
-			} else
-				transmit_event(p, &head, &tail);
-		}
-	}
-
-	/* Sweep pipes trying to expire idle flow_queues. */
-	for (i = 0; i < HASHSIZE; i++) {
-		SLIST_FOREACH(pipe, &pipehash[i], next) {
-			if (pipe->idle_heap.elements > 0 &&
-			    DN_KEY_LT(pipe->idle_heap.p[0].key, pipe->V)) {
-				struct dn_flow_queue *q =
-				    pipe->idle_heap.p[0].object;
-
-				heap_extract(&(pipe->idle_heap), NULL);
-				/* Mark timestamp as invalid. */
-				q->S = q->F + 1;
-				pipe->sum -= q->fs->weight;
-			}
-		}
-	}
-
-	DUMMYNET_UNLOCK();
+	struct dn_fsk *fs;
+	int err = 0;
+
+	if (!locked)
+		DN_BH_WLOCK();
+	fs = dn_ht_find(dn_cfg.fshash, i, DNHT_REMOVE, NULL);
+	ND("fs %d found %p", i, fs);
+	if (fs) {
+		fsk_detach(fs, DN_DETACH | DN_DELETE_FS);
+		err = 0;
+	} else
+		err = EINVAL;
+	if (!locked)
+		DN_BH_WUNLOCK();
+	return err;
+}
 
-	if (head != NULL)
-		dummynet_send(head);
+/*----- end of flowset hashtable support -------------*/
 
-	callout_reset(&dn_timeout, 1, dummynet, NULL);
+/*------------------------------------------------------------
+ * Scheduler hash. When searching by index we pass sched_nr,
+ * otherwise we pass struct dn_sch * which is the first field in
+ * struct dn_schk so we can cast between the two. We use this trick
+ * because in the create phase (but it should be fixed).
+ */
+static uint32_t
+schk_hash(uintptr_t key, int flags, void *_arg)
+{
+	uint32_t i = !(flags & DNHT_KEY_IS_OBJ) ? key :
+		((struct dn_schk *)key)->sch.sched_nr;
+	return ( (i>>8)^(i>>4)^i );
 }
 
-static void
-dummynet_send(struct mbuf *m)
+static int
+schk_match(void *obj, uintptr_t key, int flags, void *_arg)
 {
-	struct mbuf *n;
-
-	for (; m != NULL; m = n) {
-		struct ifnet *ifp;
-		int dst;
-        	struct m_tag *tag;
-
-		n = m->m_nextpkt;
-		m->m_nextpkt = NULL;
-		tag = m_tag_first(m);
-		if (tag == NULL) {
-			dst = DIR_DROP;
-		} else {
-			struct dn_pkt_tag *pkt = dn_tag_get(m);
-			/* extract the dummynet info, rename the tag */
-			dst = pkt->dn_dir;
-			ifp = pkt->ifp;
-			/* rename the tag so it carries reinject info */
-			tag->m_tag_cookie = MTAG_IPFW_RULE;
-			tag->m_tag_id = 0;
-		}
-
-		switch (dst) {
-		case DIR_OUT:
-			SET_HOST_IPLEN(mtod(m, struct ip *));
-			ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL);
-			break ;
-		case DIR_IN :
-			/* put header in network format for ip_input() */
-			//SET_NET_IPLEN(mtod(m, struct ip *));
-			netisr_dispatch(NETISR_IP, m);
-			break;
-#ifdef INET6
-		case DIR_IN | PROTO_IPV6:
-			netisr_dispatch(NETISR_IPV6, m);
-			break;
-
-		case DIR_OUT | PROTO_IPV6:
-			SET_HOST_IPLEN(mtod(m, struct ip *));
-			ip6_output(m, NULL, NULL, IPV6_FORWARDING, NULL, NULL, NULL);
-			break;
-#endif
-		case DIR_FWD | PROTO_IFB: /* DN_TO_IFB_FWD: */
-			if (bridge_dn_p != NULL)
-				((*bridge_dn_p)(m, ifp));
-			else
-				printf("dummynet: if_bridge not loaded\n");
-
-			break;
-		case DIR_IN | PROTO_LAYER2: /* DN_TO_ETH_DEMUX: */
-			/*
-			 * The Ethernet code assumes the Ethernet header is
-			 * contiguous in the first mbuf header.
-			 * Insure this is true.
-			 */
-			if (m->m_len < ETHER_HDR_LEN &&
-			    (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) {
-				printf("dummynet/ether: pullup failed, "
-				    "dropping packet\n");
-				break;
-			}
-			ether_demux(m->m_pkthdr.rcvif, m);
-			break;
-		case DIR_OUT | PROTO_LAYER2: /* N_TO_ETH_OUT: */
-			ether_output_frame(ifp, m);
-			break;
-
-		case DIR_DROP:
-			/* drop the packet after some time */
-			FREE_PKT(m);
-			break;
-
-		default:
-			printf("dummynet: bad switch %d!\n", dst);
-			FREE_PKT(m);
-			break;
-		}
-	}
+	struct dn_schk *s = (struct dn_schk *)obj;
+	int i = !(flags & DNHT_KEY_IS_OBJ) ? key :
+		((struct dn_schk *)key)->sch.sched_nr;
+	return (s->sch.sched_nr == i);
 }
 
 /*
- * Unconditionally expire empty queues in case of shortage.
- * Returns the number of queues freed.
+ * Create the entry and intialize with the sched hash if needed.
+ * Leave s->fp unset so we can tell whether a dn_ht_find() returns
+ * a new object or a previously existing one.
  */
-static int
-expire_queues(struct dn_flow_set *fs)
-{
-    struct dn_flow_queue *q, *prev ;
-    int i, initial_elements = fs->rq_elements ;
-
-    if (fs->last_expired == time_uptime)
-	return 0 ;
-    fs->last_expired = time_uptime ;
-    for (i = 0 ; i <= fs->rq_size ; i++) { /* last one is overflow */
-	for (prev=NULL, q = fs->rq[i] ; q != NULL ; ) {
-	    if (!QUEUE_IS_IDLE(q)) {
-  		prev = q ;
-  	        q = q->next ;
-  	    } else { /* entry is idle, expire it */
-		struct dn_flow_queue *old_q = q ;
-
-		if (prev != NULL)
-		    prev->next = q = q->next ;
-		else
-		    fs->rq[i] = q = q->next ;
-		fs->rq_elements-- ;
-		free(old_q, M_DUMMYNET);
-	    }
+static void *
+schk_new(uintptr_t key, int flags, void *arg)
+{
+	struct schk_new_arg *a = arg;
+	struct dn_schk *s;
+	int l = sizeof(*s) +a->fp->schk_datalen;
+
+	s = malloc(l, M_DUMMYNET, M_NOWAIT | M_ZERO);
+	if (s == NULL)
+		return NULL;
+	set_oid(&s->link.oid, DN_LINK, sizeof(s->link));
+	s->sch = *a->sch; // copy initial values
+	s->link.link_nr = s->sch.sched_nr;
+	SLIST_INIT(&s->fsk_list);
+	/* initialize the hash table or create the single instance */
+	s->fp = a->fp;	/* si_new needs this */
+	s->drain_bucket = 0;
+	if (s->sch.flags & DN_HAVE_MASK) {
+		s->siht = dn_ht_init(NULL, s->sch.buckets,
+			offsetof(struct dn_sch_inst, si_next),
+			si_hash, si_match, si_new);
+		if (s->siht == NULL) {
+			free(s, M_DUMMYNET);
+			return NULL;
+		}
 	}
-    }
-    return initial_elements - fs->rq_elements ;
+	s->fp = NULL;	/* mark as a new scheduler */
+	dn_cfg.schk_count++;
+	return s;
 }
 
 /*
- * If room, create a new queue and put at head of slot i;
- * otherwise, create or use the default queue.
+ * Callback for sched delete. Notify all attached flowsets to
+ * detach from the scheduler, destroy the internal flowset, and
+ * all instances. The scheduler goes away too.
+ * arg is 0 (only detach flowsets and destroy instances)
+ * DN_DESTROY (detach & delete queues, delete schk)
+ * or DN_DELETE_FS (delete queues and flowsets, delete schk)
  */
-static struct dn_flow_queue *
-create_queue(struct dn_flow_set *fs, int i)
+static int
+schk_delete_cb(void *obj, void *arg)
 {
-	struct dn_flow_queue *q;
-
-	if (fs->rq_elements > fs->rq_size * dn_max_ratio &&
-	    expire_queues(fs) == 0) {
-		/* No way to get room, use or create overflow queue. */
-		i = fs->rq_size;
-		if (fs->rq[i] != NULL)
-		    return fs->rq[i];
-	}
-	q = malloc(sizeof(*q), M_DUMMYNET, M_NOWAIT | M_ZERO);
-	if (q == NULL) {
-		printf("dummynet: sorry, cannot allocate queue for new flow\n");
-		return (NULL);
+	struct dn_schk *s = obj;
+#if 0
+	int a = (int)arg;
+	ND("sched %d arg %s%s",
+		s->sch.sched_nr,
+		a&DN_DESTROY ? "DEL ":"",
+		a&DN_DELETE_FS ? "DEL_FS":"");
+#endif
+	fsk_detach_list(&s->fsk_list, arg ? DN_DESTROY : 0);
+	/* no more flowset pointing to us now */
+	if (s->sch.flags & DN_HAVE_MASK)
+		dn_ht_scan(s->siht, si_destroy, NULL);
+	else if (s->siht)
+		si_destroy(s->siht, NULL);
+	if (s->profile) {
+		free(s->profile, M_DUMMYNET);
+		s->profile = NULL;
 	}
-	q->fs = fs;
-	q->hash_slot = i;
-	q->next = fs->rq[i];
-	q->S = q->F + 1;	/* hack - mark timestamp as invalid. */
-	q->numbytes = fs->pipe->burst + (io_fast ? fs->pipe->bandwidth : 0);
-	fs->rq[i] = q;
-	fs->rq_elements++;
-	return (q);
+	s->siht = NULL;
+	if (s->fp->destroy)
+		s->fp->destroy(s);
+	bzero(s, sizeof(*s));	// safety
+	free(obj, M_DUMMYNET);
+	dn_cfg.schk_count--;
+	return DNHT_SCAN_DEL;
 }
 
 /*
- * Given a flow_set and a pkt in last_pkt, find a matching queue
- * after appropriate masking. The queue is moved to front
- * so that further searches take less time.
+ * called on a 'sched X delete' command. Deletes a single scheduler.
+ * This is done by removing from the schedhash, unlinking all
+ * flowsets and deleting their traffic.
  */
-static struct dn_flow_queue *
-find_queue(struct dn_flow_set *fs, struct ipfw_flow_id *id)
-{
-    int i = 0 ; /* we need i and q for new allocations */
-    struct dn_flow_queue *q, *prev;
-    int is_v6 = IS_IP6_FLOW_ID(id);
-
-    if ( !(fs->flags_fs & DN_HAVE_FLOW_MASK) )
-	q = fs->rq[0] ;
-    else {
-	/* first, do the masking, then hash */
-	id->dst_port &= fs->flow_mask.dst_port ;
-	id->src_port &= fs->flow_mask.src_port ;
-	id->proto &= fs->flow_mask.proto ;
-	id->flags = 0 ; /* we don't care about this one */
-	if (is_v6) {
-	    APPLY_MASK(&id->dst_ip6, &fs->flow_mask.dst_ip6);
-	    APPLY_MASK(&id->src_ip6, &fs->flow_mask.src_ip6);
-	    id->flow_id6 &= fs->flow_mask.flow_id6;
-
-	    i = ((id->dst_ip6.__u6_addr.__u6_addr32[0]) & 0xffff)^
-		((id->dst_ip6.__u6_addr.__u6_addr32[1]) & 0xffff)^
-		((id->dst_ip6.__u6_addr.__u6_addr32[2]) & 0xffff)^
-		((id->dst_ip6.__u6_addr.__u6_addr32[3]) & 0xffff)^
-
-		((id->dst_ip6.__u6_addr.__u6_addr32[0] >> 15) & 0xffff)^
-		((id->dst_ip6.__u6_addr.__u6_addr32[1] >> 15) & 0xffff)^
-		((id->dst_ip6.__u6_addr.__u6_addr32[2] >> 15) & 0xffff)^
-		((id->dst_ip6.__u6_addr.__u6_addr32[3] >> 15) & 0xffff)^
-
-		((id->src_ip6.__u6_addr.__u6_addr32[0] << 1) & 0xfffff)^
-		((id->src_ip6.__u6_addr.__u6_addr32[1] << 1) & 0xfffff)^
-		((id->src_ip6.__u6_addr.__u6_addr32[2] << 1) & 0xfffff)^
-		((id->src_ip6.__u6_addr.__u6_addr32[3] << 1) & 0xfffff)^
-
-		((id->src_ip6.__u6_addr.__u6_addr32[0] << 16) & 0xffff)^
-		((id->src_ip6.__u6_addr.__u6_addr32[1] << 16) & 0xffff)^
-		((id->src_ip6.__u6_addr.__u6_addr32[2] << 16) & 0xffff)^
-		((id->src_ip6.__u6_addr.__u6_addr32[3] << 16) & 0xffff)^
-
-		(id->dst_port << 1) ^ (id->src_port) ^
-		(id->proto ) ^
-		(id->flow_id6);
-	} else {
-	    id->dst_ip &= fs->flow_mask.dst_ip ;
-	    id->src_ip &= fs->flow_mask.src_ip ;
-
-	    i = ( (id->dst_ip) & 0xffff ) ^
-		( (id->dst_ip >> 15) & 0xffff ) ^
-		( (id->src_ip << 1) & 0xffff ) ^
-		( (id->src_ip >> 16 ) & 0xffff ) ^
-		(id->dst_port << 1) ^ (id->src_port) ^
-		(id->proto );
-	}
-	i = i % fs->rq_size ;
-	/* finally, scan the current list for a match */
-	searches++ ;
-	for (prev=NULL, q = fs->rq[i] ; q ; ) {
-	    search_steps++;
-	    if (is_v6 &&
-		    IN6_ARE_ADDR_EQUAL(&id->dst_ip6,&q->id.dst_ip6) &&  
-		    IN6_ARE_ADDR_EQUAL(&id->src_ip6,&q->id.src_ip6) &&  
-		    id->dst_port == q->id.dst_port &&
-		    id->src_port == q->id.src_port &&
-		    id->proto == q->id.proto &&
-		    id->flags == q->id.flags &&
-		    id->flow_id6 == q->id.flow_id6)
-		break ; /* found */
-
-	    if (!is_v6 && id->dst_ip == q->id.dst_ip &&
-		    id->src_ip == q->id.src_ip &&
-		    id->dst_port == q->id.dst_port &&
-		    id->src_port == q->id.src_port &&
-		    id->proto == q->id.proto &&
-		    id->flags == q->id.flags)
-		break ; /* found */
-
-	    /* No match. Check if we can expire the entry */
-	    if (pipe_expire && QUEUE_IS_IDLE(q)) {
-		/* entry is idle and not in any heap, expire it */
-		struct dn_flow_queue *old_q = q ;
-
-		if (prev != NULL)
-		    prev->next = q = q->next ;
-		else
-		    fs->rq[i] = q = q->next ;
-		fs->rq_elements-- ;
-		free(old_q, M_DUMMYNET);
-		continue ;
-	    }
-	    prev = q ;
-	    q = q->next ;
-	}
-	if (q && prev != NULL) { /* found and not in front */
-	    prev->next = q->next ;
-	    q->next = fs->rq[i] ;
-	    fs->rq[i] = q ;
-	}
-    }
-    if (q == NULL) { /* no match, need to allocate a new entry */
-	q = create_queue(fs, i);
-	if (q != NULL)
-	q->id = *id ;
-    }
-    return q ;
+static int
+delete_schk(int i)
+{
+	struct dn_schk *s;
+
+	s = dn_ht_find(dn_cfg.schedhash, i, DNHT_REMOVE, NULL);
+	ND("%d %p", i, s);
+	if (!s)
+		return EINVAL;
+	delete_fs(i + DN_MAX_ID, 1); /* first delete internal fs */
+	/* then detach flowsets, delete traffic */
+	schk_delete_cb(s, (void*)(uintptr_t)DN_DESTROY);
+	return 0;
 }
+/*--- end of schk hashtable support ---*/
 
 static int
-red_drops(struct dn_flow_set *fs, struct dn_flow_queue *q, int len)
+copy_obj(char **start, char *end, void *_o, const char *msg, int i)
 {
-	/*
-	 * RED algorithm
-	 *
-	 * RED calculates the average queue size (avg) using a low-pass filter
-	 * with an exponential weighted (w_q) moving average:
-	 * 	avg  <-  (1-w_q) * avg + w_q * q_size
-	 * where q_size is the queue length (measured in bytes or * packets).
-	 *
-	 * If q_size == 0, we compute the idle time for the link, and set
-	 *	avg = (1 - w_q)^(idle/s)
-	 * where s is the time needed for transmitting a medium-sized packet.
-	 *
-	 * Now, if avg < min_th the packet is enqueued.
-	 * If avg > max_th the packet is dropped. Otherwise, the packet is
-	 * dropped with probability P function of avg.
-	 */
+	struct dn_id *o = _o;
+	int have = end - *start;
 
-	int64_t p_b = 0;
-
-	/* Queue in bytes or packets? */
-	u_int q_size = (fs->flags_fs & DN_QSIZE_IS_BYTES) ?
-	    q->len_bytes : q->len;
-
-	DPRINTF(("\ndummynet: %d q: %2u ", (int)curr_time, q_size));
-
-	/* Average queue size estimation. */
-	if (q_size != 0) {
-		/* Queue is not empty, avg <- avg + (q_size - avg) * w_q */
-		int diff = SCALE(q_size) - q->avg;
-		int64_t v = SCALE_MUL((int64_t)diff, (int64_t)fs->w_q);
-
-		q->avg += (int)v;
-	} else {
-		/*
-		 * Queue is empty, find for how long the queue has been
-		 * empty and use a lookup table for computing
-		 * (1 - * w_q)^(idle_time/s) where s is the time to send a
-		 * (small) packet.
-		 * XXX check wraps...
-		 */
-		if (q->avg) {
-			u_int t = div64(curr_time - q->idle_time,
-			    fs->lookup_step);
-
-			q->avg = (t < fs->lookup_depth) ?
-			    SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0;
-		}
-	}
-	DPRINTF(("dummynet: avg: %u ", SCALE_VAL(q->avg)));
-
-	/* Should i drop? */
-	if (q->avg < fs->min_th) {
-		q->count = -1;
-		return (0);	/* accept packet */
-	}
-	if (q->avg >= fs->max_th) {	/* average queue >=  max threshold */
-		if (fs->flags_fs & DN_IS_GENTLE_RED) {
-			/*
-			 * According to Gentle-RED, if avg is greater than
-			 * max_th the packet is dropped with a probability
-			 *	 p_b = c_3 * avg - c_4
-			 * where c_3 = (1 - max_p) / max_th
-			 *       c_4 = 1 - 2 * max_p
-			 */
-			p_b = SCALE_MUL((int64_t)fs->c_3, (int64_t)q->avg) -
-			    fs->c_4;
-		} else {
-			q->count = -1;
-			DPRINTF(("dummynet: - drop"));
-			return (1);
-		}
-	} else if (q->avg > fs->min_th) {
-		/*
-		 * We compute p_b using the linear dropping function
-		 *	 p_b = c_1 * avg - c_2
-		 * where c_1 = max_p / (max_th - min_th)
-		 * 	 c_2 = max_p * min_th / (max_th - min_th)
-		 */
-		p_b = SCALE_MUL((int64_t)fs->c_1, (int64_t)q->avg) - fs->c_2;
+	if (have < o->len || o->len == 0 || o->type == 0) {
+		D("ERROR type %d %s %d have %d need %d",
+			o->type, msg, i, have, o->len);
+		return 1;
 	}
-
-	if (fs->flags_fs & DN_QSIZE_IS_BYTES)
-		p_b = div64(p_b * len, fs->max_pkt_size);
-	if (++q->count == 0)
-		q->random = random() & 0xffff;
-	else {
-		/*
-		 * q->count counts packets arrived since last drop, so a greater
-		 * value of q->count means a greater packet drop probability.
-		 */
-		if (SCALE_MUL(p_b, SCALE((int64_t)q->count)) > q->random) {
-			q->count = 0;
-			DPRINTF(("dummynet: - red drop"));
-			/* After a drop we calculate a new random value. */
-			q->random = random() & 0xffff;
-			return (1);	/* drop */
-		}
+	ND("type %d %s %d len %d", o->type, msg, i, o->len);
+	bcopy(_o, *start, o->len);
+	if (o->type == DN_LINK) {
+		/* Adjust burst parameter for link */
+		struct dn_link *l = (struct dn_link *)*start;
+		l->burst =  div64(l->burst, 8 * hz);
+	} else if (o->type == DN_SCH) {
+		/* Set id->id to the number of instances */
+		struct dn_schk *s = _o;
+		struct dn_id *id = (struct dn_id *)(*start);
+		id->id = (s->sch.flags & DN_HAVE_MASK) ?
+			dn_ht_entries(s->siht) : (s->siht ? 1 : 0);
 	}
-	/* End of RED algorithm. */
-
-	return (0);	/* accept */
+	*start += o->len;
+	return 0;
 }
 
-static __inline struct dn_flow_set *
-locate_flowset(int fs_nr)
+/* Specific function to copy a queue.
+ * It copies only the common part of a queue, and correctly set
+ * the length
+ */
+static int
+copy_obj_q(char **start, char *end, void *_o, const char *msg, int i)
 {
-	struct dn_flow_set *fs;
-
-	SLIST_FOREACH(fs, &flowsethash[HASH(fs_nr)], next)
-		if (fs->fs_nr == fs_nr)
-			return (fs);
-
-	return (NULL);
+	struct dn_id *o = _o;
+	int have = end - *start;
+	int len = sizeof(struct dn_queue);
+
+	if (have < len || o->len == 0 || o->type != DN_QUEUE) {
+		D("ERROR type %d %s %d have %d need %d",
+			o->type, msg, i, have, len);
+		return 1;
+	}
+	ND("type %d %s %d len %d", o->type, msg, i, len);
+	bcopy(_o, *start, len);
+	((struct dn_id*)(*start))->len = len;
+	*start += len;
+	return 0;
 }
 
-static __inline struct dn_pipe *
-locate_pipe(int pipe_nr)
+static int
+copy_q_cb(void *obj, void *arg)
 {
-	struct dn_pipe *pipe;
-
-	SLIST_FOREACH(pipe, &pipehash[HASH(pipe_nr)], next)
-		if (pipe->pipe_nr == pipe_nr)
-			return (pipe);
-
-	return (NULL);
+	struct dn_queue *q = obj;
+	struct copy_args *a = arg;
+	struct dn_flow *ni = (struct dn_flow *)(*a->start);
+        if (copy_obj_q(a->start, a->end, &q->ni, "queue", -1))
+                return DNHT_SCAN_END;
+        ni->oid.type = DN_FLOW; /* override the DN_QUEUE */
+        ni->oid.id = si_hash((uintptr_t)&ni->fid, 0, NULL);
+        return 0;
 }
 
-/*
- * dummynet hook for packets. Below 'pipe' is a pipe or a queue
- * depending on whether WF2Q or fixed bw is used.
- *
- * pipe_nr	pipe or queue the packet is destined for.
- * dir		where shall we send the packet after dummynet.
- * m		the mbuf with the packet
- * ifp		the 'ifp' parameter from the caller.
- *		NULL in ip_input, destination interface in ip_output,
- * rule		matching rule, in case of multiple passes
- */
 static int
-dummynet_io(struct mbuf **m0, int dir, struct ip_fw_args *fwa)
-{
-	struct mbuf *m = *m0, *head = NULL, *tail = NULL;
-	struct dn_pkt_tag *pkt;
-	struct m_tag *mtag;
-	struct dn_flow_set *fs = NULL;
-	struct dn_pipe *pipe;
-	uint64_t len = m->m_pkthdr.len;
-	struct dn_flow_queue *q = NULL;
-	int is_pipe = fwa->rule.info & IPFW_IS_PIPE;
-
-	KASSERT(m->m_nextpkt == NULL,
-	    ("dummynet_io: mbuf queue passed to dummynet"));
-
-	DUMMYNET_LOCK();
-	io_pkt++;
-	/*
-	 * This is a dummynet rule, so we expect an O_PIPE or O_QUEUE rule.
-	 */
-	if (is_pipe) {
-		pipe = locate_pipe(fwa->rule.info & IPFW_INFO_MASK);
-		if (pipe != NULL)
-			fs = &(pipe->fs);
-	} else
-		fs = locate_flowset(fwa->rule.info & IPFW_INFO_MASK);
-
-	if (fs == NULL)
-		goto dropit;	/* This queue/pipe does not exist! */
-	pipe = fs->pipe;
-	if (pipe == NULL) {	/* Must be a queue, try find a matching pipe. */
-		pipe = locate_pipe(fs->parent_nr);
-		if (pipe != NULL)
-			fs->pipe = pipe;
-		else {
-			printf("dummynet: no pipe %d for queue %d, drop pkt\n",
-			    fs->parent_nr, fs->fs_nr);
-			goto dropit;
-		}
-	}
-	q = find_queue(fs, &(fwa->f_id));
-	if (q == NULL)
-		goto dropit;		/* Cannot allocate queue. */
-
-	/* Update statistics, then check reasons to drop pkt. */
-	q->tot_bytes += len;
-	q->tot_pkts++;
-	if (fs->plr && random() < fs->plr)
-		goto dropit;		/* Random pkt drop. */
-	if (fs->flags_fs & DN_QSIZE_IS_BYTES) {
-		if (q->len_bytes > fs->qsize)
-			goto dropit;	/* Queue size overflow. */
-	} else {
-		if (q->len >= fs->qsize)
-			goto dropit;	/* Queue count overflow. */
-	}
-	if (fs->flags_fs & DN_IS_RED && red_drops(fs, q, len))
-		goto dropit;
-
-	/* XXX expensive to zero, see if we can remove it. */
-	mtag = m_tag_get(PACKET_TAG_DUMMYNET,
-	    sizeof(struct dn_pkt_tag), M_NOWAIT | M_ZERO);
-	if (mtag == NULL)
-		goto dropit;		/* Cannot allocate packet header. */
-	m_tag_prepend(m, mtag);		/* Attach to mbuf chain. */
-
-	pkt = (struct dn_pkt_tag *)(mtag + 1);
-	/*
-	 * Ok, i can handle the pkt now...
-	 * Build and enqueue packet + parameters.
-	 */
-	pkt->rule = fwa->rule;
-	pkt->rule.info &= IPFW_ONEPASS;	/* only keep this info */
-	pkt->dn_dir = dir;
-	pkt->ifp = fwa->oif;
-
-	if (q->head == NULL)
-		q->head = m;
+copy_q(struct copy_args *a, struct dn_fsk *fs, int flags)
+{
+	if (!fs->qht)
+		return 0;
+	if (fs->fs.flags & DN_QHT_HASH)
+		dn_ht_scan(fs->qht, copy_q_cb, a);
 	else
-		q->tail->m_nextpkt = m;
-	q->tail = m;
-	q->len++;
-	q->len_bytes += len;
-
-	if (q->head != m)		/* Flow was not idle, we are done. */
-		goto done;
-
-	if (is_pipe) {			/* Fixed rate queues. */
-		if (q->idle_time < curr_time) {
-			/* Calculate available burst size. */
-			q->numbytes +=
-			    (curr_time - q->idle_time - 1) * pipe->bandwidth;
-			if (q->numbytes > pipe->burst)
-				q->numbytes = pipe->burst;
-			if (io_fast)
-				q->numbytes += pipe->bandwidth;
-		}
-	} else {			/* WF2Q. */
-		if (pipe->idle_time < curr_time &&
-		    pipe->scheduler_heap.elements == 0 &&
-		    pipe->not_eligible_heap.elements == 0) {
-			/* Calculate available burst size. */
-			pipe->numbytes +=
-			    (curr_time - pipe->idle_time - 1) * pipe->bandwidth;
-			if (pipe->numbytes > 0 && pipe->numbytes > pipe->burst)
-				pipe->numbytes = pipe->burst;
-			if (io_fast)
-				pipe->numbytes += pipe->bandwidth;
-		}
-		pipe->idle_time = curr_time;
-	}
-	/* Necessary for both: fixed rate & WF2Q queues. */
-	q->idle_time = curr_time;
-
-	/*
-	 * If we reach this point the flow was previously idle, so we need
-	 * to schedule it. This involves different actions for fixed-rate or
-	 * WF2Q queues.
-	 */
-	if (is_pipe) {
-		/* Fixed-rate queue: just insert into the ready_heap. */
-		dn_key t = 0;
-
-		if (pipe->bandwidth) {
-			q->extra_bits = compute_extra_bits(m, pipe);
-			t = set_ticks(m, q, pipe);
-		}
-		q->sched_time = curr_time;
-		if (t == 0)		/* Must process it now. */
-			ready_event(q, &head, &tail);
-		else
-			heap_insert(&ready_heap, curr_time + t , q);
-	} else {
-		/*
-		 * WF2Q. First, compute start time S: if the flow was
-		 * idle (S = F + 1) set S to the virtual time V for the
-		 * controlling pipe, and update the sum of weights for the pipe;
-		 * otherwise, remove flow from idle_heap and set S to max(F,V).
-		 * Second, compute finish time F = S + len / weight.
-		 * Third, if pipe was idle, update V = max(S, V).
-		 * Fourth, count one more backlogged flow.
-		 */
-		if (DN_KEY_GT(q->S, q->F)) { /* Means timestamps are invalid. */
-			q->S = pipe->V;
-			pipe->sum += fs->weight; /* Add weight of new queue. */
-		} else {
-			heap_extract(&(pipe->idle_heap), q);
-			q->S = MAX64(q->F, pipe->V);
-		}
-		q->F = q->S + div64(len << MY_M, fs->weight);
-
-		if (pipe->not_eligible_heap.elements == 0 &&
-		    pipe->scheduler_heap.elements == 0)
-			pipe->V = MAX64(q->S, pipe->V);
-		fs->backlogged++;
-		/*
-		 * Look at eligibility. A flow is not eligibile if S>V (when
-		 * this happens, it means that there is some other flow already
-		 * scheduled for the same pipe, so the scheduler_heap cannot be
-		 * empty). If the flow is not eligible we just store it in the
-		 * not_eligible_heap. Otherwise, we store in the scheduler_heap
-		 * and possibly invoke ready_event_wfq() right now if there is
-		 * leftover credit.
-		 * Note that for all flows in scheduler_heap (SCH), S_i <= V,
-		 * and for all flows in not_eligible_heap (NEH), S_i > V.
-		 * So when we need to compute max(V, min(S_i)) forall i in
-		 * SCH+NEH, we only need to look into NEH.
-		 */
-		if (DN_KEY_GT(q->S, pipe->V)) {		/* Not eligible. */
-			if (pipe->scheduler_heap.elements == 0)
-				printf("dummynet: ++ ouch! not eligible but empty scheduler!\n");
-			heap_insert(&(pipe->not_eligible_heap), q->S, q);
-		} else {
-			heap_insert(&(pipe->scheduler_heap), q->F, q);
-			if (pipe->numbytes >= 0) {	 /* Pipe is idle. */
-				if (pipe->scheduler_heap.elements != 1)
-					printf("dummynet: OUCH! pipe should have been idle!\n");
-				DPRINTF(("dummynet: waking up pipe %d at %d\n",
-				    pipe->pipe_nr, (int)(q->F >> MY_M)));
-				pipe->sched_time = curr_time;
-				ready_event_wfq(pipe, &head, &tail);
-			}
-		}
-	}
-done:
-	if (head == m && (dir & PROTO_LAYER2) == 0 ) {
-		/* Fast io. */
-		io_pkt_fast++;
-		if (m->m_nextpkt != NULL)
-			printf("dummynet: fast io: pkt chain detected!\n");
-		head = m->m_nextpkt = NULL;
-	} else
-		*m0 = NULL;		/* Normal io. */
-
-	DUMMYNET_UNLOCK();
-	if (head != NULL)
-		dummynet_send(head);
-	return (0);
-
-dropit:
-	io_pkt_drop++;
-	if (q)
-		q->drops++;
-	DUMMYNET_UNLOCK();
-	FREE_PKT(m);
-	*m0 = NULL;
-	return ((fs && (fs->flags_fs & DN_NOERROR)) ? 0 : ENOBUFS);
+		copy_q_cb(fs->qht, a);
+	return 0;
 }
 
 /*
- * Dispose all packets and flow_queues on a flow_set.
- * If all=1, also remove red lookup table and other storage,
- * including the descriptor itself.
- * For the one in dn_pipe MUST also cleanup ready_heap...
+ * This routine only copies the initial part of a profile ? XXX
  */
-static void
-purge_flow_set(struct dn_flow_set *fs, int all)
+static int
+copy_profile(struct copy_args *a, struct dn_profile *p)
 {
-	struct dn_flow_queue *q, *qn;
-	int i;
+	int have = a->end - *a->start;
+	/* XXX here we check for max length */
+	int profile_len = sizeof(struct dn_profile) - 
+		ED_MAX_SAMPLES_NO*sizeof(int);
 
-	DUMMYNET_LOCK_ASSERT();
-
-	for (i = 0; i <= fs->rq_size; i++) {
-		for (q = fs->rq[i]; q != NULL; q = qn) {
-			dn_free_pkts(q->head);
-			qn = q->next;
-			free(q, M_DUMMYNET);
-		}
-		fs->rq[i] = NULL;
+	if (p == NULL)
+		return 0;
+	if (have < profile_len) {
+		D("error have %d need %d", have, profile_len);
+		return 1;
 	}
+	bcopy(p, *a->start, profile_len);
+	((struct dn_id *)(*a->start))->len = profile_len;
+	*a->start += profile_len;
+	return 0;
+}
 
-	fs->rq_elements = 0;
-	if (all) {
-		/* RED - free lookup table. */
-		if (fs->w_q_lookup != NULL)
-			free(fs->w_q_lookup, M_DUMMYNET);
-		if (fs->rq != NULL)
-			free(fs->rq, M_DUMMYNET);
-		/* If this fs is not part of a pipe, free it. */
-		if (fs->pipe == NULL || fs != &(fs->pipe->fs))
-			free(fs, M_DUMMYNET);
+static int
+copy_flowset(struct copy_args *a, struct dn_fsk *fs, int flags)
+{
+	struct dn_fs *ufs = (struct dn_fs *)(*a->start);
+	if (!fs)
+		return 0;
+	ND("flowset %d", fs->fs.fs_nr);
+	if (copy_obj(a->start, a->end, &fs->fs, "flowset", fs->fs.fs_nr))
+		return DNHT_SCAN_END;
+	ufs->oid.id = (fs->fs.flags & DN_QHT_HASH) ?
+		dn_ht_entries(fs->qht) : (fs->qht ? 1 : 0);
+	if (flags) {	/* copy queues */
+		copy_q(a, fs, 0);
 	}
+	return 0;
 }
 
-/*
- * Dispose all packets queued on a pipe (not a flow_set).
- * Also free all resources associated to a pipe, which is about
- * to be deleted.
- */
-static void
-purge_pipe(struct dn_pipe *pipe)
+static int
+copy_si_cb(void *obj, void *arg)
 {
+	struct dn_sch_inst *si = obj;
+	struct copy_args *a = arg;
+	struct dn_flow *ni = (struct dn_flow *)(*a->start);
+	if (copy_obj(a->start, a->end, &si->ni, "inst",
+			si->sched->sch.sched_nr))
+		return DNHT_SCAN_END;
+	ni->oid.type = DN_FLOW; /* override the DN_SCH_I */
+	ni->oid.id = si_hash((uintptr_t)si, DNHT_KEY_IS_OBJ, NULL);
+	return 0;
+}
 
-    purge_flow_set( &(pipe->fs), 1 );
-
-    dn_free_pkts(pipe->head);
-
-    heap_free( &(pipe->scheduler_heap) );
-    heap_free( &(pipe->not_eligible_heap) );
-    heap_free( &(pipe->idle_heap) );
+static int
+copy_si(struct copy_args *a, struct dn_schk *s, int flags)
+{
+	if (s->sch.flags & DN_HAVE_MASK)
+		dn_ht_scan(s->siht, copy_si_cb, a);
+	else if (s->siht)
+		copy_si_cb(s->siht, a);
+	return 0;
 }
 
 /*
- * Delete all pipes and heaps returning memory. Must also
- * remove references from all ipfw rules to all pipes.
+ * compute a list of children of a scheduler and copy up
  */
-static void
-dummynet_flush(void)
+static int
+copy_fsk_list(struct copy_args *a, struct dn_schk *s, int flags)
 {
-	struct dn_pipe *pipe, *pipe1;
-	struct dn_flow_set *fs, *fs1;
-	int i;
-
-	DUMMYNET_LOCK();
-	/* Free heaps so we don't have unwanted events. */
-	heap_free(&ready_heap);
-	heap_free(&wfq_ready_heap);
-	heap_free(&extract_heap);
+	struct dn_fsk *fs;
+	struct dn_id *o;
+	uint32_t *p;
+
+	int n = 0, space = sizeof(*o);
+	SLIST_FOREACH(fs, &s->fsk_list, sch_chain) {
+		if (fs->fs.fs_nr < DN_MAX_ID)
+			n++;
+	}
+	space += n * sizeof(uint32_t);
+	DX(3, "sched %d has %d flowsets", s->sch.sched_nr, n);
+	if (a->end - *(a->start) < space)
+		return DNHT_SCAN_END;
+	o = (struct dn_id *)(*(a->start));
+	o->len = space;
+	*a->start += o->len;
+	o->type = DN_TEXT;
+	p = (uint32_t *)(o+1);
+	SLIST_FOREACH(fs, &s->fsk_list, sch_chain)
+		if (fs->fs.fs_nr < DN_MAX_ID)
+			*p++ = fs->fs.fs_nr;
+	return 0;
+}
 
-	/*
-	 * Now purge all queued pkts and delete all pipes.
-	 *
-	 * XXXGL: can we merge the for(;;) cycles into one or not?
-	 */
-	for (i = 0; i < HASHSIZE; i++)
-		SLIST_FOREACH_SAFE(fs, &flowsethash[i], next, fs1) {
-			SLIST_REMOVE(&flowsethash[i], fs, dn_flow_set, next);
-			purge_flow_set(fs, 1);
+static int
+copy_data_helper(void *_o, void *_arg)
+{
+	struct copy_args *a = _arg;
+
+	if (a->type == DN_LINK ||	/* pipe show */
+	    a->type == DN_SCH) {	/* sched show */
+		struct dn_schk *s = _o; /* we get only schedulers */
+		if (a->type == DN_SCH && s->sch.sched_nr >= DN_MAX_ID)
+			return 0;	/* not valid scheduler */
+		if (a->type == DN_LINK && s->sch.sched_nr <= DN_MAX_ID)
+			return 0;	/* not valid pipe */
+		if (a->flags & DN_C_LINK) {
+			if (copy_obj(a->start, a->end, &s->link,
+					"link", s->sch.sched_nr))
+				return DNHT_SCAN_END;
+			if (copy_profile(a, s->profile))
+				return DNHT_SCAN_END;
+			if (copy_flowset(a, s->fs, 0))
+				return DNHT_SCAN_END;
 		}
-	for (i = 0; i < HASHSIZE; i++)
-		SLIST_FOREACH_SAFE(pipe, &pipehash[i], next, pipe1) {
-			SLIST_REMOVE(&pipehash[i], pipe, dn_pipe, next);
-			purge_pipe(pipe);
-			free_pipe(pipe);
+		if (a->flags & DN_C_SCH) {
+			if (copy_obj(a->start, a->end, &s->sch,
+					"sched", s->sch.sched_nr))
+				return DNHT_SCAN_END;
+
+			/* list all attached flowsets */
+			if (copy_fsk_list(a, s, 0))
+				return DNHT_SCAN_END;
 		}
-	DUMMYNET_UNLOCK();
+		if (a->flags & DN_C_FLOW) {
+			copy_si(a, s, 0);
+		}
+	}
+	if (a->type == DN_FS) {	/* queue show, skip internal flowsets */
+		struct dn_fsk *fs = _o;
+		if (fs->fs.fs_nr >= DN_MAX_ID)
+			return 0;
+		if (copy_flowset(a, fs, 0))
+			return DNHT_SCAN_END;
+		copy_q(a, fs, 0);
+	}
+	return 0;
+}
+
+static inline struct dn_schk *
+locate_scheduler(int i)
+{
+	return dn_ht_find(dn_cfg.schedhash, i, 0, NULL);
 }
 
 /*
- * setup RED parameters
+ * red parameters are in fixed point arithmetic.
  */
 static int
-config_red(struct dn_flow_set *p, struct dn_flow_set *x)
+config_red(struct dn_fsk *fs)
 {
-	int i;
-
-	x->w_q = p->w_q;
-	x->min_th = SCALE(p->min_th);
-	x->max_th = SCALE(p->max_th);
-	x->max_p = p->max_p;
-
-	x->c_1 = p->max_p / (p->max_th - p->min_th);
-	x->c_2 = SCALE_MUL(x->c_1, SCALE(p->min_th));
-
-	if (x->flags_fs & DN_IS_GENTLE_RED) {
-		x->c_3 = (SCALE(1) - p->max_p) / p->max_th;
-		x->c_4 = SCALE(1) - 2 * p->max_p;
+	int64_t s, idle, weight, w0;
+	int t, i;
+
+	fs->w_q = fs->fs.w_q;
+	fs->max_p = fs->fs.max_p;
+	D("called");
+	/* Doing stuff that was in userland */
+	i = fs->sched->link.bandwidth;
+	s = (i <= 0) ? 0 :
+		hz * dn_cfg.red_avg_pkt_size * 8 * SCALE(1) / i;
+
+	idle = div64((s * 3) , fs->w_q); /* s, fs->w_q scaled; idle not scaled */
+	fs->lookup_step = div64(idle , dn_cfg.red_lookup_depth);
+	/* fs->lookup_step not scaled, */
+	if (!fs->lookup_step)
+		fs->lookup_step = 1;
+	w0 = weight = SCALE(1) - fs->w_q; //fs->w_q scaled
+
+	for (t = fs->lookup_step; t > 1; --t)
+		weight = SCALE_MUL(weight, w0);
+	fs->lookup_weight = (int)(weight); // scaled
+
+	/* Now doing stuff that was in kerneland */
+	fs->min_th = SCALE(fs->fs.min_th);
+	fs->max_th = SCALE(fs->fs.max_th);
+
+	fs->c_1 = fs->max_p / (fs->fs.max_th - fs->fs.min_th);
+	fs->c_2 = SCALE_MUL(fs->c_1, SCALE(fs->fs.min_th));
+
+	if (fs->fs.flags & DN_IS_GENTLE_RED) {
+		fs->c_3 = (SCALE(1) - fs->max_p) / fs->fs.max_th;
+		fs->c_4 = SCALE(1) - 2 * fs->max_p;
 	}
 
 	/* If the lookup table already exist, free and create it again. */
-	if (x->w_q_lookup) {
-		free(x->w_q_lookup, M_DUMMYNET);
-		x->w_q_lookup = NULL;
+	if (fs->w_q_lookup) {
+		free(fs->w_q_lookup, M_DUMMYNET);
+		fs->w_q_lookup = NULL;
 	}
-	if (red_lookup_depth == 0) {
+	if (dn_cfg.red_lookup_depth == 0) {
 		printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth"
 		    "must be > 0\n");
-		free(x, M_DUMMYNET);
+		fs->fs.flags &= ~DN_IS_RED;
+		fs->fs.flags &= ~DN_IS_GENTLE_RED;
 		return (EINVAL);
 	}
-	x->lookup_depth = red_lookup_depth;
-	x->w_q_lookup = (u_int *)malloc(x->lookup_depth * sizeof(int),
+	fs->lookup_depth = dn_cfg.red_lookup_depth;
+	fs->w_q_lookup = (u_int *)malloc(fs->lookup_depth * sizeof(int),
 	    M_DUMMYNET, M_NOWAIT);
-	if (x->w_q_lookup == NULL) {
+	if (fs->w_q_lookup == NULL) {
 		printf("dummynet: sorry, cannot allocate red lookup table\n");
-		free(x, M_DUMMYNET);
+		fs->fs.flags &= ~DN_IS_RED;
+		fs->fs.flags &= ~DN_IS_GENTLE_RED;
 		return(ENOSPC);
 	}
 
 	/* Fill the lookup table with (1 - w_q)^x */
-	x->lookup_step = p->lookup_step;
-	x->lookup_weight = p->lookup_weight;
-	x->w_q_lookup[0] = SCALE(1) - x->w_q;
-
-	for (i = 1; i < x->lookup_depth; i++)
-		x->w_q_lookup[i] =
-		    SCALE_MUL(x->w_q_lookup[i - 1], x->lookup_weight);
+	fs->w_q_lookup[0] = SCALE(1) - fs->w_q;
+
+	for (i = 1; i < fs->lookup_depth; i++)
+		fs->w_q_lookup[i] =
+		    SCALE_MUL(fs->w_q_lookup[i - 1], fs->lookup_weight);
+
+	if (dn_cfg.red_avg_pkt_size < 1)
+		dn_cfg.red_avg_pkt_size = 512;
+	fs->avg_pkt_size = dn_cfg.red_avg_pkt_size;
+	if (dn_cfg.red_max_pkt_size < 1)
+		dn_cfg.red_max_pkt_size = 1500;
+	fs->max_pkt_size = dn_cfg.red_max_pkt_size;
+	D("exit");
+	return 0;
+}
 
-	if (red_avg_pkt_size < 1)
-		red_avg_pkt_size = 512;
-	x->avg_pkt_size = red_avg_pkt_size;
-	if (red_max_pkt_size < 1)
-		red_max_pkt_size = 1500;
-	x->max_pkt_size = red_max_pkt_size;
-	return (0);
+/* Scan all flowset attached to this scheduler and update red */
+static void
+update_red(struct dn_schk *s)
+{
+	struct dn_fsk *fs;
+	SLIST_FOREACH(fs, &s->fsk_list, sch_chain) {
+		if (fs && (fs->fs.flags & DN_IS_RED))
+			config_red(fs);
+	}
 }
 
-static int
-alloc_hash(struct dn_flow_set *x, struct dn_flow_set *pfs)
-{
-    if (x->flags_fs & DN_HAVE_FLOW_MASK) {     /* allocate some slots */
-	int l = pfs->rq_size;
-
-	if (l == 0)
-	    l = dn_hash_size;
-	if (l < 4)
-	    l = 4;
-	else if (l > DN_MAX_HASH_SIZE)
-	    l = DN_MAX_HASH_SIZE;
-	x->rq_size = l;
-    } else                  /* one is enough for null mask */
-	x->rq_size = 1;
-    x->rq = malloc((1 + x->rq_size) * sizeof(struct dn_flow_queue *),
-	    M_DUMMYNET, M_NOWAIT | M_ZERO);
-    if (x->rq == NULL) {
-	printf("dummynet: sorry, cannot allocate queue\n");
-	return (ENOMEM);
-    }
-    x->rq_elements = 0;
-    return 0 ;
+/* attach flowset to scheduler s, possibly requeue */
+static void
+fsk_attach(struct dn_fsk *fs, struct dn_schk *s)
+{
+	ND("remove fs %d from fsunlinked, link to sched %d",
+		fs->fs.fs_nr, s->sch.sched_nr);
+	SLIST_REMOVE(&dn_cfg.fsu, fs, dn_fsk, sch_chain);
+	fs->sched = s;
+	SLIST_INSERT_HEAD(&s->fsk_list, fs, sch_chain);
+	if (s->fp->new_fsk)
+		s->fp->new_fsk(fs);
+	/* XXX compute fsk_mask */
+	fs->fsk_mask = fs->fs.flow_mask;
+	if (fs->sched->sch.flags & DN_HAVE_MASK)
+		flow_id_or(&fs->sched->sch.sched_mask, &fs->fsk_mask);
+	if (fs->qht) {
+		/*
+		 * we must drain qht according to the old
+		 * type, and reinsert according to the new one.
+		 * The requeue is complex -- in general we need to
+		 * reclassify every single packet.
+		 * For the time being, let's hope qht is never set
+		 * when we reach this point.
+		 */
+		D("XXX TODO requeue from fs %d to sch %d",
+			fs->fs.fs_nr, s->sch.sched_nr);
+		fs->qht = NULL;
+	}
+	/* set the new type for qht */
+	if (nonzero_mask(&fs->fsk_mask))
+		fs->fs.flags |= DN_QHT_HASH;
+	else
+		fs->fs.flags &= ~DN_QHT_HASH;
+
+	/* XXX config_red() can fail... */
+	if (fs->fs.flags & DN_IS_RED)
+		config_red(fs);
 }
 
+/* update all flowsets which may refer to this scheduler */
 static void
-set_fs_parms(struct dn_flow_set *x, struct dn_flow_set *src)
-{
-	x->flags_fs = src->flags_fs;
-	x->qsize = src->qsize;
-	x->plr = src->plr;
-	x->flow_mask = src->flow_mask;
-	if (x->flags_fs & DN_QSIZE_IS_BYTES) {
-		if (x->qsize > pipe_byte_limit)
-			x->qsize = 1024 * 1024;
-	} else {
-		if (x->qsize == 0)
-			x->qsize = 50;
-		if (x->qsize > pipe_slot_limit)
-			x->qsize = 50;
+update_fs(struct dn_schk *s)
+{
+	struct dn_fsk *fs, *tmp;
+
+	SLIST_FOREACH_SAFE(fs, &dn_cfg.fsu, sch_chain, tmp) {
+		if (s->sch.sched_nr != fs->fs.sched_nr) {
+			D("fs %d for sch %d not %d still unlinked",
+				fs->fs.fs_nr, fs->fs.sched_nr,
+				s->sch.sched_nr);
+			continue;
+		}
+		fsk_attach(fs, s);
 	}
-	/* Configuring RED. */
-	if (x->flags_fs & DN_IS_RED)
-		config_red(src, x);	/* XXX should check errors */
 }
 
 /*
- * Setup pipe or queue parameters.
+ * Configuration -- to preserve backward compatibility we use
+ * the following scheme (N is 65536)
+ *	NUMBER		SCHED	LINK	FLOWSET
+ *	   1 ..  N-1	(1)WFQ	(2)WFQ	(3)queue
+ *	 N+1 .. 2N-1	(4)FIFO (5)FIFO	(6)FIFO for sched 1..N-1
+ *	2N+1 .. 3N-1	--	--	(7)FIFO for sched N+1..2N-1
+ *
+ * "pipe i config" configures #1, #2 and #3
+ * "sched i config" configures #1 and possibly #6
+ * "queue i config" configures #3
+ * #1 is configured with 'pipe i config' or 'sched i config'
+ * #2 is configured with 'pipe i config', and created if not
+ *	existing with 'sched i config'
+ * #3 is configured with 'queue i config'
+ * #4 is automatically configured after #1, can only be FIFO
+ * #5 is automatically configured after #2
+ * #6 is automatically created when #1 is !MULTIQUEUE,
+ *	and can be updated.
+ * #7 is automatically configured after #2
+ */
+
+/*
+ * configure a link (and its FIFO instance)
  */
 static int
-config_pipe(struct dn_pipe *p)
+config_link(struct dn_link *p, struct dn_id *arg)
 {
-	struct dn_flow_set *pfs = &(p->fs);
-	struct dn_flow_queue *q;
-	int i, error;
+	int i;
 
+	if (p->oid.len != sizeof(*p)) {
+		D("invalid pipe len %d", p->oid.len);
+		return EINVAL;
+	}
+	i = p->link_nr;
+	if (i <= 0 || i >= DN_MAX_ID)
+		return EINVAL;
 	/*
 	 * The config program passes parameters as follows:
 	 * bw = bits/second (0 means no limits),
 	 * delay = ms, must be translated into ticks.
 	 * qsize = slots/bytes
+	 * burst ???
 	 */
 	p->delay = (p->delay * hz) / 1000;
 	/* Scale burst size: bytes -> bits * hz */
 	p->burst *= 8 * hz;
-	/* We need either a pipe number or a flow_set number. */
-	if (p->pipe_nr == 0 && pfs->fs_nr == 0)
-		return (EINVAL);
-	if (p->pipe_nr != 0 && pfs->fs_nr != 0)
-		return (EINVAL);
-	if (p->pipe_nr != 0) {			/* this is a pipe */
-		struct dn_pipe *pipe;
-
-		DUMMYNET_LOCK();
-		pipe = locate_pipe(p->pipe_nr);	/* locate pipe */
-
-		if (pipe == NULL) {		/* new pipe */
-			pipe = malloc(sizeof(struct dn_pipe), M_DUMMYNET,
-			    M_NOWAIT | M_ZERO);
-			if (pipe == NULL) {
-				DUMMYNET_UNLOCK();
-				printf("dummynet: no memory for new pipe\n");
-				return (ENOMEM);
-			}
-			pipe->pipe_nr = p->pipe_nr;
-			pipe->fs.pipe = pipe;
-			/*
-			 * idle_heap is the only one from which
-			 * we extract from the middle.
-			 */
-			pipe->idle_heap.size = pipe->idle_heap.elements = 0;
-			pipe->idle_heap.offset =
-			    offsetof(struct dn_flow_queue, heap_pos);
-		} else {
-			/* Flush accumulated credit for all queues. */
-			for (i = 0; i <= pipe->fs.rq_size; i++) {
-				for (q = pipe->fs.rq[i]; q; q = q->next) {
-					q->numbytes = p->burst +
-					    (io_fast ? p->bandwidth : 0);
-				}
-			}
-		}
 
-		pipe->bandwidth = p->bandwidth;
-		pipe->burst = p->burst;
-		pipe->numbytes = pipe->burst + (io_fast ? pipe->bandwidth : 0);
-		bcopy(p->if_name, pipe->if_name, sizeof(p->if_name));
-		pipe->ifp = NULL;		/* reset interface ptr */
-		pipe->delay = p->delay;
-		set_fs_parms(&(pipe->fs), pfs);
-
-		/* Handle changes in the delay profile. */
-		if (p->samples_no > 0) {
-			if (pipe->samples_no != p->samples_no) {
-				if (pipe->samples != NULL)
-					free(pipe->samples, M_DUMMYNET);
-				pipe->samples =
-				    malloc(p->samples_no*sizeof(dn_key),
-					M_DUMMYNET, M_NOWAIT | M_ZERO);
-				if (pipe->samples == NULL) {
-					DUMMYNET_UNLOCK();
-					printf("dummynet: no memory "
-						"for new samples\n");
-					return (ENOMEM);
-				}
-				pipe->samples_no = p->samples_no;
-			}
+	DN_BH_WLOCK();
+	/* do it twice, base link and FIFO link */
+	for (; i < 2*DN_MAX_ID; i += DN_MAX_ID) {
+	    struct dn_schk *s = locate_scheduler(i);
+	    if (s == NULL) {
+		DN_BH_WUNLOCK();
+		D("sched %d not found", i);
+		return EINVAL;
+	    }
+	    /* remove profile if exists */
+	    if (s->profile) {
+		free(s->profile, M_DUMMYNET);
+		s->profile = NULL;
+	    }
+	    /* copy all parameters */
+	    s->link.oid = p->oid;
+	    s->link.link_nr = i;
+	    s->link.delay = p->delay;
+	    if (s->link.bandwidth != p->bandwidth) {
+		/* XXX bandwidth changes, need to update red params */
+	    s->link.bandwidth = p->bandwidth;
+		update_red(s);
+	    }
+	    s->link.burst = p->burst;
+	    schk_reset_credit(s);
+	}
+	dn_cfg.id++;
+	DN_BH_WUNLOCK();
+	return 0;
+}
 
-			strncpy(pipe->name,p->name,sizeof(pipe->name));
-			pipe->loss_level = p->loss_level;
-			for (i = 0; i<pipe->samples_no; ++i)
-				pipe->samples[i] = p->samples[i];
-		} else if (pipe->samples != NULL) {
-			free(pipe->samples, M_DUMMYNET);
-			pipe->samples = NULL;
-			pipe->samples_no = 0;
-		}
+/*
+ * configure a flowset. Can be called from inside with locked=1,
+ */
+static struct dn_fsk *
+config_fs(struct dn_fs *nfs, struct dn_id *arg, int locked)
+{
+	int i;
+	struct dn_fsk *fs;
 
-		if (pipe->fs.rq == NULL) {	/* a new pipe */
-			error = alloc_hash(&(pipe->fs), pfs);
-			if (error) {
-				DUMMYNET_UNLOCK();
-				free_pipe(pipe);
-				return (error);
-			}
-			SLIST_INSERT_HEAD(&pipehash[HASH(pipe->pipe_nr)],
-			    pipe, next);
-		}
-		DUMMYNET_UNLOCK();
-	} else {				/* config queue */
-		struct dn_flow_set *fs;
+	if (nfs->oid.len != sizeof(*nfs)) {
+		D("invalid flowset len %d", nfs->oid.len);
+		return NULL;
+	}
+	i = nfs->fs_nr;
+	if (i <= 0 || i >= 3*DN_MAX_ID)
+		return NULL;
+	ND("flowset %d", i);
+	/* XXX other sanity checks */
+        if (nfs->flags & DN_QSIZE_BYTES) {
+		ipdn_bound_var(&nfs->qsize, 16384,
+		    1500, dn_cfg.byte_limit, NULL); // "queue byte size");
+        } else {
+		ipdn_bound_var(&nfs->qsize, 50,
+		    1, dn_cfg.slot_limit, NULL); // "queue slot size");
+        }
+	if (nfs->flags & DN_HAVE_MASK) {
+		/* make sure we have some buckets */
+		ipdn_bound_var(&nfs->buckets, dn_cfg.hash_size,
+			1, dn_cfg.max_hash_size, "flowset buckets");
+	} else {
+		nfs->buckets = 1;	/* we only need 1 */
+	}
+	if (!locked)
+		DN_BH_WLOCK();
+	do { /* exit with break when done */
+	    struct dn_schk *s;
+	    int flags = nfs->sched_nr ? DNHT_INSERT : 0;
+	    int j;
+	    int oldc = dn_cfg.fsk_count;
+	    fs = dn_ht_find(dn_cfg.fshash, i, flags, NULL);
+	    if (fs == NULL) {
+		D("missing sched for flowset %d", i);
+	        break;
+	    }
+	    /* grab some defaults from the existing one */
+	    if (nfs->sched_nr == 0) /* reuse */
+		nfs->sched_nr = fs->fs.sched_nr;
+	    for (j = 0; j < sizeof(nfs->par)/sizeof(nfs->par[0]); j++) {
+		if (nfs->par[j] == -1) /* reuse */
+		    nfs->par[j] = fs->fs.par[j];
+	    }
+	    if (bcmp(&fs->fs, nfs, sizeof(*nfs)) == 0) {
+		ND("flowset %d unchanged", i);
+		break; /* no change, nothing to do */
+	    }
+	    if (oldc != dn_cfg.fsk_count)	/* new item */
+		dn_cfg.id++;
+	    s = locate_scheduler(nfs->sched_nr);
+	    /* detach from old scheduler if needed, preserving
+	     * queues if we need to reattach. Then update the
+	     * configuration, and possibly attach to the new sched.
+	     */
+	    DX(2, "fs %d changed sched %d@%p to %d@%p",
+		fs->fs.fs_nr,
+		fs->fs.sched_nr, fs->sched, nfs->sched_nr, s);
+	    if (fs->sched) {
+		int flags = s ? DN_DETACH : (DN_DETACH | DN_DESTROY);
+		flags |= DN_DESTROY; /* XXX temporary */
+		fsk_detach(fs, flags);
+	    }
+	    fs->fs = *nfs; /* copy configuration */
+	    if (s != NULL)
+		fsk_attach(fs, s);
+	} while (0);
+	if (!locked)
+		DN_BH_WUNLOCK();
+	return fs;
+}
 
-		DUMMYNET_LOCK();
-		fs = locate_flowset(pfs->fs_nr); /* locate flow_set */
+/*
+ * config/reconfig a scheduler and its FIFO variant.
+ * For !MULTIQUEUE schedulers, also set up the flowset.
+ *
+ * On reconfigurations (detected because s->fp is set),
+ * detach existing flowsets preserving traffic, preserve link,
+ * and delete the old scheduler creating a new one.
+ */
+static int
+config_sched(struct dn_sch *_nsch, struct dn_id *arg)
+{
+	struct dn_schk *s;
+	struct schk_new_arg a; /* argument for schk_new */
+	int i;
+	struct dn_link p;	/* copy of oldlink */
+	struct dn_profile *pf;	/* copy of old link profile */
+	/* Used to preserv mask parameter */
+	struct ipfw_flow_id new_mask;
+	int new_buckets = 0;
+	int new_flags = 0;
+	int pipe_cmd;
+
+	a.sch = _nsch;
+	if (a.sch->oid.len != sizeof(*a.sch)) {
+		D("bad sched len %d", a.sch->oid.len);
+		return EINVAL;
+	}
+	i = a.sch->sched_nr;
+	if (i <= 0 || i >= DN_MAX_ID)
+		return EINVAL;
+	/* make sure we have some buckets */
+	if (a.sch->flags & DN_HAVE_MASK)
+		ipdn_bound_var(&a.sch->buckets, dn_cfg.hash_size,
+			1, dn_cfg.max_hash_size, "sched buckets");
+	/* XXX other sanity checks */
+	bzero(&p, sizeof(p));
+	pf = malloc(sizeof(struct dn_profile), M_DUMMYNET, M_NOWAIT | M_ZERO);
+	if (pf == NULL) {
+		D("Error allocating profile");
+		return ENOMEM;
+	}
 
-		if (fs == NULL) {		/* new */
-			if (pfs->parent_nr == 0) { /* need link to a pipe */
-				DUMMYNET_UNLOCK();
-				return (EINVAL);
-			}
-			fs = malloc(sizeof(struct dn_flow_set), M_DUMMYNET,
-			    M_NOWAIT | M_ZERO);
-			if (fs == NULL) {
-				DUMMYNET_UNLOCK();
-				printf(
-				    "dummynet: no memory for new flow_set\n");
-				return (ENOMEM);
-			}
-			fs->fs_nr = pfs->fs_nr;
-			fs->parent_nr = pfs->parent_nr;
-			fs->weight = pfs->weight;
-			if (fs->weight == 0)
-				fs->weight = 1;
-			else if (fs->weight > 100)
-				fs->weight = 100;
+	pipe_cmd = a.sch->flags & DN_PIPE_CMD;
+	a.sch->flags &= ~DN_PIPE_CMD; //XXX do it even if is not set?
+	if (pipe_cmd) {
+		/* Copy mask parameter */
+		new_mask = a.sch->sched_mask;
+		new_buckets = a.sch->buckets;
+		new_flags = a.sch->flags;
+	}
+	DN_BH_WLOCK();
+again: /* run twice, for wfq and fifo */
+	/*
+	 * lookup the type. If not supplied, use the previous one
+	 * or default to WF2Q+. Otherwise, return an error.
+	 */
+	dn_cfg.id++;
+	a.fp = find_sched_type(a.sch->oid.subtype, a.sch->name);
+	if (a.fp != NULL) {
+		/* found. Lookup or create entry */
+		s = dn_ht_find(dn_cfg.schedhash, i, DNHT_INSERT, &a);
+	} else if (a.sch->oid.subtype == 0 && !a.sch->name[0]) {
+		/* No type. search existing s* or retry with WF2Q+ */
+		s = dn_ht_find(dn_cfg.schedhash, i, 0, &a);
+		if (s != NULL) {
+			a.fp = s->fp;
+			/* Scheduler exists, skip to FIFO scheduler 
+			 * if command was pipe config...
+			 */
+			if (pipe_cmd)
+				goto next;
 		} else {
-			/*
-			 * Change parent pipe not allowed;
-			 * must delete and recreate.
+			/* New scheduler, create a wf2q+ with no mask
+			 * if command was pipe config...
 			 */
-			if (pfs->parent_nr != 0 &&
-			    fs->parent_nr != pfs->parent_nr) {
-				DUMMYNET_UNLOCK();
-				return (EINVAL);
+			if (pipe_cmd) {
+				/* clear mask parameter */
+				bzero(&a.sch->sched_mask, sizeof(new_mask));
+				a.sch->buckets = 0;
+				a.sch->flags &= ~DN_HAVE_MASK;
 			}
+			a.sch->oid.subtype = DN_SCHED_WF2QP;
+			goto again;
 		}
-
-		set_fs_parms(fs, pfs);
-
-		if (fs->rq == NULL) {		/* a new flow_set */
-			error = alloc_hash(fs, pfs);
-			if (error) {
-				DUMMYNET_UNLOCK();
-				free(fs, M_DUMMYNET);
-				return (error);
+	} else {
+		DN_BH_WUNLOCK();
+		D("invalid scheduler type %d %s",
+			a.sch->oid.subtype, a.sch->name);
+		return EINVAL;
+	}
+	/* normalize name and subtype */
+	a.sch->oid.subtype = a.fp->type;
+	bzero(a.sch->name, sizeof(a.sch->name));
+	strlcpy(a.sch->name, a.fp->name, sizeof(a.sch->name));
+	if (s == NULL) {
+		DN_BH_WUNLOCK();
+		D("cannot allocate scheduler %d", i);
+		return ENOMEM;
+	}
+	/* restore existing link if any */
+	if (p.link_nr) {
+		s->link = p;
+		if (pf->link_nr == p.link_nr) /* Restore profile */
+			s->profile = pf;
+		else
+			s->profile = NULL; /* XXX maybe not needed */
+	}
+	p.link_nr = 0;
+	if (s->fp == NULL) {
+		DX(2, "sched %d new type %s", i, a.fp->name);
+	} else if (s->fp != a.fp ||
+			bcmp(a.sch, &s->sch, sizeof(*a.sch)) ) {
+		/* already existing. */
+		DX(2, "sched %d type changed from %s to %s",
+			i, s->fp->name, a.fp->name);
+		DX(4, "   type/sub %d/%d -> %d/%d",
+			s->sch.oid.type, s->sch.oid.subtype, 
+			a.sch->oid.type, a.sch->oid.subtype);
+		if (s->link.link_nr == 0)
+			D("XXX WARNING link 0 for sched %d", i);
+		p = s->link;	/* preserve link */
+		if (s->profile) /* preserve profile */
+			bcopy(s->profile, pf, sizeof(struct dn_profile));
+		/* remove from the hash */
+		dn_ht_find(dn_cfg.schedhash, i, DNHT_REMOVE, NULL);
+		/* Detach flowsets, preserve queues. */
+		// schk_delete_cb(s, NULL);
+		// XXX temporarily, kill queues
+		schk_delete_cb(s, (void *)DN_DESTROY);
+		goto again;
+	} else {
+		DX(4, "sched %d unchanged type %s", i, a.fp->name);
+	}
+	/* complete initialization */
+	s->sch = *a.sch;
+	s->fp = a.fp;
+	s->cfg = arg;
+	// XXX schk_reset_credit(s);
+	/* create the internal flowset if needed,
+	 * trying to reuse existing ones if available
+	 */
+	if (!(s->fp->flags & DN_MULTIQUEUE) && !s->fs) {
+	        s->fs = dn_ht_find(dn_cfg.fshash, i, 0, NULL);
+		if (!s->fs) {
+			struct dn_fs fs;
+			bzero(&fs, sizeof(fs));
+			set_oid(&fs.oid, DN_FS, sizeof(fs));
+			fs.fs_nr = i + DN_MAX_ID;
+			fs.sched_nr = i;
+			s->fs = config_fs(&fs, NULL, 1 /* locked */);
+		}
+		if (!s->fs) {
+			schk_delete_cb(s, (void *)DN_DESTROY);
+			D("error creating internal fs for %d", i);
+			DN_BH_WUNLOCK();
+			return ENOMEM;
+		}
+	}
+	/* call init function after the flowset is created */
+	if (s->fp->config)
+		s->fp->config(s);
+	update_fs(s);
+next:
+	if (i < DN_MAX_ID) { /* now configure the FIFO instance */
+		i += DN_MAX_ID;
+		if (pipe_cmd) {
+			/* Restore mask parameter for FIFO */
+			a.sch->sched_mask = new_mask;
+			a.sch->buckets = new_buckets;
+			a.sch->flags = new_flags;
+		} else {
+			/* sched config shouldn't modify the FIFO scheduler */
+			if (dn_ht_find(dn_cfg.schedhash, i, 0, &a) != NULL) {
+				/* FIFO already exist, don't touch it */
+				DN_BH_WUNLOCK();
+				return 0;
 			}
-			SLIST_INSERT_HEAD(&flowsethash[HASH(fs->fs_nr)],
-			    fs, next);
 		}
-		DUMMYNET_UNLOCK();
+		a.sch->sched_nr = i;
+		a.sch->oid.subtype = DN_SCHED_FIFO;
+		bzero(a.sch->name, sizeof(a.sch->name));
+		goto again;
 	}
-	return (0);
+	DN_BH_WUNLOCK();
+	return 0;
 }
 
 /*
- * Helper function to remove from a heap queues which are linked to
- * a flow_set about to be deleted.
+ * attach a profile to a link
  */
-static void
-fs_remove_from_heap(struct dn_heap *h, struct dn_flow_set *fs)
+static int
+config_profile(struct dn_profile *pf, struct dn_id *arg)
 {
-    int i, found;
+	struct dn_schk *s;
+	int i, olen, err = 0;
 
-    for (i = found = 0 ; i < h->elements ;) {
-	if ( ((struct dn_flow_queue *)h->p[i].object)->fs == fs) {
-	    h->elements-- ;
-	    h->p[i] = h->p[h->elements] ;
-	    found++ ;
-	} else
-	    i++ ;
-    }
-    if (found)
-	heapify(h);
+	if (pf->oid.len < sizeof(*pf)) {
+		D("short profile len %d", pf->oid.len);
+		return EINVAL;
+	}
+	i = pf->link_nr;
+	if (i <= 0 || i >= DN_MAX_ID)
+		return EINVAL;
+	/* XXX other sanity checks */
+	DN_BH_WLOCK();
+	for (; i < 2*DN_MAX_ID; i += DN_MAX_ID) {
+	s = locate_scheduler(i);
+
+	if (s == NULL) {
+			err = EINVAL;
+			break;
+	}
+	dn_cfg.id++;
+	/*
+	 * If we had a profile and the new one does not fit,
+	 * or it is deleted, then we need to free memory.
+	 */
+	if (s->profile && (pf->samples_no == 0 ||
+			s->profile->oid.len < pf->oid.len)) {
+		free(s->profile, M_DUMMYNET);
+		s->profile = NULL;
+	}
+		if (pf->samples_no == 0)
+			continue;
+	/*
+		 * new profile, possibly allocate memory
+	 * and copy data.
+	 */
+		if (s->profile == NULL)
+			s->profile = malloc(pf->oid.len,
+			    M_DUMMYNET, M_NOWAIT | M_ZERO);
+		if (s->profile == NULL) {
+			D("no memory for profile %d", i);
+			err = ENOMEM;
+			break;
+		}
+		/* preserve larger length XXX double check */
+		olen = s->profile->oid.len;
+		if (olen < pf->oid.len)
+			olen = pf->oid.len;
+		bcopy(pf, s->profile, pf->oid.len);
+		s->profile->oid.len = olen;
+	}
+	DN_BH_WUNLOCK();
+	return err;
 }
 
 /*
- * helper function to remove a pipe from a heap (can be there at most once)
+ * Delete all objects:
  */
 static void
-pipe_remove_from_heap(struct dn_heap *h, struct dn_pipe *p)
+dummynet_flush(void)
 {
-	int i;
 
-	for (i=0; i < h->elements ; i++ ) {
-		if (h->p[i].object == p) { /* found it */
-			h->elements-- ;
-			h->p[i] = h->p[h->elements] ;
-			heapify(h);
-			break ;
-		}
-	}
+	/* delete all schedulers and related links/queues/flowsets */
+	dn_ht_scan(dn_cfg.schedhash, schk_delete_cb,
+		(void *)(uintptr_t)DN_DELETE_FS);
+	/* delete all remaining (unlinked) flowsets */
+	DX(4, "still %d unlinked fs", dn_cfg.fsk_count);
+	dn_ht_free(dn_cfg.fshash, DNHT_REMOVE);
+	fsk_detach_list(&dn_cfg.fsu, DN_DELETE_FS);
+	/* Reinitialize system heap... */
+	heap_init(&dn_cfg.evheap, 16, offsetof(struct dn_id, id));
 }
 
 /*
- * Fully delete a pipe or a queue, cleaning up associated info.
+ * Main handler for configuration. We are guaranteed to be called
+ * with an oid which is at least a dn_id.
+ * - the first object is the command (config, delete, flush, ...)
+ * - config_link must be issued after the corresponding config_sched
+ * - parameters (DN_TXT) for an object must preceed the object
+ *   processed on a config_sched.
  */
-static int
-delete_pipe(struct dn_pipe *p)
+int
+do_config(void *p, int l)
 {
+	struct dn_id *next, *o;
+	int err = 0, err2 = 0;
+	struct dn_id *arg = NULL;
+	uintptr_t *a;
+
+	o = p;
+	if (o->id != DN_API_VERSION) {
+		D("invalid api version got %d need %d",
+			o->id, DN_API_VERSION);
+		return EINVAL;
+	}
+	for (; l >= sizeof(*o); o = next) {
+		struct dn_id *prev = arg;
+		if (o->len < sizeof(*o) || l < o->len) {
+			D("bad len o->len %d len %d", o->len, l);
+			err = EINVAL;
+			break;
+		}
+		l -= o->len;
+		next = (struct dn_id *)((char *)o + o->len);
+		err = 0;
+		switch (o->type) {
+		default:
+			D("cmd %d not implemented", o->type);
+			break;
+#ifdef EMULATE_SYSCTL		
+		/* sysctl emulation.
+		 * if we recognize the command, jump to the correct
+		 * handler and return
+		 */
+		case DN_SYSCTL_SET:
+			err = kesysctl_emu_set(p, l);
+			return err;
+#endif
+		case DN_CMD_CONFIG: /* simply a header */
+			break;
 
-    if (p->pipe_nr == 0 && p->fs.fs_nr == 0)
-	return EINVAL ;
-    if (p->pipe_nr != 0 && p->fs.fs_nr != 0)
-	return EINVAL ;
-    if (p->pipe_nr != 0) { /* this is an old-style pipe */
-	struct dn_pipe *pipe;
-	struct dn_flow_set *fs;
-	int i;
+		case DN_CMD_DELETE:
+			/* the argument is in the first uintptr_t after o */
+			a = (uintptr_t *)(o+1);
+			if (o->len < sizeof(*o) + sizeof(*a)) {
+				err = EINVAL;
+				break;
+			}
+			switch (o->subtype) {
+			case DN_LINK:
+				/* delete base and derived schedulers */
+				DN_BH_WLOCK();
+				err = delete_schk(*a);
+				err2 = delete_schk(*a + DN_MAX_ID);
+				DN_BH_WUNLOCK();
+				if (!err)
+					err = err2;
+				break;
 
-	DUMMYNET_LOCK();
-	pipe = locate_pipe(p->pipe_nr);	/* locate pipe */
+			default:
+				D("invalid delete type %d",
+					o->subtype);
+				err = EINVAL;
+				break;
+
+			case DN_FS:
+				err = (*a <1 || *a >= DN_MAX_ID) ?
+					EINVAL : delete_fs(*a, 0) ;
+				break;
+			}
+			break;
 
-	if (pipe == NULL) {
-	    DUMMYNET_UNLOCK();
-	    return (ENOENT);	/* not found */
+		case DN_CMD_FLUSH:
+			DN_BH_WLOCK();
+			dummynet_flush();
+			DN_BH_WUNLOCK();
+			break;
+		case DN_TEXT:	/* store argument the next block */
+			prev = NULL;
+			arg = o;
+			break;
+		case DN_LINK:
+			err = config_link((struct dn_link *)o, arg);
+			break;
+		case DN_PROFILE:
+			err = config_profile((struct dn_profile *)o, arg);
+			break;
+		case DN_SCH:
+			err = config_sched((struct dn_sch *)o, arg);
+			break;
+		case DN_FS:
+			err = (NULL==config_fs((struct dn_fs *)o, arg, 0));
+			break;
+		}
+		if (prev)
+			arg = NULL;
+		if (err != 0)
+			break;
 	}
+	return err;
+}
 
-	/* Unlink from list of pipes. */
-	SLIST_REMOVE(&pipehash[HASH(pipe->pipe_nr)], pipe, dn_pipe, next);
+static int
+compute_space(struct dn_id *cmd, int *to_copy)
+{
+	int x = 0, need = 0;
+	int profile_size = sizeof(struct dn_profile) - 
+		ED_MAX_SAMPLES_NO*sizeof(int);
+
+	/* NOTE about compute space:
+	 * NP 	= dn_cfg.schk_count
+	 * NSI 	= dn_cfg.si_count
+	 * NF 	= dn_cfg.fsk_count
+	 * NQ 	= dn_cfg.queue_count
+	 * - ipfw pipe show
+	 *   (NP/2)*(dn_link + dn_sch + dn_id + dn_fs) only half scheduler
+	 *                             link, scheduler template, flowset
+	 *                             integrated in scheduler and header
+	 *                             for flowset list
+	 *   (NSI)*(dn_flow + dn_queue) all scheduler instance + one
+	 *                              queue per instance
+	 * - ipfw sched show
+	 *   (NP/2)*(dn_link + dn_sch + dn_id + dn_fs) only half scheduler
+	 *                             link, scheduler template, flowset
+	 *                             integrated in scheduler and header
+	 *                             for flowset list
+	 *   (NSI * dn_flow) all scheduler instances
+	 *   (NF * sizeof(uint_32)) space for flowset list linked to scheduler
+	 *   (NQ * dn_queue) all queue [XXXfor now not listed]
+	 * - ipfw queue show
+	 *   (NF * dn_fs) all flowset
+	 *   (NQ * dn_queue) all queues
+	 */
+	switch (cmd->subtype) {
+	default:
+		return -1;
+	/* XXX where do LINK and SCH differ ? */
+	case DN_LINK:	/* pipe show */
+		x = DN_C_LINK | DN_C_SCH | DN_C_FLOW;
+		need += dn_cfg.schk_count *
+			(sizeof(struct dn_fs) + profile_size) / 2;
+		need += dn_cfg.si_count * sizeof(struct dn_queue);
+		need += dn_cfg.fsk_count * sizeof(uint32_t);
+		break;
+	case DN_SCH:	/* sched show */
+		need += dn_cfg.schk_count *
+			(sizeof(struct dn_fs) + profile_size) / 2;
+		need += dn_cfg.fsk_count * sizeof(uint32_t);
+		x = DN_C_SCH | DN_C_LINK | DN_C_FLOW;
+		break;
+	case DN_FS:	/* queue show */
+		x = DN_C_FS | DN_C_QUEUE;
+		break;
+	case DN_GET_COMPAT:	/* compatibility mode */
+		need =  dn_compat_calc_size(dn_cfg); 
+		break;
+	}
+	*to_copy = x;
+	if (x & DN_C_SCH) {
+		need += dn_cfg.schk_count * sizeof(struct dn_sch) / 2;
+		/* NOT also, each fs might be attached to a sched */
+		need += dn_cfg.schk_count * sizeof(struct dn_id) / 2;
+	}
+	if (x & DN_C_FS)
+		need += dn_cfg.fsk_count * sizeof(struct dn_fs);
+	if (x & DN_C_LINK) {
+		need += dn_cfg.schk_count * sizeof(struct dn_link) / 2;
+	}
+	/* XXX queue space might be variable */
+	if (x & DN_C_QUEUE)
+		need += dn_cfg.queue_count * sizeof(struct dn_queue);
+	if (x & DN_C_FLOW)
+		need += dn_cfg.si_count * (sizeof(struct dn_flow));
+	return need;
+}
 
-	/* Remove all references to this pipe from flow_sets. */
-	for (i = 0; i < HASHSIZE; i++) {
-	    SLIST_FOREACH(fs, &flowsethash[i], next) {
-		if (fs->pipe == pipe) {
-			printf("dummynet: ++ ref to pipe %d from fs %d\n",
-			    p->pipe_nr, fs->fs_nr);
-			fs->pipe = NULL ;
-			purge_flow_set(fs, 0);
+/*
+ * If compat != NULL dummynet_get is called in compatibility mode.
+ * *compat will be the pointer to the buffer to pass to ipfw
+ */
+int
+dummynet_get(struct sockopt *sopt, void **compat)
+{
+	int have, i, need, error;
+	char *start = NULL, *buf;
+	size_t sopt_valsize;
+	struct dn_id cmd;
+	struct copy_args a;
+
+	/* save and restore original sopt_valsize around copyin */
+	sopt_valsize = sopt->sopt_valsize;
+	if (!compat) {
+		error = sooptcopyin(sopt, &cmd, sizeof(cmd), sizeof(cmd));
+		if (error)
+			return error;
+		sopt->sopt_valsize = sopt_valsize;
+#ifdef EMULATE_SYSCTL
+		/* sysctl emulation. */
+		if (cmd.type == DN_SYSCTL_GET)
+			return kesysctl_emu_get(sopt);
+#endif
+	} else {
+		error = 0;
+		cmd.type = DN_CMD_GET;
+		cmd.len = sizeof(struct dn_id);
+		cmd.subtype = DN_GET_COMPAT;
+		// cmd.id = sopt_valsize;
+		D("compatibility mode");
+	}
+	/* Count space (under lock) and allocate (outside lock).
+	 * Exit with lock held if we manage to get enough buffer.
+	 * Try a few times then give up.
+	 */
+	for (have = 0, i = 0; i < 10; i++) {
+		DN_BH_WLOCK();
+		need = compute_space(&cmd, &a.flags);
+		if (need < 0) {
+			DN_BH_WUNLOCK();
+			return EINVAL;
 		}
-	    }
+		need += sizeof(cmd);
+		cmd.id = need;
+		if (have >= need)
+			break;
+		DN_BH_WUNLOCK();
+		if (start)
+			free(start, M_DUMMYNET);
+		start = NULL;
+		if (need > sopt_valsize)
+			break;
+		have = need;
+		start = malloc(have, M_DUMMYNET, M_WAITOK | M_ZERO);
+		if (start == NULL)
+			return ENOMEM;
+	}
+	if (start == NULL) {
+		if (compat) {
+			*compat = NULL;
+			return 1; // XXX
+		}
+		return sooptcopyout(sopt, &cmd, sizeof(cmd));
 	}
-	fs_remove_from_heap(&ready_heap, &(pipe->fs));
-	purge_pipe(pipe); /* remove all data associated to this pipe */
-	/* remove reference to here from extract_heap and wfq_ready_heap */
-	pipe_remove_from_heap(&extract_heap, pipe);
-	pipe_remove_from_heap(&wfq_ready_heap, pipe);
-	DUMMYNET_UNLOCK();
+	ND("have %d:%d sched %d, %d:%d links %d, %d:%d flowsets %d, "
+		"%d:%d si %d, %d:%d queues %d",
+		dn_cfg.schk_count, sizeof(struct dn_sch), DN_SCH,
+		dn_cfg.schk_count, sizeof(struct dn_link), DN_LINK,
+		dn_cfg.fsk_count, sizeof(struct dn_fs), DN_FS,
+		dn_cfg.si_count, sizeof(struct dn_flow), DN_SCH_I,
+		dn_cfg.queue_count, sizeof(struct dn_queue), DN_QUEUE);
+	sopt->sopt_valsize = sopt_valsize;
+	a.type = cmd.subtype;
+	if (compat == NULL) {
+		bcopy(&cmd, start, sizeof(cmd));
+		buf = start + sizeof(cmd);
+	} else
+		buf = start;
+	a.start = &buf;
+	a.end = start + have;
+	/* start copying other objects */
+	if (compat) {
+		a.type = DN_COMPAT_PIPE;
+		dn_ht_scan(dn_cfg.schedhash, copy_data_helper_compat, &a);
+		a.type = DN_COMPAT_QUEUE;
+		dn_ht_scan(dn_cfg.fshash, copy_data_helper_compat, &a);
+	} else if (a.type == DN_FS)
+		dn_ht_scan(dn_cfg.fshash, copy_data_helper, &a);
+	else
+		dn_ht_scan(dn_cfg.schedhash, copy_data_helper, &a);
+	DN_BH_WUNLOCK();
+	if (compat) {
+		*compat = start;
+		sopt->sopt_valsize = buf - start;
+		/* free() is done by ip_dummynet_compat() */
+	} else {
+		error = sooptcopyout(sopt, start, buf - start);
+		free(start, M_DUMMYNET);
+	}
+	return error;
+}
 
-	free_pipe(pipe);
-    } else { /* this is a WF2Q queue (dn_flow_set) */
-	struct dn_flow_set *fs;
+/* Callback called on scheduler instance to delete it if idle */
+static int
+drain_scheduler_cb(void *_si, void *arg)
+{
+	struct dn_sch_inst *si = _si;
 
-	DUMMYNET_LOCK();
-	fs = locate_flowset(p->fs.fs_nr); /* locate set */
+	if ((si->kflags & DN_ACTIVE) || si->dline.mq.head != NULL)
+		return 0;
 
-	if (fs == NULL) {
-	    DUMMYNET_UNLOCK();
-	    return (ENOENT); /* not found */
+	if (si->sched->fp->flags & DN_MULTIQUEUE) {
+		if (si->q_count == 0)
+			return si_destroy(si, NULL);
+		else
+			return 0;
+	} else { /* !DN_MULTIQUEUE */
+		if ((si+1)->ni.length == 0)
+			return si_destroy(si, NULL);
+		else
+			return 0;
 	}
+	return 0; /* unreachable */
+}
 
-	/* Unlink from list of flowsets. */
-	SLIST_REMOVE( &flowsethash[HASH(fs->fs_nr)], fs, dn_flow_set, next);
+/* Callback called on scheduler to check if it has instances */
+static int
+drain_scheduler_sch_cb(void *_s, void *arg)
+{
+	struct dn_schk *s = _s;
 
-	if (fs->pipe != NULL) {
-	    /* Update total weight on parent pipe and cleanup parent heaps. */
-	    fs->pipe->sum -= fs->weight * fs->backlogged ;
-	    fs_remove_from_heap(&(fs->pipe->not_eligible_heap), fs);
-	    fs_remove_from_heap(&(fs->pipe->scheduler_heap), fs);
-#if 1	/* XXX should i remove from idle_heap as well ? */
-	    fs_remove_from_heap(&(fs->pipe->idle_heap), fs);
-#endif
+	if (s->sch.flags & DN_HAVE_MASK) {
+		dn_ht_scan_bucket(s->siht, &s->drain_bucket,
+				drain_scheduler_cb, NULL);
+		s->drain_bucket++;
+	} else {
+		if (s->siht) {
+			if (drain_scheduler_cb(s->siht, NULL) == DNHT_SCAN_DEL)
+				s->siht = NULL;
+		}
 	}
-	purge_flow_set(fs, 1);
-	DUMMYNET_UNLOCK();
-    }
-    return 0 ;
+	return 0;
 }
 
-/*
- * helper function used to copy data from kernel in DUMMYNET_GET
- */
-static char *
-dn_copy_set(struct dn_flow_set *set, char *bp)
-{
-    int i, copied = 0 ;
-    struct dn_flow_queue *q, *qp = (struct dn_flow_queue *)bp;
-
-    DUMMYNET_LOCK_ASSERT();
-
-    for (i = 0 ; i <= set->rq_size ; i++) {
-	for (q = set->rq[i] ; q ; q = q->next, qp++ ) {
-	    if (q->hash_slot != i)
-		printf("dummynet: ++ at %d: wrong slot (have %d, "
-		    "should be %d)\n", copied, q->hash_slot, i);
-	    if (q->fs != set)
-		printf("dummynet: ++ at %d: wrong fs ptr (have %p, should be %p)\n",
-			i, q->fs, set);
-	    copied++ ;
-	    bcopy(q, qp, sizeof( *q ) );
-	    /* cleanup pointers */
-	    qp->next = NULL ;
-	    qp->head = qp->tail = NULL ;
-	    qp->fs = NULL ;
-	}
-    }
-    if (copied != set->rq_elements)
-	printf("dummynet: ++ wrong count, have %d should be %d\n",
-	    copied, set->rq_elements);
-    return (char *)qp ;
-}
-
-static size_t
-dn_calc_size(void)
-{
-    struct dn_flow_set *fs;
-    struct dn_pipe *pipe;
-    size_t size = 0;
-    int i;
-
-    DUMMYNET_LOCK_ASSERT();
-    /*
-     * Compute size of data structures: list of pipes and flow_sets.
-     */
-    for (i = 0; i < HASHSIZE; i++) {
-	SLIST_FOREACH(pipe, &pipehash[i], next)
-		size += sizeof(*pipe) +
-		    pipe->fs.rq_elements * sizeof(struct dn_flow_queue);
-	SLIST_FOREACH(fs, &flowsethash[i], next)
-		size += sizeof (*fs) +
-		    fs->rq_elements * sizeof(struct dn_flow_queue);
-    }
-    return size;
+/* Called every tick, try to delete a 'bucket' of scheduler */
+void
+dn_drain_scheduler(void)
+{
+	dn_ht_scan_bucket(dn_cfg.schedhash, &dn_cfg.drain_sch,
+			   drain_scheduler_sch_cb, NULL);
+	dn_cfg.drain_sch++;
 }
 
+/* Callback called on queue to delete if it is idle */
 static int
-dummynet_get(struct sockopt *sopt)
-{
-    char *buf, *bp ; /* bp is the "copy-pointer" */
-    size_t size ;
-    struct dn_flow_set *fs;
-    struct dn_pipe *pipe;
-    int error=0, i ;
-
-    /* XXX lock held too long */
-    DUMMYNET_LOCK();
-    /*
-     * XXX: Ugly, but we need to allocate memory with M_WAITOK flag and we
-     *      cannot use this flag while holding a mutex.
-     */
-    for (i = 0; i < 10; i++) {
-	size = dn_calc_size();
-	DUMMYNET_UNLOCK();
-	buf = malloc(size, M_TEMP, M_WAITOK);
-	DUMMYNET_LOCK();
-	if (size >= dn_calc_size())
-		break;
-	free(buf, M_TEMP);
-	buf = NULL;
-    }
-    if (buf == NULL) {
-	DUMMYNET_UNLOCK();
-	return ENOBUFS ;
-    }
-    bp = buf;
-    for (i = 0; i < HASHSIZE; i++) {
-	SLIST_FOREACH(pipe, &pipehash[i], next) {
-		struct dn_pipe *pipe_bp = (struct dn_pipe *)bp;
-
-		/*
-		 * Copy pipe descriptor into *bp, convert delay back to ms,
-		 * then copy the flow_set descriptor(s) one at a time.
-		 * After each flow_set, copy the queue descriptor it owns.
-		 */
-		bcopy(pipe, bp, sizeof(*pipe));
-		pipe_bp->delay = (pipe_bp->delay * 1000) / hz;
-		pipe_bp->burst = div64(pipe_bp->burst, 8 * hz);
-		/*
-		 * XXX the following is a hack based on ->next being the
-		 * first field in dn_pipe and dn_flow_set. The correct
-		 * solution would be to move the dn_flow_set to the beginning
-		 * of struct dn_pipe.
-		 */
-		pipe_bp->next.sle_next = (struct dn_pipe *)DN_IS_PIPE;
-		/* Clean pointers. */
-		pipe_bp->head = pipe_bp->tail = NULL;
-		pipe_bp->fs.next.sle_next = NULL;
-		pipe_bp->fs.pipe = NULL;
-		pipe_bp->fs.rq = NULL;
-		pipe_bp->samples = NULL;
+drain_queue_cb(void *_q, void *arg)
+{
+	struct dn_queue *q = _q;
 
-		bp += sizeof(*pipe) ;
-		bp = dn_copy_set(&(pipe->fs), bp);
+	if (q->ni.length == 0) {
+		dn_delete_queue(q, DN_DESTROY);
+		return DNHT_SCAN_DEL; /* queue is deleted */
 	}
-    }
 
-    for (i = 0; i < HASHSIZE; i++) {
-	SLIST_FOREACH(fs, &flowsethash[i], next) {
-		struct dn_flow_set *fs_bp = (struct dn_flow_set *)bp;
+	return 0; /* queue isn't deleted */
+}
 
-		bcopy(fs, bp, sizeof(*fs));
-		/* XXX same hack as above */
-		fs_bp->next.sle_next = (struct dn_flow_set *)DN_IS_QUEUE;
-		fs_bp->pipe = NULL;
-		fs_bp->rq = NULL;
-		bp += sizeof(*fs);
-		bp = dn_copy_set(fs, bp);
-	}
-    }
+/* Callback called on flowset used to check if it has queues */
+static int
+drain_queue_fs_cb(void *_fs, void *arg)
+{
+	struct dn_fsk *fs = _fs;
 
-    DUMMYNET_UNLOCK();
+	if (fs->fs.flags & DN_QHT_HASH) {
+		/* Flowset has a hash table for queues */
+		dn_ht_scan_bucket(fs->qht, &fs->drain_bucket,
+				drain_queue_cb, NULL);
+		fs->drain_bucket++;
+	}
+	else {
+		/* No hash table for this flowset, null the pointer 
+		 * if the queue is deleted
+		 */
+		if (fs->qht) {
+			if (drain_queue_cb(fs->qht, NULL) == DNHT_SCAN_DEL)
+				fs->qht = NULL;
+		}
+	}
+	return 0;
+}
 
-    error = sooptcopyout(sopt, buf, size);
-    free(buf, M_TEMP);
-    return error ;
+/* Called every tick, try to delete a 'bucket' of queue */
+void
+dn_drain_queue(void)
+{
+	/* scan a bucket of flowset */
+	dn_ht_scan_bucket(dn_cfg.fshash, &dn_cfg.drain_fs,
+                               drain_queue_fs_cb, NULL);
+	dn_cfg.drain_fs++;
 }
 
 /*
- * Handler for the various dummynet socket options (get, flush, config, del)
+ * Handler for the various dummynet socket options
  */
 static int
 ip_dn_ctl(struct sockopt *sopt)
 {
-    int error;
-    struct dn_pipe *p = NULL;
+	void *p = NULL;
+	int error, l;
 
-    error = priv_check(sopt->sopt_td, PRIV_NETINET_DUMMYNET);
-    if (error)
-	return (error);
-
-    /* Disallow sets in really-really secure mode. */
-    if (sopt->sopt_dir == SOPT_SET) {
-#if __FreeBSD_version >= 500034
-	error =  securelevel_ge(sopt->sopt_td->td_ucred, 3);
+	error = priv_check(sopt->sopt_td, PRIV_NETINET_DUMMYNET);
 	if (error)
-	    return (error);
-#else
-	if (securelevel >= 3)
-	    return (EPERM);
-#endif
-    }
-
-    switch (sopt->sopt_name) {
-    default :
-	printf("dummynet: -- unknown option %d", sopt->sopt_name);
-	error = EINVAL ;
-	break;
+		return (error);
 
-    case IP_DUMMYNET_GET :
-	error = dummynet_get(sopt);
-	break ;
-
-    case IP_DUMMYNET_FLUSH :
-	dummynet_flush() ;
-	break ;
-
-    case IP_DUMMYNET_CONFIGURE :
-	p = malloc(sizeof(struct dn_pipe_max), M_TEMP, M_WAITOK);
-	error = sooptcopyin(sopt, p, sizeof(struct dn_pipe_max), sizeof *p);
-	if (error)
-	    break ;
-	if (p->samples_no > 0)
-	    p->samples = &(((struct dn_pipe_max *)p)->samples[0]);
+	/* Disallow sets in really-really secure mode. */
+	if (sopt->sopt_dir == SOPT_SET) {
+		error =  securelevel_ge(sopt->sopt_td->td_ucred, 3);
+		if (error)
+			return (error);
+	}
 
-	error = config_pipe(p);
-	break ;
+	switch (sopt->sopt_name) {
+	default :
+		D("dummynet: unknown option %d", sopt->sopt_name);
+		error = EINVAL;
+		break;
 
-    case IP_DUMMYNET_DEL :	/* remove a pipe or queue */
-	p = malloc(sizeof(struct dn_pipe), M_TEMP, M_WAITOK);
-	error = sooptcopyin(sopt, p, sizeof(struct dn_pipe), sizeof *p);
-	if (error)
-	    break ;
+	case IP_DUMMYNET_FLUSH:
+	case IP_DUMMYNET_CONFIGURE:
+	case IP_DUMMYNET_DEL:	/* remove a pipe or queue */
+	case IP_DUMMYNET_GET:
+		D("dummynet: compat option %d", sopt->sopt_name);
+		error = ip_dummynet_compat(sopt);
+		break;
 
-	error = delete_pipe(p);
-	break ;
-    }
+	case IP_DUMMYNET3 :
+		if (sopt->sopt_dir == SOPT_GET) {
+			error = dummynet_get(sopt, NULL);
+			break;
+		}
+		l = sopt->sopt_valsize;
+		if (l < sizeof(struct dn_id) || l > 12000) {
+			D("argument len %d invalid", l);
+			break;
+		}
+		p = malloc(l, M_TEMP, M_WAITOK); // XXX can it fail ?
+		error = sooptcopyin(sopt, p, l, l);
+		if (error)
+			break ;
+		error = do_config(p, l);
+		break;
+	}
 
-    if (p != NULL)
-	free(p, M_TEMP);
+	if (p != NULL)
+		free(p, M_TEMP);
 
-    return error ;
+	return error ;
 }
 
+
 static void
 ip_dn_init(void)
 {
-	int i;
+	static int init_done = 0;
 
+	if (init_done)
+		return;
+	init_done = 1;
 	if (bootverbose)
-		printf("DUMMYNET with IPv6 initialized (040826)\n");
+		printf("DUMMYNET with IPv6 initialized (100131)\n");
 
-	DUMMYNET_LOCK_INIT();
-
-	for (i = 0; i < HASHSIZE; i++) {
-		SLIST_INIT(&pipehash[i]);
-		SLIST_INIT(&flowsethash[i]);
-	}
-	ready_heap.size = ready_heap.elements = 0;
-	ready_heap.offset = 0;
+	/* init defaults here, MSVC does not accept initializers */
+	/* queue limits */
+	dn_cfg.slot_limit = 100; /* Foot shooting limit for queues. */
+	dn_cfg.byte_limit = 1024 * 1024;
 
-	wfq_ready_heap.size = wfq_ready_heap.elements = 0;
-	wfq_ready_heap.offset = 0;
+	/* RED parameters */
+	dn_cfg.red_lookup_depth = 256;	/* default lookup table depth */
+	dn_cfg.red_avg_pkt_size = 512;	/* default medium packet size */
+	dn_cfg.red_max_pkt_size = 1500;	/* default max packet size */
 
-	extract_heap.size = extract_heap.elements = 0;
-	extract_heap.offset = 0;
+	/* hash tables */
+	dn_cfg.max_hash_size = 1024;	/* max in the hash tables */
+	dn_cfg.hash_size = 64;		/* default hash size */
 
+	/* create hash tables for schedulers and flowsets.
+	 * In both we search by key and by pointer.
+	 */
+	dn_cfg.schedhash = dn_ht_init(NULL, dn_cfg.hash_size,
+		offsetof(struct dn_schk, schk_next),
+		schk_hash, schk_match, schk_new);
+	dn_cfg.fshash = dn_ht_init(NULL, dn_cfg.hash_size,
+		offsetof(struct dn_fsk, fsk_next),
+		fsk_hash, fsk_match, fsk_new);
+
+	/* bucket index to drain object */
+	dn_cfg.drain_fs = 0;
+	dn_cfg.drain_sch = 0;
+
+	heap_init(&dn_cfg.evheap, 16, offsetof(struct dn_id, id));
+	SLIST_INIT(&dn_cfg.fsu);
+	SLIST_INIT(&dn_cfg.schedlist);
+
+	DN_LOCK_INIT();
 	ip_dn_ctl_ptr = ip_dn_ctl;
 	ip_dn_io_ptr = dummynet_io;
 
@@ -2270,25 +2065,28 @@ ip_dn_init(void)
 	callout_reset(&dn_timeout, 1, dummynet, NULL);
 
 	/* Initialize curr_time adjustment mechanics. */
-	getmicrouptime(&prev_t);
+	getmicrouptime(&dn_cfg.prev_t);
 }
 
 #ifdef KLD_MODULE
 static void
 ip_dn_destroy(void)
 {
+	DN_BH_WLOCK();
 	ip_dn_ctl_ptr = NULL;
 	ip_dn_io_ptr = NULL;
 
-	DUMMYNET_LOCK();
 	callout_stop(&dn_timeout);
-	DUMMYNET_UNLOCK();
+	dummynet_flush();
+	DN_BH_WUNLOCK();
 	taskqueue_drain(dn_tq, &dn_task);
 	taskqueue_free(dn_tq);
 
-	dummynet_flush();
+	dn_ht_free(dn_cfg.schedhash, 0);
+	dn_ht_free(dn_cfg.fshash, 0);
+	heap_free(&dn_cfg.evheap);
 
-	DUMMYNET_LOCK_DESTROY();
+	DN_LOCK_DESTROY();
 }
 #endif /* KLD_MODULE */
 
@@ -2296,36 +2094,98 @@ static int
 dummynet_modevent(module_t mod, int type, void *data)
 {
 
-	switch (type) {
-	case MOD_LOAD:
+	if (type == MOD_LOAD) {
 		if (ip_dn_io_ptr) {
-		    printf("DUMMYNET already loaded\n");
-		    return EEXIST ;
+			printf("DUMMYNET already loaded\n");
+			return EEXIST ;
 		}
 		ip_dn_init();
-		break;
-
-	case MOD_UNLOAD:
+		return 0;
+	} else if (type == MOD_UNLOAD) {
 #if !defined(KLD_MODULE)
 		printf("dummynet statically compiled, cannot unload\n");
 		return EINVAL ;
 #else
 		ip_dn_destroy();
+		return 0;
 #endif
-		break ;
-	default:
+	} else
 		return EOPNOTSUPP;
-		break ;
+}
+
+/* modevent helpers for the modules */
+static int
+load_dn_sched(struct dn_alg *d)
+{
+	struct dn_alg *s;
+
+	if (d == NULL)
+		return 1; /* error */
+	ip_dn_init();	/* just in case, we need the lock */
+
+	/* Check that mandatory funcs exists */
+	if (d->enqueue == NULL || d->dequeue == NULL) {
+		D("missing enqueue or dequeue for %s", d->name);
+		return 1;
+	}
+
+	/* Search if scheduler already exists */
+	DN_BH_WLOCK();
+	SLIST_FOREACH(s, &dn_cfg.schedlist, next) {
+		if (strcmp(s->name, d->name) == 0) {
+			D("%s already loaded", d->name);
+			break; /* scheduler already exists */
+		}
+	}
+	if (s == NULL)
+		SLIST_INSERT_HEAD(&dn_cfg.schedlist, d, next);
+	DN_BH_WUNLOCK();
+	D("dn_sched %s %sloaded", d->name, s ? "not ":"");
+	return s ? 1 : 0;
+}
+
+static int
+unload_dn_sched(struct dn_alg *s)
+{
+	struct dn_alg *tmp, *r;
+	int err = EINVAL;
+
+	D("called for %s", s->name);
+
+	DN_BH_WLOCK();
+	SLIST_FOREACH_SAFE(r, &dn_cfg.schedlist, next, tmp) {
+		if (strcmp(s->name, r->name) != 0)
+			continue;
+		D("ref_count = %d", r->ref_count);
+		err = (r->ref_count != 0) ? EBUSY : 0;
+		if (err == 0)
+			SLIST_REMOVE(&dn_cfg.schedlist, r, dn_alg, next);
+		break;
 	}
-	return 0 ;
+	DN_BH_WUNLOCK();
+	D("dn_sched %s %sunloaded", s->name, err ? "not ":"");
+	return err;
+}
+
+int
+dn_sched_modevent(module_t mod, int cmd, void *arg)
+{
+	struct dn_alg *sch = arg;
+
+	if (cmd == MOD_LOAD)
+		return load_dn_sched(sch);
+	else if (cmd == MOD_UNLOAD)
+		return unload_dn_sched(sch);
+	else
+		return EINVAL;
 }
 
 static moduledata_t dummynet_mod = {
-	"dummynet",
-	dummynet_modevent,
-	NULL
+	"dummynet", dummynet_modevent, NULL
 };
-DECLARE_MODULE(dummynet, dummynet_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY);
+
+DECLARE_MODULE(dummynet, dummynet_mod,
+	SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY-1);
 MODULE_DEPEND(dummynet, ipfw, 2, 2, 2);
 MODULE_VERSION(dummynet, 1);
 /* end of file */
diff --git a/sys/netinet/ipfw/ip_fw2.c b/sys/netinet/ipfw/ip_fw2.c
index 724536c..e7ad107 100644
--- a/sys/netinet/ipfw/ip_fw2.c
+++ b/sys/netinet/ipfw/ip_fw2.c
@@ -142,6 +142,11 @@ ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr;
 ipfw_nat_cfg_t *ipfw_nat_get_log_ptr;
 
 #ifdef SYSCTL_NODE
+uint32_t dummy_def = IPFW_DEFAULT_RULE;
+uint32_t dummy_tables_max = IPFW_TABLES_MAX;
+
+SYSBEGIN(f3)
+
 SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
 SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, one_pass,
     CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0,
@@ -156,10 +161,10 @@ SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit,
     CTLFLAG_RW, &VNET_NAME(verbose_limit), 0,
     "Set upper limit of matches of ipfw rules logged");
 SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, default_rule, CTLFLAG_RD,
-    NULL, IPFW_DEFAULT_RULE,
+    &dummy_def, 0,
     "The default/max possible rule number.");
 SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, tables_max, CTLFLAG_RD,
-    NULL, IPFW_TABLES_MAX,
+    &dummy_tables_max, 0,
     "The maximum number of tables.");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, default_to_accept, CTLFLAG_RDTUN,
     &default_to_accept, 0,
@@ -177,6 +182,8 @@ SYSCTL_VNET_INT(_net_inet6_ip6_fw, OID_AUTO, deny_unknown_exthdrs,
     "Deny packets with unknown IPv6 Extension Headers");
 #endif /* INET6 */
 
+SYSEND
+
 #endif /* SYSCTL_NODE */
 
 
@@ -344,6 +351,7 @@ iface_match(struct ifnet *ifp, ipfw_insn_if *cmd)
 				return(1);
 		}
 	} else {
+#ifdef	__FreeBSD__	/* and OSX too ? */
 		struct ifaddr *ia;
 
 		if_addr_rlock(ifp);
@@ -357,6 +365,7 @@ iface_match(struct ifnet *ifp, ipfw_insn_if *cmd)
 			}
 		}
 		if_addr_runlock(ifp);
+#endif /* __FreeBSD__ */
 	}
 	return(0);	/* no match, fail ... */
 }
@@ -385,6 +394,9 @@ iface_match(struct ifnet *ifp, ipfw_insn_if *cmd)
 static int
 verify_path(struct in_addr src, struct ifnet *ifp, u_int fib)
 {
+#ifndef __FreeBSD__
+	return 0;
+#else
 	struct route ro;
 	struct sockaddr_in *dst;
 
@@ -427,6 +439,7 @@ verify_path(struct in_addr src, struct ifnet *ifp, u_int fib)
 	/* found valid route */
 	RTFREE(ro.ro_rt);
 	return 1;
+#endif /* __FreeBSD__ */
 }
 
 #ifdef INET6
@@ -634,9 +647,14 @@ send_reject(struct ip_fw_args *args, int code, int iplen, struct ip *ip)
 static int
 check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif,
     struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip,
-    u_int16_t src_port, struct ucred **uc, int *ugid_lookupp,
-    struct inpcb *inp)
+    u_int16_t src_port, int *ugid_lookupp,
+    struct ucred **uc, struct inpcb *inp)
 {
+#ifndef __FreeBSD__
+	return cred_check(insn, proto, oif,
+	    dst_ip, dst_port, src_ip, src_port,
+	    (struct bsd_ucred *)uc, ugid_lookupp, ((struct mbuf *)inp)->m_skb);
+#else  /* FreeBSD */
 	struct inpcbinfo *pi;
 	int wildcard;
 	struct inpcb *pcb;
@@ -703,6 +721,7 @@ check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif,
 	else if (insn->o.opcode == O_JAIL)
 		match = ((*uc)->cr_prison->pr_id == (int)insn->d[0]);
 	return match;
+#endif /* __FreeBSD__ */
 }
 
 /*
@@ -794,7 +813,11 @@ ipfw_chk(struct ip_fw_args *args)
 	 * these types of constraints, as well as decrease contention
 	 * on pcb related locks.
 	 */
+#ifndef __FreeBSD__
+	struct bsd_ucred ucred_cache;
+#else
 	struct ucred *ucred_cache = NULL;
+#endif
 	int ucred_lookup = 0;
 
 	/*
@@ -1233,8 +1256,13 @@ do {								\
 						    (ipfw_insn_u32 *)cmd,
 						    proto, oif,
 						    dst_ip, dst_port,
-						    src_ip, src_port, &ucred_cache,
-						    &ucred_lookup, args->inp);
+						    src_ip, src_port, &ucred_lookup,
+#ifdef __FreeBSD__
+						    &ucred_cache, args->inp);
+#else
+						    (void *)&ucred_cache,
+						    (struct inpcb *)args->m);
+#endif
 				break;
 
 			case O_RECV:
@@ -1348,12 +1376,21 @@ do {								\
 						(ipfw_insn_u32 *)cmd,
 						proto, oif,
 						dst_ip, dst_port,
-						src_ip, src_port, &ucred_cache,
-						&ucred_lookup, args->inp);
+						src_ip, src_port, &ucred_lookup,
+#ifdef __FreeBSD__
+						&ucred_cache, args->inp);
 					    if (v == 4 /* O_UID */)
 						key = ucred_cache->cr_uid;
 					    else if (v == 5 /* O_JAIL */)
 						key = ucred_cache->cr_prison->pr_id;
+#else /* !__FreeBSD__ */
+						(void *)&ucred_cache,
+						(struct inpcb *)args->m);
+					    if (v ==4 /* O_UID */)
+						key = ucred_cache.uid;
+					    else if (v == 5 /* O_JAIL */)
+						key = ucred_cache.xid;
+#endif /* !__FreeBSD__ */
 					    key = htonl(key);
 					} else
 					    break;
@@ -1392,11 +1429,10 @@ do {								\
 					match = (tif != NULL);
 					break;
 				}
-				/* FALLTHROUGH */
 #ifdef INET6
+				/* FALLTHROUGH */
 			case O_IP6_SRC_ME:
-				match = is_ipv6 &&
-				    search_ip6_addr_net(&args->f_id.src_ip6);
+				match= is_ipv6 && search_ip6_addr_net(&args->f_id.src_ip6);
 #endif
 				break;
 
@@ -1432,14 +1468,14 @@ do {								\
 					match = (tif != NULL);
 					break;
 				}
-				/* FALLTHROUGH */
 #ifdef INET6
+				/* FALLTHROUGH */
 			case O_IP6_DST_ME:
-				match = is_ipv6 &&
-				    search_ip6_addr_net(&args->f_id.dst_ip6);
+				match= is_ipv6 && search_ip6_addr_net(&args->f_id.dst_ip6);
 #endif
 				break;
 
+
 			case O_IP_SRCPORT:
 			case O_IP_DSTPORT:
 				/*
@@ -2164,8 +2200,10 @@ do {								\
 		printf("ipfw: ouch!, skip past end of rules, denying packet\n");
 	}
 	IPFW_RUNLOCK(chain);
+#ifdef __FreeBSD__
 	if (ucred_cache != NULL)
 		crfree(ucred_cache);
+#endif
 	return (retval);
 
 pullup_failed:
diff --git a/sys/netinet/ipfw/ip_fw_dynamic.c b/sys/netinet/ipfw/ip_fw_dynamic.c
index ad5599a..3aa8f20 100644
--- a/sys/netinet/ipfw/ip_fw_dynamic.c
+++ b/sys/netinet/ipfw/ip_fw_dynamic.c
@@ -128,7 +128,11 @@ static VNET_DEFINE(struct callout, ipfw_timeout);
 #define V_ipfw_timeout                  VNET(ipfw_timeout)
 
 static uma_zone_t ipfw_dyn_rule_zone;
+#ifndef __FreeBSD__
+DEFINE_SPINLOCK(ipfw_dyn_mtx);
+#else
 static struct mtx ipfw_dyn_mtx;		/* mutex guarding dynamic rules */
+#endif
 
 #define	IPFW_DYN_LOCK_INIT() \
 	mtx_init(&ipfw_dyn_mtx, "IPFW dynamic rules", NULL, MTX_DEF)
@@ -183,6 +187,9 @@ static VNET_DEFINE(u_int32_t, dyn_max);		/* max # of dynamic rules */
 #define	V_dyn_max			VNET(dyn_max)
 
 #ifdef SYSCTL_NODE
+
+SYSBEGIN(f2)
+
 SYSCTL_DECL(_net_inet_ip_fw);
 SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_buckets,
     CTLFLAG_RW, &VNET_NAME(dyn_buckets), 0,
@@ -217,6 +224,9 @@ SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime,
 SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive,
     CTLFLAG_RW, &VNET_NAME(dyn_keepalive), 0,
     "Enable keepalives for dyn. rules");
+
+SYSEND
+
 #endif /* SYSCTL_NODE */
 
 
@@ -884,6 +894,9 @@ struct mbuf *
 ipfw_send_pkt(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t seq,
     u_int32_t ack, int flags)
 {
+#ifndef __FreeBSD__
+	return NULL;
+#else
 	struct mbuf *m;
 	int len, dir;
 	struct ip *h = NULL;		/* stupid compiler */
@@ -1020,6 +1033,7 @@ ipfw_send_pkt(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t seq,
 	}
 
 	return (m);
+#endif /* __FreeBSD__ */
 }
 
 /*
diff --git a/sys/netinet/ipfw/ip_fw_log.c b/sys/netinet/ipfw/ip_fw_log.c
index a5178db..23a1737 100644
--- a/sys/netinet/ipfw/ip_fw_log.c
+++ b/sys/netinet/ipfw/ip_fw_log.c
@@ -413,6 +413,7 @@ ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args,
 				    (ipoff & IP_MF) ? "+" : "");
 		}
 	}
+#ifdef __FreeBSD__
 	if (oif || m->m_pkthdr.rcvif)
 		log(LOG_SECURITY | LOG_INFO,
 		    "ipfw: %d %s %s %s via %s%s\n",
@@ -421,6 +422,7 @@ ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args,
 		    oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname,
 		    fragment);
 	else
+#endif
 		log(LOG_SECURITY | LOG_INFO,
 		    "ipfw: %d %s %s [no if info]%s\n",
 		    f ? f->rulenum : -1,
diff --git a/sys/netinet/ipfw/ip_fw_pfil.c b/sys/netinet/ipfw/ip_fw_pfil.c
index a7aa5aa..b4e31d4 100644
--- a/sys/netinet/ipfw/ip_fw_pfil.c
+++ b/sys/netinet/ipfw/ip_fw_pfil.c
@@ -77,6 +77,9 @@ int ipfw_chg_hook(SYSCTL_HANDLER_ARGS);
 static int ipfw_divert(struct mbuf **, int, struct ipfw_rule_ref *, int);
 
 #ifdef SYSCTL_NODE
+
+SYSBEGIN(f1)
+
 SYSCTL_DECL(_net_inet_ip_fw);
 SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, enable,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_enable), 0,
@@ -87,6 +90,9 @@ SYSCTL_VNET_PROC(_net_inet6_ip6_fw, OID_AUTO, enable,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw6_enable), 0,
     ipfw_chg_hook, "I", "Enable ipfw+6");
 #endif /* INET6 */
+
+SYSEND
+
 #endif /* SYSCTL_NODE */
 
 /*
@@ -94,7 +100,7 @@ SYSCTL_VNET_PROC(_net_inet6_ip6_fw, OID_AUTO, enable,
  * dummynet, divert, netgraph or other modules.
  * The packet may be consumed.
  */
-static int
+int
 ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir,
     struct inpcb *inp)
 {
diff --git a/sys/netinet/ipfw/ip_fw_private.h b/sys/netinet/ipfw/ip_fw_private.h
index 92508f1..094c22f 100644
--- a/sys/netinet/ipfw/ip_fw_private.h
+++ b/sys/netinet/ipfw/ip_fw_private.h
@@ -35,6 +35,18 @@
 
 #ifdef _KERNEL
 
+/*
+ * For platforms that do not have SYSCTL support, we wrap the
+ * SYSCTL_* into a function (one per file) to collect the values
+ * into an array at module initialization. The wrapping macros,
+ * SYSBEGIN() and SYSEND, are empty in the default case.
+ */
+#ifndef SYSBEGIN
+#define SYSBEGIN(x)
+#endif
+#ifndef SYSEND
+#define SYSEND
+#endif
 
 /* Return values from ipfw_chk() */
 enum {
@@ -119,7 +131,13 @@ enum {
 };
 
 /* wrapper for freeing a packet, in case we need to do more work */
+#ifndef FREE_PKT
+#if defined(__linux__) || defined(_WIN32)
+#define FREE_PKT(m)	netisr_dispatch(-1, m)
+#else
 #define FREE_PKT(m)	m_freem(m)
+#endif
+#endif /* !FREE_PKT */
 
 /*
  * Function definitions.
@@ -199,8 +217,13 @@ struct ip_fw_chain {
 	struct ip_fw    **map;	/* array of rule ptrs to ease lookup */
 	LIST_HEAD(nat_list, cfg_nat) nat;       /* list of nat entries */
 	struct radix_node_head *tables[IPFW_TABLES_MAX];
+#if defined( __linux__ ) || defined( _WIN32 )
+	spinlock_t rwmtx;
+	spinlock_t uh_lock;
+#else
 	struct rwlock	rwmtx;
 	struct rwlock	uh_lock;	/* lock for upper half */
+#endif
 	uint32_t	id;		/* ruleset id */
 };
 
@@ -240,6 +263,10 @@ int ipfw_ctl(struct sockopt *sopt);
 int ipfw_chk(struct ip_fw_args *args);
 void ipfw_reap_rules(struct ip_fw *head);
 
+/* In ip_fw_pfil */
+int ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir,
+     struct inpcb *inp);
+
 /* In ip_fw_table.c */
 struct radix_node;
 int ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
diff --git a/sys/netinet/ipfw/ip_fw_sockopt.c b/sys/netinet/ipfw/ip_fw_sockopt.c
index 3d0fc55..f61c126 100644
--- a/sys/netinet/ipfw/ip_fw_sockopt.c
+++ b/sys/netinet/ipfw/ip_fw_sockopt.c
@@ -115,7 +115,8 @@ get_map(struct ip_fw_chain *chain, int extra, int locked)
 		int i;
 
 		i = chain->n_rules + extra;
-		map = malloc(i * sizeof(struct ip_fw *), M_IPFW, M_WAITOK);
+		map = malloc(i * sizeof(struct ip_fw *), M_IPFW,
+			locked ? M_NOWAIT : M_WAITOK);
 		if (map == NULL) {
 			printf("%s: cannot allocate map\n", __FUNCTION__);
 			return NULL;
@@ -771,6 +772,44 @@ bad_size:
 	return EINVAL;
 }
 
+
+/*
+ * Translation of requests for compatibility with FreeBSD 7.2/8.
+ * a static variable tells us if we have an old client from userland,
+ * and if necessary we translate requests and responses between the
+ * two formats.
+ */
+static int is7 = 0;
+
+struct ip_fw7 {
+	struct ip_fw7	*next;		/* linked list of rules     */
+	struct ip_fw7	*next_rule;	/* ptr to next [skipto] rule    */
+	/* 'next_rule' is used to pass up 'set_disable' status      */
+
+	uint16_t	act_ofs;	/* offset of action in 32-bit units */
+	uint16_t	cmd_len;	/* # of 32-bit words in cmd */
+	uint16_t	rulenum;	/* rule number          */
+	uint8_t		set;		/* rule set (0..31)     */
+	// #define RESVD_SET   31  /* set for default and persistent rules */
+	uint8_t		_pad;		/* padding          */
+	// uint32_t        id;             /* rule id, only in v.8 */
+	/* These fields are present in all rules.           */
+	uint64_t	pcnt;		/* Packet counter       */
+	uint64_t	bcnt;		/* Byte counter         */
+	uint32_t	timestamp;	/* tv_sec of last match     */
+
+	ipfw_insn	cmd[1];		/* storage for commands     */
+};
+
+	int convert_rule_to_7(struct ip_fw *rule);
+int convert_rule_to_8(struct ip_fw *rule);
+
+#ifndef RULESIZE7
+#define RULESIZE7(rule)  (sizeof(struct ip_fw7) + \
+	((struct ip_fw7 *)(rule))->cmd_len * 4 - 4)
+#endif
+
+
 /*
  * Copy the static and dynamic rules to the supplied buffer
  * and return the amount of space actually used.
@@ -788,6 +827,32 @@ ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space)
         boot_seconds = boottime.tv_sec;
 	for (i = 0; i < chain->n_rules; i++) {
 		rule = chain->map[i];
+
+		if (is7) {
+		    /* Convert rule to FreeBSd 7.2 format */
+		    l = RULESIZE7(rule);
+		    if (bp + l + sizeof(uint32_t) <= ep) {
+			int error;
+			bcopy(rule, bp, l + sizeof(uint32_t));
+			error = convert_rule_to_7((struct ip_fw *) bp);
+			if (error)
+				return 0; /*XXX correct? */
+			/*
+			 * XXX HACK. Store the disable mask in the "next"
+			 * pointer in a wild attempt to keep the ABI the same.
+			 * Why do we do this on EVERY rule?
+			 */
+			bcopy(&V_set_disable,
+				&(((struct ip_fw7 *)bp)->next_rule),
+				sizeof(V_set_disable));
+			if (((struct ip_fw7 *)bp)->timestamp)
+			    ((struct ip_fw7 *)bp)->timestamp += boot_seconds;
+			bp += l;
+		    }
+		    continue; /* go to next rule */
+		}
+
+		/* normal mode, don't touch rules */
 		l = RULESIZE(rule);
 		if (bp + l > ep) { /* should not happen */
 			printf("overflow dumping static rules\n");
@@ -887,15 +952,42 @@ ipfw_ctl(struct sockopt *sopt)
 		rule = malloc(RULE_MAXSIZE, M_TEMP, M_WAITOK);
 		error = sooptcopyin(sopt, rule, RULE_MAXSIZE,
 			sizeof(struct ip_fw) );
+
+		/*
+		 * If the size of commands equals RULESIZE7 then we assume
+		 * a FreeBSD7.2 binary is talking to us (set is7=1).
+		 * is7 is persistent so the next 'ipfw list' command
+		 * will use this format.
+		 * NOTE: If wrong version is guessed (this can happen if
+		 *       the first ipfw command is 'ipfw [pipe] list')
+		 *       the ipfw binary may crash or loop infinitly...
+		 */
+		if (sopt->sopt_valsize == RULESIZE7(rule)) {
+		    is7 = 1;
+		    error = convert_rule_to_8(rule);
+		    if (error)
+			return error;
+		    if (error == 0)
+			error = check_ipfw_struct(rule, RULESIZE(rule));
+		} else {
+		    is7 = 0;
 		if (error == 0)
 			error = check_ipfw_struct(rule, sopt->sopt_valsize);
+		}
 		if (error == 0) {
 			/* locking is done within ipfw_add_rule() */
 			error = ipfw_add_rule(chain, rule);
 			size = RULESIZE(rule);
-			if (!error && sopt->sopt_dir == SOPT_GET)
+			if (!error && sopt->sopt_dir == SOPT_GET) {
+				if (is7) {
+					error = convert_rule_to_7(rule);
+					size = RULESIZE7(rule);
+					if (error)
+						return error;
+				}
 				error = sooptcopyout(sopt, rule, size);
 		}
+		}
 		free(rule, M_TEMP);
 		break;
 
@@ -1078,4 +1170,104 @@ ipfw_ctl(struct sockopt *sopt)
 	return (error);
 #undef RULE_MAXSIZE
 }
+
+
+#define	RULE_MAXSIZE	(256*sizeof(u_int32_t))
+
+/* Functions to convert rules 7.2 <==> 8.0 */
+int
+convert_rule_to_7(struct ip_fw *rule)
+{
+	/* Used to modify original rule */
+	struct ip_fw7 *rule7 = (struct ip_fw7 *)rule;
+	/* copy of original rule, version 8 */
+	struct ip_fw *tmp;
+
+	/* Used to copy commands */
+	ipfw_insn *ccmd, *dst;
+	int ll = 0, ccmdlen = 0;
+
+	tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO);
+	if (tmp == NULL) {
+		return 1; //XXX error
+	}
+	bcopy(rule, tmp, RULE_MAXSIZE);
+
+	/* Copy fields */
+	rule7->_pad = tmp->_pad;
+	rule7->set = tmp->set;
+	rule7->rulenum = tmp->rulenum;
+	rule7->cmd_len = tmp->cmd_len;
+	rule7->act_ofs = tmp->act_ofs;
+	rule7->next_rule = (struct ip_fw7 *)tmp->next_rule;
+	rule7->next = (struct ip_fw7 *)tmp->x_next;
+	rule7->cmd_len = tmp->cmd_len;
+	rule7->pcnt = tmp->pcnt;
+	rule7->bcnt = tmp->bcnt;
+	rule7->timestamp = tmp->timestamp;
+
+	/* Copy commands */
+	for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule7->cmd ;
+			ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) {
+		ccmdlen = F_LEN(ccmd);
+
+		bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t));
+		if (ccmdlen > ll) {
+			printf("ipfw: opcode %d size truncated\n",
+				ccmd->opcode);
+			return EINVAL;
+		}
+	}
+	free(tmp, M_TEMP);
+
+	return 0;
+}
+
+int
+convert_rule_to_8(struct ip_fw *rule)
+{
+	/* Used to modify original rule */
+	struct ip_fw7 *rule7 = (struct ip_fw7 *) rule;
+
+	/* Used to copy commands */
+	ipfw_insn *ccmd, *dst;
+	int ll = 0, ccmdlen = 0;
+
+	/* Copy of original rule */
+	struct ip_fw7 *tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO);
+	if (tmp == NULL) {
+		return 1; //XXX error
+	}
+
+	bcopy(rule7, tmp, RULE_MAXSIZE);
+
+	for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule->cmd ;
+			ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) {
+		ccmdlen = F_LEN(ccmd);
+		
+		bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t));
+		if (ccmdlen > ll) {
+			printf("ipfw: opcode %d size truncated\n",
+			    ccmd->opcode);
+			return EINVAL;
+		}
+	}
+
+	rule->_pad = tmp->_pad;
+	rule->set = tmp->set;
+	rule->rulenum = tmp->rulenum;
+	rule->cmd_len = tmp->cmd_len;
+	rule->act_ofs = tmp->act_ofs;
+	rule->next_rule = (struct ip_fw *)tmp->next_rule;
+	rule->x_next = (struct ip_fw *)tmp->next;
+	rule->cmd_len = tmp->cmd_len;
+	rule->id = 0; /* XXX see if is ok = 0 */
+	rule->pcnt = tmp->pcnt;
+	rule->bcnt = tmp->bcnt;
+	rule->timestamp = tmp->timestamp;
+
+	free (tmp, M_TEMP);
+	return 0;
+}
+
 /* end of file */
diff --git a/sys/netinet/ipfw/ip_fw_table.c b/sys/netinet/ipfw/ip_fw_table.c
index 0d8625a..5854e97 100644
--- a/sys/netinet/ipfw/ip_fw_table.c
+++ b/sys/netinet/ipfw/ip_fw_table.c
@@ -66,6 +66,7 @@ __FBSDID("$FreeBSD$");
 #include <netinet/in.h>
 #include <netinet/ip_var.h>	/* struct ipfw_rule_ref */
 #include <netinet/ip_fw.h>
+#include <sys/queue.h> /* LIST_HEAD */
 #include <netinet/ipfw/ip_fw_private.h>
 
 #ifdef MAC
diff --git a/sys/netinet/ipfw/test/Makefile b/sys/netinet/ipfw/test/Makefile
new file mode 100644
index 0000000..bbeb942
--- /dev/null
+++ b/sys/netinet/ipfw/test/Makefile
@@ -0,0 +1,50 @@
+#
+# $FreeBSD$
+#
+# Makefile for building userland tests
+# this is written in a form compatible with gmake
+
+SCHED_SRCS = test_dn_sched.c
+SCHED_SRCS += dn_sched_fifo.c
+SCHED_SRCS += dn_sched_wf2q.c
+SCHED_SRCS += dn_sched_qfq.c
+SCHED_SRCS += dn_sched_rr.c
+SCHED_SRCS += dn_heap.c
+SCHED_SRCS += main.c
+
+SCHED_OBJS=$(SCHED_SRCS:.c=.o)
+
+HEAP_SRCS = dn_heap.c test_dn_heap.c
+HEAP_OBJS=$(HEAP_SRCS:.c=.o)
+
+VPATH=	.:..
+
+CFLAGS = -I.. -I. -Wall -Werror -O3 -DIPFW
+TARGETS= test_sched # no test_heap by default
+
+all:	$(TARGETS)
+
+test_heap : $(HEAP_OBJS)
+	$(CC) -o $@ $(HEAP_OBJS)
+
+test_sched : $(SCHED_OBJS)
+	$(CC) -o $@ $(SCHED_OBJS)
+
+$(SCHED_OBJS): dn_test.h
+main.o: mylist.h
+
+clean:
+	- rm *.o $(TARGETS) *.core
+
+ALLSRCS = $(SCHED_SRCS) dn_test.h mylist.h \
+	dn_sched.h dn_heap.h ip_dn_private.h Makefile
+TMPBASE = /tmp/testXYZ
+TMPDIR = $(TMPBASE)/test
+
+tgz:
+	-rm -rf $(TMPDIR)
+	mkdir -p $(TMPDIR)
+	-cp -p $(ALLSRCS) $(TMPDIR)
+	-(cd ..; cp -p $(ALLSRCS) $(TMPDIR))
+	ls -la  $(TMPDIR)
+	(cd $(TMPBASE); tar cvzf /tmp/test.tgz test)
diff --git a/sys/netinet/ipfw/test/dn_test.h b/sys/netinet/ipfw/test/dn_test.h
new file mode 100644
index 0000000..c56aa6a
--- /dev/null
+++ b/sys/netinet/ipfw/test/dn_test.h
@@ -0,0 +1,155 @@
+/*
+ * $FreeBSD$
+ *
+ * userspace compatibility code for dummynet schedulers
+ */
+
+#ifndef _DN_TEST_H
+#define _DN_TEST_H
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>	/* bzero, ffs, ... */
+#include <string.h>	/* strcmp */
+#include <errno.h>
+#include <inttypes.h>
+#include <sys/queue.h>
+#include <sys/time.h>
+
+
+extern int debug;
+#define ND(fmt, args...) do {} while (0)
+#define D1(fmt, args...) do {} while (0)
+#define D(fmt, args...) fprintf(stderr, "%-8s " fmt "\n",      \
+        __FUNCTION__, ## args)
+#define DX(lev, fmt, args...) do {              \
+        if (debug > lev) D(fmt, ## args); } while (0)
+
+
+#define offsetof(t,m) (int)((&((t *)0L)->m))
+
+#include <mylist.h>
+
+/* prevent include of other system headers */
+#define	_NETINET_IP_VAR_H_	/* ip_fw_args */
+#define _IPFW2_H
+#define _SYS_MBUF_H_
+
+enum	{
+	DN_QUEUE,
+};
+
+enum	{
+	DN_SCHED_FIFO,
+	DN_SCHED_WF2QP,
+};
+
+struct dn_id {
+	int type, subtype, len, id;
+};
+struct dn_fs {
+	int par[4];	/* flowset parameters */
+
+	/* simulation entries.
+	 * 'index' is not strictly necessary
+	 * y is used for the inverse mapping ,
+	 */
+	int index;
+	int y;	/* inverse mapping */
+	int base_y;	/* inverse mapping */
+	int next_y;	/* inverse mapping */
+	int n_flows;
+	int first_flow;
+	int next_flow;	/* first_flow + n_flows */
+	/*
+	 * when generating, let 'cur' go from 0 to n_flows-1,
+	 * then point to flow first_flow + cur
+	 */
+	int	cur;
+};
+struct dn_sch {
+};
+struct dn_flow {
+	struct dn_id oid;
+	int length;
+	int len_bytes;
+	int drops;
+	uint64_t tot_bytes;
+	uint32_t flow_id;
+	struct list_head h;	/* used by the generator */
+};
+struct dn_link {
+};
+
+struct ip_fw_args {
+};
+
+struct mbuf {
+        struct {
+                int len;
+        } m_pkthdr;
+        struct mbuf *m_nextpkt;
+	int flow_id;	/* for testing, index of a flow */
+	//int flowset_id;	/* for testing, index of a flowset */
+	void *cfg;	/* config args */
+};
+
+#define MALLOC_DECLARE(x)
+#define KASSERT(x, y)	do { if (!(x)) printf y ; exit(0); } while (0)
+struct ipfw_flow_id {
+};
+
+typedef void * module_t;
+struct _md_t {
+	const char *name;
+	int (*f)(module_t, int, void *);
+	void *p;
+};
+typedef struct _md_t moduledata_t;
+#define DECLARE_MODULE(name, b, c, d)	\
+	moduledata_t *_g_##name = & b
+#define MODULE_DEPEND(a, b, c, d, e)
+
+#ifdef IPFW
+#include <dn_heap.h>
+#include <ip_dn_private.h>
+#include <dn_sched.h>
+#else
+struct dn_queue {
+        struct dn_fsk *fs;             /* parent flowset. */
+        struct dn_sch_inst *_si;	/* parent sched instance. */
+};
+struct dn_schk {
+};
+struct dn_fsk {
+	struct dn_fs fs;
+	struct dn_schk *sched;
+};
+struct dn_sch_inst {
+	struct dn_schk *sched;
+};
+struct dn_alg {
+	int type;
+	const char *name;
+	void *enqueue, *dequeue;
+	int q_datalen, si_datalen, schk_datalen;
+	int (*config)(struct dn_schk *);
+	int (*new_sched)(struct dn_sch_inst *);
+	int (*new_fsk)(struct dn_fsk *);
+        int (*new_queue)(struct dn_queue *q);
+};
+
+#endif
+
+static inline void
+mq_append(struct mq *q, struct mbuf *m)
+{
+        if (q->head == NULL)
+                q->head = m;
+        else
+                q->tail->m_nextpkt = m;
+        q->tail = m;
+        m->m_nextpkt = NULL;
+}
+
+#endif /* _DN_TEST_H */
diff --git a/sys/netinet/ipfw/test/main.c b/sys/netinet/ipfw/test/main.c
new file mode 100644
index 0000000..be9fdf5
--- /dev/null
+++ b/sys/netinet/ipfw/test/main.c
@@ -0,0 +1,636 @@
+/*
+ * $FreeBSD$
+ *
+ * Testing program for schedulers
+ *
+ * The framework include a simple controller which, at each
+ * iteration, decides whether we can enqueue and/or dequeue.
+ * Then the mainloop runs the required number of tests,
+ * keeping track of statistics.
+ */
+
+#include "dn_test.h"
+
+struct q_list {
+	struct list_head h;
+};
+
+struct cfg_s {
+	int ac;
+	char * const *av;
+
+	const char *name;
+	int loops;
+	struct timeval time;
+
+	/* running counters */
+	uint32_t	_enqueue;
+	uint32_t	drop;
+	uint32_t	pending;
+	uint32_t	dequeue;
+
+	/* generator parameters */
+	int th_min, th_max;
+	int maxburst;
+	int lmin, lmax;	/* packet len */
+	int flows;	/* number of flows */
+	int flowsets;	/* number of flowsets */
+	int wsum;	/* sum of weights of all flows */
+	int max_y;	/* max random number in the generation */
+	int cur_y, cur_fs;	/* used in generation, between 0 and max_y - 1 */
+	const char *fs_config; /* flowset config */
+	int can_dequeue;
+	int burst;	/* count of packets sent in a burst */
+	struct mbuf *tosend;	/* packet to send -- also flag to enqueue */
+
+	struct mbuf *freelist;
+
+	struct mbuf *head, *tail;	/* a simple tailq */
+
+	/* scheduler hooks */
+	int (*enq)(struct dn_sch_inst *, struct dn_queue *,
+		struct mbuf *);
+	struct mbuf * (*deq)(struct dn_sch_inst *);
+	/* size of the three fields including sched-specific areas */
+	int schk_len;
+	int q_len; /* size of a queue including sched-fields */
+	int si_len; /* size of a sch_inst including sched-fields */
+	char *q;	/* array of flow queues */
+		/* use a char* because size is variable */
+	struct dn_fsk *fs;	/* array of flowsets */
+	struct dn_sch_inst *si;
+	struct dn_schk *sched;
+
+	/* generator state */
+	int state;		/* 0 = going up, 1: going down */
+
+	/*
+	 * We keep lists for each backlog level, and always serve
+	 * the one with shortest backlog. llmask contains a bitmap
+	 * of lists, and ll are the heads of the lists. The last
+	 * entry (BACKLOG) contains all entries considered 'full'
+	 * XXX to optimize things, entry i could contain queues with
+	 * 2^{i-1}+1 .. 2^i entries.
+	 */
+#define BACKLOG	30
+	uint32_t	llmask;
+	struct list_head ll[BACKLOG + 10];
+};
+
+/* FI2Q and Q2FI converts from flow_id to dn_queue and back.
+ * We cannot easily use pointer arithmetic because it is variable size.
+  */
+#define FI2Q(c, i)	((struct dn_queue *)((c)->q + (c)->q_len * (i)))
+#define Q2FI(c, q)	(((char *)(q) - (c)->q)/(c)->q_len)
+
+int debug = 0;
+
+struct dn_parms dn_cfg;
+
+static void controller(struct cfg_s *c);
+
+/* release a packet: put the mbuf in the freelist, and the queue in
+ * the bucket.
+ */
+int
+drop(struct cfg_s *c, struct mbuf *m)
+{
+	struct dn_queue *q;
+	int i;
+
+	c->drop++;
+	q = FI2Q(c, m->flow_id);
+	i = q->ni.length; // XXX or ffs...
+
+	ND("q %p id %d current length %d", q, m->flow_id, i);
+	if (i < BACKLOG) {
+		struct list_head *h = &q->ni.h;
+		c->llmask &= ~(1<<(i+1));
+		c->llmask |= (1<<(i));
+		list_del(h);
+		list_add_tail(h, &c->ll[i]);
+	}
+	m->m_nextpkt = c->freelist;
+	c->freelist = m;
+	return 0;
+}
+
+/* dequeue returns NON-NULL when a packet is dropped */
+static int
+enqueue(struct cfg_s *c, void *_m)
+{
+	struct mbuf *m = _m;
+	if (c->enq)
+		return c->enq(c->si, FI2Q(c, m->flow_id), m);
+	if (c->head == NULL)
+		c->head = m;
+	else
+		c->tail->m_nextpkt = m;
+	c->tail = m;
+	return 0; /* default - success */
+}
+
+/* dequeue returns NON-NULL when a packet is available */
+static void *
+dequeue(struct cfg_s *c)
+{
+	struct mbuf *m;
+	if (c->deq)
+		return c->deq(c->si);
+	if ((m = c->head)) {
+		m = c->head;
+		c->head = m->m_nextpkt;
+		m->m_nextpkt = NULL;
+	}
+	return m;
+}
+
+static int
+mainloop(struct cfg_s *c)
+{
+	int i;
+	struct mbuf *m;
+
+	for (i=0; i < c->loops; i++) {
+		/* implement histeresis */
+		controller(c);
+		DX(3, "loop %d enq %d send %p rx %d",
+			i, c->_enqueue, c->tosend, c->can_dequeue);
+		if ( (m = c->tosend) ) {
+			c->_enqueue++;
+			if (enqueue(c, m)) {
+				drop(c, m);
+				ND("loop %d enqueue fail", i );
+			} else {
+				ND("enqueue ok");
+				c->pending++;
+			}
+		}
+		if (c->can_dequeue) {
+			c->dequeue++;
+			if ((m = dequeue(c))) {
+				c->pending--;
+				drop(c, m);
+				c->drop--;	/* compensate */
+			}
+		}
+	}
+	DX(1, "mainloop ends %d", i);
+	return 0;
+}
+
+int
+dump(struct cfg_s *c)
+{
+	int i;
+	struct dn_queue *q;
+
+	for (i=0; i < c->flows; i++) {
+		q = FI2Q(c, i);
+		DX(1, "queue %4d tot %10lld", i, q->ni.tot_bytes);
+	}
+	DX(1, "done %d loops\n", c->loops);
+	return 0;
+}
+
+/* interpret a number in human form */
+static long
+getnum(const char *s, char **next, const char *key)
+{
+	char *end = NULL;
+	long l;
+
+	if (next)	/* default */
+		*next = NULL;
+	if (s && *s) {
+		DX(3, "token is <%s> %s", s, key ? key : "-");
+		l = strtol(s, &end, 0);
+	} else {
+		DX(3, "empty string");
+		l = -1;
+	}
+	if (l < 0) {
+		DX(2, "invalid %s for %s", s ? s : "NULL", (key ? key : "") );
+		return 0;	// invalid 
+	}
+	if (!end || !*end)
+		return l;
+	if (*end == 'n')
+		l = -l;	/* multiply by n */
+	else if (*end == 'K')
+		l = l*1000;
+	else if (*end == 'M')
+		l = l*1000000;
+	else if (*end == 'k')
+		l = l*1024;
+	else if (*end == 'm')
+		l = l*1024*1024;
+	else if (*end == 'w')
+		;
+	else {/* not recognized */
+		D("suffix %s for %s, next %p", end, key, next);
+		end--;
+	}
+	end++;
+	DX(3, "suffix now %s for %s, next %p", end, key, next);
+	if (next && *end) {
+		DX(3, "setting next to %s for %s", end, key);
+		*next = end;
+	}
+	return l;
+}
+
+/*
+ * flowsets are a comma-separated list of
+ *     weight:maxlen:flows
+ * indicating how many flows are hooked to that fs.
+ * Both weight and range can be min-max-steps.
+ * In a first pass we just count the number of flowsets and flows,
+ * in a second pass we complete the setup.
+ */
+static void
+parse_flowsets(struct cfg_s *c, const char *fs, int pass)
+{
+	char *s, *cur, *next;
+	int n_flows = 0, n_fs = 0, wsum = 0;
+	int i, j;
+	struct dn_fs *prev = NULL;
+
+	DX(3, "--- pass %d flows %d flowsets %d", pass, c->flows, c->flowsets);
+	if (pass == 0)
+		c->fs_config = fs;
+	s = c->fs_config ? strdup(c->fs_config) : NULL;
+	if (s == NULL) {
+		if (pass == 0)
+			D("no fsconfig");
+		return;
+	}
+	for (next = s; (cur = strsep(&next, ","));) {
+		char *p = NULL;
+		int w, w_h, w_steps, wi;
+		int len, len_h, l_steps, li;
+		int flows;
+
+		w = getnum(strsep(&cur, ":"), &p, "weight");
+		if (w <= 0)
+			w = 1;
+		w_h = p ? getnum(p+1, &p, "weight_max") : w;
+		w_steps = p ? getnum(p+1, &p, "w_steps") : (w_h == w ?1:2);
+		len = getnum(strsep(&cur, ":"), &p, "len");
+		if (len <= 0)
+			len = 1000;
+		len_h = p ? getnum(p+1, &p, "len_max") : len;
+		l_steps = p ? getnum(p+1, &p, "l_steps") : (len_h == len ? 1 : 2);
+		flows = getnum(strsep(&cur, ":"), NULL, "flows");
+		if (flows == 0)
+			flows = 1;
+		DX(4, "weight %d..%d (%d) len %d..%d (%d) flows %d",
+			w, w_h, w_steps, len, len_h, l_steps, flows);
+		if (w == 0 || w_h < w || len == 0 || len_h < len ||
+				flows == 0) {
+			DX(4,"wrong parameters %s", fs);
+			return;
+		}
+		n_flows += flows * w_steps * l_steps;
+		for (i = 0; i < w_steps; i++) {
+			wi = w + ((w_h - w)* i)/(w_steps == 1 ? 1 : (w_steps-1));
+			for (j = 0; j < l_steps; j++, n_fs++) {
+				struct dn_fs *fs = &c->fs[n_fs].fs; // tentative
+				int x;
+
+				li = len + ((len_h - len)* j)/(l_steps == 1 ? 1 : (l_steps-1));
+				x = (wi*2048)/li;
+				DX(3, "----- fs %4d weight %4d lmax %4d X %4d flows %d",
+					n_fs, wi, li, x, flows);
+				if (pass == 0)
+					continue;
+				if (c->fs == NULL || c->flowsets <= n_fs) {
+					D("error in number of flowsets");
+					return;
+				}
+				wsum += wi * flows;
+				fs->par[0] = wi;
+				fs->par[1] = li;
+				fs->index = n_fs;
+				fs->n_flows = flows;
+				fs->cur = fs->first_flow = prev==NULL ? 0 : prev->next_flow;
+				fs->next_flow = fs->first_flow + fs->n_flows;
+				fs->y = x * flows;
+				fs->base_y = (prev == NULL) ? 0 : prev->next_y;
+				fs->next_y = fs->base_y + fs->y;
+				prev = fs;
+			}
+		}
+	}
+	c->max_y = prev ? prev->base_y + prev->y : 0;
+	c->flows = n_flows;
+	c->flowsets = n_fs;
+	c->wsum = wsum;
+	if (pass == 0)
+		return;
+
+	/* now link all flows to their parent flowsets */
+	DX(1,"%d flows on %d flowsets max_y %d", c->flows, c->flowsets, c->max_y);
+	for (i=0; i < c->flowsets; i++) {
+		struct dn_fs *fs = &c->fs[i].fs;
+		DX(1, "fs %3d w %5d l %4d flow %5d .. %5d y %6d .. %6d",
+			i, fs->par[0], fs->par[1],
+			fs->first_flow, fs->next_flow,
+			fs->base_y, fs->next_y);
+		for (j = fs->first_flow; j < fs->next_flow; j++) {
+			struct dn_queue *q = FI2Q(c, j);
+			q->fs = &c->fs[i];
+		}
+	}
+}
+
+static int
+init(struct cfg_s *c)
+{
+	int i;
+	int ac = c->ac;
+	char * const *av = c->av;
+
+	c->si_len = sizeof(struct dn_sch_inst);
+	c->q_len = sizeof(struct dn_queue);
+	moduledata_t *mod = NULL;
+	struct dn_alg *p = NULL;
+
+	c->th_min = 0;
+	c->th_max = -20;/* 20 packets per flow */
+	c->lmin = c->lmax = 1280;	/* packet len */
+	c->flows = 1;
+	c->flowsets = 1;
+	c->name = "null";
+	ac--; av++;
+	while (ac > 1) {
+		if (!strcmp(*av, "-n")) {
+			c->loops = getnum(av[1], NULL, av[0]);
+		} else if (!strcmp(*av, "-d")) {
+			debug = atoi(av[1]);
+		} else if (!strcmp(*av, "-alg")) {
+			extern moduledata_t *_g_dn_fifo;
+			extern moduledata_t *_g_dn_wf2qp;
+			extern moduledata_t *_g_dn_rr;
+			extern moduledata_t *_g_dn_qfq;
+#ifdef WITH_KPS
+			extern moduledata_t *_g_dn_kps;
+#endif
+			if (!strcmp(av[1], "rr"))
+				mod = _g_dn_rr;
+			else if (!strcmp(av[1], "wf2qp"))
+				mod = _g_dn_wf2qp;
+			else if (!strcmp(av[1], "fifo"))
+				mod = _g_dn_fifo;
+			else if (!strcmp(av[1], "qfq"))
+				mod = _g_dn_qfq;
+#ifdef WITH_KPS
+			else if (!strcmp(av[1], "kps"))
+				mod = _g_dn_kps;
+#endif
+			else
+				mod = NULL;
+			c->name = mod ? mod->name : "NULL";
+			DX(3, "using scheduler %s", c->name);
+		} else if (!strcmp(*av, "-len")) {
+			c->lmin = getnum(av[1], NULL, av[0]);
+			c->lmax = c->lmin;
+			DX(3, "setting max to %d", c->th_max);
+		} else if (!strcmp(*av, "-burst")) {
+			c->maxburst = getnum(av[1], NULL, av[0]);
+			DX(3, "setting max to %d", c->th_max);
+		} else if (!strcmp(*av, "-qmax")) {
+			c->th_max = getnum(av[1], NULL, av[0]);
+			DX(3, "setting max to %d", c->th_max);
+		} else if (!strcmp(*av, "-qmin")) {
+			c->th_min = getnum(av[1], NULL, av[0]);
+			DX(3, "setting min to %d", c->th_min);
+		} else if (!strcmp(*av, "-flows")) {
+			c->flows = getnum(av[1], NULL, av[0]);
+			DX(3, "setting flows to %d", c->flows);
+		} else if (!strcmp(*av, "-flowsets")) {
+			parse_flowsets(c, av[1], 0);
+			DX(3, "setting flowsets to %d", c->flowsets);
+		} else {
+			D("option %s not recognised, ignore", *av);
+		}
+		ac -= 2; av += 2;
+	}
+	if (c->maxburst <= 0)
+		c->maxburst = 1;
+	if (c->loops <= 0)
+		c->loops = 1;
+	if (c->flows <= 0)
+		c->flows = 1;
+	if (c->flowsets <= 0)
+		c->flowsets = 1;
+	if (c->lmin <= 0)
+		c->lmin = 1;
+	if (c->lmax <= 0)
+		c->lmax = 1;
+	/* multiply by N */
+	if (c->th_min < 0)
+		c->th_min = c->flows * -c->th_min;
+	if (c->th_max < 0)
+		c->th_max = c->flows * -c->th_max;
+	if (c->th_max <= c->th_min)
+		c->th_max = c->th_min + 1;
+	if (mod) {
+		p = mod->p;
+		DX(3, "using module %s f %p p %p", mod->name, mod->f, mod->p);
+		DX(3, "modname %s ty %d", p->name, p->type);
+		c->enq = p->enqueue;
+		c->deq = p->dequeue;
+		c->si_len += p->si_datalen;
+		c->q_len += p->q_datalen;
+		c->schk_len += p->schk_datalen;
+	}
+	/* allocate queues, flowsets and one scheduler */
+	c->q = calloc(c->flows, c->q_len);
+	c->fs = calloc(c->flowsets, sizeof(struct dn_fsk));
+	c->si = calloc(1, c->si_len);
+	c->sched = calloc(c->flows, c->schk_len);
+	if (c->q == NULL || c->fs == NULL) {
+		D("error allocating memory for flows");
+		exit(1);
+	}
+	c->si->sched = c->sched;
+	if (p) {
+		if (p->config)
+			p->config(c->sched);
+		if (p->new_sched)
+			p->new_sched(c->si);
+	}
+	/* parse_flowsets links queues to their flowsets */
+	parse_flowsets(c, av[1], 1);
+	/* complete the work calling new_fsk */
+	for (i = 0; i < c->flowsets; i++) {
+		if (c->fs[i].fs.par[1] == 0)
+			c->fs[i].fs.par[1] = 1000;	/* default pkt len */
+		c->fs[i].sched = c->sched;
+		if (p && p->new_fsk)
+			p->new_fsk(&c->fs[i]);
+	}
+
+	/* initialize the lists for the generator, and put
+	 * all flows in the list for backlog = 0
+	 */
+	for (i=0; i <= BACKLOG+5; i++)
+		INIT_LIST_HEAD(&c->ll[i]);
+
+	for (i = 0; i < c->flows; i++) {
+		struct dn_queue *q = FI2Q(c, i);
+		if (q->fs == NULL)
+			q->fs = &c->fs[0]; /* XXX */
+		q->_si = c->si;
+		if (p && p->new_queue)
+			p->new_queue(q);
+		INIT_LIST_HEAD(&q->ni.h);
+		list_add_tail(&q->ni.h, &c->ll[0]);
+	}
+	c->llmask = 1;
+	return 0;
+}
+
+
+int
+main(int ac, char *av[])
+{
+	struct cfg_s c;
+	struct timeval end;
+	double ll;
+	int i;
+	char msg[40];
+
+	bzero(&c, sizeof(c));
+	c.ac = ac;
+	c.av = av;
+	init(&c);
+	gettimeofday(&c.time, NULL);
+	mainloop(&c);
+	gettimeofday(&end, NULL);
+	end.tv_sec -= c.time.tv_sec;
+	end.tv_usec -= c.time.tv_usec;
+	if (end.tv_usec < 0) {
+		end.tv_usec += 1000000;
+		end.tv_sec--;
+	}
+	c.time = end;
+	ll = end.tv_sec*1000000 + end.tv_usec;
+	ll *= 1000;	/* convert to nanoseconds */
+	ll /= c._enqueue;
+	sprintf(msg, "1::%d", c.flows);
+	D("%-8s n %d %d time %d.%06d %8.3f qlen %d %d flows %s drops %d",
+		c.name, c._enqueue, c.loops,
+		(int)c.time.tv_sec, (int)c.time.tv_usec, ll,
+		c.th_min, c.th_max,
+		c.fs_config ? c.fs_config : msg, c.drop);
+	dump(&c);
+	DX(1, "done ac %d av %p", ac, av);
+	for (i=0; i < ac; i++)
+		DX(1, "arg %d %s", i, av[i]);
+	return 0;
+}
+
+/*
+ * The controller decides whether in this iteration we should send
+ * (the packet is in c->tosend) and/or receive (flag c->can_dequeue)
+ */
+static void
+controller(struct cfg_s *c)
+{
+	struct mbuf *m;
+	struct dn_fs *fs;
+	int flow_id;
+
+	/* histeresis between max and min */
+	if (c->state == 0 && c->pending >= c->th_max)
+		c->state = 1;
+	else if (c->state == 1 && c->pending <= c->th_min)
+		c->state = 0;
+	ND(1, "state %d pending %2d", c->state, c->pending);
+	c->can_dequeue = c->state;
+	c->tosend = NULL;
+	if (c->state)
+		return;
+
+    if (1) {
+	int i;
+	struct dn_queue *q;
+	struct list_head *h;
+
+	i = ffs(c->llmask) - 1;
+	if (i < 0) {
+		DX(2, "no candidate");
+		c->can_dequeue = 1;
+		return;
+	}
+	h = &c->ll[i];
+	ND(1, "backlog %d p %p prev %p next %p", i, h, h->prev, h->next);
+	q = list_first_entry(h, struct dn_queue, ni.h);
+	list_del(&q->ni.h);
+	flow_id = Q2FI(c, q);
+	DX(2, "extracted flow %p %d backlog %d", q, flow_id, i);
+	if (list_empty(h)) {
+		ND(2, "backlog %d empty", i);
+		c->llmask &= ~(1<<i);
+	}
+	ND(1, "before %d p %p prev %p next %p", i+1, h+1, h[1].prev, h[1].next);
+	list_add_tail(&q->ni.h, h+1);
+	ND(1, " after %d p %p prev %p next %p", i+1, h+1, h[1].prev, h[1].next);
+	if (i < BACKLOG) {
+		ND(2, "backlog %d full", i+1);
+		c->llmask |= 1<<(1+i);
+	}
+	fs = &q->fs->fs;
+	c->cur_fs = q->fs - c->fs;
+	fs->cur = flow_id;
+    } else {
+	/* XXX this does not work ? */
+	/* now decide whom to send the packet, and the length */
+	/* lookup in the flow table */
+	if (c->cur_y >= c->max_y) {	/* handle wraparound */
+		c->cur_y = 0;
+		c->cur_fs = 0;
+	}
+	fs = &c->fs[c->cur_fs].fs;
+	flow_id = fs->cur++;
+	if (fs->cur >= fs->next_flow)
+		fs->cur = fs->first_flow;
+	c->cur_y++;
+	if (c->cur_y >= fs->next_y)
+		c->cur_fs++;
+    }
+
+	/* construct a packet */
+	if (c->freelist) {
+		m = c->tosend = c->freelist;
+		c->freelist = c->freelist->m_nextpkt;
+	} else {
+		m = c->tosend = calloc(1, sizeof(struct mbuf));
+	}
+	if (m == NULL)
+		return;
+
+	m->cfg = c;
+	m->m_nextpkt = NULL;
+	m->m_pkthdr.len = fs->par[1]; // XXX maxlen
+	m->flow_id = flow_id;
+
+	ND(2,"y %6d flow %5d fs %3d weight %4d len %4d",
+		c->cur_y, m->flow_id, c->cur_fs,
+		fs->par[0], m->m_pkthdr.len);
+
+}
+
+/*
+Packet allocation:
+to achieve a distribution that matches weights, for each X=w/lmax class
+we should generate a number of packets proportional to Y = X times the number
+of flows in the class.
+So we construct an array with the cumulative distribution of Y's,
+and use it to identify the flow via inverse mapping (if the Y's are
+not too many we can use an array for the lookup). In practice,
+each flow will have X entries [virtually] pointing to it.
+
+*/
diff --git a/sys/netinet/ipfw/test/mylist.h b/sys/netinet/ipfw/test/mylist.h
new file mode 100644
index 0000000..ebc7d11
--- /dev/null
+++ b/sys/netinet/ipfw/test/mylist.h
@@ -0,0 +1,49 @@
+/*
+ * $FreeBSD$
+ *
+ * linux-like bidirectional lists
+ */
+
+#ifndef _MYLIST_H
+#define _MYLIST_H
+struct list_head {
+        struct list_head *prev, *next;
+};
+
+#define INIT_LIST_HEAD(l) do {  (l)->prev = (l)->next = (l); } while (0)
+#define list_empty(l)   ( (l)->next == l )
+static inline void
+__list_add(struct list_head *new, struct list_head *prev,
+        struct list_head *next)
+{
+        next->prev = new;
+        new->next = next;
+        new->prev = prev;
+        prev->next = new;
+}
+ 
+static inline void
+list_add_tail(struct list_head *new, struct list_head *head)
+{
+        __list_add(new, head->prev, head);
+}
+
+#define list_first_entry(pL, ty, member)        \
+        (ty *)((char *)((pL)->next) - offsetof(ty, member))
+
+static inline void
+__list_del(struct list_head *prev, struct list_head *next)
+{
+        next->prev = prev;
+        prev->next = next;
+}
+
+static inline void
+list_del(struct list_head *entry)
+{
+	ND("called on %p", entry);
+        __list_del(entry->prev, entry->next);
+        entry->next = entry->prev = NULL;
+}
+
+#endif /* _MYLIST_H */
diff --git a/sys/netinet/ipfw/test/test_dn_heap.c b/sys/netinet/ipfw/test/test_dn_heap.c
new file mode 100644
index 0000000..d460cf2
--- /dev/null
+++ b/sys/netinet/ipfw/test/test_dn_heap.c
@@ -0,0 +1,162 @@
+/*-
+ * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Userland code for testing binary heaps and hash tables
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+
+#include <stdio.h>
+#include <strings.h>
+#include <stdlib.h>
+
+#include  "dn_heap.h"
+#define log(x, arg...)	fprintf(stderr, ## arg)
+#define panic(x...)	fprintf(stderr, ## x), exit(1)
+
+#include <string.h>
+
+struct x {
+	struct x *ht_link;
+	char buf[0];
+};
+
+uint32_t hf(uintptr_t key, int flags, void *arg)
+{
+	return (flags & DNHT_KEY_IS_OBJ) ?
+		((struct x *)key)->buf[0] : *(char *)key;
+}
+
+int matchf(void *obj, uintptr_t key, int flags, void *arg)
+{
+	char *s = (flags & DNHT_KEY_IS_OBJ) ?
+		((struct x *)key)->buf : (char *)key;
+	return (strcmp(((struct x *)obj)->buf, s) == 0);
+}
+
+void *newfn(uintptr_t key, int flags, void *arg)
+{
+	char *s = (char *)key;
+	struct x *p = malloc(sizeof(*p) + 1 + strlen(s));
+	if (p)
+		strcpy(p->buf, s);
+	return p;
+}
+
+char *strings[] = {
+	"undici", "unico", "doppio", "devoto",
+	"uno", "due", "tre", "quattro", "cinque", "sei",
+	"uno", "due", "tre", "quattro", "cinque", "sei",
+	NULL,
+};
+
+int doprint(void *_x, void *arg)
+{
+	struct x *x = _x;
+	printf("found element <%s>\n", x->buf);
+	return (int)arg;
+}
+
+static void
+test_hash()
+{
+	char **p;
+	struct dn_ht *h;
+	uintptr_t x = 0;
+	uintptr_t x1 = 0;
+
+	/* first, find and allocate */
+	h = dn_ht_init(NULL, 10, 0, hf, matchf, newfn);
+
+	for (p = strings; *p; p++) {
+		dn_ht_find(h, (uintptr_t)*p, DNHT_INSERT, NULL);
+	}
+	dn_ht_scan(h, doprint, 0);
+	printf("/* second -- find without allocate */\n");
+	h = dn_ht_init(NULL, 10, 0, hf, matchf, NULL);
+	for (p = strings; *p; p++) {
+		void **y = newfn((uintptr_t)*p, 0, NULL);
+		if (x == 0)
+			x = (uintptr_t)y;
+		else {
+			if (x1 == 0)
+				x1 = (uintptr_t)*p;
+		}
+		dn_ht_find(h, (uintptr_t)y, DNHT_INSERT | DNHT_KEY_IS_OBJ, NULL);
+	}
+	dn_ht_scan(h, doprint, 0);
+	printf("remove %p gives %p\n", (void *)x,
+		dn_ht_find(h, x, DNHT_KEY_IS_OBJ | DNHT_REMOVE, NULL));
+	printf("remove %p gives %p\n", (void *)x,
+		dn_ht_find(h, x, DNHT_KEY_IS_OBJ | DNHT_REMOVE, NULL));
+	printf("remove %p gives %p\n", (void *)x,
+		dn_ht_find(h, x1, DNHT_REMOVE, NULL));
+	printf("remove %p gives %p\n", (void *)x,
+		dn_ht_find(h, x1, DNHT_REMOVE, NULL));
+	dn_ht_scan(h, doprint, 0);
+}
+
+int
+main(int argc, char *argv[])
+{
+	struct dn_heap h;
+	int i, n, n2, n3;
+
+	test_hash();
+	return 0;
+
+	/* n = elements, n2 = cycles */
+	n = (argc > 1) ? atoi(argv[1]) : 0;
+	if (n <= 0 || n > 1000000)
+		n = 100;
+	n2 = (argc > 2) ? atoi(argv[2]) : 0;
+	if (n2 <= 0)
+		n = 1000000;
+	n3 = (argc > 3) ? atoi(argv[3]) : 0;
+	bzero(&h, sizeof(h));
+	heap_init(&h, n, -1);
+	while (n2-- > 0) {
+		uint64_t prevk = 0;
+		for (i=0; i < n; i++)
+			heap_insert(&h, n3 ? n-i: random(), (void *)(100+i));
+		
+		for (i=0; h.elements > 0; i++) {
+			uint64_t k = h.p[0].key;
+			if (k < prevk)
+				panic("wrong sequence\n");
+			prevk = k;
+			if (0)
+			printf("%d key %llu, val %p\n",
+				i, h.p[0].key, h.p[0].object);
+			heap_extract(&h, NULL);
+		}
+	}
+	return 0;
+}
diff --git a/sys/netinet/ipfw/test/test_dn_sched.c b/sys/netinet/ipfw/test/test_dn_sched.c
new file mode 100644
index 0000000..667c42e
--- /dev/null
+++ b/sys/netinet/ipfw/test/test_dn_sched.c
@@ -0,0 +1,76 @@
+/*
+ * $FreeBSD$
+ *
+ * library functions for userland testing of dummynet schedulers
+ */
+
+#include "dn_test.h"
+
+void
+m_freem(struct mbuf *m)
+{
+	printf("free %p\n", m);
+}
+
+int
+dn_sched_modevent(module_t mod, int cmd, void *arg)
+{
+	return 0;
+}
+
+void
+dn_free_pkts(struct mbuf *m)
+{
+	struct mbuf *x;
+	while ( (x = m) ) {
+		m = m->m_nextpkt;
+		m_freem(x);
+	}
+}
+		
+int
+dn_delete_queue(void *_q, void *do_free)
+{
+	struct dn_queue *q = _q;
+        if (q->mq.head)
+                dn_free_pkts(q->mq.head);
+        free(q);
+        return 0;
+}
+
+/*
+ * This is a simplified function for testing purposes, which does
+ * not implement statistics or random loss.
+ * Enqueue a packet in q, subject to space and queue management policy
+ * (whose parameters are in q->fs).
+ * Update stats for the queue and the scheduler.
+ * Return 0 on success, 1 on drop. The packet is consumed anyways.
+ */
+int
+dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop)
+{
+        if (drop)
+                goto drop;
+        if (q->ni.length >= 200)
+                goto drop;
+        mq_append(&q->mq, m);
+        q->ni.length++;
+        q->ni.tot_bytes += m->m_pkthdr.len;
+        return 0;
+
+drop:
+        q->ni.drops++;
+        return 1;
+}
+
+int
+ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg)
+{
+        if (*v < lo) {
+                *v = dflt;
+        } else if (*v > hi) {
+                *v = hi;
+        }
+        return *v;
+}
+
author	luigi <luigi@FreeBSD.org>	2010-03-02 17:40:48 +0000
committer	luigi <luigi@FreeBSD.org>	2010-03-02 17:40:48 +0000
commit	5ceeac4aa882074e2b1e42fbc20c79ebbd24f4c5 (patch)
tree	225c58cb171a7858ca9880ee492dd9f3d281f623
parent	a8f766213ad48d0cb09214bdf6ac6a3b74221b03 (diff)
download	FreeBSD-src-5ceeac4aa882074e2b1e42fbc20c79ebbd24f4c5.zip FreeBSD-src-5ceeac4aa882074e2b1e42fbc20c79ebbd24f4c5.tar.gz