summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorluigi <luigi@FreeBSD.org>2010-03-23 09:58:59 +0000
committerluigi <luigi@FreeBSD.org>2010-03-23 09:58:59 +0000
commit153fa4f49e7ae4d39851638cfb970d383c0f8b91 (patch)
tree003b5213e062bbdc0758d00a9edc114e010723bd
parent237d6f0e4a51d1e1fc9572f2c25fb79712126d1f (diff)
downloadFreeBSD-src-153fa4f49e7ae4d39851638cfb970d383c0f8b91.zip
FreeBSD-src-153fa4f49e7ae4d39851638cfb970d383c0f8b91.tar.gz
MFC of a large number of ipfw and dummynet fixes and enhancements
done in CURRENT over the last 4 months. HEAD and RELENG_8 are almost in sync now for ipfw, dummynet the pfil hooks and related components. Among the most noticeable changes: - r200855 more efficient lookup of skipto rules, and remove O(N) blocks from critical sections in the kernel; - r204591 large restructuring of the dummynet module, with support for multiple scheduling algorithms (4 available so far) See the original commit logs for details. Changes in the kernel/userland ABI should be harmless because the kernel is able to understand previous requests from RELENG_8 and RELENG_7. For this reason, this changeset would be applicable to RELENG_7 as well, but i am not sure if it is worthwhile.
-rw-r--r--sbin/ipfw/Makefile1
-rw-r--r--sbin/ipfw/altq.c1
-rw-r--r--sbin/ipfw/dummynet.c965
-rw-r--r--sbin/ipfw/ipfw.8264
-rw-r--r--sbin/ipfw/ipfw2.c365
-rw-r--r--sbin/ipfw/ipfw2.h31
-rw-r--r--sbin/ipfw/main.c158
-rw-r--r--sys/conf/files11
-rw-r--r--sys/net/if_bridge.c34
-rw-r--r--sys/net/if_ethersubr.c32
-rw-r--r--sys/net/radix.c58
-rw-r--r--sys/net/radix.h3
-rw-r--r--sys/net/route.c9
-rw-r--r--sys/netgraph/ng_ipfw.c51
-rw-r--r--sys/netgraph/ng_ipfw.h24
-rw-r--r--sys/netinet/in.h26
-rw-r--r--sys/netinet/ip_divert.c90
-rw-r--r--sys/netinet/ip_divert.h63
-rw-r--r--sys/netinet/ip_dummynet.h485
-rw-r--r--sys/netinet/ip_fw.h158
-rw-r--r--sys/netinet/ip_var.h47
-rw-r--r--sys/netinet/ipfw/dn_heap.c550
-rw-r--r--sys/netinet/ipfw/dn_heap.h191
-rw-r--r--sys/netinet/ipfw/dn_sched.h189
-rw-r--r--sys/netinet/ipfw/dn_sched_fifo.c120
-rw-r--r--sys/netinet/ipfw/dn_sched_qfq.c864
-rw-r--r--sys/netinet/ipfw/dn_sched_rr.c307
-rw-r--r--sys/netinet/ipfw/dn_sched_wf2q.c373
-rw-r--r--sys/netinet/ipfw/dummynet.txt860
-rw-r--r--sys/netinet/ipfw/ip_dn_glue.c845
-rw-r--r--sys/netinet/ipfw/ip_dn_io.c788
-rw-r--r--sys/netinet/ipfw/ip_dn_private.h402
-rw-r--r--sys/netinet/ipfw/ip_dummynet.c3880
-rw-r--r--sys/netinet/ipfw/ip_fw2.c3606
-rw-r--r--sys/netinet/ipfw/ip_fw_dynamic.c1244
-rw-r--r--sys/netinet/ipfw/ip_fw_log.c435
-rw-r--r--sys/netinet/ipfw/ip_fw_nat.c491
-rw-r--r--sys/netinet/ipfw/ip_fw_pfil.c568
-rw-r--r--sys/netinet/ipfw/ip_fw_private.h301
-rw-r--r--sys/netinet/ipfw/ip_fw_sockopt.c1287
-rw-r--r--sys/netinet/ipfw/ip_fw_table.c286
-rw-r--r--sys/netinet/ipfw/test/Makefile50
-rw-r--r--sys/netinet/ipfw/test/dn_test.h175
-rw-r--r--sys/netinet/ipfw/test/main.c636
-rw-r--r--sys/netinet/ipfw/test/mylist.h49
-rw-r--r--sys/netinet/ipfw/test/test_dn_heap.c162
-rw-r--r--sys/netinet/ipfw/test/test_dn_sched.c89
-rw-r--r--sys/netinet/raw_ip.c10
48 files changed, 14723 insertions, 6911 deletions
diff --git a/sbin/ipfw/Makefile b/sbin/ipfw/Makefile
index c09ebca..b25f38c 100644
--- a/sbin/ipfw/Makefile
+++ b/sbin/ipfw/Makefile
@@ -3,6 +3,7 @@
PROG= ipfw
SRCS= ipfw2.c dummynet.c ipv6.c main.c nat.c altq.c
WARNS?= 2
+DPADD= ${LIBUTIL}
LDADD= -lutil
MAN= ipfw.8
diff --git a/sbin/ipfw/altq.c b/sbin/ipfw/altq.c
index b00a1e0..8cf19e5 100644
--- a/sbin/ipfw/altq.c
+++ b/sbin/ipfw/altq.c
@@ -39,6 +39,7 @@
#include <net/if.h> /* IFNAMSIZ */
#include <net/pfvar.h>
+#include <netinet/in.h> /* in_addr */
#include <netinet/ip_fw.h>
/*
diff --git a/sbin/ipfw/dummynet.c b/sbin/ipfw/dummynet.c
index 9e68e65..eb6547a 100644
--- a/sbin/ipfw/dummynet.c
+++ b/sbin/ipfw/dummynet.c
@@ -1,10 +1,5 @@
/*
- * Copyright (c) 2002-2003 Luigi Rizzo
- * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp
- * Copyright (c) 1994 Ugen J.S.Antsilevich
- *
- * Idea and grammar partially left from:
- * Copyright (c) 1993 Daniel Boulet
+ * Copyright (c) 2002-2003,2010 Luigi Rizzo
*
* Redistribution and use in source forms, with and without modification,
* are permitted provided that this entire comment appears intact.
@@ -15,8 +10,6 @@
*
* This software is provided ``AS IS'' without any warranties of any kind.
*
- * NEW command line interface for IP firewall facility
- *
* $FreeBSD$
*
* dummynet support
@@ -24,7 +17,6 @@
#include <sys/types.h>
#include <sys/socket.h>
-#include <sys/queue.h>
/* XXX there are several sysctl leftover here */
#include <sys/sysctl.h>
@@ -46,6 +38,7 @@
#include <netinet/ip_dummynet.h>
#include <arpa/inet.h> /* inet_ntoa */
+
static struct _s_x dummynet_params[] = {
{ "plr", TOK_PLR },
{ "noerror", TOK_NOERROR },
@@ -56,27 +49,59 @@ static struct _s_x dummynet_params[] = {
{ "src-port", TOK_SRCPORT },
{ "proto", TOK_PROTO },
{ "weight", TOK_WEIGHT },
+ { "lmax", TOK_LMAX },
+ { "maxlen", TOK_LMAX },
{ "all", TOK_ALL },
- { "mask", TOK_MASK },
+ { "mask", TOK_MASK }, /* alias for both */
+ { "sched_mask", TOK_SCHED_MASK },
+ { "flow_mask", TOK_FLOW_MASK },
{ "droptail", TOK_DROPTAIL },
{ "red", TOK_RED },
{ "gred", TOK_GRED },
{ "bw", TOK_BW },
{ "bandwidth", TOK_BW },
{ "delay", TOK_DELAY },
+ { "link", TOK_LINK },
{ "pipe", TOK_PIPE },
{ "queue", TOK_QUEUE },
+ { "flowset", TOK_FLOWSET },
+ { "sched", TOK_SCHED },
+ { "pri", TOK_PRI },
+ { "priority", TOK_PRI },
+ { "type", TOK_TYPE },
{ "flow-id", TOK_FLOWID},
{ "dst-ipv6", TOK_DSTIP6},
{ "dst-ip6", TOK_DSTIP6},
{ "src-ipv6", TOK_SRCIP6},
{ "src-ip6", TOK_SRCIP6},
- { "profile", TOK_PIPE_PROFILE},
+ { "profile", TOK_PROFILE},
{ "burst", TOK_BURST},
{ "dummynet-params", TOK_NULL },
{ NULL, 0 } /* terminator */
};
+#define O_NEXT(p, len) ((void *)((char *)p + len))
+
+static void
+oid_fill(struct dn_id *oid, int len, int type, uintptr_t id)
+{
+ oid->len = len;
+ oid->type = type;
+ oid->subtype = 0;
+ oid->id = id;
+}
+
+/* make room in the buffer and move the pointer forward */
+static void *
+o_next(struct dn_id **o, int len, int type)
+{
+ struct dn_id *ret = *o;
+ oid_fill(ret, len, type, 0);
+ *o = O_NEXT(*o, len);
+ return ret;
+}
+
+#if 0
static int
sort_q(void *arg, const void *pa, const void *pb)
{
@@ -108,117 +133,84 @@ sort_q(void *arg, const void *pa, const void *pb)
res = 1;
return (int)(rev ? res : -res);
}
+#endif
+/* print a mask and header for the subsequent list of flows */
static void
-list_queues(struct dn_flow_set *fs, struct dn_flow_queue *q)
+print_mask(struct ipfw_flow_id *id)
+{
+ if (!IS_IP6_FLOW_ID(id)) {
+ printf(" "
+ "mask: %s 0x%02x 0x%08x/0x%04x -> 0x%08x/0x%04x\n",
+ id->extra ? "queue," : "",
+ id->proto,
+ id->src_ip, id->src_port,
+ id->dst_ip, id->dst_port);
+
+ printf("BKT Prot ___Source IP/port____ "
+ "____Dest. IP/port____ "
+ "Tot_pkt/bytes Pkt/Byte Drp\n");
+ } else {
+ char buf[255];
+ printf("\n mask: %sproto: 0x%02x, flow_id: 0x%08x, ",
+ id->extra ? "queue," : "",
+ id->proto, id->flow_id6);
+ inet_ntop(AF_INET6, &(id->src_ip6), buf, sizeof(buf));
+ printf("%s/0x%04x -> ", buf, id->src_port);
+ inet_ntop(AF_INET6, &(id->dst_ip6), buf, sizeof(buf));
+ printf("%s/0x%04x\n", buf, id->dst_port);
+
+ printf("BKT ___Prot___ _flow-id_ "
+ "______________Source IPv6/port_______________ "
+ "_______________Dest. IPv6/port_______________ "
+ "Tot_pkt/bytes Pkt/Byte Drp\n");
+ }
+}
+
+static void
+list_flow(struct dn_flow *ni)
{
- int l;
- int index_printed, indexes = 0;
char buff[255];
struct protoent *pe;
+ struct in_addr ina;
+ struct ipfw_flow_id *id = &ni->fid;
- if (fs->rq_elements == 0)
- return;
-
- if (co.do_sort != 0)
- qsort_r(q, fs->rq_elements, sizeof *q, NULL, sort_q);
-
- /* Print IPv4 flows */
- index_printed = 0;
- for (l = 0; l < fs->rq_elements; l++) {
- struct in_addr ina;
-
+ pe = getprotobynumber(id->proto);
/* XXX: Should check for IPv4 flows */
- if (IS_IP6_FLOW_ID(&(q[l].id)))
- continue;
-
- if (!index_printed) {
- index_printed = 1;
- if (indexes > 0) /* currently a no-op */
- printf("\n");
- indexes++;
- printf(" "
- "mask: 0x%02x 0x%08x/0x%04x -> 0x%08x/0x%04x\n",
- fs->flow_mask.proto,
- fs->flow_mask.src_ip, fs->flow_mask.src_port,
- fs->flow_mask.dst_ip, fs->flow_mask.dst_port);
-
- printf("BKT Prot ___Source IP/port____ "
- "____Dest. IP/port____ "
- "Tot_pkt/bytes Pkt/Byte Drp\n");
- }
-
- printf("%3d ", q[l].hash_slot);
- pe = getprotobynumber(q[l].id.proto);
+ printf("%3u%c", (ni->oid.id) & 0xff,
+ id->extra ? '*' : ' ');
+ if (!IS_IP6_FLOW_ID(id)) {
if (pe)
printf("%-4s ", pe->p_name);
else
- printf("%4u ", q[l].id.proto);
- ina.s_addr = htonl(q[l].id.src_ip);
+ printf("%4u ", id->proto);
+ ina.s_addr = htonl(id->src_ip);
printf("%15s/%-5d ",
- inet_ntoa(ina), q[l].id.src_port);
- ina.s_addr = htonl(q[l].id.dst_ip);
+ inet_ntoa(ina), id->src_port);
+ ina.s_addr = htonl(id->dst_ip);
printf("%15s/%-5d ",
- inet_ntoa(ina), q[l].id.dst_port);
- printf("%4llu %8llu %2u %4u %3u\n",
- align_uint64(&q[l].tot_pkts),
- align_uint64(&q[l].tot_bytes),
- q[l].len, q[l].len_bytes, q[l].drops);
- if (co.verbose)
- printf(" S %20llu F %20llu\n",
- align_uint64(&q[l].S), align_uint64(&q[l].F));
- }
-
- /* Print IPv6 flows */
- index_printed = 0;
- for (l = 0; l < fs->rq_elements; l++) {
- if (!IS_IP6_FLOW_ID(&(q[l].id)))
- continue;
-
- if (!index_printed) {
- index_printed = 1;
- if (indexes > 0)
- printf("\n");
- indexes++;
- printf("\n mask: proto: 0x%02x, flow_id: 0x%08x, ",
- fs->flow_mask.proto, fs->flow_mask.flow_id6);
- inet_ntop(AF_INET6, &(fs->flow_mask.src_ip6),
- buff, sizeof(buff));
- printf("%s/0x%04x -> ", buff, fs->flow_mask.src_port);
- inet_ntop( AF_INET6, &(fs->flow_mask.dst_ip6),
- buff, sizeof(buff) );
- printf("%s/0x%04x\n", buff, fs->flow_mask.dst_port);
-
- printf("BKT ___Prot___ _flow-id_ "
- "______________Source IPv6/port_______________ "
- "_______________Dest. IPv6/port_______________ "
- "Tot_pkt/bytes Pkt/Byte Drp\n");
- }
- printf("%3d ", q[l].hash_slot);
- pe = getprotobynumber(q[l].id.proto);
+ inet_ntoa(ina), id->dst_port);
+ } else {
+ /* Print IPv6 flows */
if (pe != NULL)
printf("%9s ", pe->p_name);
else
- printf("%9u ", q[l].id.proto);
- printf("%7d %39s/%-5d ", q[l].id.flow_id6,
- inet_ntop(AF_INET6, &(q[l].id.src_ip6), buff, sizeof(buff)),
- q[l].id.src_port);
+ printf("%9u ", id->proto);
+ printf("%7d %39s/%-5d ", id->flow_id6,
+ inet_ntop(AF_INET6, &(id->src_ip6), buff, sizeof(buff)),
+ id->src_port);
printf(" %39s/%-5d ",
- inet_ntop(AF_INET6, &(q[l].id.dst_ip6), buff, sizeof(buff)),
- q[l].id.dst_port);
- printf(" %4llu %8llu %2u %4u %3u\n",
- align_uint64(&q[l].tot_pkts),
- align_uint64(&q[l].tot_bytes),
- q[l].len, q[l].len_bytes, q[l].drops);
- if (co.verbose)
- printf(" S %20llu F %20llu\n",
- align_uint64(&q[l].S),
- align_uint64(&q[l].F));
+ inet_ntop(AF_INET6, &(id->dst_ip6), buff, sizeof(buff)),
+ id->dst_port);
}
+ printf("%4llu %8llu %2u %4u %3u\n",
+ align_uint64(&ni->tot_pkts),
+ align_uint64(&ni->tot_bytes),
+ ni->length, ni->len_bytes, ni->drops);
}
static void
-print_flowset_parms(struct dn_flow_set *fs, char *prefix)
+print_flowset_parms(struct dn_fs *fs, char *prefix)
{
int l;
char qs[30];
@@ -226,7 +218,7 @@ print_flowset_parms(struct dn_flow_set *fs, char *prefix)
char red[90]; /* Display RED parameters */
l = fs->qsize;
- if (fs->flags_fs & DN_QSIZE_IS_BYTES) {
+ if (fs->flags & DN_QSIZE_BYTES) {
if (l >= 8192)
sprintf(qs, "%d KB", l / 1024);
else
@@ -237,23 +229,34 @@ print_flowset_parms(struct dn_flow_set *fs, char *prefix)
sprintf(plr, "plr %f", 1.0 * fs->plr / (double)(0x7fffffff));
else
plr[0] = '\0';
- if (fs->flags_fs & DN_IS_RED) /* RED parameters */
+
+ if (fs->flags & DN_IS_RED) /* RED parameters */
sprintf(red,
"\n\t %cRED w_q %f min_th %d max_th %d max_p %f",
- (fs->flags_fs & DN_IS_GENTLE_RED) ? 'G' : ' ',
+ (fs->flags & DN_IS_GENTLE_RED) ? 'G' : ' ',
1.0 * fs->w_q / (double)(1 << SCALE_RED),
- SCALE_VAL(fs->min_th),
- SCALE_VAL(fs->max_th),
+ fs->min_th,
+ fs->max_th,
1.0 * fs->max_p / (double)(1 << SCALE_RED));
else
sprintf(red, "droptail");
- printf("%s %s%s %d queues (%d buckets) %s\n",
- prefix, qs, plr, fs->rq_elements, fs->rq_size, red);
+ if (prefix[0]) {
+ printf("%s %s%s %d queues (%d buckets) %s\n",
+ prefix, qs, plr, fs->oid.id, fs->buckets, red);
+ prefix[0] = '\0';
+ } else {
+ printf("q%05d %s%s %d flows (%d buckets) sched %d "
+ "weight %d lmax %d pri %d %s\n",
+ fs->fs_nr, qs, plr, fs->oid.id, fs->buckets,
+ fs->sched_nr, fs->par[0], fs->par[1], fs->par[2], red);
+ if (fs->flags & DN_HAVE_MASK)
+ print_mask(&fs->flow_mask);
+ }
}
static void
-print_extra_delay_parms(struct dn_pipe *p)
+print_extra_delay_parms(struct dn_profile *p)
{
double loss;
if (p->samples_no <= 0)
@@ -265,105 +268,126 @@ print_extra_delay_parms(struct dn_pipe *p)
p->name, loss, p->samples_no);
}
-void
-ipfw_list_pipes(void *data, uint nbytes, int ac, char *av[])
+static void
+flush_buf(char *buf)
{
- int rulenum;
- void *next = data;
- struct dn_pipe *p = (struct dn_pipe *) data;
- struct dn_flow_set *fs;
- struct dn_flow_queue *q;
- int l;
-
- if (ac > 0)
- rulenum = strtoul(*av++, NULL, 10);
- else
- rulenum = 0;
- for (; nbytes >= sizeof *p; p = (struct dn_pipe *)next) {
- double b = p->bandwidth;
- char buf[30];
- char prefix[80];
- char burst[5 + 7];
-
- if (SLIST_NEXT(p, next) != (struct dn_pipe *)DN_IS_PIPE)
- break; /* done with pipes, now queues */
-
- /*
- * compute length, as pipe have variable size
- */
- l = sizeof(*p) + p->fs.rq_elements * sizeof(*q);
- next = (char *)p + l;
- nbytes -= l;
-
- if ((rulenum != 0 && rulenum != p->pipe_nr) || co.do_pipe == 2)
- continue;
-
- /*
- * Print rate (or clocking interface)
- */
- if (p->if_name[0] != '\0')
- sprintf(buf, "%s", p->if_name);
- else if (b == 0)
- sprintf(buf, "unlimited");
- else if (b >= 1000000)
- sprintf(buf, "%7.3f Mbit/s", b/1000000);
- else if (b >= 1000)
- sprintf(buf, "%7.3f Kbit/s", b/1000);
- else
- sprintf(buf, "%7.3f bit/s ", b);
-
- sprintf(prefix, "%05d: %s %4d ms ",
- p->pipe_nr, buf, p->delay);
-
- print_flowset_parms(&(p->fs), prefix);
-
- if (humanize_number(burst, sizeof(burst), p->burst,
- "Byte", HN_AUTOSCALE, 0) < 0 || co.verbose)
- printf("\t burst: %ju Byte\n", p->burst);
- else
- printf("\t burst: %s\n", burst);
-
- print_extra_delay_parms(p);
-
- q = (struct dn_flow_queue *)(p+1);
- list_queues(&(p->fs), q);
- }
- for (fs = next; nbytes >= sizeof *fs; fs = next) {
- char prefix[80];
-
- if (SLIST_NEXT(fs, next) != (struct dn_flow_set *)DN_IS_QUEUE)
- break;
- l = sizeof(*fs) + fs->rq_elements * sizeof(*q);
- next = (char *)fs + l;
- nbytes -= l;
-
- if (rulenum != 0 && ((rulenum != fs->fs_nr && co.do_pipe == 2) ||
- (rulenum != fs->parent_nr && co.do_pipe == 1))) {
- continue;
- }
-
- q = (struct dn_flow_queue *)(fs+1);
- sprintf(prefix, "q%05d: weight %d pipe %d ",
- fs->fs_nr, fs->weight, fs->parent_nr);
- print_flowset_parms(fs, prefix);
- list_queues(fs, q);
+ if (buf[0])
+ printf("%s\n", buf);
+ buf[0] = '\0';
+}
+
+/*
+ * generic list routine. We expect objects in a specific order, i.e.
+ * PIPES AND SCHEDULERS:
+ * link; scheduler; internal flowset if any; instances
+ * we can tell a pipe from the number.
+ *
+ * FLOWSETS:
+ * flowset; queues;
+ * link i (int queue); scheduler i; si(i) { flowsets() : queues }
+ */
+static void
+list_pipes(struct dn_id *oid, struct dn_id *end)
+{
+ char buf[160]; /* pending buffer */
+ buf[0] = '\0';
+
+ for (; oid != end; oid = O_NEXT(oid, oid->len)) {
+ if (oid->len < sizeof(*oid))
+ errx(1, "invalid oid len %d\n", oid->len);
+
+ switch (oid->type) {
+ default:
+ flush_buf(buf);
+ printf("unrecognized object %d size %d\n", oid->type, oid->len);
+ break;
+ case DN_TEXT: /* list of attached flowsets */
+ {
+ int i, l;
+ struct {
+ struct dn_id id;
+ uint32_t p[0];
+ } *d = (void *)oid;
+ l = (oid->len - sizeof(*oid))/sizeof(d->p[0]);
+ if (l == 0)
+ break;
+ printf(" Children flowsets: ");
+ for (i = 0; i < l; i++)
+ printf("%u ", d->p[i]);
+ printf("\n");
+ break;
+ }
+ case DN_CMD_GET:
+ if (co.verbose)
+ printf("answer for cmd %d, len %d\n", oid->type, oid->id);
+ break;
+ case DN_SCH: {
+ struct dn_sch *s = (struct dn_sch *)oid;
+ flush_buf(buf);
+ printf(" sched %d type %s flags 0x%x %d buckets %d active\n",
+ s->sched_nr,
+ s->name, s->flags, s->buckets, s->oid.id);
+ if (s->flags & DN_HAVE_MASK)
+ print_mask(&s->sched_mask);
+ }
+ break;
+
+ case DN_FLOW:
+ list_flow((struct dn_flow *)oid);
+ break;
+
+ case DN_LINK: {
+ struct dn_link *p = (struct dn_link *)oid;
+ double b = p->bandwidth;
+ char bwbuf[30];
+ char burst[5 + 7];
+
+ /* This starts a new object so flush buffer */
+ flush_buf(buf);
+ /* data rate */
+ if (b == 0)
+ sprintf(bwbuf, "unlimited ");
+ else if (b >= 1000000)
+ sprintf(bwbuf, "%7.3f Mbit/s", b/1000000);
+ else if (b >= 1000)
+ sprintf(bwbuf, "%7.3f Kbit/s", b/1000);
+ else
+ sprintf(bwbuf, "%7.3f bit/s ", b);
+
+ if (humanize_number(burst, sizeof(burst), p->burst,
+ "", HN_AUTOSCALE, 0) < 0 || co.verbose)
+ sprintf(burst, "%d", (int)p->burst);
+ sprintf(buf, "%05d: %s %4d ms burst %s",
+ p->link_nr % DN_MAX_ID, bwbuf, p->delay, burst);
+ }
+ break;
+
+ case DN_FS:
+ print_flowset_parms((struct dn_fs *)oid, buf);
+ break;
+ case DN_PROFILE:
+ flush_buf(buf);
+ print_extra_delay_parms((struct dn_profile *)oid);
}
+ flush_buf(buf); // XXX does it really go here ?
+ }
}
/*
- * Delete pipe or queue i
+ * Delete pipe, queue or scheduler i
*/
int
-ipfw_delete_pipe(int pipe_or_queue, int i)
+ipfw_delete_pipe(int do_pipe, int i)
{
- struct dn_pipe p;
-
- memset(&p, 0, sizeof p);
- if (pipe_or_queue == 1)
- p.pipe_nr = i; /* pipe */
- else
- p.fs.fs_nr = i; /* queue */
- i = do_cmd(IP_DUMMYNET_DEL, &p, sizeof p);
+ struct {
+ struct dn_id oid;
+ uintptr_t a[1]; /* add more if we want a list */
+ } cmd;
+ oid_fill((void *)&cmd, sizeof(cmd), DN_CMD_DELETE, DN_API_VERSION);
+ cmd.oid.subtype = (do_pipe == 1) ? DN_LINK :
+ ( (do_pipe == 2) ? DN_FS : DN_SCH);
+ cmd.a[0] = i;
+ i = do_cmd(IP_DUMMYNET3, &cmd, cmd.oid.len);
if (i) {
i = 1;
warn("rule %u: setsockopt(IP_DUMMYNET_DEL)", i);
@@ -400,7 +424,7 @@ ipfw_delete_pipe(int pipe_or_queue, int i)
* The empirical curve may have both vertical and horizontal lines.
* Vertical lines represent constant delay for a range of
* probabilities; horizontal lines correspond to a discontinuty
- * in the delay distribution: the pipe will use the largest delay
+ * in the delay distribution: the link will use the largest delay
* for a given probability.
*
* To pass the curve to dummynet, we must store the parameters
@@ -490,9 +514,12 @@ static void
read_bandwidth(char *arg, int *bandwidth, char *if_name, int namelen)
{
if (*bandwidth != -1)
- warn("duplicate token, override bandwidth value!");
+ warnx("duplicate token, override bandwidth value!");
if (arg[0] >= 'a' && arg[0] <= 'z') {
+ if (!if_name) {
+ errx(1, "no if support");
+ }
if (namelen >= IFNAMSIZ)
warn("interface name truncated");
namelen--;
@@ -508,7 +535,7 @@ read_bandwidth(char *arg, int *bandwidth, char *if_name, int namelen)
if (*end == 'K' || *end == 'k') {
end++;
bw *= 1000;
- } else if (*end == 'M') {
+ } else if (*end == 'M' || *end == 'm') {
end++;
bw *= 1000000;
}
@@ -521,7 +548,8 @@ read_bandwidth(char *arg, int *bandwidth, char *if_name, int namelen)
errx(EX_DATAERR, "bandwidth too large");
*bandwidth = bw;
- if_name[0] = '\0';
+ if (if_name)
+ if_name[0] = '\0';
}
}
@@ -551,7 +579,8 @@ compare_points(const void *vp1, const void *vp2)
#define ED_EFMT(s) EX_DATAERR,"error in %s at line %d: "#s,filename,lineno
static void
-load_extra_delays(const char *filename, struct dn_pipe *p)
+load_extra_delays(const char *filename, struct dn_profile *p,
+ struct dn_link *link)
{
char line[ED_MAX_LINE_LEN];
FILE *f;
@@ -566,6 +595,9 @@ load_extra_delays(const char *filename, struct dn_pipe *p)
struct point points[ED_MAX_SAMPLES_NO];
int points_no = 0;
+ /* XXX link never NULL? */
+ p->link_nr = link->link_nr;
+
profile_name[0] = '\0';
f = fopen(filename, "r");
if (f == NULL)
@@ -606,7 +638,8 @@ load_extra_delays(const char *filename, struct dn_pipe *p)
ED_MAX_SAMPLES_NO);
do_points = 0;
} else if (!strcasecmp(name, ED_TOK_BW)) {
- read_bandwidth(arg, &p->bandwidth, p->if_name, sizeof(p->if_name));
+ char buf[IFNAMSIZ];
+ read_bandwidth(arg, &link->bandwidth, buf, sizeof(buf));
} else if (!strcasecmp(name, ED_TOK_LOSS)) {
if (loss != -1.0)
errx(ED_EFMT("duplicated token: %s"), name);
@@ -676,17 +709,17 @@ load_extra_delays(const char *filename, struct dn_pipe *p)
double y2 = points[i+1].prob * samples;
double x2 = points[i+1].delay;
- int index = y1;
+ int ix = y1;
int stop = y2;
if (x1 == x2) {
- for (; index<stop; ++index)
- p->samples[index] = x1;
+ for (; ix<stop; ++ix)
+ p->samples[ix] = x1;
} else {
double m = (y2-y1)/(x2-x1);
double c = y1 - m*x1;
- for (; index<stop ; ++index)
- p->samples[index] = (index - c)/m;
+ for (; ix<stop ; ++ix)
+ p->samples[ix] = (ix - c)/m;
}
}
p->samples_no = samples;
@@ -694,27 +727,120 @@ load_extra_delays(const char *filename, struct dn_pipe *p)
strncpy(p->name, profile_name, sizeof(p->name));
}
+/*
+ * configuration of pipes, schedulers, flowsets.
+ * When we configure a new scheduler, an empty pipe is created, so:
+ *
+ * do_pipe = 1 -> "pipe N config ..." only for backward compatibility
+ * sched N+Delta type fifo sched_mask ...
+ * pipe N+Delta <parameters>
+ * flowset N+Delta pipe N+Delta (no parameters)
+ * sched N type wf2q+ sched_mask ...
+ * pipe N <parameters>
+ *
+ * do_pipe = 2 -> flowset N config
+ * flowset N parameters
+ *
+ * do_pipe = 3 -> sched N config
+ * sched N parameters (default no pipe)
+ * optional Pipe N config ...
+ * pipe ==>
+ */
void
ipfw_config_pipe(int ac, char **av)
{
- int samples[ED_MAX_SAMPLES_NO];
- struct dn_pipe p;
- int i;
+ int i, j;
char *end;
void *par = NULL;
-
- memset(&p, 0, sizeof p);
- p.bandwidth = -1;
+ struct dn_id *buf, *base;
+ struct dn_sch *sch = NULL;
+ struct dn_link *p = NULL;
+ struct dn_fs *fs = NULL;
+ struct dn_profile *pf = NULL;
+ struct ipfw_flow_id *mask = NULL;
+ int lmax;
+ uint32_t _foo = 0, *flags = &_foo , *buckets = &_foo;
+
+ /*
+ * allocate space for 1 header,
+ * 1 scheduler, 1 link, 1 flowset, 1 profile
+ */
+ lmax = sizeof(struct dn_id); /* command header */
+ lmax += sizeof(struct dn_sch) + sizeof(struct dn_link) +
+ sizeof(struct dn_fs) + sizeof(struct dn_profile);
av++; ac--;
/* Pipe number */
if (ac && isdigit(**av)) {
i = atoi(*av); av++; ac--;
- if (co.do_pipe == 1)
- p.pipe_nr = i;
- else
- p.fs.fs_nr = i;
+ } else
+ i = -1;
+ if (i <= 0)
+ errx(EX_USAGE, "need a pipe/flowset/sched number");
+ base = buf = safe_calloc(1, lmax);
+ /* all commands start with a 'CONFIGURE' and a version */
+ o_next(&buf, sizeof(struct dn_id), DN_CMD_CONFIG);
+ base->id = DN_API_VERSION;
+
+ switch (co.do_pipe) {
+ case 1: /* "pipe N config ..." */
+ /* Allocate space for the WF2Q+ scheduler, its link
+ * and the FIFO flowset. Set the number, but leave
+ * the scheduler subtype and other parameters to 0
+ * so the kernel will use appropriate defaults.
+ * XXX todo: add a flag to record if a parameter
+ * is actually configured.
+ * If we do a 'pipe config' mask -> sched_mask.
+ * The FIFO scheduler and link are derived from the
+ * WF2Q+ one in the kernel.
+ */
+ sch = o_next(&buf, sizeof(*sch), DN_SCH);
+ p = o_next(&buf, sizeof(*p), DN_LINK);
+ fs = o_next(&buf, sizeof(*fs), DN_FS);
+
+ sch->sched_nr = i;
+ sch->oid.subtype = 0; /* defaults to WF2Q+ */
+ mask = &sch->sched_mask;
+ flags = &sch->flags;
+ buckets = &sch->buckets;
+ *flags |= DN_PIPE_CMD;
+
+ p->link_nr = i;
+
+ /* This flowset is only for the FIFO scheduler */
+ fs->fs_nr = i + 2*DN_MAX_ID;
+ fs->sched_nr = i + DN_MAX_ID;
+ break;
+
+ case 2: /* "queue N config ... " */
+ fs = o_next(&buf, sizeof(*fs), DN_FS);
+ fs->fs_nr = i;
+ mask = &fs->flow_mask;
+ flags = &fs->flags;
+ buckets = &fs->buckets;
+ break;
+
+ case 3: /* "sched N config ..." */
+ sch = o_next(&buf, sizeof(*sch), DN_SCH);
+ fs = o_next(&buf, sizeof(*fs), DN_FS);
+ sch->sched_nr = i;
+ mask = &sch->sched_mask;
+ flags = &sch->flags;
+ buckets = &sch->buckets;
+ /* fs is used only with !MULTIQUEUE schedulers */
+ fs->fs_nr = i + DN_MAX_ID;
+ fs->sched_nr = i;
+ break;
}
+ /* set to -1 those fields for which we want to reuse existing
+ * values from the kernel.
+ * Also, *_nr and subtype = 0 mean reuse the value from the kernel.
+ * XXX todo: support reuse of the mask.
+ */
+ if (p)
+ p->bandwidth = -1;
+ for (j = 0; j < sizeof(fs->par)/sizeof(fs->par[0]); j++)
+ fs->par[j] = -1;
while (ac > 0) {
double d;
int tok = match_token(dummynet_params, *av);
@@ -722,41 +848,48 @@ ipfw_config_pipe(int ac, char **av)
switch(tok) {
case TOK_NOERROR:
- p.fs.flags_fs |= DN_NOERROR;
+ NEED(fs, "noerror is only for pipes");
+ fs->flags |= DN_NOERROR;
break;
case TOK_PLR:
+ NEED(fs, "plr is only for pipes");
NEED1("plr needs argument 0..1\n");
d = strtod(av[0], NULL);
if (d > 1)
d = 1;
else if (d < 0)
d = 0;
- p.fs.plr = (int)(d*0x7fffffff);
+ fs->plr = (int)(d*0x7fffffff);
ac--; av++;
break;
case TOK_QUEUE:
+ NEED(fs, "queue is only for pipes or flowsets");
NEED1("queue needs queue size\n");
end = NULL;
- p.fs.qsize = strtoul(av[0], &end, 0);
+ fs->qsize = strtoul(av[0], &end, 0);
if (*end == 'K' || *end == 'k') {
- p.fs.flags_fs |= DN_QSIZE_IS_BYTES;
- p.fs.qsize *= 1024;
+ fs->flags |= DN_QSIZE_BYTES;
+ fs->qsize *= 1024;
} else if (*end == 'B' ||
_substrcmp2(end, "by", "bytes") == 0) {
- p.fs.flags_fs |= DN_QSIZE_IS_BYTES;
+ fs->flags |= DN_QSIZE_BYTES;
}
ac--; av++;
break;
case TOK_BUCKETS:
+ NEED(fs, "buckets is only for pipes or flowsets");
NEED1("buckets needs argument\n");
- p.fs.rq_size = strtoul(av[0], NULL, 0);
+ *buckets = strtoul(av[0], NULL, 0);
ac--; av++;
break;
+ case TOK_FLOW_MASK:
+ case TOK_SCHED_MASK:
case TOK_MASK:
+ NEED(mask, "tok_mask");
NEED1("mask needs mask specifier\n");
/*
* per-flow queue, mask is dst_ip, dst_port,
@@ -764,7 +897,7 @@ ipfw_config_pipe(int ac, char **av)
*/
par = NULL;
- bzero(&p.fs.flow_mask, sizeof(p.fs.flow_mask));
+ bzero(mask, sizeof(*mask));
end = NULL;
while (ac >= 1) {
@@ -780,44 +913,55 @@ ipfw_config_pipe(int ac, char **av)
case TOK_ALL:
/*
* special case, all bits significant
+ * except 'extra' (the queue number)
*/
- p.fs.flow_mask.dst_ip = ~0;
- p.fs.flow_mask.src_ip = ~0;
- p.fs.flow_mask.dst_port = ~0;
- p.fs.flow_mask.src_port = ~0;
- p.fs.flow_mask.proto = ~0;
- n2mask(&(p.fs.flow_mask.dst_ip6), 128);
- n2mask(&(p.fs.flow_mask.src_ip6), 128);
- p.fs.flow_mask.flow_id6 = ~0;
- p.fs.flags_fs |= DN_HAVE_FLOW_MASK;
+ mask->dst_ip = ~0;
+ mask->src_ip = ~0;
+ mask->dst_port = ~0;
+ mask->src_port = ~0;
+ mask->proto = ~0;
+ n2mask(&mask->dst_ip6, 128);
+ n2mask(&mask->src_ip6, 128);
+ mask->flow_id6 = ~0;
+ *flags |= DN_HAVE_MASK;
+ goto end_mask;
+
+ case TOK_QUEUE:
+ mask->extra = ~0;
+ *flags |= DN_HAVE_MASK;
goto end_mask;
case TOK_DSTIP:
- p32 = &p.fs.flow_mask.dst_ip;
+ mask->addr_type = 4;
+ p32 = &mask->dst_ip;
break;
case TOK_SRCIP:
- p32 = &p.fs.flow_mask.src_ip;
+ mask->addr_type = 4;
+ p32 = &mask->src_ip;
break;
case TOK_DSTIP6:
- pa6 = &(p.fs.flow_mask.dst_ip6);
+ mask->addr_type = 6;
+ pa6 = &mask->dst_ip6;
break;
case TOK_SRCIP6:
- pa6 = &(p.fs.flow_mask.src_ip6);
+ mask->addr_type = 6;
+ pa6 = &mask->src_ip6;
break;
case TOK_FLOWID:
- p20 = &p.fs.flow_mask.flow_id6;
+ mask->addr_type = 6;
+ p20 = &mask->flow_id6;
break;
case TOK_DSTPORT:
- p16 = &p.fs.flow_mask.dst_port;
+ p16 = &mask->dst_port;
break;
case TOK_SRCPORT:
- p16 = &p.fs.flow_mask.src_port;
+ p16 = &mask->src_port;
break;
case TOK_PROTO:
@@ -857,10 +1001,10 @@ ipfw_config_pipe(int ac, char **av)
if (a > 0xFF)
errx(EX_DATAERR,
"proto mask must be 8 bit");
- p.fs.flow_mask.proto = (uint8_t)a;
+ mask->proto = (uint8_t)a;
}
if (a != 0)
- p.fs.flags_fs |= DN_HAVE_FLOW_MASK;
+ *flags |= DN_HAVE_MASK;
ac--; av++;
} /* end while, config masks */
end_mask:
@@ -869,9 +1013,9 @@ end_mask:
case TOK_RED:
case TOK_GRED:
NEED1("red/gred needs w_q/min_th/max_th/max_p\n");
- p.fs.flags_fs |= DN_IS_RED;
+ fs->flags |= DN_IS_RED;
if (tok == TOK_GRED)
- p.fs.flags_fs |= DN_IS_GENTLE_RED;
+ fs->flags |= DN_IS_GENTLE_RED;
/*
* the format for parameters is w_q/min_th/max_th/max_p
*/
@@ -879,82 +1023,108 @@ end_mask:
double w_q = strtod(end, NULL);
if (w_q > 1 || w_q <= 0)
errx(EX_DATAERR, "0 < w_q <= 1");
- p.fs.w_q = (int) (w_q * (1 << SCALE_RED));
+ fs->w_q = (int) (w_q * (1 << SCALE_RED));
}
if ((end = strsep(&av[0], "/"))) {
- p.fs.min_th = strtoul(end, &end, 0);
+ fs->min_th = strtoul(end, &end, 0);
if (*end == 'K' || *end == 'k')
- p.fs.min_th *= 1024;
+ fs->min_th *= 1024;
}
if ((end = strsep(&av[0], "/"))) {
- p.fs.max_th = strtoul(end, &end, 0);
+ fs->max_th = strtoul(end, &end, 0);
if (*end == 'K' || *end == 'k')
- p.fs.max_th *= 1024;
+ fs->max_th *= 1024;
}
if ((end = strsep(&av[0], "/"))) {
double max_p = strtod(end, NULL);
if (max_p > 1 || max_p <= 0)
errx(EX_DATAERR, "0 < max_p <= 1");
- p.fs.max_p = (int)(max_p * (1 << SCALE_RED));
+ fs->max_p = (int)(max_p * (1 << SCALE_RED));
}
ac--; av++;
break;
case TOK_DROPTAIL:
- p.fs.flags_fs &= ~(DN_IS_RED|DN_IS_GENTLE_RED);
+ NEED(fs, "droptail is only for flowsets");
+ fs->flags &= ~(DN_IS_RED|DN_IS_GENTLE_RED);
break;
case TOK_BW:
+ NEED(p, "bw is only for links");
NEED1("bw needs bandwidth or interface\n");
- if (co.do_pipe != 1)
- errx(EX_DATAERR, "bandwidth only valid for pipes");
- read_bandwidth(av[0], &p.bandwidth, p.if_name, sizeof(p.if_name));
+ read_bandwidth(av[0], &p->bandwidth, NULL, 0);
ac--; av++;
break;
case TOK_DELAY:
- if (co.do_pipe != 1)
- errx(EX_DATAERR, "delay only valid for pipes");
+ NEED(p, "delay is only for links");
NEED1("delay needs argument 0..10000ms\n");
- p.delay = strtoul(av[0], NULL, 0);
+ p->delay = strtoul(av[0], NULL, 0);
+ ac--; av++;
+ break;
+
+ case TOK_TYPE: {
+ int l;
+ NEED(sch, "type is only for schedulers");
+ NEED1("type needs a string");
+ l = strlen(av[0]);
+ if (l == 0 || l > 15)
+ errx(1, "type %s too long\n", av[0]);
+ strcpy(sch->name, av[0]);
+ sch->oid.subtype = 0; /* use string */
ac--; av++;
break;
+ }
case TOK_WEIGHT:
- if (co.do_pipe == 1)
- errx(EX_DATAERR,"weight only valid for queues");
- NEED1("weight needs argument 0..100\n");
- p.fs.weight = strtoul(av[0], &end, 0);
+ NEED(fs, "weight is only for flowsets");
+ NEED1("weight needs argument\n");
+ fs->par[0] = strtol(av[0], &end, 0);
+ ac--; av++;
+ break;
+
+ case TOK_LMAX:
+ NEED(fs, "lmax is only for flowsets");
+ NEED1("lmax needs argument\n");
+ fs->par[1] = strtol(av[0], &end, 0);
ac--; av++;
break;
+ case TOK_PRI:
+ NEED(fs, "priority is only for flowsets");
+ NEED1("priority needs argument\n");
+ fs->par[2] = strtol(av[0], &end, 0);
+ ac--; av++;
+ break;
+
+ case TOK_SCHED:
case TOK_PIPE:
- if (co.do_pipe == 1)
- errx(EX_DATAERR,"pipe only valid for queues");
- NEED1("pipe needs pipe_number\n");
- p.fs.parent_nr = strtoul(av[0], &end, 0);
+ NEED(fs, "pipe/sched");
+ NEED1("pipe/link/sched needs number\n");
+ fs->sched_nr = strtoul(av[0], &end, 0);
ac--; av++;
break;
- case TOK_PIPE_PROFILE:
- if (co.do_pipe != 1)
- errx(EX_DATAERR, "extra delay only valid for pipes");
+ case TOK_PROFILE:
+ NEED((!pf), "profile already set");
+ NEED(p, "profile");
+ {
NEED1("extra delay needs the file name\n");
- p.samples = &samples[0];
- load_extra_delays(av[0], &p);
+ pf = o_next(&buf, sizeof(*pf), DN_PROFILE);
+ load_extra_delays(av[0], pf, p); //XXX can't fail?
--ac; ++av;
+ }
break;
case TOK_BURST:
- if (co.do_pipe != 1)
- errx(EX_DATAERR, "burst only valid for pipes");
+ NEED(p, "burst");
NEED1("burst needs argument\n");
errno = 0;
- if (expand_number(av[0], &p.burst) < 0)
+ if (expand_number(av[0], (int64_t *)&p->burst) < 0)
if (errno != ERANGE)
errx(EX_DATAERR,
"burst: invalid argument");
- if (errno || p.burst > (1ULL << 48) - 1)
+ if (errno || p->burst > (1ULL << 48) - 1)
errx(EX_DATAERR,
"burst: out of range (0..2^48-1)");
ac--; av++;
@@ -964,26 +1134,17 @@ end_mask:
errx(EX_DATAERR, "unrecognised option ``%s''", av[-1]);
}
}
- if (co.do_pipe == 1) {
- if (p.pipe_nr == 0)
- errx(EX_DATAERR, "pipe_nr must be > 0");
- if (p.delay > 10000)
- errx(EX_DATAERR, "delay must be < 10000");
- } else { /* co.do_pipe == 2, queue */
- if (p.fs.parent_nr == 0)
- errx(EX_DATAERR, "pipe must be > 0");
- if (p.fs.weight >100)
- errx(EX_DATAERR, "weight must be <= 100");
- }
- /* check for bandwidth value */
- if (p.bandwidth == -1) {
- p.bandwidth = 0;
- if (p.samples_no > 0)
- errx(EX_DATAERR, "profile requires a bandwidth limit");
+ /* check validity of parameters */
+ if (p) {
+ if (p->delay > 10000)
+ errx(EX_DATAERR, "delay must be < 10000");
+ if (p->bandwidth == -1)
+ p->bandwidth = 0;
}
-
- if (p.fs.flags_fs & DN_QSIZE_IS_BYTES) {
+ if (fs) {
+ /* XXX accept a 0 scheduler to keep the default */
+ if (fs->flags & DN_QSIZE_BYTES) {
size_t len;
long limit;
@@ -991,9 +1152,9 @@ end_mask:
if (sysctlbyname("net.inet.ip.dummynet.pipe_byte_limit",
&limit, &len, NULL, 0) == -1)
limit = 1024*1024;
- if (p.fs.qsize > limit)
+ if (fs->qsize > limit)
errx(EX_DATAERR, "queue size must be < %ldB", limit);
- } else {
+ } else {
size_t len;
long limit;
@@ -1001,27 +1162,25 @@ end_mask:
if (sysctlbyname("net.inet.ip.dummynet.pipe_slot_limit",
&limit, &len, NULL, 0) == -1)
limit = 100;
- if (p.fs.qsize > limit)
+ if (fs->qsize > limit)
errx(EX_DATAERR, "2 <= queue size <= %ld", limit);
- }
- if (p.fs.flags_fs & DN_IS_RED) {
+ }
+
+ if (fs->flags & DN_IS_RED) {
size_t len;
int lookup_depth, avg_pkt_size;
- double s, idle, weight, w_q;
- struct clockinfo ck;
- int t;
+ double w_q;
- if (p.fs.min_th >= p.fs.max_th)
+ if (fs->min_th >= fs->max_th)
errx(EX_DATAERR, "min_th %d must be < than max_th %d",
- p.fs.min_th, p.fs.max_th);
- if (p.fs.max_th == 0)
+ fs->min_th, fs->max_th);
+ if (fs->max_th == 0)
errx(EX_DATAERR, "max_th must be > 0");
len = sizeof(int);
if (sysctlbyname("net.inet.ip.dummynet.red_lookup_depth",
&lookup_depth, &len, NULL, 0) == -1)
- errx(1, "sysctlbyname(\"%s\")",
- "net.inet.ip.dummynet.red_lookup_depth");
+ lookup_depth = 256;
if (lookup_depth == 0)
errx(EX_DATAERR, "net.inet.ip.dummynet.red_lookup_depth"
" must be greater than zero");
@@ -1029,18 +1188,13 @@ end_mask:
len = sizeof(int);
if (sysctlbyname("net.inet.ip.dummynet.red_avg_pkt_size",
&avg_pkt_size, &len, NULL, 0) == -1)
+ avg_pkt_size = 512;
- errx(1, "sysctlbyname(\"%s\")",
- "net.inet.ip.dummynet.red_avg_pkt_size");
if (avg_pkt_size == 0)
errx(EX_DATAERR,
"net.inet.ip.dummynet.red_avg_pkt_size must"
" be greater than zero");
- len = sizeof(struct clockinfo);
- if (sysctlbyname("kern.clockrate", &ck, &len, NULL, 0) == -1)
- errx(1, "sysctlbyname(\"%s\")", "kern.clockrate");
-
/*
* Ticks needed for sending a medium-sized packet.
* Unfortunately, when we are configuring a WF2Q+ queue, we
@@ -1050,38 +1204,181 @@ end_mask:
* correct. But on the other hand, why do we want RED with
* WF2Q+ ?
*/
+#if 0
if (p.bandwidth==0) /* this is a WF2Q+ queue */
s = 0;
else
s = (double)ck.hz * avg_pkt_size * 8 / p.bandwidth;
-
+#endif
/*
* max idle time (in ticks) before avg queue size becomes 0.
* NOTA: (3/w_q) is approx the value x so that
* (1-w_q)^x < 10^-3.
*/
- w_q = ((double)p.fs.w_q) / (1 << SCALE_RED);
+ w_q = ((double)fs->w_q) / (1 << SCALE_RED);
+#if 0 // go in kernel
idle = s * 3. / w_q;
- p.fs.lookup_step = (int)idle / lookup_depth;
- if (!p.fs.lookup_step)
- p.fs.lookup_step = 1;
+ fs->lookup_step = (int)idle / lookup_depth;
+ if (!fs->lookup_step)
+ fs->lookup_step = 1;
weight = 1 - w_q;
- for (t = p.fs.lookup_step; t > 1; --t)
+ for (t = fs->lookup_step; t > 1; --t)
weight *= 1 - w_q;
- p.fs.lookup_weight = (int)(weight * (1 << SCALE_RED));
+ fs->lookup_weight = (int)(weight * (1 << SCALE_RED));
+#endif
+ }
}
- if (p.samples_no <= 0) {
- i = do_cmd(IP_DUMMYNET_CONFIGURE, &p, sizeof p);
- } else {
- struct dn_pipe_max pm;
- int len = sizeof(pm);
- memcpy(&pm.pipe, &p, sizeof(pm.pipe));
- memcpy(&pm.samples, samples, sizeof(pm.samples));
-
- i = do_cmd(IP_DUMMYNET_CONFIGURE, &pm, len);
- }
+ i = do_cmd(IP_DUMMYNET3, base, (char *)buf - (char *)base);
if (i)
err(1, "setsockopt(%s)", "IP_DUMMYNET_CONFIGURE");
}
+
+void
+dummynet_flush(void)
+{
+ struct dn_id oid;
+ oid_fill(&oid, sizeof(oid), DN_CMD_FLUSH, DN_API_VERSION);
+ do_cmd(IP_DUMMYNET3, &oid, oid.len);
+}
+
+/* Parse input for 'ipfw [pipe|sched|queue] show [range list]'
+ * Returns the number of ranges, and possibly stores them
+ * in the array v of size len.
+ */
+static int
+parse_range(int ac, char *av[], uint32_t *v, int len)
+{
+ int n = 0;
+ char *endptr, *s;
+ uint32_t base[2];
+
+ if (v == NULL || len < 2) {
+ v = base;
+ len = 2;
+ }
+
+ for (s = *av; s != NULL; av++, ac--) {
+ v[0] = strtoul(s, &endptr, 10);
+ v[1] = (*endptr != '-') ? v[0] :
+ strtoul(endptr+1, &endptr, 10);
+ if (*endptr == '\0') { /* prepare for next round */
+ s = (ac > 0) ? *(av+1) : NULL;
+ } else {
+ if (*endptr != ',') {
+ warn("invalid number: %s", s);
+ s = ++endptr;
+ continue;
+ }
+ /* continue processing from here */
+ s = ++endptr;
+ ac++;
+ av--;
+ }
+ if (v[1] < v[0] ||
+ v[1] < 0 || v[1] >= DN_MAX_ID-1 ||
+ v[0] < 0 || v[1] >= DN_MAX_ID-1) {
+ continue; /* invalid entry */
+ }
+ n++;
+ /* translate if 'pipe list' */
+ if (co.do_pipe == 1) {
+ v[0] += DN_MAX_ID;
+ v[1] += DN_MAX_ID;
+ }
+ v = (n*2 < len) ? v + 2 : base;
+ }
+ return n;
+}
+
+/* main entry point for dummynet list functions. co.do_pipe indicates
+ * which function we want to support.
+ * av may contain filtering arguments, either individual entries
+ * or ranges, or lists (space or commas are valid separators).
+ * Format for a range can be n1-n2 or n3 n4 n5 ...
+ * In a range n1 must be <= n2, otherwise the range is ignored.
+ * A number 'n4' is translate in a range 'n4-n4'
+ * All number must be > 0 and < DN_MAX_ID-1
+ */
+void
+dummynet_list(int ac, char *av[], int show_counters)
+{
+ struct dn_id *oid, *x = NULL;
+ int ret, i, l;
+ int n; /* # of ranges */
+ int buflen;
+ int max_size; /* largest obj passed up */
+
+ ac--;
+ av++; /* skip 'list' | 'show' word */
+
+ n = parse_range(ac, av, NULL, 0); /* Count # of ranges. */
+
+ /* Allocate space to store ranges */
+ l = sizeof(*oid) + sizeof(uint32_t) * n * 2;
+ oid = safe_calloc(1, l);
+ oid_fill(oid, l, DN_CMD_GET, DN_API_VERSION);
+
+ if (n > 0) /* store ranges in idx */
+ parse_range(ac, av, (uint32_t *)(oid + 1), n*2);
+ /*
+ * Compute the size of the largest object returned. If the
+ * response leaves at least this much spare space in the
+ * buffer, then surely the response is complete; otherwise
+ * there might be a risk of truncation and we will need to
+ * retry with a larger buffer.
+ * XXX don't bother with smaller structs.
+ */
+ max_size = sizeof(struct dn_fs);
+ if (max_size < sizeof(struct dn_sch))
+ max_size = sizeof(struct dn_sch);
+ if (max_size < sizeof(struct dn_flow))
+ max_size = sizeof(struct dn_flow);
+
+ switch (co.do_pipe) {
+ case 1:
+ oid->subtype = DN_LINK; /* list pipe */
+ break;
+ case 2:
+ oid->subtype = DN_FS; /* list queue */
+ break;
+ case 3:
+ oid->subtype = DN_SCH; /* list sched */
+ break;
+ }
+
+ /*
+ * Ask the kernel an estimate of the required space (result
+ * in oid.id), unless we are requesting a subset of objects,
+ * in which case the kernel does not give an exact answer.
+ * In any case, space might grow in the meantime due to the
+ * creation of new queues, so we must be prepared to retry.
+ */
+ if (n > 0) {
+ buflen = 4*1024;
+ } else {
+ ret = do_cmd(-IP_DUMMYNET3, oid, (uintptr_t)&l);
+ if (ret != 0 || oid->id <= sizeof(*oid))
+ goto done;
+ buflen = oid->id + max_size;
+ oid->len = sizeof(*oid); /* restore */
+ }
+ /* Try a few times, until the buffer fits */
+ for (i = 0; i < 20; i++) {
+ l = buflen;
+ x = safe_realloc(x, l);
+ bcopy(oid, x, oid->len);
+ ret = do_cmd(-IP_DUMMYNET3, x, (uintptr_t)&l);
+ if (ret != 0 || x->id <= sizeof(*oid))
+ goto done; /* no response */
+ if (l + max_size <= buflen)
+ break; /* ok */
+ buflen *= 2; /* double for next attempt */
+ }
+ list_pipes(x, O_NEXT(x, l));
+done:
+ if (x)
+ free(x);
+ free(oid);
+}
diff --git a/sbin/ipfw/ipfw.8 b/sbin/ipfw/ipfw.8
index f8b0746..897cd3f 100644
--- a/sbin/ipfw/ipfw.8
+++ b/sbin/ipfw/ipfw.8
@@ -6,8 +6,10 @@
.Os
.Sh NAME
.Nm ipfw
-.Nd IP firewall and traffic shaper control program
+.Nd User interface for firewall, traffic shaper, packet scheduler,
+in-kernel NAT.
.Sh SYNOPSIS
+.Ss FIREWALL CONFIGURATION
.Nm
.Op Fl cq
.Cm add
@@ -26,12 +28,6 @@
.Op Cm set Ar N
.Brq Cm delete | zero | resetlog
.Op Ar number ...
-.Nm
-.Cm enable
-.Brq Cm firewall | altq | one_pass | debug | verbose | dyn_keepalive
-.Nm
-.Cm disable
-.Brq Cm firewall | altq | one_pass | debug | verbose | dyn_keepalive
.Pp
.Nm
.Cm set Oo Cm disable Ar number ... Oc Op Cm enable Ar number ...
@@ -43,7 +39,16 @@
.Cm set swap Ar number number
.Nm
.Cm set show
+.Ss SYSCTL SHORTCUTS
+.Pp
+.Nm
+.Cm enable
+.Brq Cm firewall | altq | one_pass | debug | verbose | dyn_keepalive
+.Nm
+.Cm disable
+.Brq Cm firewall | altq | one_pass | debug | verbose | dyn_keepalive
.Pp
+.Ss LOOKUP TABLES
.Nm
.Cm table Ar number Cm add Ar addr Ns Oo / Ns Ar masklen Oc Op Ar value
.Nm
@@ -57,17 +62,19 @@
.Brq Ar number | all
.Cm list
.Pp
+.Ss DUMMYNET CONFIGURATION (TRAFFIC SHAPER AND PACKET SCHEDULER)
.Nm
-.Brq Cm pipe | queue
+.Brq Cm pipe | queue | sched
.Ar number
.Cm config
.Ar config-options
.Nm
.Op Fl s Op Ar field
-.Brq Cm pipe | queue
+.Brq Cm pipe | queue | sched
.Brq Cm delete | list | show
.Op Ar number ...
.Pp
+.Ss IN-KERNEL NAT
.Nm
.Op Fl q
.Cm nat
@@ -89,28 +96,27 @@ The
.Nm
utility is the user interface for controlling the
.Xr ipfw 4
-firewall and the
+firewall, the
.Xr dummynet 4
-traffic shaper in
-.Fx .
+traffic shaper/packet scheduler, and the
+in-kernel NAT services.
.Pp
-An
-.Nm
-configuration, or
+A firewall configuration, or
.Em ruleset ,
is made of a list of
.Em rules
numbered from 1 to 65535.
-Packets are passed to
-.Nm
+Packets are passed to the firewall
from a number of different places in the protocol stack
(depending on the source and destination of the packet,
-it is possible that
-.Nm
-is invoked multiple times on the same packet).
+it is possible for the firewall to be
+invoked multiple times on the same packet).
The packet passed to the firewall is compared
-against each of the rules in the firewall
-.Em ruleset .
+against each of the rules in the
+.Em ruleset ,
+in rule-number order
+(multiple rules with the same number are permitted, in which case
+they are processed in order of insertion).
When a match is found, the action corresponding to the
matching rule is performed.
.Pp
@@ -118,9 +124,7 @@ Depending on the action and certain system settings, packets
can be reinjected into the firewall at some rule after the
matching one for further processing.
.Pp
-An
-.Nm
-ruleset always includes a
+A ruleset always includes a
.Em default
rule (numbered 65535) which cannot be modified or deleted,
and matches all packets.
@@ -137,14 +141,14 @@ If the ruleset includes one or more rules with the
or
.Cm limit
option,
-.Nm
-will have a
+the firewall will have a
.Em stateful
-behaviour, i.e., upon a match it will create dynamic rules matching
-the exact parameters (source and destination addresses and ports)
-of the matching packet.
-.Pp
-These dynamic rules, which have a limited lifetime, are checked
+behaviour, i.e., upon a match it will create
+.Em dynamic rules ,
+i.e. rules that match packets with the same 5-tuple
+(protocol, source and destination addresses and ports)
+as the packet which caused their creation.
+Dynamic rules, which have a limited lifetime, are checked
at the first occurrence of a
.Cm check-state ,
.Cm keep-state
@@ -283,6 +287,7 @@ When listing, show last match timestamp as seconds from the epoch.
This form can be more convenient for postprocessing by scripts.
.El
.Pp
+.Ss LIST OF RULES AND PREPROCESSING
To ease configuration, rules can be put into a file which is
processed using
.Nm
@@ -322,14 +327,16 @@ This allows for flexible configuration files (like conditionalizing
them on the local hostname) and the use of macros to centralize
frequently required arguments like IP addresses.
.Pp
+.Ss TRAFFIC SHAPER CONFIGURATION
The
.Nm
-.Cm pipe
+.Cm pipe , queue
and
-.Cm queue
-commands are used to configure the traffic shaper, as shown in the
+.Cm sched
+commands are used to configure the traffic shaper and packet scheduler.
+See the
.Sx TRAFFIC SHAPER (DUMMYNET) CONFIGURATION
-Section below.
+Section below for details.
.Pp
If the world and the kernel get out of sync the
.Nm
@@ -362,7 +369,7 @@ have this picture in mind in order to design a correct ruleset.
| to devices |
.Ed
.Pp
-As can be noted from the above picture, the number of
+The number of
times the same packet goes through the firewall can
vary between 0 and 4 depending on packet source and
destination, and system configuration.
@@ -421,9 +428,9 @@ Keywords are case-sensitive, whereas arguments may
or may not be case-sensitive depending on their nature
(e.g.\& uid's are, hostnames are not).
.Pp
-In
-.Nm ipfw2
-you can introduce spaces after commas ',' to make
+Some arguments (e.g. port or address lists) are comma-separated
+lists of values.
+In this case, spaces after commas ',' are allowed to make
the line more readable.
You can also put the entire
command (including flags) into a single argument.
@@ -434,9 +441,7 @@ ipfw -q add deny src-ip 10.0.0.0/24, 127.0.0.1/8
ipfw "-q add deny src-ip 10.0.0.0/24, 127.0.0.1/8"
.Ed
.Sh RULE FORMAT
-The format of
-.Nm
-rules is the following:
+The format of firewall rules is the following:
.Bd -ragged -offset indent
.Bk -words
.Op Ar rule_number
@@ -496,7 +501,7 @@ in future forwarding decisions.
.El
.Pp
Note that some of the above information, e.g.\& source MAC or IP addresses and
-TCP/UDP ports, could easily be spoofed, so filtering on those fields
+TCP/UDP ports, can be easily spoofed, so filtering on those fields
alone might not guarantee the desired results.
.Bl -tag -width indent
.It Ar rule_number
@@ -1002,6 +1007,7 @@ The second format
with multiple addresses) is provided for convenience only and
its use is discouraged.
.It Ar addr : Oo Cm not Oc Bro
+.Bl -tag -width indent
.Cm any | me | me6 |
.Cm table Ns Pq Ar number Ns Op , Ns Ar value
.Ar | addr-list | addr-set
@@ -1023,6 +1029,7 @@ is also specified, an entry will match only if it has this value.
See the
.Sx LOOKUP TABLES
section below for more information on lookup tables.
+.El
.It Ar addr-list : ip-addr Ns Op Ns , Ns Ar addr-list
.It Ar ip-addr :
A host or subnet address specified in one of the following ways:
@@ -1389,6 +1396,20 @@ of source and destination addresses and ports can be
specified.
Currently,
only IPv4 flows are supported.
+.It Cm lookup Bro Cm dst-ip | dst-port | src-ip | src-port | uid | jail Brc Ar N
+Search an entry in lookup table
+.Ar N
+that matches the field specified as argument.
+If not found, the match fails.
+Otherwise, the match succeeds and
+.Cm tablearg
+is set to the value extracted from the table.
+.Pp
+This option can be useful to quickly dispatch traffic based on
+certain packet fields.
+See the
+.Sx LOOKUP TABLES
+section below for more information on lookup tables.
.It Cm { MAC | mac } Ar dst-mac src-mac
Match packets with a given
.Ar dst-mac
@@ -1480,7 +1501,7 @@ is invalid) whenever
.Cm xmit
is used.
.Pp
-A packet may not have a receive or transmit interface: packets
+A packet might not have a receive or transmit interface: packets
originating from the local host have no receive interface,
while packets destined for the local host have no transmit
interface.
@@ -1627,15 +1648,17 @@ because it engages only on packets with source addresses of directly
connected networks instead of all source addresses.
.El
.Sh LOOKUP TABLES
-Lookup tables are useful to handle large sparse address sets,
-typically from a hundred to several thousands of entries.
+Lookup tables are useful to handle large sparse sets of
+addresses or other search keys (e.g. ports, jail IDs).
+In the rest of this section we will use the term ``address''
+to mean any unsigned value of up to 32-bit.
There may be up to 128 different lookup tables, numbered 0 to 127.
.Pp
Each entry is represented by an
.Ar addr Ns Op / Ns Ar masklen
and will match all addresses with base
.Ar addr
-(specified as an IP address or a hostname)
+(specified as an IP address, a hostname or an unsigned integer)
and mask width of
.Ar masklen
bits.
@@ -1653,9 +1676,9 @@ is not specified, it defaults to 0.
.Pp
An entry can be added to a table
.Pq Cm add ,
-removed from a table
-.Pq Cm delete ,
-a table can be examined
+or removed from a table
+.Pq Cm delete .
+A table can be examined
.Pq Cm list
or flushed
.Pq Cm flush .
@@ -1664,7 +1687,7 @@ Internally, each table is stored in a Radix tree, the same way as
the routing table (see
.Xr route 4 ) .
.Pp
-Lookup tables currently support IPv4 addresses only.
+Lookup tables currently support only ports, jail IDs and IPv4 addresses.
.Pp
The
.Cm tablearg
@@ -1822,9 +1845,9 @@ for more examples on how to use dynamic rules.
.Nm
is also the user interface for the
.Nm dummynet
-traffic shaper and network emulator, a subsystem that
+traffic shaper, packet scheduler and network emulator, a subsystem that
can artificially queue, delay or drop packets
-emulator the behaviour of certain network links
+emulating the behaviour of certain network links
or queueing systems.
.Pp
.Nm dummynet
@@ -1836,26 +1859,33 @@ Matching packets are then passed to either of two
different objects, which implement the traffic regulation:
.Bl -hang -offset XXXX
.It Em pipe
-A pipe emulates a link with given bandwidth, propagation delay,
+A
+.Em pipe
+emulates a
+.Em link
+with given bandwidth and propagation delay,
+driven by a FIFO scheduler and a single queue with programmable
queue size and packet loss rate.
-Packets are queued in front of the pipe as they come out from the classifier,
-and then transferred to the pipe according to the pipe's parameters.
+Packets are appended to the queue as they come out from
+.Nm ipfw ,
+and then transferred in FIFO order to the link at the desired rate.
.It Em queue
-A queue
-is an abstraction used to implement the WF2Q+
-(Worst-case Fair Weighted Fair Queueing) policy, which is
-an efficient variant of the WFQ policy.
-.Pp
-The queue associates a
-.Em weight
-and a reference pipe to each flow (a flow is a set of packets
-with the same addresses and ports after masking).
-All backlogged flows (i.e., those
-with packets queued) linked to the same pipe share the pipe's
-bandwidth proportionally to their weights.
-Note that weights are not priorities; a flow with a lower weight
-is still guaranteed to get its fraction of the bandwidth even if a
-flow with a higher weight is permanently backlogged.
+A
+.Em queue
+is an abstraction used to implement packet scheduling
+using one of several packet scheduling algorithms.
+Packets sent to a
+.Em queue
+are first grouped into flows according to a mask on the 5-tuple.
+Flows are then passed to the scheduler associated to the
+.Em queue ,
+and each flow uses scheduling parameters (weight and others)
+as configured in the
+.Em queue
+itself.
+A scheduler in turn is connected to an emulated link,
+and arbitrates the link's bandwidth among backlogged flows according to
+weights and to the features of the scheduling algorithm in use.
.El
.Pp
In practice,
@@ -1864,6 +1894,52 @@ can be used to set hard limits to the bandwidth that a flow can use, whereas
.Em queues
can be used to determine how different flows share the available bandwidth.
.Pp
+A graphical representation of the binding of queues,
+flows, schedulers and links is below.
+.Bd -literal -offset indent
+ (flow_mask|sched_mask) sched_mask
+ +---------+ weight Wx +-------------+
+ | |->-[flow]-->--| |-+
+ -->--| QUEUE x | ... | | |
+ | |->-[flow]-->--| SCHEDuler N | |
+ +---------+ | | |
+ ... | +--[LINK N]-->--
+ +---------+ weight Wy | | +--[LINK N]-->--
+ | |->-[flow]-->--| | |
+ -->--| QUEUE y | ... | | |
+ | |->-[flow]-->--| | |
+ +---------+ +-------------+ |
+ +-------------+
+.Ed
+It is important to understand the role of the SCHED_MASK
+and FLOW_MASK, which are configured through the commands
+.Dl "ipfw sched N config mask SCHED_MASK ..."
+and
+.Dl "ipfw queue X config mask FLOW_MASK ..." .
+.Pp
+The SCHED_MASK is used to assign flows to one or more
+scheduler instances, one for each
+value of the packet's 5-fuple after applying SCHED_MASK.
+As an example, using ``src-ip 0xffffff00'' creates one instance
+for each /24 destination subnet.
+.Pp
+The FLOW_MASK, together with the SCHED_MASK, is used to split
+packets into flows. As an example, using
+``src-ip 0x000000ff''
+together with the previous SCHED_MASK makes a flow for
+each individual source address. In turn, flows for each /24
+subnet will be sent to the same scheduler instance.
+.Pp
+The above diagram holds even for the
+.Em pipe
+case, with the only restriction that a
+.Em pipe
+only supports a SCHED_MASK, and forces the use of a FIFO
+scheduler (these are for backward compatibility reasons;
+in fact, internally, a
+.Nm dummynet's
+pipe is implemented exactly as above).
+.Pp
There are two modes of
.Nm dummynet
operation:
@@ -1895,16 +1971,19 @@ mode can be enabled by setting the
.Xr sysctl 8
variable to a non-zero value.
.Pp
-.Ss PIPE AND QUEUE CONFIGURATION
+.Ss PIPE, QUEUE AND SCHEDULER CONFIGURATION
The
-.Em pipe
-and
+.Em pipe ,
.Em queue
+and
+.Em scheduler
configuration commands are the following:
.Bd -ragged -offset indent
.Cm pipe Ar number Cm config Ar pipe-configuration
.Pp
.Cm queue Ar number Cm config Ar queue-configuration
+.Pp
+.Cm sched Ar number Cm config Ar sched-configuration
.Ed
.Pp
The following parameters can be configured for a pipe:
@@ -2057,6 +2136,41 @@ Specifies the weight to be used for flows matching this queue.
The weight must be in the range 1..100, and defaults to 1.
.El
.Pp
+The following parameters can be configured for a scheduler:
+.Pp
+.Bl -tag -width indent -compact
+.It Cm type Ar {fifo | wf2qp | rr | qfq}
+specifies the scheduling algorithm to use.
+.Bl -tag -width indent -compact
+.It cm fifo
+is just a FIFO scheduler (which means that all packets
+are stored in the same queue as they arrive to the scheduler).
+FIFO has O(1) per-packet time complexity, with very low
+constants (estimate 60-80ns on a 2Ghz desktop machine)
+but gives no service guarantees.
+.It Cm wf2qp
+implements the WF2Q+ algorithm, which is a Weighted Fair Queueing
+algorithm which permits flows to share bandwidth according to
+their weights. Note that weights are not priorities; even a flow
+with a minuscule weight will never starve.
+WF2Q+ has O(log N) per-packet processing cost, where N is the number
+of flows, and is the default algorithm used by previous versions
+dummynet's queues.
+.It Cm rr
+implements the Deficit Round Robin algorithm, which has O(1) processing
+costs (roughly, 100-150ns per packet)
+and permits bandwidth allocation according to weights, but
+with poor service guarantees.
+.It Cm qfq
+implements the QFQ algorithm, which is a very fast variant of
+WF2Q+, with similar service guarantees and O(1) processing
+costs (roughly, 200-250ns per packet).
+.El
+.El
+.Pp
+In addition to the type, all parameters allowed for a pipe can also
+be specified for a scheduler.
+.Pp
Finally, the following parameters can be configured for both
pipes and queues:
.Pp
diff --git a/sbin/ipfw/ipfw2.c b/sbin/ipfw/ipfw2.c
index b19f390..1ab827f 100644
--- a/sbin/ipfw/ipfw2.c
+++ b/sbin/ipfw/ipfw2.c
@@ -57,7 +57,7 @@ struct cmdline_opts co; /* global options */
int resvd_set_number = RESVD_SET;
#define GET_UINT_ARG(arg, min, max, tok, s_x) do { \
- if (!ac) \
+ if (!av[0]) \
errx(EX_USAGE, "%s: missing argument", match_value(s_x, tok)); \
if (_substrcmp(*av, "tablearg") == 0) { \
arg = IP_FW_TABLEARG; \
@@ -65,23 +65,23 @@ int resvd_set_number = RESVD_SET;
} \
\
{ \
- long val; \
+ long _xval; \
char *end; \
\
- val = strtol(*av, &end, 10); \
+ _xval = strtol(*av, &end, 10); \
\
- if (!isdigit(**av) || *end != '\0' || (val == 0 && errno == EINVAL)) \
+ if (!isdigit(**av) || *end != '\0' || (_xval == 0 && errno == EINVAL)) \
errx(EX_DATAERR, "%s: invalid argument: %s", \
match_value(s_x, tok), *av); \
\
- if (errno == ERANGE || val < min || val > max) \
+ if (errno == ERANGE || _xval < min || _xval > max) \
errx(EX_DATAERR, "%s: argument is out of range (%u..%u): %s", \
match_value(s_x, tok), min, max, *av); \
\
- if (val == IP_FW_TABLEARG) \
+ if (_xval == IP_FW_TABLEARG) \
errx(EX_DATAERR, "%s: illegal argument value: %s", \
match_value(s_x, tok), *av); \
- arg = val; \
+ arg = _xval; \
} \
} while (0)
@@ -224,6 +224,15 @@ static struct _s_x rule_action_params[] = {
{ NULL, 0 } /* terminator */
};
+/*
+ * The 'lookup' instruction accepts one of the following arguments.
+ * -1 is a terminator for the list.
+ * Arguments are passed as v[1] in O_DST_LOOKUP options.
+ */
+static int lookup_key[] = {
+ TOK_DSTIP, TOK_SRCIP, TOK_DSTPORT, TOK_SRCPORT,
+ TOK_UID, TOK_JAIL, TOK_DSCP, -1 };
+
static struct _s_x rule_options[] = {
{ "tagged", TOK_TAGGED },
{ "uid", TOK_UID },
@@ -249,6 +258,7 @@ static struct _s_x rule_options[] = {
{ "iplen", TOK_IPLEN },
{ "ipid", TOK_IPID },
{ "ipprecedence", TOK_IPPRECEDENCE },
+ { "dscp", TOK_DSCP },
{ "iptos", TOK_IPTOS },
{ "ipttl", TOK_IPTTL },
{ "ipversion", TOK_IPVER },
@@ -290,6 +300,7 @@ static struct _s_x rule_options[] = {
{ "dst-ip6", TOK_DSTIP6},
{ "src-ipv6", TOK_SRCIP6},
{ "src-ip6", TOK_SRCIP6},
+ { "lookup", TOK_LOOKUP},
{ "//", TOK_COMMENT },
{ "not", TOK_NOT }, /* pseudo option */
@@ -343,6 +354,7 @@ safe_realloc(void *ptr, size_t size)
/*
* conditionally runs the command.
+ * Selected options or negative -> getsockopt
*/
int
do_cmd(int optname, void *optval, uintptr_t optlen)
@@ -362,11 +374,15 @@ do_cmd(int optname, void *optval, uintptr_t optlen)
optname == IP_FW_ADD || optname == IP_FW_TABLE_LIST ||
optname == IP_FW_TABLE_GETSIZE ||
optname == IP_FW_NAT_GET_CONFIG ||
- optname == IP_FW_NAT_GET_LOG)
+ optname < 0 ||
+ optname == IP_FW_NAT_GET_LOG) {
+ if (optname < 0)
+ optname = -optname;
i = getsockopt(s, IPPROTO_IP, optname, optval,
(socklen_t *)optlen);
- else
+ } else {
i = setsockopt(s, IPPROTO_IP, optname, optval, optlen);
+ }
return i;
}
@@ -739,9 +755,19 @@ static void
print_ip(ipfw_insn_ip *cmd, char const *s)
{
struct hostent *he = NULL;
- int len = F_LEN((ipfw_insn *)cmd);
+ uint32_t len = F_LEN((ipfw_insn *)cmd);
uint32_t *a = ((ipfw_insn_u32 *)cmd)->d;
+ if (cmd->o.opcode == O_IP_DST_LOOKUP && len > F_INSN_SIZE(ipfw_insn_u32)) {
+ uint32_t d = a[1];
+ const char *arg = "<invalid>";
+
+ if (d < sizeof(lookup_key)/sizeof(lookup_key[0]))
+ arg = match_value(rule_options, lookup_key[d]);
+ printf("%s lookup %s %d", cmd->o.len & F_NOT ? " not": "",
+ arg, cmd->o.arg1);
+ return;
+ }
printf("%s%s ", cmd->o.len & F_NOT ? " not": "", s);
if (cmd->o.opcode == O_IP_SRC_ME || cmd->o.opcode == O_IP_DST_ME) {
@@ -1108,9 +1134,11 @@ show_ipfw(struct ip_fw *rule, int pcwidth, int bcwidth)
else
printf(" log");
}
+#ifndef NO_ALTQ
if (altqptr) {
print_altq_cmd(altqptr);
}
+#endif
if (tagptr) {
if (tagptr->len & F_NOT)
PRINT_UINT_ARG(" untag ", tagptr->arg1);
@@ -1595,26 +1623,33 @@ show_dyn_ipfw(ipfw_dyn_rule *d, int pcwidth, int bcwidth)
* ipfw set move rule X to Y
*/
void
-ipfw_sets_handler(int ac, char *av[])
+ipfw_sets_handler(char *av[])
{
uint32_t set_disable, masks[2];
int i, nbytes;
uint16_t rulenum;
uint8_t cmd, new_set;
- ac--;
av++;
- if (!ac)
+ if (av[0] == NULL)
errx(EX_USAGE, "set needs command");
if (_substrcmp(*av, "show") == 0) {
- void *data;
+ void *data = NULL;
char const *msg;
-
- nbytes = sizeof(struct ip_fw);
+ int nalloc;
+
+ nalloc = nbytes = sizeof(struct ip_fw);
+ while (nbytes >= nalloc) {
+ if (data)
+ free(data);
+ nalloc = nalloc * 2 + 200;
+ nbytes = nalloc;
data = safe_calloc(1, nbytes);
if (do_cmd(IP_FW_GET, data, (uintptr_t)&nbytes) < 0)
err(EX_OSERR, "getsockopt(IP_FW_GET)");
+ }
+
bcopy(&((struct ip_fw *)data)->next_rule,
&set_disable, sizeof(set_disable));
@@ -1631,8 +1666,8 @@ ipfw_sets_handler(int ac, char *av[])
}
printf("\n");
} else if (_substrcmp(*av, "swap") == 0) {
- ac--; av++;
- if (ac != 2)
+ av++;
+ if ( av[0] == NULL || av[1] == NULL )
errx(EX_USAGE, "set swap needs 2 set numbers\n");
rulenum = atoi(av[0]);
new_set = atoi(av[1]);
@@ -1643,13 +1678,14 @@ ipfw_sets_handler(int ac, char *av[])
masks[0] = (4 << 24) | (new_set << 16) | (rulenum);
i = do_cmd(IP_FW_DEL, masks, sizeof(uint32_t));
} else if (_substrcmp(*av, "move") == 0) {
- ac--; av++;
- if (ac && _substrcmp(*av, "rule") == 0) {
+ av++;
+ if (av[0] && _substrcmp(*av, "rule") == 0) {
cmd = 2;
- ac--; av++;
+ av++;
} else
cmd = 3;
- if (ac != 3 || _substrcmp(av[1], "to") != 0)
+ if (av[0] == NULL || av[1] == NULL || av[2] == NULL ||
+ av[3] != NULL || _substrcmp(av[1], "to") != 0)
errx(EX_USAGE, "syntax: set move [rule] X to Y\n");
rulenum = atoi(av[0]);
new_set = atoi(av[2]);
@@ -1664,10 +1700,10 @@ ipfw_sets_handler(int ac, char *av[])
_substrcmp(*av, "enable") == 0 ) {
int which = _substrcmp(*av, "enable") == 0 ? 1 : 0;
- ac--; av++;
+ av++;
masks[0] = masks[1] = 0;
- while (ac) {
+ while (av[0]) {
if (isdigit(**av)) {
i = atoi(*av);
if (i < 0 || i > RESVD_SET)
@@ -1681,7 +1717,7 @@ ipfw_sets_handler(int ac, char *av[])
else
errx(EX_DATAERR,
"invalid set command %s\n", *av);
- av++; ac--;
+ av++;
}
if ( (masks[0] & masks[1]) != 0 )
errx(EX_DATAERR,
@@ -1695,12 +1731,11 @@ ipfw_sets_handler(int ac, char *av[])
}
void
-ipfw_sysctl_handler(int ac, char *av[], int which)
+ipfw_sysctl_handler(char *av[], int which)
{
- ac--;
av++;
- if (ac == 0) {
+ if (av[0] == NULL) {
warnx("missing keyword to enable/disable\n");
} else if (_substrcmp(*av, "firewall") == 0) {
sysctlbyname("net.inet.ip.fw.enable", NULL, 0,
@@ -1717,8 +1752,10 @@ ipfw_sysctl_handler(int ac, char *av[], int which)
} else if (_substrcmp(*av, "dyn_keepalive") == 0) {
sysctlbyname("net.inet.ip.fw.dyn_keepalive", NULL, 0,
&which, sizeof(which));
+#ifndef NO_ALTQ
} else if (_substrcmp(*av, "altq") == 0) {
altq_set_enabled(which);
+#endif
} else {
warnx("unrecognize enable/disable keyword: %s\n", *av);
}
@@ -1751,6 +1788,10 @@ ipfw_list(int ac, char *av[], int show_counters)
fprintf(stderr, "Testing only, list disabled\n");
return;
}
+ if (co.do_pipe) {
+ dummynet_list(ac, av, show_counters);
+ return;
+ }
ac--;
av++;
@@ -1767,11 +1808,6 @@ ipfw_list(int ac, char *av[], int show_counters)
co.do_pipe ? "DUMMYNET" : "FW");
}
- if (co.do_pipe) {
- ipfw_list_pipes(data, nbytes, ac, av);
- goto done;
- }
-
/*
* Count static rules. They have variable size so we
* need to scan the list to count them.
@@ -2119,7 +2155,7 @@ fill_ip(ipfw_insn_ip *cmd, char *av)
return;
}
/* A single IP can be stored in an optimized format */
- if (d[1] == ~0 && av == NULL && len == 0) {
+ if (d[1] == (uint32_t)~0 && av == NULL && len == 0) {
cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32);
return;
}
@@ -2188,29 +2224,28 @@ fill_flags(ipfw_insn *cmd, enum ipfw_opcodes opcode,
void
-ipfw_delete(int ac, char *av[])
+ipfw_delete(char *av[])
{
uint32_t rulenum;
int i;
int exitval = EX_OK;
int do_set = 0;
-
- av++; ac--;
+ av++;
NEED1("missing rule specification");
- if (ac > 0 && _substrcmp(*av, "set") == 0) {
+ if ( *av && _substrcmp(*av, "set") == 0) {
/* Do not allow using the following syntax:
* ipfw set N delete set M
*/
if (co.use_set)
errx(EX_DATAERR, "invalid syntax");
do_set = 1; /* delete set */
- ac--; av++;
+ av++;
}
/* Rule number */
- while (ac && isdigit(**av)) {
- i = atoi(*av); av++; ac--;
+ while (*av && isdigit(**av)) {
+ i = atoi(*av); av++;
if (co.do_nat) {
exitval = do_cmd(IP_FW_NAT_DEL, &i, sizeof i);
if (exitval) {
@@ -2264,7 +2299,8 @@ fill_iface(ipfw_insn_if *cmd, char *arg)
static void
get_mac_addr_mask(const char *p, uint8_t *addr, uint8_t *mask)
{
- int i, l;
+ int i;
+ size_t l;
char *ap, *ptr, *optr;
struct ether_addr *mac;
const char *macset = "0123456789abcdefABCDEF:";
@@ -2286,11 +2322,11 @@ get_mac_addr_mask(const char *p, uint8_t *addr, uint8_t *mask)
if (ptr != NULL) { /* we have mask? */
if (p[ptr - optr - 1] == '/') { /* mask len */
- l = strtol(ptr, &ap, 10);
- if (*ap != 0 || l > ETHER_ADDR_LEN * 8 || l < 0)
+ long ml = strtol(ptr, &ap, 10);
+ if (*ap != 0 || ml > ETHER_ADDR_LEN * 8 || ml < 0)
errx(EX_DATAERR, "Incorrect mask length");
- for (i = 0; l > 0 && i < ETHER_ADDR_LEN; l -= 8, i++)
- mask[i] = (l >= 8) ? 0xff: (~0) << (8 - l);
+ for (i = 0; ml > 0 && i < ETHER_ADDR_LEN; ml -= 8, i++)
+ mask[i] = (ml >= 8) ? 0xff: (~0) << (8 - ml);
} else { /* mask */
l = strlen(ptr);
if (strspn(ptr, macset) != l ||
@@ -2325,7 +2361,7 @@ next_cmd(ipfw_insn *cmd)
* Takes arguments and copies them into a comment
*/
static void
-fill_comment(ipfw_insn *cmd, int ac, char **av)
+fill_comment(ipfw_insn *cmd, char **av)
{
int i, l;
char *p = (char *)(cmd + 1);
@@ -2334,7 +2370,7 @@ fill_comment(ipfw_insn *cmd, int ac, char **av)
cmd->len = (cmd->len & (F_NOT | F_OR));
/* Compute length of comment string. */
- for (i = 0, l = 0; i < ac; i++)
+ for (i = 0, l = 0; av[i] != NULL; i++)
l += strlen(av[i]) + 1;
if (l == 0)
return;
@@ -2343,7 +2379,7 @@ fill_comment(ipfw_insn *cmd, int ac, char **av)
"comment too long (max 80 chars)");
l = 1 + (l+3)/4;
cmd->len = (cmd->len & (F_NOT | F_OR)) | l;
- for (i = 0; i < ac; i++) {
+ for (i = 0; av[i] != NULL; i++) {
strcpy(p, av[i]);
p += strlen(av[i]);
*p++ = ' ';
@@ -2368,11 +2404,11 @@ fill_cmd(ipfw_insn *cmd, enum ipfw_opcodes opcode, int flags, uint16_t arg)
* two microinstructions, and returns the pointer to the last one.
*/
static ipfw_insn *
-add_mac(ipfw_insn *cmd, int ac, char *av[])
+add_mac(ipfw_insn *cmd, char *av[])
{
ipfw_insn_mac *mac;
- if (ac < 2)
+ if ( ( av[0] == NULL ) || ( av[1] == NULL ) )
errx(EX_DATAERR, "MAC dst src");
cmd->opcode = O_MACADDR2;
@@ -2386,9 +2422,9 @@ add_mac(ipfw_insn *cmd, int ac, char *av[])
}
static ipfw_insn *
-add_mactype(ipfw_insn *cmd, int ac, char *av)
+add_mactype(ipfw_insn *cmd, char *av)
{
- if (ac < 1)
+ if (!av)
errx(EX_DATAERR, "missing MAC type");
if (strcmp(av, "any") != 0) { /* we have a non-null type */
fill_newports((ipfw_insn_u16 *)cmd, av, IPPROTO_ETHERTYPE);
@@ -2496,6 +2532,7 @@ add_dstip(ipfw_insn *cmd, char *av)
static ipfw_insn *
add_ports(ipfw_insn *cmd, char *av, u_char proto, int opcode)
{
+ /* XXX "any" is trapped before. Perhaps "to" */
if (_substrcmp(av, "any") == 0) {
return NULL;
} else if (fill_newports((ipfw_insn_u16 *)cmd, av, proto)) {
@@ -2519,11 +2556,11 @@ add_src(ipfw_insn *cmd, char *av, u_char proto)
*ch = '\0';
if (proto == IPPROTO_IPV6 || strcmp(av, "me6") == 0 ||
- inet_pton(AF_INET6, host, &a))
+ inet_pton(AF_INET6, host, &a) == 1)
ret = add_srcip6(cmd, av);
/* XXX: should check for IPv4, not !IPv6 */
if (ret == NULL && (proto == IPPROTO_IP || strcmp(av, "me") == 0 ||
- !inet_pton(AF_INET6, host, &a)))
+ inet_pton(AF_INET6, host, &a) != 1))
ret = add_srcip(cmd, av);
if (ret == NULL && strcmp(av, "any") != 0)
ret = cmd;
@@ -2545,11 +2582,11 @@ add_dst(ipfw_insn *cmd, char *av, u_char proto)
*ch = '\0';
if (proto == IPPROTO_IPV6 || strcmp(av, "me6") == 0 ||
- inet_pton(AF_INET6, host, &a))
+ inet_pton(AF_INET6, host, &a) == 1)
ret = add_dstip6(cmd, av);
/* XXX: should check for IPv4, not !IPv6 */
if (ret == NULL && (proto == IPPROTO_IP || strcmp(av, "me") == 0 ||
- !inet_pton(AF_INET6, host, &a)))
+ inet_pton(AF_INET6, host, &a) != 1))
ret = add_dstip(cmd, av);
if (ret == NULL && strcmp(av, "any") != 0)
ret = cmd;
@@ -2571,7 +2608,7 @@ add_dst(ipfw_insn *cmd, char *av, u_char proto)
*
*/
void
-ipfw_add(int ac, char *av[])
+ipfw_add(char *av[])
{
/*
* rules are added into the 'rulebuf' and then copied in
@@ -2610,37 +2647,36 @@ ipfw_add(int ac, char *av[])
cmd = (ipfw_insn *)cmdbuf;
action = (ipfw_insn *)actbuf;
- av++; ac--;
+ av++;
/* [rule N] -- Rule number optional */
- if (ac && isdigit(**av)) {
+ if (av[0] && isdigit(**av)) {
rule->rulenum = atoi(*av);
av++;
- ac--;
}
/* [set N] -- set number (0..RESVD_SET), optional */
- if (ac > 1 && _substrcmp(*av, "set") == 0) {
+ if (av[0] && !av[1] && _substrcmp(*av, "set") == 0) {
int set = strtoul(av[1], NULL, 10);
if (set < 0 || set > RESVD_SET)
errx(EX_DATAERR, "illegal set %s", av[1]);
rule->set = set;
- av += 2; ac -= 2;
+ av += 2;
}
/* [prob D] -- match probability, optional */
- if (ac > 1 && _substrcmp(*av, "prob") == 0) {
+ if (av[0] && av[1] && _substrcmp(*av, "prob") == 0) {
match_prob = strtod(av[1], NULL);
if (match_prob <= 0 || match_prob > 1)
errx(EX_DATAERR, "illegal match prob. %s", av[1]);
- av += 2; ac -= 2;
+ av += 2;
}
/* action -- mandatory */
NEED1("missing action");
i = match_token(rule_actions, *av);
- ac--; av++;
+ av++;
action->len = 1; /* default */
switch(i) {
case TOK_CHECKSTATE:
@@ -2676,14 +2712,14 @@ ipfw_add(int ac, char *av[])
action->opcode = O_REJECT;
NEED1("missing reject code");
fill_reject_code(&action->arg1, *av);
- ac--; av++;
+ av++;
break;
case TOK_UNREACH6:
action->opcode = O_UNREACH6;
NEED1("missing unreach code");
fill_unreach6_code(&action->arg1, *av);
- ac--; av++;
+ av++;
break;
case TOK_COUNT:
@@ -2716,7 +2752,7 @@ ipfw_add(int ac, char *av[])
case TOK_TEE:
action->opcode = O_TEE;
chkarg:
- if (!ac)
+ if (!av[0])
errx(EX_USAGE, "missing argument for %s", *(av - 1));
if (isdigit(**av)) {
action->arg1 = strtoul(*av, NULL, 10);
@@ -2735,7 +2771,7 @@ chkarg:
errx(EX_DATAERR, "illegal divert/tee port");
} else
errx(EX_DATAERR, "illegal argument for %s", *(av - 1));
- ac--; av++;
+ av++;
break;
case TOK_FORWARD: {
@@ -2773,13 +2809,13 @@ chkarg:
p->sa.sin_addr.s_addr = INADDR_ANY;
else
lookup_host(*av, &(p->sa.sin_addr));
- ac--; av++;
+ av++;
break;
}
case TOK_COMMENT:
/* pretend it is a 'count' rule followed by the comment */
action->opcode = O_COUNT;
- ac++; av--; /* go back... */
+ av--; /* go back... */
break;
case TOK_SETFIB:
@@ -2794,7 +2830,7 @@ chkarg:
errx(EX_DATAERR, "fibs not suported.\n");
if (action->arg1 >= numfibs) /* Temporary */
errx(EX_DATAERR, "fib too large.\n");
- ac--; av++;
+ av++;
break;
}
@@ -2814,8 +2850,8 @@ chkarg:
* If they exist, it go first in the cmdbuf, but then it is
* skipped in the copy section to the end of the buffer.
*/
- while (ac != 0 && (i = match_token(rule_action_params, *av)) != -1) {
- ac--; av++;
+ while (av[0] != NULL && (i = match_token(rule_action_params, *av)) != -1) {
+ av++;
switch (i) {
case TOK_LOG:
{
@@ -2828,15 +2864,15 @@ chkarg:
have_log = (ipfw_insn *)c;
cmd->len = F_INSN_SIZE(ipfw_insn_log);
cmd->opcode = O_LOG;
- if (ac && _substrcmp(*av, "logamount") == 0) {
- ac--; av++;
+ if (av[0] && _substrcmp(*av, "logamount") == 0) {
+ av++;
NEED1("logamount requires argument");
l = atoi(*av);
if (l < 0)
errx(EX_DATAERR,
"logamount must be positive");
c->max_log = l;
- ac--; av++;
+ av++;
} else {
len = sizeof(c->max_log);
if (sysctlbyname("net.inet.ip.fw.verbose_limit",
@@ -2847,6 +2883,7 @@ chkarg:
}
break;
+#ifndef NO_ALTQ
case TOK_ALTQ:
{
ipfw_insn_altq *a = (ipfw_insn_altq *)cmd;
@@ -2859,9 +2896,10 @@ chkarg:
cmd->len = F_INSN_SIZE(ipfw_insn_altq);
cmd->opcode = O_ALTQ;
a->qid = altq_name_to_qid(*av);
- ac--; av++;
+ av++;
}
break;
+#endif
case TOK_TAG:
case TOK_UNTAG: {
@@ -2874,7 +2912,7 @@ chkarg:
rule_action_params);
have_tag = cmd;
fill_cmd(cmd, O_TAG, (i == TOK_TAG) ? 0: F_NOT, tag);
- ac--; av++;
+ av++;
break;
}
@@ -2888,13 +2926,13 @@ chkarg:
goto done;
#define OR_START(target) \
- if (ac && (*av[0] == '(' || *av[0] == '{')) { \
+ if (av[0] && (*av[0] == '(' || *av[0] == '{')) { \
if (open_par) \
errx(EX_USAGE, "nested \"(\" not allowed\n"); \
prev = NULL; \
open_par = 1; \
if ( (av[0])[1] == '\0') { \
- ac--; av++; \
+ av++; \
} else \
(*av)++; \
} \
@@ -2903,30 +2941,30 @@ chkarg:
#define CLOSE_PAR \
if (open_par) { \
- if (ac && ( \
+ if (av[0] && ( \
strcmp(*av, ")") == 0 || \
strcmp(*av, "}") == 0)) { \
prev = NULL; \
open_par = 0; \
- ac--; av++; \
+ av++; \
} else \
errx(EX_USAGE, "missing \")\"\n"); \
}
#define NOT_BLOCK \
- if (ac && _substrcmp(*av, "not") == 0) { \
+ if (av[0] && _substrcmp(*av, "not") == 0) { \
if (cmd->len & F_NOT) \
errx(EX_USAGE, "double \"not\" not allowed\n"); \
cmd->len |= F_NOT; \
- ac--; av++; \
+ av++; \
}
#define OR_BLOCK(target) \
- if (ac && _substrcmp(*av, "or") == 0) { \
+ if (av[0] && _substrcmp(*av, "or") == 0) { \
if (prev == NULL || open_par == 0) \
errx(EX_DATAERR, "invalid OR block"); \
prev->len |= F_OR; \
- ac--; av++; \
+ av++; \
goto target; \
} \
CLOSE_PAR;
@@ -2943,15 +2981,15 @@ chkarg:
NEED1("missing protocol");
if (_substrcmp(*av, "MAC") == 0 ||
_substrcmp(*av, "mac") == 0) {
- ac--; av++; /* the "MAC" keyword */
- add_mac(cmd, ac, av); /* exits in case of errors */
+ av++; /* the "MAC" keyword */
+ add_mac(cmd, av); /* exits in case of errors */
cmd = next_cmd(cmd);
- ac -= 2; av += 2; /* dst-mac and src-mac */
+ av += 2; /* dst-mac and src-mac */
NOT_BLOCK;
NEED1("missing mac type");
- if (add_mactype(cmd, ac, av[0]))
+ if (add_mactype(cmd, av[0]))
cmd = next_cmd(cmd);
- ac--; av++; /* any or mac-type */
+ av++; /* any or mac-type */
goto read_options;
}
#endif
@@ -2963,7 +3001,7 @@ chkarg:
NOT_BLOCK;
NEED1("missing protocol");
if (add_proto_compat(cmd, *av, &proto)) {
- av++; ac--;
+ av++;
if (F_LEN(cmd) != 0) {
prev = cmd;
cmd = next_cmd(cmd);
@@ -2977,9 +3015,9 @@ chkarg:
/*
* "from", mandatory
*/
- if (!ac || _substrcmp(*av, "from") != 0)
+ if ((av[0] == NULL) || _substrcmp(*av, "from") != 0)
errx(EX_USAGE, "missing ``from''");
- ac--; av++;
+ av++;
/*
* source IP, mandatory
@@ -2988,7 +3026,7 @@ chkarg:
NOT_BLOCK; /* optional "not" */
NEED1("missing source address");
if (add_src(cmd, *av, proto)) {
- ac--; av++;
+ av++;
if (F_LEN(cmd) != 0) { /* ! any */
prev = cmd;
cmd = next_cmd(cmd);
@@ -3001,10 +3039,10 @@ chkarg:
* source ports, optional
*/
NOT_BLOCK; /* optional "not" */
- if (ac) {
+ if ( av[0] != NULL ) {
if (_substrcmp(*av, "any") == 0 ||
add_ports(cmd, *av, proto, O_IP_SRCPORT)) {
- ac--; av++;
+ av++;
if (F_LEN(cmd) != 0)
cmd = next_cmd(cmd);
}
@@ -3013,9 +3051,9 @@ chkarg:
/*
* "to", mandatory
*/
- if (!ac || _substrcmp(*av, "to") != 0)
+ if ( (av[0] == NULL) || _substrcmp(*av, "to") != 0 )
errx(EX_USAGE, "missing ``to''");
- av++; ac--;
+ av++;
/*
* destination, mandatory
@@ -3024,7 +3062,7 @@ chkarg:
NOT_BLOCK; /* optional "not" */
NEED1("missing dst address");
if (add_dst(cmd, *av, proto)) {
- ac--; av++;
+ av++;
if (F_LEN(cmd) != 0) { /* ! any */
prev = cmd;
cmd = next_cmd(cmd);
@@ -3037,17 +3075,17 @@ chkarg:
* dest. ports, optional
*/
NOT_BLOCK; /* optional "not" */
- if (ac) {
+ if (av[0]) {
if (_substrcmp(*av, "any") == 0 ||
add_ports(cmd, *av, proto, O_IP_DSTPORT)) {
- ac--; av++;
+ av++;
if (F_LEN(cmd) != 0)
cmd = next_cmd(cmd);
}
}
read_options:
- if (ac && first_cmd == cmd) {
+ if (av[0] && first_cmd == cmd) {
/*
* nothing specified so far, store in the rule to ease
* printout later.
@@ -3055,7 +3093,7 @@ read_options:
rule->_pad = 1;
}
prev = NULL;
- while (ac) {
+ while ( av[0] != NULL ) {
char *s;
ipfw_insn_u32 *cmd32; /* alias for cmd */
@@ -3069,7 +3107,7 @@ read_options:
s++;
}
i = match_token(rule_options, s);
- ac--; av++;
+ av++;
switch(i) {
case TOK_NOT:
if (cmd->len & F_NOT)
@@ -3131,7 +3169,7 @@ read_options:
NEED1("recv, xmit, via require interface name"
" or address");
fill_iface((ipfw_insn_if *)cmd, av[0]);
- ac--; av++;
+ av++;
if (F_LEN(cmd) == 0) /* not a valid address */
break;
if (i == TOK_XMIT)
@@ -3145,13 +3183,13 @@ read_options:
case TOK_ICMPTYPES:
NEED1("icmptypes requires list of types");
fill_icmptypes((ipfw_insn_u32 *)cmd, *av);
- av++; ac--;
+ av++;
break;
case TOK_ICMP6TYPES:
NEED1("icmptypes requires list of types");
fill_icmp6types((ipfw_insn_icmp6 *)cmd, *av);
- av++; ac--;
+ av++;
break;
case TOK_IPTTL:
@@ -3161,7 +3199,7 @@ read_options:
errx(EX_DATAERR, "invalid ipttl %s", *av);
} else
fill_cmd(cmd, O_IPTTL, 0, strtoul(*av, NULL, 0));
- ac--; av++;
+ av++;
break;
case TOK_IPID:
@@ -3171,7 +3209,7 @@ read_options:
errx(EX_DATAERR, "invalid ipid %s", *av);
} else
fill_cmd(cmd, O_IPID, 0, strtoul(*av, NULL, 0));
- ac--; av++;
+ av++;
break;
case TOK_IPLEN:
@@ -3181,32 +3219,32 @@ read_options:
errx(EX_DATAERR, "invalid ip len %s", *av);
} else
fill_cmd(cmd, O_IPLEN, 0, strtoul(*av, NULL, 0));
- ac--; av++;
+ av++;
break;
case TOK_IPVER:
NEED1("ipver requires version");
fill_cmd(cmd, O_IPVER, 0, strtoul(*av, NULL, 0));
- ac--; av++;
+ av++;
break;
case TOK_IPPRECEDENCE:
NEED1("ipprecedence requires value");
fill_cmd(cmd, O_IPPRECEDENCE, 0,
(strtoul(*av, NULL, 0) & 7) << 5);
- ac--; av++;
+ av++;
break;
case TOK_IPOPTS:
NEED1("missing argument for ipoptions");
fill_flags(cmd, O_IPOPT, f_ipopts, *av);
- ac--; av++;
+ av++;
break;
case TOK_IPTOS:
NEED1("missing argument for iptos");
fill_flags(cmd, O_IPTOS, f_iptos, *av);
- ac--; av++;
+ av++;
break;
case TOK_UID:
@@ -3223,7 +3261,7 @@ read_options:
errx(EX_DATAERR, "uid \"%s\" nonexistent", *av);
cmd32->d[0] = pwd->pw_uid;
cmd->len |= F_INSN_SIZE(ipfw_insn_u32);
- ac--; av++;
+ av++;
}
break;
@@ -3241,7 +3279,7 @@ read_options:
errx(EX_DATAERR, "gid \"%s\" nonexistent", *av);
cmd32->d[0] = grp->gr_gid;
cmd->len |= F_INSN_SIZE(ipfw_insn_u32);
- ac--; av++;
+ av++;
}
break;
@@ -3257,7 +3295,7 @@ read_options:
errx(EX_DATAERR, "jail requires prison ID");
cmd32->d[0] = (uint32_t)jid;
cmd->len |= F_INSN_SIZE(ipfw_insn_u32);
- ac--; av++;
+ av++;
}
break;
@@ -3278,13 +3316,13 @@ read_options:
} else
fill_cmd(cmd, O_TCPDATALEN, 0,
strtoul(*av, NULL, 0));
- ac--; av++;
+ av++;
break;
case TOK_TCPOPTS:
NEED1("missing argument for tcpoptions");
fill_flags(cmd, O_TCPOPTS, f_tcpopts, *av);
- ac--; av++;
+ av++;
break;
case TOK_TCPSEQ:
@@ -3293,21 +3331,21 @@ read_options:
cmd->len = F_INSN_SIZE(ipfw_insn_u32);
cmd->opcode = (i == TOK_TCPSEQ) ? O_TCPSEQ : O_TCPACK;
cmd32->d[0] = htonl(strtoul(*av, NULL, 0));
- ac--; av++;
+ av++;
break;
case TOK_TCPWIN:
NEED1("tcpwin requires length");
fill_cmd(cmd, O_TCPWIN, 0,
htons(strtoul(*av, NULL, 0)));
- ac--; av++;
+ av++;
break;
case TOK_TCPFLAGS:
NEED1("missing argument for tcpflags");
cmd->opcode = O_TCPFLAGS;
fill_flags(cmd, O_TCPFLAGS, f_tcpflags, *av);
- ac--; av++;
+ av++;
break;
case TOK_KEEPSTATE:
@@ -3337,11 +3375,11 @@ read_options:
cmd->opcode = O_LIMIT;
c->limit_mask = c->conn_limit = 0;
- while (ac > 0) {
+ while ( av[0] != NULL ) {
if ((val = match_token(limit_masks, *av)) <= 0)
break;
c->limit_mask |= val;
- ac--; av++;
+ av++;
}
if (c->limit_mask == 0)
@@ -3350,14 +3388,14 @@ read_options:
GET_UINT_ARG(c->conn_limit, IPFW_ARG_MIN, IPFW_ARG_MAX,
TOK_LIMIT, rule_options);
- ac--; av++;
+ av++;
break;
}
case TOK_PROTO:
NEED1("missing protocol");
if (add_proto(cmd, *av, &proto)) {
- ac--; av++;
+ av++;
} else
errx(EX_DATAERR, "invalid protocol ``%s''",
*av);
@@ -3366,28 +3404,28 @@ read_options:
case TOK_SRCIP:
NEED1("missing source IP");
if (add_srcip(cmd, *av)) {
- ac--; av++;
+ av++;
}
break;
case TOK_DSTIP:
NEED1("missing destination IP");
if (add_dstip(cmd, *av)) {
- ac--; av++;
+ av++;
}
break;
case TOK_SRCIP6:
NEED1("missing source IP6");
if (add_srcip6(cmd, *av)) {
- ac--; av++;
+ av++;
}
break;
case TOK_DSTIP6:
NEED1("missing destination IP6");
if (add_dstip6(cmd, *av)) {
- ac--; av++;
+ av++;
}
break;
@@ -3395,7 +3433,7 @@ read_options:
NEED1("missing source port");
if (_substrcmp(*av, "any") == 0 ||
add_ports(cmd, *av, proto, O_IP_SRCPORT)) {
- ac--; av++;
+ av++;
} else
errx(EX_DATAERR, "invalid source port %s", *av);
break;
@@ -3404,23 +3442,22 @@ read_options:
NEED1("missing destination port");
if (_substrcmp(*av, "any") == 0 ||
add_ports(cmd, *av, proto, O_IP_DSTPORT)) {
- ac--; av++;
+ av++;
} else
errx(EX_DATAERR, "invalid destination port %s",
*av);
break;
case TOK_MAC:
- if (add_mac(cmd, ac, av)) {
- ac -= 2; av += 2;
- }
+ if (add_mac(cmd, av))
+ av += 2;
break;
case TOK_MACTYPE:
NEED1("missing mac type");
- if (!add_mactype(cmd, ac, *av))
+ if (!add_mactype(cmd, *av))
errx(EX_DATAERR, "invalid mac type %s", *av);
- ac--; av++;
+ av++;
break;
case TOK_VERREVPATH:
@@ -3449,7 +3486,7 @@ read_options:
case TOK_EXT6HDR:
fill_ext6hdr( cmd, *av );
- ac--; av++;
+ av++;
break;
case TOK_FLOWID:
@@ -3457,17 +3494,16 @@ read_options:
errx( EX_USAGE, "flow-id filter is active "
"only for ipv6 protocol\n");
fill_flow6( (ipfw_insn_u32 *) cmd, *av );
- ac--; av++;
+ av++;
break;
case TOK_COMMENT:
- fill_comment(cmd, ac, av);
- av += ac;
- ac = 0;
+ fill_comment(cmd, av);
+ av[0]=NULL;
break;
case TOK_TAGGED:
- if (ac > 0 && strpbrk(*av, "-,")) {
+ if (av[0] && strpbrk(*av, "-,")) {
if (!add_ports(cmd, *av, 0, O_TAGGED))
errx(EX_DATAERR, "tagged: invalid tag"
" list: %s", *av);
@@ -3479,13 +3515,38 @@ read_options:
TOK_TAGGED, rule_options);
fill_cmd(cmd, O_TAGGED, 0, tag);
}
- ac--; av++;
+ av++;
break;
case TOK_FIB:
NEED1("fib requires fib number");
fill_cmd(cmd, O_FIB, 0, strtoul(*av, NULL, 0));
- ac--; av++;
+ av++;
+ break;
+
+ case TOK_LOOKUP: {
+ ipfw_insn_u32 *c = (ipfw_insn_u32 *)cmd;
+ char *p;
+ int j;
+
+ if (!av[0] || !av[1])
+ errx(EX_USAGE, "format: lookup argument tablenum");
+ cmd->opcode = O_IP_DST_LOOKUP;
+ cmd->len |= F_INSN_SIZE(ipfw_insn) + 2;
+ i = match_token(rule_options, *av);
+ for (j = 0; lookup_key[j] >= 0 ; j++) {
+ if (i == lookup_key[j])
+ break;
+ }
+ if (lookup_key[j] <= 0)
+ errx(EX_USAGE, "format: cannot lookup on %s", *av);
+ c->d[1] = j; // i converted to option
+ av++;
+ cmd->arg1 = strtoul(*av, &p, 0);
+ if (p && *p)
+ errx(EX_USAGE, "format: lookup argument tablenum");
+ av++;
+ }
break;
default:
@@ -3662,6 +3723,10 @@ ipfw_flush(int force)
if (c == 'N') /* user said no */
return;
}
+ if (co.do_pipe) {
+ dummynet_flush();
+ return;
+ }
/* `ipfw set N flush` - is the same that `ipfw delete set N` */
if (co.use_set) {
uint32_t arg = ((co.use_set - 1) & 0xffff) | (1 << 24);
@@ -3775,14 +3840,14 @@ ipfw_table_handler(int ac, char *av[])
}
}
} else if (_substrcmp(*av, "flush") == 0) {
- a = is_all ? tables_max : (ent.tbl + 1);
+ a = is_all ? tables_max : (uint32_t)(ent.tbl + 1);
do {
if (do_cmd(IP_FW_TABLE_FLUSH, &ent.tbl,
sizeof(ent.tbl)) < 0)
err(EX_OSERR, "setsockopt(IP_FW_TABLE_FLUSH)");
} while (++ent.tbl < a);
} else if (_substrcmp(*av, "list") == 0) {
- a = is_all ? tables_max : (ent.tbl + 1);
+ a = is_all ? tables_max : (uint32_t)(ent.tbl + 1);
do {
table_list(ent, is_all);
} while (++ent.tbl < a);
diff --git a/sbin/ipfw/ipfw2.h b/sbin/ipfw/ipfw2.h
index d3ce7fb..d172984 100644
--- a/sbin/ipfw/ipfw2.h
+++ b/sbin/ipfw/ipfw2.h
@@ -35,7 +35,7 @@ struct cmdline_opts {
int do_resolv; /* try to resolve all ip to names */
int do_time; /* Show time stamps */
int do_quiet; /* Be quiet in add and flush */
- int do_pipe; /* this cmd refers to a pipe */
+ int do_pipe; /* this cmd refers to a pipe/queue/sched */
int do_nat; /* this cmd refers to a nat config */
int do_dynamic; /* display dynamic rules */
int do_expired; /* display expired dynamic rules */
@@ -82,7 +82,10 @@ enum tokens {
TOK_ACCEPT,
TOK_COUNT,
TOK_PIPE,
+ TOK_LINK,
TOK_QUEUE,
+ TOK_FLOWSET,
+ TOK_SCHED,
TOK_DIVERT,
TOK_TEE,
TOK_NETGRAPH,
@@ -122,6 +125,7 @@ enum tokens {
TOK_IPLEN,
TOK_IPID,
TOK_IPPRECEDENCE,
+ TOK_DSCP,
TOK_IPTOS,
TOK_IPTTL,
TOK_IPVER,
@@ -151,15 +155,23 @@ enum tokens {
TOK_SRCPORT,
TOK_ALL,
TOK_MASK,
+ TOK_FLOW_MASK,
+ TOK_SCHED_MASK,
TOK_BW,
TOK_DELAY,
- TOK_PIPE_PROFILE,
+ TOK_PROFILE,
TOK_BURST,
TOK_RED,
TOK_GRED,
TOK_DROPTAIL,
TOK_PROTO,
+ /* dummynet tokens */
TOK_WEIGHT,
+ TOK_LMAX,
+ TOK_PRI,
+ TOK_TYPE,
+ TOK_SLOTSIZE,
+
TOK_IP,
TOK_IF,
TOK_ALOG,
@@ -186,12 +198,14 @@ enum tokens {
TOK_FIB,
TOK_SETFIB,
+ TOK_LOOKUP,
};
/*
* the following macro returns an error message if we run out of
* arguments.
*/
-#define NEED1(msg) {if (!ac) errx(EX_USAGE, msg);}
+#define NEED(_p, msg) {if (!_p) errx(EX_USAGE, msg);}
+#define NEED1(msg) {if (!(*av)) errx(EX_USAGE, msg);}
unsigned long long align_uint64(const uint64_t *pll);
@@ -235,14 +249,14 @@ struct _ipfw_insn_icmp6;
extern int resvd_set_number;
/* first-level command handlers */
-void ipfw_add(int ac, char *av[]);
+void ipfw_add(char *av[]);
void ipfw_show_nat(int ac, char **av);
void ipfw_config_pipe(int ac, char **av);
void ipfw_config_nat(int ac, char **av);
-void ipfw_sets_handler(int ac, char *av[]);
+void ipfw_sets_handler(char *av[]);
void ipfw_table_handler(int ac, char *av[]);
-void ipfw_sysctl_handler(int ac, char *av[], int which);
-void ipfw_delete(int ac, char *av[]);
+void ipfw_sysctl_handler(char *av[], int which);
+void ipfw_delete(char *av[]);
void ipfw_flush(int force);
void ipfw_zero(int ac, char *av[], int optname);
void ipfw_list(int ac, char *av[], int show_counters);
@@ -254,7 +268,8 @@ u_int32_t altq_name_to_qid(const char *name);
void print_altq_cmd(struct _ipfw_insn_altq *altqptr);
/* dummynet.c */
-void ipfw_list_pipes(void *data, uint nbytes, int ac, char *av[]);
+void dummynet_list(int ac, char *av[], int show_counters);
+void dummynet_flush(void);
int ipfw_delete_pipe(int pipe_or_queue, int n);
/* ipv6.c */
diff --git a/sbin/ipfw/main.c b/sbin/ipfw/main.c
index 3916057..cd39cf1 100644
--- a/sbin/ipfw/main.c
+++ b/sbin/ipfw/main.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2002-2003 Luigi Rizzo
+ * Copyright (c) 2002-2003,2010 Luigi Rizzo
* Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp
* Copyright (c) 1994 Ugen J.S.Antsilevich
*
@@ -80,31 +80,27 @@ help(void)
}
/*
- * Free a the (locally allocated) copy of command line arguments.
- */
-static void
-free_args(int ac, char **av)
-{
- int i;
-
- for (i=0; i < ac; i++)
- free(av[i]);
- free(av);
-}
-
-/*
* Called with the arguments, including program name because getopt
* wants it to be present.
* Returns 0 if successful, 1 if empty command, errx() in case of errors.
+ * First thing we do is process parameters creating an argv[] array
+ * which includes the program name and a NULL entry at the end.
+ * If we are called with a single string, we split it on whitespace.
+ * Also, arguments with a trailing ',' are joined to the next one.
+ * The pointers (av[]) and data are in a a single chunk of memory.
+ * av[0] points to the original program name, all other entries
+ * point into the allocated chunk.
*/
static int
ipfw_main(int oldac, char **oldav)
{
- int ch, ac, save_ac;
+ int ch, ac;
const char *errstr;
char **av, **save_av;
int do_acct = 0; /* Show packet/byte count */
int try_next = 0; /* set if pipe cmd not found */
+ int av_size; /* compute the av size */
+ char *av_p; /* used to build the av list */
#define WHITESP " \t\f\v\n\r"
if (oldac < 2)
@@ -112,10 +108,9 @@ ipfw_main(int oldac, char **oldav)
if (oldac == 2) {
/*
- * If we are called with a single string, try to split it into
- * arguments for subsequent parsing.
- * But first, remove spaces after a ',', by copying the string
- * in-place.
+ * If we are called with one argument, try to split it into
+ * words for subsequent parsing. Spaces after a ',' are
+ * removed by copying the string in-place.
*/
char *arg = oldav[1]; /* The string is the first arg. */
int l = strlen(arg);
@@ -150,31 +145,59 @@ ipfw_main(int oldac, char **oldav)
ac++;
/*
- * Allocate the argument list, including one entry for
- * the program name because getopt expects it.
+ * Allocate the argument list structure as a single block
+ * of memory, containing pointers and the argument
+ * strings. We include one entry for the program name
+ * because getopt expects it, and a NULL at the end
+ * to simplify further parsing.
*/
- av = safe_calloc(ac + 1, sizeof(char *));
+ ac++; /* add 1 for the program name */
+ av_size = (ac+1) * sizeof(char *) + l + 1;
+ av = safe_calloc(av_size, 1);
/*
- * Second, copy arguments from arg[] to av[]. For each one,
+ * Init the argument pointer to the end of the array
+ * and copy arguments from arg[] to av[]. For each one,
* j is the initial character, i is the one past the end.
*/
- for (ac = 1, i = j = 0; i < l; i++)
+ av_p = (char *)&av[ac+1];
+ for (ac = 1, i = j = 0; i < l; i++) {
if (index(WHITESP, arg[i]) != NULL || i == l-1) {
if (i == l-1)
i++;
- av[ac] = safe_calloc(i-j+1, 1);
- bcopy(arg+j, av[ac], i-j);
+ bcopy(arg+j, av_p, i-j);
+ av[ac] = av_p;
+ av_p += i-j; /* the lenght of the string */
+ *av_p++ = '\0';
ac++;
j = i + 1;
}
+ }
} else {
/*
* If an argument ends with ',' join with the next one.
*/
- int first, i, l;
+ int first, i, l=0;
+
+ /*
+ * Allocate the argument list structure as a single block
+ * of memory, containing both pointers and the argument
+ * strings. We include some space for the program name
+ * because getopt expects it.
+ * We add an extra pointer to the end of the array,
+ * to make simpler further parsing.
+ */
+ for (i=0; i<oldac; i++)
+ l += strlen(oldav[i]);
- av = safe_calloc(oldac, sizeof(char *));
+ av_size = (oldac+1) * sizeof(char *) + l + oldac;
+ av = safe_calloc(av_size, 1);
+
+ /*
+ * Init the argument pointer to the end of the array
+ * and copy arguments from arg[] to av[]
+ */
+ av_p = (char *)&av[oldac+1];
for (first = i = ac = 1, l = 0; i < oldac; i++) {
char *arg = oldav[i];
int k = strlen(arg);
@@ -182,11 +205,12 @@ ipfw_main(int oldac, char **oldav)
l += k;
if (arg[k-1] != ',' || i == oldac-1) {
/* Time to copy. */
- av[ac] = safe_calloc(l+1, 1);
+ av[ac] = av_p;
for (l=0; first <= i; first++) {
- strcat(av[ac]+l, oldav[first]);
- l += strlen(oldav[first]);
+ strcat(av_p, oldav[first]);
+ av_p += strlen(oldav[first]);
}
+ *av_p++ = '\0';
ac++;
l = 0;
first = i+1;
@@ -194,13 +218,47 @@ ipfw_main(int oldac, char **oldav)
}
}
- av[0] = strdup(oldav[0]); /* copy progname from the caller */
+ /*
+ * set the progname pointer to the original string
+ * and terminate the array with null
+ */
+ av[0] = oldav[0];
+ av[ac] = NULL;
+
/* Set the force flag for non-interactive processes */
if (!co.do_force)
co.do_force = !isatty(STDIN_FILENO);
+#ifdef EMULATE_SYSCTL /* sysctl emulation */
+ if ( ac >= 2 && !strcmp(av[1], "sysctl")) {
+ char *s;
+ int i;
+
+ if (ac != 3) {
+ printf( "sysctl emulation usage:\n"
+ " ipfw sysctl name[=value]\n"
+ " ipfw sysctl -a\n");
+ return 0;
+ }
+ s = index(av[2], '=');
+ if (s == NULL) {
+ s = !strcmp(av[2], "-a") ? NULL : av[2];
+ sysctlbyname(s, NULL, NULL, NULL, 0);
+ } else { /* ipfw sysctl x.y.z=value */
+ /* assume an INT value, will extend later */
+ if (s[1] == '\0') {
+ printf("ipfw sysctl: missing value\n\n");
+ return 0;
+ }
+ *s = '\0';
+ i = strtol(s+1, NULL, 0);
+ sysctlbyname(av[2], NULL, NULL, &i, sizeof(int));
+ }
+ return 0;
+ }
+#endif
+
/* Save arguments for final freeing of memory. */
- save_ac = ac;
save_av = av;
optind = optreset = 1; /* restart getopt() */
@@ -232,7 +290,7 @@ ipfw_main(int oldac, char **oldav)
break;
case 'h': /* help */
- free_args(save_ac, save_av);
+ free(save_av);
help();
break; /* NOTREACHED */
@@ -273,7 +331,7 @@ ipfw_main(int oldac, char **oldav)
break;
default:
- free_args(save_ac, save_av);
+ free(save_av);
return 1;
}
@@ -304,6 +362,10 @@ ipfw_main(int oldac, char **oldav)
co.do_pipe = 1;
else if (_substrcmp(*av, "queue") == 0)
co.do_pipe = 2;
+ else if (_substrcmp(*av, "flowset") == 0)
+ co.do_pipe = 2;
+ else if (_substrcmp(*av, "sched") == 0)
+ co.do_pipe = 3;
else if (!strncmp(*av, "set", strlen(*av))) {
if (ac > 1 && isdigit(av[1][0])) {
co.use_set = strtonum(av[1], 0, resvd_set_number,
@@ -335,7 +397,7 @@ ipfw_main(int oldac, char **oldav)
if (co.use_set == 0) {
if (_substrcmp(*av, "add") == 0)
- ipfw_add(ac, av);
+ ipfw_add(av);
else if (co.do_nat && _substrcmp(*av, "show") == 0)
ipfw_show_nat(ac, av);
else if (co.do_pipe && _substrcmp(*av, "config") == 0)
@@ -343,20 +405,20 @@ ipfw_main(int oldac, char **oldav)
else if (co.do_nat && _substrcmp(*av, "config") == 0)
ipfw_config_nat(ac, av);
else if (_substrcmp(*av, "set") == 0)
- ipfw_sets_handler(ac, av);
+ ipfw_sets_handler(av);
else if (_substrcmp(*av, "table") == 0)
ipfw_table_handler(ac, av);
else if (_substrcmp(*av, "enable") == 0)
- ipfw_sysctl_handler(ac, av, 1);
+ ipfw_sysctl_handler(av, 1);
else if (_substrcmp(*av, "disable") == 0)
- ipfw_sysctl_handler(ac, av, 0);
+ ipfw_sysctl_handler(av, 0);
else
try_next = 1;
}
if (co.use_set || try_next) {
if (_substrcmp(*av, "delete") == 0)
- ipfw_delete(ac, av);
+ ipfw_delete(av);
else if (_substrcmp(*av, "flush") == 0)
ipfw_flush(co.do_force);
else if (_substrcmp(*av, "zero") == 0)
@@ -373,7 +435,7 @@ ipfw_main(int oldac, char **oldav)
}
/* Free memory allocated in the argument parsing. */
- free_args(save_ac, save_av);
+ free(save_av);
return 0;
}
@@ -521,6 +583,20 @@ ipfw_readfile(int ac, char *av[])
int
main(int ac, char *av[])
{
+#if defined(_WIN32) && defined(TCC)
+ {
+ WSADATA wsaData;
+ int ret=0;
+ unsigned short wVersionRequested = MAKEWORD(2, 2);
+ ret = WSAStartup(wVersionRequested, &wsaData);
+ if (ret != 0) {
+ /* Tell the user that we could not find a usable */
+ /* Winsock DLL. */
+ printf("WSAStartup failed with error: %d\n", ret);
+ return 1;
+ }
+ }
+#endif
/*
* If the last argument is an absolute pathname, interpret it
* as a file to be preprocessed.
diff --git a/sys/conf/files b/sys/conf/files
index 313b51e..b488012 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -2474,13 +2474,24 @@ netinet/in_proto.c optional inet \
compile-with "${NORMAL_C} -I$S/contrib/pf"
netinet/in_rmx.c optional inet
netinet/ip_divert.c optional inet ipdivert ipfirewall
+netinet/ipfw/dn_heap.c optional inet dummynet
+netinet/ipfw/dn_sched_fifo.c optional inet dummynet
+netinet/ipfw/dn_sched_rr.c optional inet dummynet
+netinet/ipfw/dn_sched_wf2q.c optional inet dummynet
+netinet/ipfw/dn_sched_qfq.c optional inet dummynet
netinet/ipfw/ip_dummynet.c optional inet dummynet
+netinet/ipfw/ip_dn_io.c optional inet dummynet
+netinet/ipfw/ip_dn_glue.c optional inet dummynet
netinet/ip_ecn.c optional inet | inet6
netinet/ip_encap.c optional inet | inet6
netinet/ip_fastfwd.c optional inet
netinet/ipfw/ip_fw2.c optional inet ipfirewall \
compile-with "${NORMAL_C} -I$S/contrib/pf"
+netinet/ipfw/ip_fw_dynamic.c optional inet ipfirewall
+netinet/ipfw/ip_fw_log.c optional inet ipfirewall
netinet/ipfw/ip_fw_pfil.c optional inet ipfirewall
+netinet/ipfw/ip_fw_sockopt.c optional inet ipfirewall
+netinet/ipfw/ip_fw_table.c optional inet ipfirewall
netinet/ipfw/ip_fw_nat.c optional inet ipfirewall_nat
netinet/ip_icmp.c optional inet
netinet/ip_input.c optional inet
diff --git a/sys/net/if_bridge.c b/sys/net/if_bridge.c
index 1873095..fe4e18b 100644
--- a/sys/net/if_bridge.c
+++ b/sys/net/if_bridge.c
@@ -134,7 +134,7 @@ __FBSDID("$FreeBSD$");
#include <net/route.h>
#include <netinet/ip_fw.h>
-#include <netinet/ip_dummynet.h>
+#include <netinet/ipfw/ip_fw_private.h>
/*
* Size of the route hash table. Must be a power of two.
@@ -3058,20 +3058,28 @@ bridge_pfil(struct mbuf **mp, struct ifnet *bifp, struct ifnet *ifp, int dir)
goto bad;
}
- if (V_ip_fw_chk_ptr && pfil_ipfw != 0 && dir == PFIL_OUT && ifp != NULL) {
- struct dn_pkt_tag *dn_tag;
+ /* XXX this section is also in if_ethersubr.c */
+ // XXX PFIL_OUT or DIR_OUT ?
+ if (V_ip_fw_chk_ptr && pfil_ipfw != 0 &&
+ dir == PFIL_OUT && ifp != NULL) {
+ struct m_tag *mtag;
error = -1;
- dn_tag = ip_dn_claim_tag(*mp);
- if (dn_tag != NULL) {
- if (dn_tag->rule != NULL && V_fw_one_pass)
- /* packet already partially processed */
+ /* fetch the start point from existing tags, if any */
+ mtag = m_tag_locate(*mp, MTAG_IPFW_RULE, 0, NULL);
+ if (mtag == NULL) {
+ args.rule.slot = 0;
+ } else {
+ struct ipfw_rule_ref *r;
+
+ /* XXX can we free the tag after use ? */
+ mtag->m_tag_id = PACKET_TAG_NONE;
+ r = (struct ipfw_rule_ref *)(mtag + 1);
+ /* packet already partially processed ? */
+ if (r->info & IPFW_ONEPASS)
goto ipfwpass;
- args.rule = dn_tag->rule; /* matching rule to restart */
- args.rule_id = dn_tag->rule_id;
- args.chain_id = dn_tag->chain_id;
- } else
- args.rule = NULL;
+ args.rule = *r;
+ }
args.m = *mp;
args.oif = ifp;
@@ -3097,7 +3105,7 @@ bridge_pfil(struct mbuf **mp, struct ifnet *bifp, struct ifnet *ifp, int dir)
* packet will return to us via bridge_dummynet().
*/
args.oif = ifp;
- ip_dn_io_ptr(mp, DN_TO_IFB_FWD, &args);
+ ip_dn_io_ptr(mp, DIR_FWD | PROTO_IFB, &args);
return (error);
}
diff --git a/sys/net/if_ethersubr.c b/sys/net/if_ethersubr.c
index 5dff9bc..46c44e9 100644
--- a/sys/net/if_ethersubr.c
+++ b/sys/net/if_ethersubr.c
@@ -70,9 +70,9 @@
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/if_ether.h>
-#include <netinet/ip_fw.h>
-#include <netinet/ip_dummynet.h>
#include <netinet/ip_var.h>
+#include <netinet/ip_fw.h>
+#include <netinet/ipfw/ip_fw_private.h>
#endif
#ifdef INET6
#include <netinet6/nd6.h>
@@ -466,19 +466,23 @@ ether_ipfw_chk(struct mbuf **m0, struct ifnet *dst, int shared)
struct mbuf *m;
int i;
struct ip_fw_args args;
- struct dn_pkt_tag *dn_tag;
-
- dn_tag = ip_dn_claim_tag(*m0);
+ struct m_tag *mtag;
- if (dn_tag != NULL) {
- if (dn_tag->rule != NULL && V_fw_one_pass)
+ /* fetch start point from rule, if any */
+ mtag = m_tag_locate(*m0, MTAG_IPFW_RULE, 0, NULL);
+ if (mtag == NULL) {
+ args.rule.slot = 0;
+ } else {
/* dummynet packet, already partially processed */
+ struct ipfw_rule_ref *r;
+
+ /* XXX can we free it after use ? */
+ mtag->m_tag_id = PACKET_TAG_NONE;
+ r = (struct ipfw_rule_ref *)(mtag + 1);
+ if (r->info & IPFW_ONEPASS)
return (1);
- args.rule = dn_tag->rule; /* matching rule to restart */
- args.rule_id = dn_tag->rule_id;
- args.chain_id = dn_tag->chain_id;
- } else
- args.rule = NULL;
+ args.rule = *r;
+ }
/*
* I need some amt of data to be contiguous, and in case others need
@@ -529,6 +533,7 @@ ether_ipfw_chk(struct mbuf **m0, struct ifnet *dst, int shared)
return 1;
if (ip_dn_io_ptr && (i == IP_FW_DUMMYNET)) {
+ int dir;
/*
* Pass the pkt to dummynet, which consumes it.
* If shared, make a copy and keep the original.
@@ -544,7 +549,8 @@ ether_ipfw_chk(struct mbuf **m0, struct ifnet *dst, int shared)
*/
*m0 = NULL ;
}
- ip_dn_io_ptr(&m, dst ? DN_TO_ETH_OUT: DN_TO_ETH_DEMUX, &args);
+ dir = PROTO_LAYER2 | (dst ? DIR_OUT : DIR_IN);
+ ip_dn_io_ptr(&m, dir, &args);
return 0;
}
/*
diff --git a/sys/net/radix.c b/sys/net/radix.c
index 39b198e..9f2383d 100644
--- a/sys/net/radix.c
+++ b/sys/net/radix.c
@@ -33,7 +33,6 @@
/*
* Routines to build and maintain radix trees for routing lookups.
*/
-#ifndef _RADIX_H_
#include <sys/param.h>
#ifdef _KERNEL
#include <sys/lock.h>
@@ -41,20 +40,21 @@
#include <sys/rwlock.h>
#include <sys/systm.h>
#include <sys/malloc.h>
-#include <sys/domain.h>
-#else
-#include <stdlib.h>
-#endif
#include <sys/syslog.h>
#include <net/radix.h>
-#endif
-
#include "opt_mpath.h"
-
#ifdef RADIX_MPATH
#include <net/radix_mpath.h>
#endif
-
+#else /* !_KERNEL */
+#include <stdio.h>
+#include <strings.h>
+#include <stdlib.h>
+#define log(x, arg...) fprintf(stderr, ## arg)
+#define panic(x) fprintf(stderr, "PANIC: %s", x), exit(1)
+#define min(a, b) ((a) < (b) ? (a) : (b) )
+#include <net/radix.h>
+#endif /* !_KERNEL */
static int rn_walktree_from(struct radix_node_head *h, void *a, void *m,
walktree_f_t *f, void *w);
@@ -72,6 +72,8 @@ static struct radix_node_head *mask_rnhead;
/*
* Work area -- the following point to 3 buffers of size max_keylen,
* allocated in this order in a block of memory malloc'ed by rn_init.
+ * rn_zeros, rn_ones are set in rn_init and used in readonly afterwards.
+ * addmask_key is used in rn_addmask in rw mode and not thread-safe.
*/
static char *rn_zeros, *rn_ones, *addmask_key;
@@ -135,8 +137,9 @@ static int rn_satisfies_leaf(char *trial, struct radix_node *leaf,
* To make the assumption more explicit, we use the LEN() macro to access
* this field. It is safe to pass an expression with side effects
* to LEN() as the argument is evaluated only once.
+ * We cast the result to int as this is the dominant usage.
*/
-#define LEN(x) (*(const u_char *)(x))
+#define LEN(x) ( (int) (*(const u_char *)(x)) )
/*
* XXX THIS NEEDS TO BE FIXED
@@ -197,7 +200,7 @@ rn_refines(m_arg, n_arg)
{
register caddr_t m = m_arg, n = n_arg;
register caddr_t lim, lim2 = lim = n + LEN(n);
- int longer = LEN(n++) - (int)LEN(m++);
+ int longer = LEN(n++) - LEN(m++);
int masks_are_equal = 1;
if (longer > 0)
@@ -250,10 +253,10 @@ rn_satisfies_leaf(trial, leaf, skip)
char *cplim;
int length = min(LEN(cp), LEN(cp2));
- if (cp3 == 0)
+ if (cp3 == NULL)
cp3 = rn_ones;
else
- length = min(length, *(u_char *)cp3);
+ length = min(length, LEN(cp3));
cplim = cp + length; cp3 += skip; cp2 += skip;
for (cp += skip; cp < cplim; cp++, cp2++, cp3++)
if ((*cp ^ *cp2) & *cp3)
@@ -424,7 +427,7 @@ rn_insert(v_arg, head, dupentry, nodes)
{
caddr_t v = v_arg;
struct radix_node *top = head->rnh_treetop;
- int head_off = top->rn_offset, vlen = (int)LEN(v);
+ int head_off = top->rn_offset, vlen = LEN(v);
register struct radix_node *t = rn_search(v_arg, top);
register caddr_t cp = v + head_off;
register int b;
@@ -933,7 +936,7 @@ on1:
if (m)
log(LOG_ERR,
"rn_delete: Orphaned Mask %p at %p\n",
- (void *)m, (void *)x);
+ m, x);
}
}
/*
@@ -1158,17 +1161,28 @@ rn_inithead(head, off)
return (1);
}
+int
+rn_detachhead(void **head)
+{
+ struct radix_node_head *rnh;
+
+ KASSERT((head != NULL && *head != NULL),
+ ("%s: head already freed", __func__));
+ rnh = *head;
+
+ /* Free <left,root,right> nodes. */
+ Free(rnh);
+
+ *head = NULL;
+ return (1);
+}
+
void
-rn_init()
+rn_init(int maxk)
{
char *cp, *cplim;
-#ifdef _KERNEL
- struct domain *dom;
- for (dom = domains; dom; dom = dom->dom_next)
- if (dom->dom_maxrtkey > max_keylen)
- max_keylen = dom->dom_maxrtkey;
-#endif
+ max_keylen = maxk;
if (max_keylen == 0) {
log(LOG_ERR,
"rn_init: radix functions require max_keylen be set\n");
diff --git a/sys/net/radix.h b/sys/net/radix.h
index e84072f..29659b5 100644
--- a/sys/net/radix.h
+++ b/sys/net/radix.h
@@ -160,8 +160,9 @@ struct radix_node_head {
#define RADIX_NODE_HEAD_WLOCK_ASSERT(rnh) rw_assert(&(rnh)->rnh_lock, RA_WLOCKED)
#endif /* _KERNEL */
-void rn_init(void);
+void rn_init(int);
int rn_inithead(void **, int);
+int rn_detachhead(void **);
int rn_refines(void *, void *);
struct radix_node
*rn_addmask(void *, int, int),
diff --git a/sys/net/route.c b/sys/net/route.c
index b00ea69..a938c9c 100644
--- a/sys/net/route.c
+++ b/sys/net/route.c
@@ -169,13 +169,20 @@ rt_tables_get_rnh(int table, int fam)
static void
route_init(void)
{
+ struct domain *dom;
+ int max_keylen = 0;
/* whack the tunable ints into line. */
if (rt_numfibs > RT_MAXFIBS)
rt_numfibs = RT_MAXFIBS;
if (rt_numfibs == 0)
rt_numfibs = 1;
- rn_init(); /* initialize all zeroes, all ones, mask table */
+
+ for (dom = domains; dom; dom = dom->dom_next)
+ if (dom->dom_maxrtkey > max_keylen)
+ max_keylen = dom->dom_maxrtkey;
+
+ rn_init(max_keylen); /* init all zeroes, all ones, mask table */
}
SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, 0);
diff --git a/sys/netgraph/ng_ipfw.c b/sys/netgraph/ng_ipfw.c
index 46bac8e..d331828 100644
--- a/sys/netgraph/ng_ipfw.c
+++ b/sys/netgraph/ng_ipfw.c
@@ -43,9 +43,10 @@
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/in_var.h>
+#include <netinet/ip_var.h>
#include <netinet/ip_fw.h>
+#include <netinet/ipfw/ip_fw_private.h>
#include <netinet/ip.h>
-#include <netinet/ip_var.h>
#include <netgraph/ng_message.h>
#include <netgraph/ng_parse.h>
@@ -220,21 +221,23 @@ ng_ipfw_findhook1(node_p node, u_int16_t rulenum)
static int
ng_ipfw_rcvdata(hook_p hook, item_p item)
{
- struct ng_ipfw_tag *ngit;
+ struct ipfw_rule_ref *tag;
struct mbuf *m;
NGI_GET_M(item, m);
NG_FREE_ITEM(item);
- if ((ngit = (struct ng_ipfw_tag *)m_tag_locate(m, NGM_IPFW_COOKIE, 0,
- NULL)) == NULL) {
+ tag = (struct ipfw_rule_ref *)
+ m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL);
+ if (tag == NULL) {
NG_FREE_M(m);
return (EINVAL); /* XXX: find smth better */
};
- switch (ngit->dir) {
- case NG_IPFW_OUT:
- {
+ if (tag->info & IPFW_INFO_IN) {
+ ip_input(m);
+ return (0);
+ } else {
struct ip *ip;
if (m->m_len < sizeof(struct ip) &&
@@ -243,27 +246,16 @@ ng_ipfw_rcvdata(hook_p hook, item_p item)
ip = mtod(m, struct ip *);
- ip->ip_len = ntohs(ip->ip_len);
- ip->ip_off = ntohs(ip->ip_off);
+ SET_HOST_IPLEN(ip);
return ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL);
}
- case NG_IPFW_IN:
- ip_input(m);
- return (0);
- default:
- panic("ng_ipfw_rcvdata: bad dir %u", ngit->dir);
- }
-
- /* not reached */
- return (0);
}
static int
ng_ipfw_input(struct mbuf **m0, int dir, struct ip_fw_args *fwa, int tee)
{
struct mbuf *m;
- struct ng_ipfw_tag *ngit;
struct ip *ip;
hook_p hook;
int error = 0;
@@ -272,7 +264,7 @@ ng_ipfw_input(struct mbuf **m0, int dir, struct ip_fw_args *fwa, int tee)
* Node must be loaded and corresponding hook must be present.
*/
if (fw_node == NULL ||
- (hook = ng_ipfw_findhook1(fw_node, fwa->cookie)) == NULL) {
+ (hook = ng_ipfw_findhook1(fw_node, fwa->rule.info)) == NULL) {
if (tee == 0)
m_freem(*m0);
return (ESRCH); /* no hook associated with this rule */
@@ -284,20 +276,21 @@ ng_ipfw_input(struct mbuf **m0, int dir, struct ip_fw_args *fwa, int tee)
* a copy of a packet and forward it into netgraph without a tag.
*/
if (tee == 0) {
+ struct m_tag *tag;
+ struct ipfw_rule_ref *r;
m = *m0;
*m0 = NULL; /* it belongs now to netgraph */
- if ((ngit = (struct ng_ipfw_tag *)m_tag_alloc(NGM_IPFW_COOKIE,
- 0, TAGSIZ, M_NOWAIT|M_ZERO)) == NULL) {
+ tag = m_tag_alloc(MTAG_IPFW_RULE, 0, sizeof(*r),
+ M_NOWAIT|M_ZERO);
+ if (tag == NULL) {
m_freem(m);
return (ENOMEM);
}
- ngit->rule = fwa->rule;
- ngit->rule_id = fwa->rule_id;
- ngit->chain_id = fwa->chain_id;
- ngit->dir = dir;
- ngit->ifp = fwa->oif;
- m_tag_prepend(m, &ngit->mt);
+ r = (struct ipfw_rule_ref *)(tag + 1);
+ *r = fwa->rule;
+ r->info = dir ? IPFW_INFO_IN : IPFW_INFO_OUT;
+ m_tag_prepend(m, tag);
} else
if ((m = m_dup(*m0, M_DONTWAIT)) == NULL)
@@ -308,8 +301,6 @@ ng_ipfw_input(struct mbuf **m0, int dir, struct ip_fw_args *fwa, int tee)
return (EINVAL);
ip = mtod(m, struct ip *);
- ip->ip_len = htons(ip->ip_len);
- ip->ip_off = htons(ip->ip_off);
NG_SEND_DATA_ONLY(error, hook, m);
diff --git a/sys/netgraph/ng_ipfw.h b/sys/netgraph/ng_ipfw.h
index 29039f2..c2cab6a 100644
--- a/sys/netgraph/ng_ipfw.h
+++ b/sys/netgraph/ng_ipfw.h
@@ -26,26 +26,8 @@
* $FreeBSD$
*/
+#ifndef _NG_IPFW_H
+#define _NG_IPFW_H
#define NG_IPFW_NODE_TYPE "ipfw"
#define NGM_IPFW_COOKIE 1105988990
-
-#ifdef _KERNEL
-
-typedef int ng_ipfw_input_t(struct mbuf **, int, struct ip_fw_args *, int);
-extern ng_ipfw_input_t *ng_ipfw_input_p;
-#define NG_IPFW_LOADED (ng_ipfw_input_p != NULL)
-
-struct ng_ipfw_tag {
- struct m_tag mt; /* tag header */
- struct ip_fw *rule; /* matching rule */
- uint32_t rule_id; /* matching rule id */
- uint32_t chain_id; /* ruleset id */
- struct ifnet *ifp; /* interface, for ip_output */
- int dir;
-#define NG_IPFW_OUT 0
-#define NG_IPFW_IN 1
-};
-
-#define TAGSIZ (sizeof(struct ng_ipfw_tag) - sizeof(struct m_tag))
-
-#endif /* _KERNEL */
+#endif /* _NG_IPFW_H */
diff --git a/sys/netinet/in.h b/sys/netinet/in.h
index 7aa1645..4a7e11a 100644
--- a/sys/netinet/in.h
+++ b/sys/netinet/in.h
@@ -754,6 +754,32 @@ void in_ifdetach(struct ifnet *);
#define sintosa(sin) ((struct sockaddr *)(sin))
#define ifatoia(ifa) ((struct in_ifaddr *)(ifa))
+/*
+ * Historically, BSD keeps ip_len and ip_off in host format
+ * when doing layer 3 processing, and this often requires
+ * to translate the format back and forth.
+ * To make the process explicit, we define a couple of macros
+ * that also take into account the fact that at some point
+ * we may want to keep those fields always in net format.
+ */
+
+#if (BYTE_ORDER == BIG_ENDIAN) || defined(HAVE_NET_IPLEN)
+#define SET_NET_IPLEN(p) do {} while (0)
+#define SET_HOST_IPLEN(p) do {} while (0)
+#else
+#define SET_NET_IPLEN(p) do { \
+ struct ip *h_ip = (p); \
+ h_ip->ip_len = htons(h_ip->ip_len); \
+ h_ip->ip_off = htons(h_ip->ip_off); \
+ } while (0)
+
+#define SET_HOST_IPLEN(p) do { \
+ struct ip *h_ip = (p); \
+ h_ip->ip_len = ntohs(h_ip->ip_len); \
+ h_ip->ip_off = ntohs(h_ip->ip_off); \
+ } while (0)
+#endif /* !HAVE_NET_IPLEN */
+
#endif /* _KERNEL */
/* INET6 stuff */
diff --git a/sys/netinet/ip_divert.c b/sys/netinet/ip_divert.c
index 401c090..9e7f062 100644
--- a/sys/netinet/ip_divert.c
+++ b/sys/netinet/ip_divert.c
@@ -32,14 +32,10 @@ __FBSDID("$FreeBSD$");
#if !defined(KLD_MODULE)
#include "opt_inet.h"
-#include "opt_ipfw.h"
#include "opt_sctp.h"
#ifndef INET
#error "IPDIVERT requires INET."
#endif
-#ifndef IPFIREWALL
-#error "IPDIVERT requires IPFIREWALL"
-#endif
#endif
#include <sys/param.h>
@@ -72,9 +68,7 @@ __FBSDID("$FreeBSD$");
#include <netinet/in_systm.h>
#include <netinet/in_var.h>
#include <netinet/ip.h>
-#include <netinet/ip_divert.h>
#include <netinet/ip_var.h>
-#include <netinet/ip_fw.h>
#ifdef SCTP
#include <netinet/sctp_crc32.h>
#endif
@@ -92,27 +86,29 @@ __FBSDID("$FreeBSD$");
#define DIVRCVQ (65536 + 100)
/*
- * Divert sockets work in conjunction with ipfw, see the divert(4)
- * manpage for features.
- * Internally, packets selected by ipfw in ip_input() or ip_output(),
- * and never diverted before, are passed to the input queue of the
- * divert socket with a given 'divert_port' number (as specified in
- * the matching ipfw rule), and they are tagged with a 16 bit cookie
- * (representing the rule number of the matching ipfw rule), which
- * is passed to process reading from the socket.
+ * Divert sockets work in conjunction with ipfw or other packet filters,
+ * see the divert(4) manpage for features.
+ * Packets are selected by the packet filter and tagged with an
+ * MTAG_IPFW_RULE tag carrying the 'divert port' number (as set by
+ * the packet filter) and information on the matching filter rule for
+ * subsequent reinjection. The divert_port is used to put the packet
+ * on the corresponding divert socket, while the rule number is passed
+ * up (at least partially) as the sin_port in the struct sockaddr.
*
- * Packets written to the divert socket are again tagged with a cookie
- * (usually the same as above) and a destination address.
- * If the destination address is INADDR_ANY then the packet is
- * treated as outgoing and sent to ip_output(), otherwise it is
- * treated as incoming and sent to ip_input().
- * In both cases, the packet is tagged with the cookie.
+ * Packets written to the divert socket carry in sin_addr a
+ * destination address, and in sin_port the number of the filter rule
+ * after which to continue processing.
+ * If the destination address is INADDR_ANY, the packet is treated as
+ * as outgoing and sent to ip_output(); otherwise it is treated as
+ * incoming and sent to ip_input().
+ * Further, sin_zero carries some information on the interface,
+ * which can be used in the reinject -- see comments in the code.
*
* On reinjection, processing in ip_input() and ip_output()
* will be exactly the same as for the original packet, except that
- * ipfw processing will start at the rule number after the one
- * written in the cookie (so, tagging a packet with a cookie of 0
- * will cause it to be effectively considered as a standard packet).
+ * packet filter processing will start at the rule number after the one
+ * written in the sin_port (ipfw does not allow a rule #0, so sin_port=0
+ * will apply the entire ruleset to the packet).
*/
/* Internal variables. */
@@ -193,7 +189,7 @@ div_destroy(void)
* IPPROTO_DIVERT is not in the real IP protocol number space; this
* function should never be called. Just in case, drop any packets.
*/
-void
+static void
div_input(struct mbuf *m, int off)
{
@@ -217,9 +213,8 @@ divert_packet(struct mbuf *m, int incoming)
struct sockaddr_in divsrc;
struct m_tag *mtag;
- mtag = m_tag_find(m, PACKET_TAG_DIVERT, NULL);
+ mtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL);
if (mtag == NULL) {
- printf("%s: no divert tag\n", __func__);
m_freem(m);
return;
}
@@ -244,14 +239,15 @@ divert_packet(struct mbuf *m, int incoming)
ip->ip_len = htons(ip->ip_len);
}
#endif
+ bzero(&divsrc, sizeof(divsrc));
+ divsrc.sin_len = sizeof(divsrc);
+ divsrc.sin_family = AF_INET;
+ /* record matching rule, in host format */
+ divsrc.sin_port = ((struct ipfw_rule_ref *)(mtag+1))->rulenum;
/*
* Record receive interface address, if any.
* But only for incoming packets.
*/
- bzero(&divsrc, sizeof(divsrc));
- divsrc.sin_len = sizeof(divsrc);
- divsrc.sin_family = AF_INET;
- divsrc.sin_port = divert_cookie(mtag); /* record matching rule */
if (incoming) {
struct ifaddr *ifa;
struct ifnet *ifp;
@@ -299,7 +295,7 @@ divert_packet(struct mbuf *m, int incoming)
/* Put packet on socket queue, if any */
sa = NULL;
- nport = htons((u_int16_t)divert_info(mtag));
+ nport = htons((u_int16_t)(((struct ipfw_rule_ref *)(mtag+1))->info));
INP_INFO_RLOCK(&V_divcbinfo);
LIST_FOREACH(inp, &V_divcb, inp_list) {
/* XXX why does only one socket match? */
@@ -338,7 +334,7 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin,
struct mbuf *control)
{
struct m_tag *mtag;
- struct divert_tag *dt;
+ struct ipfw_rule_ref *dt;
int error = 0;
struct mbuf *options;
@@ -353,23 +349,31 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin,
if (control)
m_freem(control); /* XXX */
- if ((mtag = m_tag_find(m, PACKET_TAG_DIVERT, NULL)) == NULL) {
- mtag = m_tag_get(PACKET_TAG_DIVERT, sizeof(struct divert_tag),
- M_NOWAIT | M_ZERO);
+ mtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL);
+ if (mtag == NULL) {
+ /* this should be normal */
+ mtag = m_tag_alloc(MTAG_IPFW_RULE, 0,
+ sizeof(struct ipfw_rule_ref), M_NOWAIT | M_ZERO);
if (mtag == NULL) {
error = ENOBUFS;
goto cantsend;
}
- dt = (struct divert_tag *)(mtag+1);
m_tag_prepend(m, mtag);
- } else
- dt = (struct divert_tag *)(mtag+1);
+ }
+ dt = (struct ipfw_rule_ref *)(mtag+1);
/* Loopback avoidance and state recovery */
if (sin) {
int i;
- dt->cookie = sin->sin_port;
+ /* set the starting point. We provide a non-zero slot,
+ * but a non_matching chain_id to skip that info and use
+ * the rulenum/rule_id.
+ */
+ dt->slot = 1; /* dummy, chain_id is invalid */
+ dt->chain_id = 0;
+ dt->rulenum = sin->sin_port+1; /* host format ? */
+ dt->rule_id = 0;
/*
* Find receive interface with the given name, stuffed
* (if it exists) in the sin_zero[] field.
@@ -387,7 +391,7 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin,
struct ip *const ip = mtod(m, struct ip *);
struct inpcb *inp;
- dt->info |= IP_FW_DIVERT_OUTPUT_FLAG;
+ dt->info |= IPFW_IS_DIVERT | IPFW_INFO_OUT;
INP_INFO_WLOCK(&V_divcbinfo);
inp = sotoinpcb(so);
INP_RLOCK(inp);
@@ -453,7 +457,7 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin,
m_freem(options);
}
} else {
- dt->info |= IP_FW_DIVERT_LOOPBACK_FLAG;
+ dt->info |= IPFW_IS_DIVERT | IPFW_INFO_IN;
if (m->m_pkthdr.rcvif == NULL) {
/*
* No luck with the name, check by IP address.
@@ -587,7 +591,7 @@ div_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
return div_output(so, m, (struct sockaddr_in *)nam, control);
}
-void
+static void
div_ctlinput(int cmd, struct sockaddr *sa, void *vip)
{
struct in_addr faddr;
@@ -800,5 +804,5 @@ static moduledata_t ipdivertmod = {
};
DECLARE_MODULE(ipdivert, ipdivertmod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY);
-MODULE_DEPEND(dummynet, ipfw, 2, 2, 2);
+MODULE_DEPEND(ipdivert, ipfw, 2, 2, 2);
MODULE_VERSION(ipdivert, 1);
diff --git a/sys/netinet/ip_divert.h b/sys/netinet/ip_divert.h
index 5036355..b8bcf4f 100644
--- a/sys/netinet/ip_divert.h
+++ b/sys/netinet/ip_divert.h
@@ -36,53 +36,20 @@
#define _NETINET_IP_DIVERT_H_
/*
- * Sysctl declaration.
- */
-#ifdef SYSCTL_DECL
-SYSCTL_DECL(_net_inet_divert);
-#endif
-
-/*
- * Divert socket definitions.
- */
-struct divert_tag {
- u_int32_t info; /* port & flags */
- u_int16_t cookie; /* ipfw rule number */
-};
-
-/*
- * Return the divert cookie associated with the mbuf; if any.
- */
-static __inline u_int16_t
-divert_cookie(struct m_tag *mtag)
-{
- return ((struct divert_tag *)(mtag+1))->cookie;
-}
-static __inline u_int16_t
-divert_find_cookie(struct mbuf *m)
-{
- struct m_tag *mtag = m_tag_find(m, PACKET_TAG_DIVERT, NULL);
- return mtag ? divert_cookie(mtag) : 0;
-}
-
-/*
- * Return the divert info associated with the mbuf; if any.
+ * divert has no custom kernel-userland API.
+ *
+ * All communication occurs through a sockaddr_in socket where
+ *
+ * kernel-->userland
+ * sin_port = matching rule, host format;
+ * sin_addr = IN: first address of the incoming interface;
+ * OUT: INADDR_ANY
+ * sin_zero = if fits, the interface name (max 7 bytes + NUL)
+ *
+ * userland->kernel
+ * sin_port = restart-rule - 1, host order
+ * (we restart at sin_port + 1)
+ * sin_addr = IN: address of the incoming interface;
+ * OUT: INADDR_ANY
*/
-static __inline u_int32_t
-divert_info(struct m_tag *mtag)
-{
- return ((struct divert_tag *)(mtag+1))->info;
-}
-static __inline u_int32_t
-divert_find_info(struct mbuf *m)
-{
- struct m_tag *mtag = m_tag_find(m, PACKET_TAG_DIVERT, NULL);
- return mtag ? divert_info(mtag) : 0;
-}
-
-typedef void ip_divert_packet_t(struct mbuf *m, int incoming);
-extern ip_divert_packet_t *ip_divert_ptr;
-
-extern void div_input(struct mbuf *, int);
-extern void div_ctlinput(int, struct sockaddr *, void *);
#endif /* _NETINET_IP_DIVERT_H_ */
diff --git a/sys/netinet/ip_dummynet.h b/sys/netinet/ip_dummynet.h
index b5ef19e..0bbc326 100644
--- a/sys/netinet/ip_dummynet.h
+++ b/sys/netinet/ip_dummynet.h
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa
+ * Copyright (c) 1998-2010 Luigi Rizzo, Universita` di Pisa
* Portions Copyright (c) 2000 Akamba Corp.
* All rights reserved
*
@@ -31,262 +31,124 @@
#define _IP_DUMMYNET_H
/*
- * Definition of dummynet data structures. In the structures, I decided
- * not to use the macros in <sys/queue.h> in the hope of making the code
- * easier to port to other architectures. The type of lists and queue we
- * use here is pretty simple anyways.
- */
-
-/*
- * We start with a heap, which is used in the scheduler to decide when
- * to transmit packets etc.
- *
- * The key for the heap is used for two different values:
+ * Definition of the kernel-userland API for dummynet.
*
- * 1. timer ticks- max 10K/second, so 32 bits are enough;
+ * Setsockopt() and getsockopt() pass a batch of objects, each
+ * of them starting with a "struct dn_id" which should fully identify
+ * the object and its relation with others in the sequence.
+ * The first object in each request should have
+ * type= DN_CMD_*, id = DN_API_VERSION.
+ * For other objects, type and subtype specify the object, len indicates
+ * the total length including the header, and 'id' identifies the specific
+ * object.
*
- * 2. virtual times. These increase in steps of len/x, where len is the
- * packet length, and x is either the weight of the flow, or the
- * sum of all weights.
- * If we limit to max 1000 flows and a max weight of 100, then
- * x needs 17 bits. The packet size is 16 bits, so we can easily
- * overflow if we do not allow errors.
- * So we use a key "dn_key" which is 64 bits. Some macros are used to
- * compare key values and handle wraparounds.
- * MAX64 returns the largest of two key values.
- * MY_M is used as a shift count when doing fixed point arithmetic
- * (a better name would be useful...).
+ * Most objects are numbered with an identifier in the range 1..65535.
+ * DN_MAX_ID indicates the first value outside the range.
*/
-typedef u_int64_t dn_key ; /* sorting key */
-#define DN_KEY_LT(a,b) ((int64_t)((a)-(b)) < 0)
-#define DN_KEY_LEQ(a,b) ((int64_t)((a)-(b)) <= 0)
-#define DN_KEY_GT(a,b) ((int64_t)((a)-(b)) > 0)
-#define DN_KEY_GEQ(a,b) ((int64_t)((a)-(b)) >= 0)
-#define MAX64(x,y) (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x)
-#define MY_M 16 /* number of left shift to obtain a larger precision */
-/*
- * XXX With this scaling, max 1000 flows, max weight 100, 1Gbit/s, the
- * virtual time wraps every 15 days.
- */
+#define DN_API_VERSION 12500000
+#define DN_MAX_ID 0x10000
+struct dn_id {
+ uint16_t len; /* total obj len including this header */
+ uint8_t type;
+ uint8_t subtype;
+ uint32_t id; /* generic id */
+};
/*
- * The maximum hash table size for queues. This value must be a power
- * of 2.
- */
-#define DN_MAX_HASH_SIZE 65536
-
-/*
- * A heap entry is made of a key and a pointer to the actual
- * object stored in the heap.
- * The heap is an array of dn_heap_entry entries, dynamically allocated.
- * Current size is "size", with "elements" actually in use.
- * The heap normally supports only ordered insert and extract from the top.
- * If we want to extract an object from the middle of the heap, we
- * have to know where the object itself is located in the heap (or we
- * need to scan the whole array). To this purpose, an object has a
- * field (int) which contains the index of the object itself into the
- * heap. When the object is moved, the field must also be updated.
- * The offset of the index in the object is stored in the 'offset'
- * field in the heap descriptor. The assumption is that this offset
- * is non-zero if we want to support extract from the middle.
+ * These values are in the type field of struct dn_id.
+ * To preserve the ABI, never rearrange the list or delete
+ * entries with the exception of DN_LAST
*/
-struct dn_heap_entry {
- dn_key key ; /* sorting key. Topmost element is smallest one */
- void *object ; /* object pointer */
+enum {
+ DN_NONE = 0,
+ DN_LINK = 1,
+ DN_FS,
+ DN_SCH,
+ DN_SCH_I,
+ DN_QUEUE,
+ DN_DELAY_LINE,
+ DN_PROFILE,
+ DN_FLOW, /* struct dn_flow */
+ DN_TEXT, /* opaque text is the object */
+
+ DN_CMD_CONFIG = 0x80, /* objects follow */
+ DN_CMD_DELETE, /* subtype + list of entries */
+ DN_CMD_GET, /* subtype + list of entries */
+ DN_CMD_FLUSH,
+ /* for compatibility with FreeBSD 7.2/8 */
+ DN_COMPAT_PIPE,
+ DN_COMPAT_QUEUE,
+ DN_GET_COMPAT,
+
+ /* special commands for emulation of sysctl variables */
+ DN_SYSCTL_GET,
+ DN_SYSCTL_SET,
+
+ DN_LAST,
} ;
-struct dn_heap {
- int size ;
- int elements ;
- int offset ; /* XXX if > 0 this is the offset of direct ptr to obj */
- struct dn_heap_entry *p ; /* really an array of "size" entries */
+enum { /* subtype for schedulers, flowset and the like */
+ DN_SCHED_UNKNOWN = 0,
+ DN_SCHED_FIFO = 1,
+ DN_SCHED_WF2QP = 2,
+ /* others are in individual modules */
} ;
-#ifdef _KERNEL
-/*
- * Packets processed by dummynet have an mbuf tag associated with
- * them that carries their dummynet state. This is used within
- * the dummynet code as well as outside when checking for special
- * processing requirements.
- */
-struct dn_pkt_tag {
- struct ip_fw *rule; /* matching rule */
- uint32_t rule_id; /* matching rule id */
- uint32_t chain_id; /* ruleset id */
- int dn_dir; /* action when packet comes out. */
-#define DN_TO_IP_OUT 1
-#define DN_TO_IP_IN 2
-/* Obsolete: #define DN_TO_BDG_FWD 3 */
-#define DN_TO_ETH_DEMUX 4
-#define DN_TO_ETH_OUT 5
-#define DN_TO_IP6_IN 6
-#define DN_TO_IP6_OUT 7
-#define DN_TO_IFB_FWD 8
-
- dn_key output_time; /* when the pkt is due for delivery */
- struct ifnet *ifp; /* interface, for ip_output */
- struct _ip6dn_args ip6opt; /* XXX ipv6 options */
+enum { /* user flags */
+ DN_HAVE_MASK = 0x0001, /* fs or sched has a mask */
+ DN_NOERROR = 0x0002, /* do not report errors */
+ DN_QHT_HASH = 0x0004, /* qht is a hash table */
+ DN_QSIZE_BYTES = 0x0008, /* queue size is in bytes */
+ DN_HAS_PROFILE = 0x0010, /* a link has a profile */
+ DN_IS_RED = 0x0020,
+ DN_IS_GENTLE_RED= 0x0040,
+ DN_PIPE_CMD = 0x1000, /* pipe config... */
};
-#endif /* _KERNEL */
-
-/*
- * Overall structure of dummynet (with WF2Q+):
-
-In dummynet, packets are selected with the firewall rules, and passed
-to two different objects: PIPE or QUEUE.
-
-A QUEUE is just a queue with configurable size and queue management
-policy. It is also associated with a mask (to discriminate among
-different flows), a weight (used to give different shares of the
-bandwidth to different flows) and a "pipe", which essentially
-supplies the transmit clock for all queues associated with that
-pipe.
-
-A PIPE emulates a fixed-bandwidth link, whose bandwidth is
-configurable. The "clock" for a pipe can come from either an
-internal timer, or from the transmit interrupt of an interface.
-A pipe is also associated with one (or more, if masks are used)
-queue, where all packets for that pipe are stored.
-
-The bandwidth available on the pipe is shared by the queues
-associated with that pipe (only one in case the packet is sent
-to a PIPE) according to the WF2Q+ scheduling algorithm and the
-configured weights.
-
-In general, incoming packets are stored in the appropriate queue,
-which is then placed into one of a few heaps managed by a scheduler
-to decide when the packet should be extracted.
-The scheduler (a function called dummynet()) is run at every timer
-tick, and grabs queues from the head of the heaps when they are
-ready for processing.
-
-There are three data structures definining a pipe and associated queues:
-
- + dn_pipe, which contains the main configuration parameters related
- to delay and bandwidth;
- + dn_flow_set, which contains WF2Q+ configuration, flow
- masks, plr and RED configuration;
- + dn_flow_queue, which is the per-flow queue (containing the packets)
-
-Multiple dn_flow_set can be linked to the same pipe, and multiple
-dn_flow_queue can be linked to the same dn_flow_set.
-All data structures are linked in a linear list which is used for
-housekeeping purposes.
-
-During configuration, we create and initialize the dn_flow_set
-and dn_pipe structures (a dn_pipe also contains a dn_flow_set).
-
-At runtime: packets are sent to the appropriate dn_flow_set (either
-WFQ ones, or the one embedded in the dn_pipe for fixed-rate flows),
-which in turn dispatches them to the appropriate dn_flow_queue
-(created dynamically according to the masks).
-
-The transmit clock for fixed rate flows (ready_event()) selects the
-dn_flow_queue to be used to transmit the next packet. For WF2Q,
-wfq_ready_event() extract a pipe which in turn selects the right
-flow using a number of heaps defined into the pipe itself.
-
- *
- */
/*
- * per flow queue. This contains the flow identifier, the queue
- * of packets, counters, and parameters used to support both RED and
- * WF2Q+.
- *
- * A dn_flow_queue is created and initialized whenever a packet for
- * a new flow arrives.
+ * link template.
*/
-struct dn_flow_queue {
- struct dn_flow_queue *next ;
- struct ipfw_flow_id id ;
-
- struct mbuf *head, *tail ; /* queue of packets */
- u_int len ;
- u_int len_bytes ;
+struct dn_link {
+ struct dn_id oid;
/*
- * When we emulate MAC overheads, or channel unavailability due
- * to other traffic on a shared medium, we augment the packet at
- * the head of the queue with an 'extra_bits' field representsing
- * the additional delay the packet will be subject to:
- * extra_bits = bw*unavailable_time.
- * With large bandwidth and large delays, extra_bits (and also numbytes)
- * can become very large, so better play safe and use 64 bit
- */
- uint64_t numbytes ; /* credit for transmission (dynamic queues) */
- int64_t extra_bits; /* extra bits simulating unavailable channel */
-
- u_int64_t tot_pkts ; /* statistics counters */
- u_int64_t tot_bytes ;
- u_int32_t drops ;
-
- int hash_slot ; /* debugging/diagnostic */
-
- /* RED parameters */
- int avg ; /* average queue length est. (scaled) */
- int count ; /* arrivals since last RED drop */
- int random ; /* random value (scaled) */
- dn_key idle_time; /* start of queue idle time */
-
- /* WF2Q+ support */
- struct dn_flow_set *fs ; /* parent flow set */
- int heap_pos ; /* position (index) of struct in heap */
- dn_key sched_time ; /* current time when queue enters ready_heap */
-
- dn_key S,F ; /* start time, finish time */
- /*
- * Setting F < S means the timestamp is invalid. We only need
- * to test this when the queue is empty.
+ * Userland sets bw and delay in bits/s and milliseconds.
+ * The kernel converts this back and forth to bits/tick and ticks.
+ * XXX what about burst ?
*/
+ int32_t link_nr;
+ int bandwidth; /* bit/s or bits/tick. */
+ int delay; /* ms and ticks */
+ uint64_t burst; /* scaled. bits*Hz XXX */
} ;
/*
- * flow_set descriptor. Contains the "template" parameters for the
- * queue configuration, and pointers to the hash table of dn_flow_queue's.
- *
- * The hash table is an array of lists -- we identify the slot by
- * hashing the flow-id, then scan the list looking for a match.
- * The size of the hash table (buckets) is configurable on a per-queue
- * basis.
- *
- * A dn_flow_set is created whenever a new queue or pipe is created (in the
- * latter case, the structure is located inside the struct dn_pipe).
+ * A flowset, which is a template for flows. Contains parameters
+ * from the command line: id, target scheduler, queue sizes, plr,
+ * flow masks, buckets for the flow hash, and possibly scheduler-
+ * specific parameters (weight, quantum and so on).
*/
-struct dn_flow_set {
- SLIST_ENTRY(dn_flow_set) next; /* linked list in a hash slot */
-
- u_short fs_nr ; /* flow_set number */
- u_short flags_fs;
-#define DN_HAVE_FLOW_MASK 0x0001
-#define DN_IS_RED 0x0002
-#define DN_IS_GENTLE_RED 0x0004
-#define DN_QSIZE_IS_BYTES 0x0008 /* queue size is measured in bytes */
-#define DN_NOERROR 0x0010 /* do not report ENOBUFS on drops */
-#define DN_HAS_PROFILE 0x0020 /* the pipe has a delay profile. */
-#define DN_IS_PIPE 0x4000
-#define DN_IS_QUEUE 0x8000
-
- struct dn_pipe *pipe ; /* pointer to parent pipe */
- u_short parent_nr ; /* parent pipe#, 0 if local to a pipe */
-
- int weight ; /* WFQ queue weight */
+struct dn_fs {
+ struct dn_id oid;
+ uint32_t fs_nr; /* the flowset number */
+ uint32_t flags; /* userland flags */
int qsize ; /* queue size in slots or bytes */
- int plr ; /* pkt loss rate (2^31-1 means 100%) */
+ int32_t plr; /* PLR, pkt loss rate (2^31-1 means 100%) */
+ uint32_t buckets; /* buckets used for the queue hash table */
struct ipfw_flow_id flow_mask ;
-
- /* hash table of queues onto this flow_set */
- int rq_size ; /* number of slots */
- int rq_elements ; /* active elements */
- struct dn_flow_queue **rq; /* array of rq_size entries */
-
- u_int32_t last_expired ; /* do not expire too frequently */
- int backlogged ; /* #active queues for this flowset */
-
- /* RED parameters */
+ uint32_t sched_nr; /* the scheduler we attach to */
+ /* generic scheduler parameters. Leave them at -1 if unset.
+ * Now we use 0: weight, 1: lmax, 2: priority
+ */
+ int par[4];
+
+ /* RED/GRED parameters.
+ * weight and probabilities are in the range 0..1 represented
+ * in fixed point arithmetic with SCALE_RED decimal bits.
+ */
#define SCALE_RED 16
#define SCALE(x) ( (x) << SCALE_RED )
#define SCALE_VAL(x) ( (x) >> SCALE_RED )
@@ -295,102 +157,107 @@ struct dn_flow_set {
int max_th ; /* maximum threshold for queue (scaled) */
int min_th ; /* minimum threshold for queue (scaled) */
int max_p ; /* maximum value for p_b (scaled) */
- u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */
- u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */
- u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */
- u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */
- u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */
- u_int lookup_depth ; /* depth of lookup table */
- int lookup_step ; /* granularity inside the lookup table */
- int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */
- int avg_pkt_size ; /* medium packet size */
- int max_pkt_size ; /* max packet size */
+
};
-SLIST_HEAD(dn_flow_set_head, dn_flow_set);
/*
- * Pipe descriptor. Contains global parameters, delay-line queue,
- * and the flow_set used for fixed-rate queues.
- *
- * For WF2Q+ support it also has 3 heaps holding dn_flow_queue:
- * not_eligible_heap, for queues whose start time is higher
- * than the virtual time. Sorted by start time.
- * scheduler_heap, for queues eligible for scheduling. Sorted by
- * finish time.
- * idle_heap, all flows that are idle and can be removed. We
- * do that on each tick so we do not slow down too much
- * operations during forwarding.
- *
+ * dn_flow collects flow_id and stats for queues and scheduler
+ * instances, and is used to pass these info to userland.
+ * oid.type/oid.subtype describe the object, oid.id is number
+ * of the parent object.
*/
-struct dn_pipe { /* a pipe */
- SLIST_ENTRY(dn_pipe) next; /* linked list in a hash slot */
-
- int pipe_nr ; /* number */
- int bandwidth; /* really, bytes/tick. */
- int delay ; /* really, ticks */
-
- struct mbuf *head, *tail ; /* packets in delay line */
-
- /* WF2Q+ */
- struct dn_heap scheduler_heap ; /* top extract - key Finish time*/
- struct dn_heap not_eligible_heap; /* top extract- key Start time */
- struct dn_heap idle_heap ; /* random extract - key Start=Finish time */
-
- dn_key V ; /* virtual time */
- int sum; /* sum of weights of all active sessions */
-
- /* Same as in dn_flow_queue, numbytes can become large */
- int64_t numbytes; /* bits I can transmit (more or less). */
- uint64_t burst; /* burst size, scaled: bits * hz */
+struct dn_flow {
+ struct dn_id oid;
+ struct ipfw_flow_id fid;
+ uint64_t tot_pkts; /* statistics counters */
+ uint64_t tot_bytes;
+ uint32_t length; /* Queue lenght, in packets */
+ uint32_t len_bytes; /* Queue lenght, in bytes */
+ uint32_t drops;
+};
- dn_key sched_time ; /* time pipe was scheduled in ready_heap */
- dn_key idle_time; /* start of pipe idle time */
/*
- * When the tx clock come from an interface (if_name[0] != '\0'), its name
- * is stored below, whereas the ifp is filled when the rule is configured.
+ * Scheduler template, mostly indicating the name, number,
+ * sched_mask and buckets.
*/
- char if_name[IFNAMSIZ];
- struct ifnet *ifp ;
- int ready ; /* set if ifp != NULL and we got a signal from it */
+struct dn_sch {
+ struct dn_id oid;
+ uint32_t sched_nr; /* N, scheduler number */
+ uint32_t buckets; /* number of buckets for the instances */
+ uint32_t flags; /* have_mask, ... */
+
+ char name[16]; /* null terminated */
+ /* mask to select the appropriate scheduler instance */
+ struct ipfw_flow_id sched_mask; /* M */
+};
- struct dn_flow_set fs ; /* used with fixed-rate flows */
+/* A delay profile is attached to a link.
+ * Note that a profile, as any other object, cannot be longer than 2^16
+ */
+#define ED_MAX_SAMPLES_NO 1024
+struct dn_profile {
+ struct dn_id oid;
/* fields to simulate a delay profile */
-
#define ED_MAX_NAME_LEN 32
char name[ED_MAX_NAME_LEN];
+ int link_nr;
int loss_level;
- int samples_no;
- int *samples;
-};
-
-/* dn_pipe_max is used to pass pipe configuration from userland onto
- * kernel space and back
- */
-#define ED_MAX_SAMPLES_NO 1024
-struct dn_pipe_max {
- struct dn_pipe pipe;
- int samples[ED_MAX_SAMPLES_NO];
+ int bandwidth; // XXX use link bandwidth?
+ int samples_no; /* actual length of samples[] */
+ int samples[ED_MAX_SAMPLES_NO]; /* may be shorter */
};
-SLIST_HEAD(dn_pipe_head, dn_pipe);
-#ifdef _KERNEL
/*
- * Return the dummynet tag; if any.
- * Make sure that the dummynet tag is not reused by lower layers.
+ * Overall structure of dummynet
+
+In dummynet, packets are selected with the firewall rules, and passed
+to two different objects: PIPE or QUEUE (bad name).
+
+A QUEUE defines a classifier, which groups packets into flows
+according to a 'mask', puts them into independent queues (one
+per flow) with configurable size and queue management policy,
+and passes flows to a scheduler:
+
+ (flow_mask|sched_mask) sched_mask
+ +---------+ weight Wx +-------------+
+ | |->-[flow]-->--| |-+
+ -->--| QUEUE x | ... | | |
+ | |->-[flow]-->--| SCHEDuler N | |
+ +---------+ | | |
+ ... | +--[LINK N]-->--
+ +---------+ weight Wy | | +--[LINK N]-->--
+ | |->-[flow]-->--| | |
+ -->--| QUEUE y | ... | | |
+ | |->-[flow]-->--| | |
+ +---------+ +-------------+ |
+ +-------------+
+
+Many QUEUE objects can connect to the same scheduler, each
+QUEUE object can have its own set of parameters.
+
+In turn, the SCHEDuler 'forks' multiple instances according
+to a 'sched_mask', each instance manages its own set of queues
+and transmits on a private instance of a configurable LINK.
+
+A PIPE is a simplified version of the above, where there
+is no flow_mask, and each scheduler instance handles a single queue.
+
+The following data structures (visible from userland) describe
+the objects used by dummynet:
+
+ + dn_link, contains the main configuration parameters related
+ to delay and bandwidth;
+ + dn_profile describes a delay profile;
+ + dn_flow describes the flow status (flow id, statistics)
+
+ + dn_sch describes a scheduler
+ + dn_fs describes a flowset (msk, weight, queue parameters)
+
+ *
*/
-static __inline struct dn_pkt_tag *
-ip_dn_claim_tag(struct mbuf *m)
-{
- struct m_tag *mtag = m_tag_find(m, PACKET_TAG_DUMMYNET, NULL);
- if (mtag != NULL) {
- mtag->m_tag_id = PACKET_TAG_NONE;
- return ((struct dn_pkt_tag *)(mtag + 1));
- } else
- return (NULL);
-}
-#endif
+
#endif /* _IP_DUMMYNET_H */
diff --git a/sys/netinet/ip_fw.h b/sys/netinet/ip_fw.h
index 1e6feb4..cf5d8d0 100644
--- a/sys/netinet/ip_fw.h
+++ b/sys/netinet/ip_fw.h
@@ -461,7 +461,7 @@ typedef struct _ipfw_insn_icmp6 {
*/
struct ip_fw {
- struct ip_fw *next; /* linked list of rules */
+ struct ip_fw *x_next; /* linked list of rules */
struct ip_fw *next_rule; /* ptr to next [skipto] rule */
/* 'next_rule' is used to pass up 'set_disable' status */
@@ -487,24 +487,29 @@ struct ip_fw {
#define RULESIZE(rule) (sizeof(struct ip_fw) + \
((struct ip_fw *)(rule))->cmd_len * 4 - 4)
+#if 1 // should be moved to in.h
/*
* This structure is used as a flow mask and a flow id for various
* parts of the code.
+ * addr_type is used in userland and kernel to mark the address type.
+ * fib is used in the kernel to record the fib in use.
+ * _flags is used in the kernel to store tcp flags for dynamic rules.
*/
struct ipfw_flow_id {
- u_int32_t dst_ip;
- u_int32_t src_ip;
- u_int16_t dst_port;
- u_int16_t src_port;
- u_int8_t fib;
- u_int8_t proto;
- u_int8_t flags; /* protocol-specific flags */
- uint8_t addr_type; /* 4 = ipv4, 6 = ipv6, 1=ether ? */
- struct in6_addr dst_ip6; /* could also store MAC addr! */
+ uint32_t dst_ip;
+ uint32_t src_ip;
+ uint16_t dst_port;
+ uint16_t src_port;
+ uint8_t fib;
+ uint8_t proto;
+ uint8_t _flags; /* protocol-specific flags */
+ uint8_t addr_type; /* 4=ip4, 6=ip6, 1=ether ? */
+ struct in6_addr dst_ip6;
struct in6_addr src_ip6;
- u_int32_t flow_id6;
- u_int32_t frag_id6;
+ uint32_t flow_id6;
+ uint32_t extra; /* queue/pipe or frag_id */
};
+#endif
#define IS_IP6_FLOW_ID(id) ((id)->addr_type == 6)
@@ -571,133 +576,4 @@ typedef struct _ipfw_table {
ipfw_table_entry ent[0]; /* entries */
} ipfw_table;
-/*
- * Main firewall chains definitions and global var's definitions.
- */
-#ifdef _KERNEL
-
-#define MTAG_IPFW 1148380143 /* IPFW-tagged cookie */
-
-/* Return values from ipfw_chk() */
-enum {
- IP_FW_PASS = 0,
- IP_FW_DENY,
- IP_FW_DIVERT,
- IP_FW_TEE,
- IP_FW_DUMMYNET,
- IP_FW_NETGRAPH,
- IP_FW_NGTEE,
- IP_FW_NAT,
- IP_FW_REASS,
-};
-
-/* flags for divert mtag */
-#define IP_FW_DIVERT_LOOPBACK_FLAG 0x00080000
-#define IP_FW_DIVERT_OUTPUT_FLAG 0x00100000
-
-/*
- * Structure for collecting parameters to dummynet for ip6_output forwarding
- */
-struct _ip6dn_args {
- struct ip6_pktopts *opt_or;
- struct route_in6 ro_or;
- int flags_or;
- struct ip6_moptions *im6o_or;
- struct ifnet *origifp_or;
- struct ifnet *ifp_or;
- struct sockaddr_in6 dst_or;
- u_long mtu_or;
- struct route_in6 ro_pmtu_or;
-};
-
-/*
- * Arguments for calling ipfw_chk() and dummynet_io(). We put them
- * all into a structure because this way it is easier and more
- * efficient to pass variables around and extend the interface.
- */
-struct ip_fw_args {
- struct mbuf *m; /* the mbuf chain */
- struct ifnet *oif; /* output interface */
- struct sockaddr_in *next_hop; /* forward address */
- struct ip_fw *rule; /* matching rule */
- uint32_t rule_id; /* matching rule id */
- uint32_t chain_id; /* ruleset id */
- struct ether_header *eh; /* for bridged packets */
-
- struct ipfw_flow_id f_id; /* grabbed from IP header */
- uint32_t cookie; /* a cookie depending on rule action */
- struct inpcb *inp;
-
- struct _ip6dn_args dummypar; /* dummynet->ip6_output */
- struct sockaddr_in hopstore; /* store here if cannot use a pointer */
-};
-
-/*
- * Function definitions.
- */
-
-/* Firewall hooks */
-struct sockopt;
-struct dn_flow_set;
-
-int ipfw_check_in(void *, struct mbuf **, struct ifnet *, int, struct inpcb *inp);
-int ipfw_check_out(void *, struct mbuf **, struct ifnet *, int, struct inpcb *inp);
-
-int ipfw_chk(struct ip_fw_args *);
-
-int ipfw_hook(void);
-int ipfw6_hook(void);
-int ipfw_unhook(void);
-int ipfw6_unhook(void);
-#ifdef NOTYET
-void ipfw_nat_destroy(void);
-#endif
-
-VNET_DECLARE(int, fw_one_pass);
-VNET_DECLARE(int, fw_enable);
-#define V_fw_one_pass VNET(fw_one_pass)
-#define V_fw_enable VNET(fw_enable)
-
-#ifdef INET6
-VNET_DECLARE(int, fw6_enable);
-#define V_fw6_enable VNET(fw6_enable)
-#endif
-
-struct ip_fw_chain {
- struct ip_fw *rules; /* list of rules */
- struct ip_fw *reap; /* list of rules to reap */
- LIST_HEAD(, cfg_nat) nat; /* list of nat entries */
- struct radix_node_head *tables[IPFW_TABLES_MAX];
- struct rwlock rwmtx;
- uint32_t id; /* ruleset id */
-};
-
-#ifdef IPFW_INTERNAL
-
-#define IPFW_LOCK_INIT(_chain) \
- rw_init(&(_chain)->rwmtx, "IPFW static rules")
-#define IPFW_LOCK_DESTROY(_chain) rw_destroy(&(_chain)->rwmtx)
-#define IPFW_WLOCK_ASSERT(_chain) rw_assert(&(_chain)->rwmtx, RA_WLOCKED)
-
-#define IPFW_RLOCK(p) rw_rlock(&(p)->rwmtx)
-#define IPFW_RUNLOCK(p) rw_runlock(&(p)->rwmtx)
-#define IPFW_WLOCK(p) rw_wlock(&(p)->rwmtx)
-#define IPFW_WUNLOCK(p) rw_wunlock(&(p)->rwmtx)
-
-#define LOOKUP_NAT(l, i, p) do { \
- LIST_FOREACH((p), &(l.nat), _next) { \
- if ((p)->id == (i)) { \
- break; \
- } \
- } \
- } while (0)
-
-typedef int ipfw_nat_t(struct ip_fw_args *, struct cfg_nat *, struct mbuf *);
-typedef int ipfw_nat_cfg_t(struct sockopt *);
-#endif
-
-VNET_DECLARE(struct ip_fw_chain, layer3_chain);
-#define V_layer3_chain VNET(layer3_chain)
-
-#endif /* _KERNEL */
#endif /* _IPFW2_H */
diff --git a/sys/netinet/ip_var.h b/sys/netinet/ip_var.h
index a1d2166..d041dd3 100644
--- a/sys/netinet/ip_var.h
+++ b/sys/netinet/ip_var.h
@@ -249,7 +249,43 @@ VNET_DECLARE(struct pfil_head, inet_pfil_hook); /* packet filter hooks */
void in_delayed_cksum(struct mbuf *m);
-/* ipfw and dummynet hooks. Most are declared in raw_ip.c */
+/* Hooks for ipfw, dummynet, divert etc. Most are declared in raw_ip.c */
+/*
+ * Reference to an ipfw or packet filter rule that can be carried
+ * outside critical sections.
+ * A rule is identified by rulenum:rule_id which is ordered.
+ * In version chain_id the rule can be found in slot 'slot', so
+ * we don't need a lookup if chain_id == chain->id.
+ *
+ * On exit from the firewall this structure refers to the rule after
+ * the matching one (slot points to the new rule; rulenum:rule_id-1
+ * is the matching rule), and additional info (e.g. info often contains
+ * the insn argument or tablearg in the low 16 bits, in host format).
+ * On entry, the structure is valid if slot>0, and refers to the starting
+ * rules. 'info' contains the reason for reinject, e.g. divert port,
+ * divert direction, and so on.
+ */
+struct ipfw_rule_ref {
+ uint32_t slot; /* slot for matching rule */
+ uint32_t rulenum; /* matching rule number */
+ uint32_t rule_id; /* matching rule id */
+ uint32_t chain_id; /* ruleset id */
+ uint32_t info; /* see below */
+};
+
+enum {
+ IPFW_INFO_MASK = 0x0000ffff,
+ IPFW_INFO_OUT = 0x00000000, /* outgoing, just for convenience */
+ IPFW_INFO_IN = 0x80000000, /* incoming, overloads dir */
+ IPFW_ONEPASS = 0x40000000, /* One-pass, do not reinject */
+ IPFW_IS_MASK = 0x30000000, /* which source ? */
+ IPFW_IS_DIVERT = 0x20000000,
+ IPFW_IS_DUMMYNET =0x10000000,
+ IPFW_IS_PIPE = 0x08000000, /* pip1=1, queue = 0 */
+};
+#define MTAG_IPFW 1148380143 /* IPFW-tagged cookie */
+#define MTAG_IPFW_RULE 1262273568 /* rule reference */
+
struct ip_fw_args;
typedef int (*ip_fw_chk_ptr_t)(struct ip_fw_args *args);
typedef int (*ip_fw_ctl_ptr_t)(struct sockopt *);
@@ -258,9 +294,14 @@ VNET_DECLARE(ip_fw_ctl_ptr_t, ip_fw_ctl_ptr);
#define V_ip_fw_chk_ptr VNET(ip_fw_chk_ptr)
#define V_ip_fw_ctl_ptr VNET(ip_fw_ctl_ptr)
+/* Divert hooks. */
+extern void (*ip_divert_ptr)(struct mbuf *m, int incoming);
+/* ng_ipfw hooks -- XXX make it the same as divert and dummynet */
+extern int (*ng_ipfw_input_p)(struct mbuf **, int,
+ struct ip_fw_args *, int);
+
extern int (*ip_dn_ctl_ptr)(struct sockopt *);
-extern int (*ip_dn_io_ptr)(struct mbuf **m, int dir, struct ip_fw_args *fwa);
-extern void (*ip_dn_ruledel_ptr)(void *); /* in ip_fw2.c */
+extern int (*ip_dn_io_ptr)(struct mbuf **, int, struct ip_fw_args *);
VNET_DECLARE(int, ip_do_randomid);
#define V_ip_do_randomid VNET(ip_do_randomid)
diff --git a/sys/netinet/ipfw/dn_heap.c b/sys/netinet/ipfw/dn_heap.c
new file mode 100644
index 0000000..6773851
--- /dev/null
+++ b/sys/netinet/ipfw/dn_heap.c
@@ -0,0 +1,550 @@
+/*-
+ * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Binary heap and hash tables, used in dummynet
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#ifdef _KERNEL
+__FBSDID("$FreeBSD$");
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <netinet/ipfw/dn_heap.h>
+#ifndef log
+#define log(x, arg...)
+#endif
+
+#else /* !_KERNEL */
+
+#include <stdio.h>
+#include <dn_test.h>
+#include <strings.h>
+#include <stdlib.h>
+
+#include "dn_heap.h"
+#define log(x, arg...) fprintf(stderr, ## arg)
+#define panic(x...) fprintf(stderr, ## x), exit(1)
+#define MALLOC_DEFINE(a, b, c)
+static void *my_malloc(int s) { return malloc(s); }
+static void my_free(void *p) { free(p); }
+#define malloc(s, t, w) my_malloc(s)
+#define free(p, t) my_free(p)
+#endif /* !_KERNEL */
+
+MALLOC_DEFINE(M_DN_HEAP, "dummynet", "dummynet heap");
+
+/*
+ * Heap management functions.
+ *
+ * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2.
+ * Some macros help finding parent/children so we can optimize them.
+ *
+ * heap_init() is called to expand the heap when needed.
+ * Increment size in blocks of 16 entries.
+ * Returns 1 on error, 0 on success
+ */
+#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 )
+#define HEAP_LEFT(x) ( (x)+(x) + 1 )
+#define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; }
+#define HEAP_INCREMENT 15
+
+static int
+heap_resize(struct dn_heap *h, unsigned int new_size)
+{
+ struct dn_heap_entry *p;
+
+ if (h->size >= new_size ) /* have enough room */
+ return 0;
+#if 1 /* round to the next power of 2 */
+ new_size |= new_size >> 1;
+ new_size |= new_size >> 2;
+ new_size |= new_size >> 4;
+ new_size |= new_size >> 8;
+ new_size |= new_size >> 16;
+#else
+ new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT;
+#endif
+ p = malloc(new_size * sizeof(*p), M_DN_HEAP, M_NOWAIT);
+ if (p == NULL) {
+ printf("--- %s, resize %d failed\n", __func__, new_size );
+ return 1; /* error */
+ }
+ if (h->size > 0) {
+ bcopy(h->p, p, h->size * sizeof(*p) );
+ free(h->p, M_DN_HEAP);
+ }
+ h->p = p;
+ h->size = new_size;
+ return 0;
+}
+
+int
+heap_init(struct dn_heap *h, int size, int ofs)
+{
+ if (heap_resize(h, size))
+ return 1;
+ h->elements = 0;
+ h->ofs = ofs;
+ return 0;
+}
+
+/*
+ * Insert element in heap. Normally, p != NULL, we insert p in
+ * a new position and bubble up. If p == NULL, then the element is
+ * already in place, and key is the position where to start the
+ * bubble-up.
+ * Returns 1 on failure (cannot allocate new heap entry)
+ *
+ * If ofs > 0 the position (index, int) of the element in the heap is
+ * also stored in the element itself at the given offset in bytes.
+ */
+#define SET_OFFSET(h, i) do { \
+ if (h->ofs > 0) \
+ *((int32_t *)((char *)(h->p[i].object) + h->ofs)) = i; \
+ } while (0)
+/*
+ * RESET_OFFSET is used for sanity checks. It sets ofs
+ * to an invalid value.
+ */
+#define RESET_OFFSET(h, i) do { \
+ if (h->ofs > 0) \
+ *((int32_t *)((char *)(h->p[i].object) + h->ofs)) = -16; \
+ } while (0)
+
+int
+heap_insert(struct dn_heap *h, uint64_t key1, void *p)
+{
+ int son = h->elements;
+
+ //log("%s key %llu p %p\n", __FUNCTION__, key1, p);
+ if (p == NULL) { /* data already there, set starting point */
+ son = key1;
+ } else { /* insert new element at the end, possibly resize */
+ son = h->elements;
+ if (son == h->size) /* need resize... */
+ // XXX expand by 16 or so
+ if (heap_resize(h, h->elements+16) )
+ return 1; /* failure... */
+ h->p[son].object = p;
+ h->p[son].key = key1;
+ h->elements++;
+ }
+ /* make sure that son >= father along the path */
+ while (son > 0) {
+ int father = HEAP_FATHER(son);
+ struct dn_heap_entry tmp;
+
+ if (DN_KEY_LT( h->p[father].key, h->p[son].key ) )
+ break; /* found right position */
+ /* son smaller than father, swap and repeat */
+ HEAP_SWAP(h->p[son], h->p[father], tmp);
+ SET_OFFSET(h, son);
+ son = father;
+ }
+ SET_OFFSET(h, son);
+ return 0;
+}
+
+/*
+ * remove top element from heap, or obj if obj != NULL
+ */
+void
+heap_extract(struct dn_heap *h, void *obj)
+{
+ int child, father, max = h->elements - 1;
+
+ if (max < 0) {
+ printf("--- %s: empty heap 0x%p\n", __FUNCTION__, h);
+ return;
+ }
+ if (obj == NULL)
+ father = 0; /* default: move up smallest child */
+ else { /* extract specific element, index is at offset */
+ if (h->ofs <= 0)
+ panic("%s: extract from middle not set on %p\n",
+ __FUNCTION__, h);
+ father = *((int *)((char *)obj + h->ofs));
+ if (father < 0 || father >= h->elements) {
+ panic("%s: father %d out of bound 0..%d\n",
+ __FUNCTION__, father, h->elements);
+ }
+ }
+ /*
+ * below, father is the index of the empty element, which
+ * we replace at each step with the smallest child until we
+ * reach the bottom level.
+ */
+ // XXX why removing RESET_OFFSET increases runtime by 10% ?
+ RESET_OFFSET(h, father);
+ while ( (child = HEAP_LEFT(father)) <= max ) {
+ if (child != max &&
+ DN_KEY_LT(h->p[child+1].key, h->p[child].key) )
+ child++; /* take right child, otherwise left */
+ h->p[father] = h->p[child];
+ SET_OFFSET(h, father);
+ father = child;
+ }
+ h->elements--;
+ if (father != max) {
+ /*
+ * Fill hole with last entry and bubble up,
+ * reusing the insert code
+ */
+ h->p[father] = h->p[max];
+ heap_insert(h, father, NULL);
+ }
+}
+
+#if 0
+/*
+ * change object position and update references
+ * XXX this one is never used!
+ */
+static void
+heap_move(struct dn_heap *h, uint64_t new_key, void *object)
+{
+ int temp, i, max = h->elements-1;
+ struct dn_heap_entry *p, buf;
+
+ if (h->ofs <= 0)
+ panic("cannot move items on this heap");
+ p = h->p; /* shortcut */
+
+ i = *((int *)((char *)object + h->ofs));
+ if (DN_KEY_LT(new_key, p[i].key) ) { /* must move up */
+ p[i].key = new_key;
+ for (; i>0 &&
+ DN_KEY_LT(new_key, p[(temp = HEAP_FATHER(i))].key);
+ i = temp ) { /* bubble up */
+ HEAP_SWAP(p[i], p[temp], buf);
+ SET_OFFSET(h, i);
+ }
+ } else { /* must move down */
+ p[i].key = new_key;
+ while ( (temp = HEAP_LEFT(i)) <= max ) {
+ /* found left child */
+ if (temp != max &&
+ DN_KEY_LT(p[temp+1].key, p[temp].key))
+ temp++; /* select child with min key */
+ if (DN_KEY_LT(>p[temp].key, new_key)) {
+ /* go down */
+ HEAP_SWAP(p[i], p[temp], buf);
+ SET_OFFSET(h, i);
+ } else
+ break;
+ i = temp;
+ }
+ }
+ SET_OFFSET(h, i);
+}
+#endif /* heap_move, unused */
+
+/*
+ * heapify() will reorganize data inside an array to maintain the
+ * heap property. It is needed when we delete a bunch of entries.
+ */
+static void
+heapify(struct dn_heap *h)
+{
+ int i;
+
+ for (i = 0; i < h->elements; i++ )
+ heap_insert(h, i , NULL);
+}
+
+int
+heap_scan(struct dn_heap *h, int (*fn)(void *, uintptr_t),
+ uintptr_t arg)
+{
+ int i, ret, found;
+
+ for (i = found = 0 ; i < h->elements ;) {
+ ret = fn(h->p[i].object, arg);
+ if (ret & HEAP_SCAN_DEL) {
+ h->elements-- ;
+ h->p[i] = h->p[h->elements] ;
+ found++ ;
+ } else
+ i++ ;
+ if (ret & HEAP_SCAN_END)
+ break;
+ }
+ if (found)
+ heapify(h);
+ return found;
+}
+
+/*
+ * cleanup the heap and free data structure
+ */
+void
+heap_free(struct dn_heap *h)
+{
+ if (h->size >0 )
+ free(h->p, M_DN_HEAP);
+ bzero(h, sizeof(*h) );
+}
+
+/*
+ * hash table support.
+ */
+
+struct dn_ht {
+ int buckets; /* how many buckets, really buckets - 1*/
+ int entries; /* how many entries */
+ int ofs; /* offset of link field */
+ uint32_t (*hash)(uintptr_t, int, void *arg);
+ int (*match)(void *_el, uintptr_t key, int, void *);
+ void *(*newh)(uintptr_t, int, void *);
+ void **ht; /* bucket heads */
+};
+/*
+ * Initialize, allocating bucket pointers inline.
+ * Recycle previous record if possible.
+ * If the 'newh' function is not supplied, we assume that the
+ * key passed to ht_find is the same object to be stored in.
+ */
+struct dn_ht *
+dn_ht_init(struct dn_ht *ht, int buckets, int ofs,
+ uint32_t (*h)(uintptr_t, int, void *),
+ int (*match)(void *, uintptr_t, int, void *),
+ void *(*newh)(uintptr_t, int, void *))
+{
+ int l;
+
+ /*
+ * Notes about rounding bucket size to a power of two.
+ * Given the original bucket size, we compute the nearest lower and
+ * higher power of two, minus 1 (respectively b_min and b_max) because
+ * this value will be used to do an AND with the index returned
+ * by hash function.
+ * To choice between these two values, the original bucket size is
+ * compared with b_min. If the original size is greater than 4/3 b_min,
+ * we round the bucket size to b_max, else to b_min.
+ * This ratio try to round to the nearest power of two, advantaging
+ * the greater size if the different between two power is relatively
+ * big.
+ * Rounding the bucket size to a power of two avoid the use of
+ * module when calculating the correct bucket.
+ * The ht->buckets variable store the bucket size - 1 to simply
+ * do an AND between the index returned by hash function and ht->bucket
+ * instead of a module.
+ */
+ int b_min; /* min buckets */
+ int b_max; /* max buckets */
+ int b_ori; /* original buckets */
+
+ if (h == NULL || match == NULL) {
+ printf("--- missing hash or match function");
+ return NULL;
+ }
+ if (buckets < 1 || buckets > 65536)
+ return NULL;
+
+ b_ori = buckets;
+ /* calculate next power of 2, - 1*/
+ buckets |= buckets >> 1;
+ buckets |= buckets >> 2;
+ buckets |= buckets >> 4;
+ buckets |= buckets >> 8;
+ buckets |= buckets >> 16;
+
+ b_max = buckets; /* Next power */
+ b_min = buckets >> 1; /* Previous power */
+
+ /* Calculate the 'nearest' bucket size */
+ if (b_min * 4000 / 3000 < b_ori)
+ buckets = b_max;
+ else
+ buckets = b_min;
+
+ if (ht) { /* see if we can reuse */
+ if (buckets <= ht->buckets) {
+ ht->buckets = buckets;
+ } else {
+ /* free pointers if not allocated inline */
+ if (ht->ht != (void *)(ht + 1))
+ free(ht->ht, M_DN_HEAP);
+ free(ht, M_DN_HEAP);
+ ht = NULL;
+ }
+ }
+ if (ht == NULL) {
+ /* Allocate buckets + 1 entries because buckets is use to
+ * do the AND with the index returned by hash function
+ */
+ l = sizeof(*ht) + (buckets + 1) * sizeof(void **);
+ ht = malloc(l, M_DN_HEAP, M_NOWAIT | M_ZERO);
+ }
+ if (ht) {
+ ht->ht = (void **)(ht + 1);
+ ht->buckets = buckets;
+ ht->ofs = ofs;
+ ht->hash = h;
+ ht->match = match;
+ ht->newh = newh;
+ }
+ return ht;
+}
+
+/* dummy callback for dn_ht_free to unlink all */
+static int
+do_del(void *obj, void *arg)
+{
+ return DNHT_SCAN_DEL;
+}
+
+void
+dn_ht_free(struct dn_ht *ht, int flags)
+{
+ if (ht == NULL)
+ return;
+ if (flags & DNHT_REMOVE) {
+ (void)dn_ht_scan(ht, do_del, NULL);
+ } else {
+ if (ht->ht && ht->ht != (void *)(ht + 1))
+ free(ht->ht, M_DN_HEAP);
+ free(ht, M_DN_HEAP);
+ }
+}
+
+int
+dn_ht_entries(struct dn_ht *ht)
+{
+ return ht ? ht->entries : 0;
+}
+
+/* lookup and optionally create or delete element */
+void *
+dn_ht_find(struct dn_ht *ht, uintptr_t key, int flags, void *arg)
+{
+ int i;
+ void **pp, *p;
+
+ if (ht == NULL) /* easy on an empty hash */
+ return NULL;
+ i = (ht->buckets == 1) ? 0 :
+ (ht->hash(key, flags, arg) & ht->buckets);
+
+ for (pp = &ht->ht[i]; (p = *pp); pp = (void **)((char *)p + ht->ofs)) {
+ if (flags & DNHT_MATCH_PTR) {
+ if (key == (uintptr_t)p)
+ break;
+ } else if (ht->match(p, key, flags, arg)) /* found match */
+ break;
+ }
+ if (p) {
+ if (flags & DNHT_REMOVE) {
+ /* link in the next element */
+ *pp = *(void **)((char *)p + ht->ofs);
+ *(void **)((char *)p + ht->ofs) = NULL;
+ ht->entries--;
+ }
+ } else if (flags & DNHT_INSERT) {
+ // printf("%s before calling new, bucket %d ofs %d\n",
+ // __FUNCTION__, i, ht->ofs);
+ p = ht->newh ? ht->newh(key, flags, arg) : (void *)key;
+ // printf("%s newh returns %p\n", __FUNCTION__, p);
+ if (p) {
+ ht->entries++;
+ *(void **)((char *)p + ht->ofs) = ht->ht[i];
+ ht->ht[i] = p;
+ }
+ }
+ return p;
+}
+
+/*
+ * do a scan with the option to delete the object. Extract next before
+ * running the callback because the element may be destroyed there.
+ */
+int
+dn_ht_scan(struct dn_ht *ht, int (*fn)(void *, void *), void *arg)
+{
+ int i, ret, found = 0;
+ void **curp, *cur, *next;
+
+ if (ht == NULL || fn == NULL)
+ return 0;
+ for (i = 0; i <= ht->buckets; i++) {
+ curp = &ht->ht[i];
+ while ( (cur = *curp) != NULL) {
+ next = *(void **)((char *)cur + ht->ofs);
+ ret = fn(cur, arg);
+ if (ret & DNHT_SCAN_DEL) {
+ found++;
+ ht->entries--;
+ *curp = next;
+ } else {
+ curp = (void **)((char *)cur + ht->ofs);
+ }
+ if (ret & DNHT_SCAN_END)
+ return found;
+ }
+ }
+ return found;
+}
+
+/*
+ * Similar to dn_ht_scan(), except thah the scan is performed only
+ * in the bucket 'bucket'. The function returns a correct bucket number if
+ * the original is invalid
+ */
+int
+dn_ht_scan_bucket(struct dn_ht *ht, int *bucket, int (*fn)(void *, void *),
+ void *arg)
+{
+ int i, ret, found = 0;
+ void **curp, *cur, *next;
+
+ if (ht == NULL || fn == NULL)
+ return 0;
+ if (*bucket > ht->buckets)
+ *bucket = 0;
+ i = *bucket;
+
+ curp = &ht->ht[i];
+ while ( (cur = *curp) != NULL) {
+ next = *(void **)((char *)cur + ht->ofs);
+ ret = fn(cur, arg);
+ if (ret & DNHT_SCAN_DEL) {
+ found++;
+ ht->entries--;
+ *curp = next;
+ } else {
+ curp = (void **)((char *)cur + ht->ofs);
+ }
+ if (ret & DNHT_SCAN_END)
+ return found;
+ }
+ return found;
+}
+
diff --git a/sys/netinet/ipfw/dn_heap.h b/sys/netinet/ipfw/dn_heap.h
new file mode 100644
index 0000000..c95473a
--- /dev/null
+++ b/sys/netinet/ipfw/dn_heap.h
@@ -0,0 +1,191 @@
+/*-
+ * Copyright (c) 1998-2010 Luigi Rizzo, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Binary heap and hash tables, header file
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IP_DN_HEAP_H
+#define _IP_DN_HEAP_H
+
+#define DN_KEY_LT(a,b) ((int64_t)((a)-(b)) < 0)
+#define DN_KEY_LEQ(a,b) ((int64_t)((a)-(b)) <= 0)
+
+/*
+ * This module implements a binary heap supporting random extraction.
+ *
+ * A heap entry contains an uint64_t key and a pointer to object.
+ * DN_KEY_LT(a,b) returns true if key 'a' is smaller than 'b'
+ *
+ * The heap is a struct dn_heap plus a dynamically allocated
+ * array of dn_heap_entry entries. 'size' represents the size of
+ * the array, 'elements' count entries in use. The topmost
+ * element has the smallest key.
+ * The heap supports ordered insert, and extract from the top.
+ * To extract an object from the middle of the heap, we the object
+ * must reserve an 'int32_t' to store the position of the object
+ * in the heap itself, and the location of this field must be
+ * passed as an argument to heap_init() -- use -1 if the feature
+ * is not used.
+ */
+struct dn_heap_entry {
+ uint64_t key; /* sorting key, smallest comes first */
+ void *object; /* object pointer */
+};
+
+struct dn_heap {
+ int size; /* the size of the array */
+ int elements; /* elements in use */
+ int ofs; /* offset in the object of heap index */
+ struct dn_heap_entry *p; /* array of "size" entries */
+};
+
+enum {
+ HEAP_SCAN_DEL = 1,
+ HEAP_SCAN_END = 2,
+};
+
+/*
+ * heap_init() reinitializes the heap setting the size and the offset
+ * of the index for random extraction (use -1 if not used).
+ * The 'elements' counter is set to 0.
+ *
+ * SET_HEAP_OFS() indicates where, in the object, is stored the index
+ * for random extractions from the heap.
+ *
+ * heap_free() frees the memory associated to a heap.
+ *
+ * heap_insert() adds a key-pointer pair to the heap
+ *
+ * HEAP_TOP() returns a pointer to the top element of the heap,
+ * but makes no checks on its existance (XXX should we change ?)
+ *
+ * heap_extract() removes the entry at the top, returing the pointer.
+ * (the key should have been read before).
+ *
+ * heap_scan() invokes a callback on each entry of the heap.
+ * The callback can return a combination of HEAP_SCAN_DEL and
+ * HEAP_SCAN_END. HEAP_SCAN_DEL means the current element must
+ * be removed, and HEAP_SCAN_END means to terminate the scan.
+ * heap_scan() returns the number of elements removed.
+ * Because the order is not guaranteed, we should use heap_scan()
+ * only as a last resort mechanism.
+ */
+#define HEAP_TOP(h) ((h)->p)
+#define SET_HEAP_OFS(h, n) do { (h)->ofs = n; } while (0)
+int heap_init(struct dn_heap *h, int size, int ofs);
+int heap_insert(struct dn_heap *h, uint64_t key1, void *p);
+void heap_extract(struct dn_heap *h, void *obj);
+void heap_free(struct dn_heap *h);
+int heap_scan(struct dn_heap *, int (*)(void *, uintptr_t), uintptr_t);
+
+/*------------------------------------------------------
+ * This module implements a generic hash table with support for
+ * running callbacks on the entire table. To avoid allocating
+ * memory during hash table operations, objects must reserve
+ * space for a link field. XXX if the heap is moderately full,
+ * an SLIST suffices, and we can tolerate the cost of a hash
+ * computation on each removal.
+ *
+ * dn_ht_init() initializes the table, setting the number of
+ * buckets, the offset of the link field, the main callbacks.
+ * Callbacks are:
+ *
+ * hash(key, flags, arg) called to return a bucket index.
+ * match(obj, key, flags, arg) called to determine if key
+ * matches the current 'obj' in the heap
+ * newh(key, flags, arg) optional, used to allocate a new
+ * object during insertions.
+ *
+ * dn_ht_free() frees the heap or unlink elements.
+ * DNHT_REMOVE unlink elements, 0 frees the heap.
+ * You need two calls to do both.
+ *
+ * dn_ht_find() is the main lookup function, which can also be
+ * used to insert or delete elements in the hash table.
+ * The final 'arg' is passed to all callbacks.
+ *
+ * dn_ht_scan() is used to invoke a callback on all entries of
+ * the heap, or possibly on just one bucket. The callback
+ * is invoked with a pointer to the object, and must return
+ * one of DNHT_SCAN_DEL or DNHT_SCAN_END to request the
+ * removal of the object from the heap and the end of the
+ * scan, respectively.
+ *
+ * dn_ht_scan_bucket() is similar to dn_ht_scan(), except that it scans
+ * only the specific bucket of the table. The bucket is a in-out
+ * parameter and return a valid bucket number if the original
+ * is invalid.
+ *
+ * A combination of flags can be used to modify the operation
+ * of the dn_ht_find(), and of the callbacks:
+ *
+ * DNHT_KEY_IS_OBJ means the key is the object pointer.
+ * It is usally of interest for the hash and match functions.
+ *
+ * DNHT_MATCH_PTR during a lookup, match pointers instead
+ * of calling match(). Normally used when removing specific
+ * entries. Does not imply KEY_IS_OBJ as the latter _is_ used
+ * by the match function.
+ *
+ * DNHT_INSERT insert the element if not found.
+ * Calls new() to allocates a new object unless
+ * DNHT_KEY_IS_OBJ is set.
+ *
+ * DNHT_UNIQUE only insert if object not found.
+ * XXX should it imply DNHT_INSERT ?
+ *
+ * DNHT_REMOVE remove objects if we find them.
+ */
+struct dn_ht; /* should be opaque */
+
+struct dn_ht *dn_ht_init(struct dn_ht *, int buckets, int ofs,
+ uint32_t (*hash)(uintptr_t, int, void *),
+ int (*match)(void *, uintptr_t, int, void *),
+ void *(*newh)(uintptr_t, int, void *));
+void dn_ht_free(struct dn_ht *, int flags);
+
+void *dn_ht_find(struct dn_ht *, uintptr_t, int, void *);
+int dn_ht_scan(struct dn_ht *, int (*)(void *, void *), void *);
+int dn_ht_scan_bucket(struct dn_ht *, int * , int (*)(void *, void *), void *);
+int dn_ht_entries(struct dn_ht *);
+
+enum { /* flags values.
+ * first two are returned by the scan callback to indicate
+ * to delete the matching element or to end the scan
+ */
+ DNHT_SCAN_DEL = 0x0001,
+ DNHT_SCAN_END = 0x0002,
+ DNHT_KEY_IS_OBJ = 0x0004, /* key is the obj pointer */
+ DNHT_MATCH_PTR = 0x0008, /* match by pointer, not match() */
+ DNHT_INSERT = 0x0010, /* insert if not found */
+ DNHT_UNIQUE = 0x0020, /* report error if already there */
+ DNHT_REMOVE = 0x0040, /* remove on find or dn_ht_free */
+};
+
+#endif /* _IP_DN_HEAP_H */
diff --git a/sys/netinet/ipfw/dn_sched.h b/sys/netinet/ipfw/dn_sched.h
new file mode 100644
index 0000000..fe54b02
--- /dev/null
+++ b/sys/netinet/ipfw/dn_sched.h
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2010 Riccardo Panicucci, Luigi Rizzo, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * The API to write a packet scheduling algorithm for dummynet.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _DN_SCHED_H
+#define _DN_SCHED_H
+
+#define DN_MULTIQUEUE 0x01
+/*
+ * Descriptor for a scheduling algorithm.
+ * Contains all function pointers for a given scheduler
+ * This is typically created when a module is loaded, and stored
+ * in a global list of schedulers.
+ */
+struct dn_alg {
+ uint32_t type; /* the scheduler type */
+ const char *name; /* scheduler name */
+ uint32_t flags; /* DN_MULTIQUEUE if supports multiple queues */
+
+ /*
+ * The following define the size of 3 optional data structures
+ * that may need to be allocated at runtime, and are appended
+ * to each of the base data structures: scheduler, sched.inst,
+ * and queue. We don't have a per-flowset structure.
+ */
+ /* + parameters attached to the template, e.g.
+ * default queue sizes, weights, quantum size, and so on;
+ */
+ size_t schk_datalen;
+
+ /* + per-instance parameters, such as timestamps,
+ * containers for queues, etc;
+ */
+ size_t si_datalen;
+
+ size_t q_datalen; /* per-queue parameters (e.g. S,F) */
+
+ /*
+ * Methods implemented by the scheduler:
+ * enqueue enqueue packet 'm' on scheduler 's', queue 'q'.
+ * q is NULL for !MULTIQUEUE.
+ * Return 0 on success, 1 on drop (packet consumed anyways).
+ * Note that q should be interpreted only as a hint
+ * on the flow that the mbuf belongs to: while a
+ * scheduler will normally enqueue m into q, it is ok
+ * to leave q alone and put the mbuf elsewhere.
+ * This function is called in two cases:
+ * - when a new packet arrives to the scheduler;
+ * - when a scheduler is reconfigured. In this case the
+ * call is issued by the new_queue callback, with a
+ * non empty queue (q) and m pointing to the first
+ * mbuf in the queue. For this reason, the function
+ * should internally check for (m != q->mq.head)
+ * before calling dn_enqueue().
+ *
+ * dequeue Called when scheduler instance 's' can
+ * dequeue a packet. Return NULL if none are available.
+ * XXX what about non work-conserving ?
+ *
+ * config called on 'sched X config ...', normally writes
+ * in the area of size sch_arg
+ *
+ * destroy called on 'sched delete', frees everything
+ * in sch_arg (other parts are handled by more specific
+ * functions)
+ *
+ * new_sched called when a new instance is created, e.g.
+ * to create the local queue for !MULTIQUEUE, set V or
+ * copy parameters for WFQ, and so on.
+ *
+ * free_sched called when deleting an instance, cleans
+ * extra data in the per-instance area.
+ *
+ * new_fsk called when a flowset is linked to a scheduler,
+ * e.g. to validate parameters such as weights etc.
+ * free_fsk when a flowset is unlinked from a scheduler.
+ * (probably unnecessary)
+ *
+ * new_queue called to set the per-queue parameters,
+ * e.g. S and F, adjust sum of weights in the parent, etc.
+ *
+ * The new_queue callback is normally called from when
+ * creating a new queue. In some cases (such as a
+ * scheduler change or reconfiguration) it can be called
+ * with a non empty queue. In this case, the queue
+ * In case of non empty queue, the new_queue callback could
+ * need to call the enqueue function. In this case,
+ * the callback should eventually call enqueue() passing
+ * as m the first element in the queue.
+ *
+ * free_queue actions related to a queue removal, e.g. undo
+ * all the above. If the queue has data in it, also remove
+ * from the scheduler. This can e.g. happen during a reconfigure.
+ */
+ int (*enqueue)(struct dn_sch_inst *, struct dn_queue *,
+ struct mbuf *);
+ struct mbuf * (*dequeue)(struct dn_sch_inst *);
+
+ int (*config)(struct dn_schk *);
+ int (*destroy)(struct dn_schk*);
+ int (*new_sched)(struct dn_sch_inst *);
+ int (*free_sched)(struct dn_sch_inst *);
+ int (*new_fsk)(struct dn_fsk *f);
+ int (*free_fsk)(struct dn_fsk *f);
+ int (*new_queue)(struct dn_queue *q);
+ int (*free_queue)(struct dn_queue *q);
+
+ /* run-time fields */
+ int ref_count; /* XXX number of instances in the system */
+ SLIST_ENTRY(dn_alg) next; /* Next scheduler in the list */
+};
+
+/* MSVC does not support initializers so we need this ugly macro */
+#ifdef _WIN32
+#define _SI(fld)
+#else
+#define _SI(fld) fld
+#endif
+
+/*
+ * Additionally, dummynet exports some functions and macros
+ * to be used by schedulers:
+ */
+
+void dn_free_pkts(struct mbuf *mnext);
+int dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop);
+/* bound a variable between min and max */
+int ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg);
+
+/*
+ * Extract the head of a queue, update stats. Must be the very last
+ * thing done on a dequeue as the queue itself may go away.
+ */
+static __inline struct mbuf*
+dn_dequeue(struct dn_queue *q)
+{
+ struct mbuf *m = q->mq.head;
+ if (m == NULL)
+ return NULL;
+ q->mq.head = m->m_nextpkt;
+ q->ni.length--;
+ q->ni.len_bytes -= m->m_pkthdr.len;
+ if (q->_si) {
+ q->_si->ni.length--;
+ q->_si->ni.len_bytes -= m->m_pkthdr.len;
+ }
+ if (q->ni.length == 0) /* queue is now idle */
+ q->q_time = dn_cfg.curr_time;
+ return m;
+}
+
+int dn_sched_modevent(module_t mod, int cmd, void *arg);
+
+#define DECLARE_DNSCHED_MODULE(name, dnsched) \
+ static moduledata_t name##_mod = { \
+ #name, dn_sched_modevent, dnsched \
+ }; \
+ DECLARE_MODULE(name, name##_mod, \
+ SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); \
+ MODULE_DEPEND(name, dummynet, 3, 3, 3);
+#endif /* _DN_SCHED_H */
diff --git a/sys/netinet/ipfw/dn_sched_fifo.c b/sys/netinet/ipfw/dn_sched_fifo.c
new file mode 100644
index 0000000..0bb3800
--- /dev/null
+++ b/sys/netinet/ipfw/dn_sched_fifo.c
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ */
+
+#ifdef _KERNEL
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/kernel.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <net/if.h> /* IFNAMSIZ */
+#include <netinet/in.h>
+#include <netinet/ip_var.h> /* ipfw_rule_ref */
+#include <netinet/ip_fw.h> /* flow_id */
+#include <netinet/ip_dummynet.h>
+#include <netinet/ipfw/dn_heap.h>
+#include <netinet/ipfw/ip_dn_private.h>
+#include <netinet/ipfw/dn_sched.h>
+#else
+#include <dn_test.h>
+#endif
+
+/*
+ * This file implements a FIFO scheduler for a single queue.
+ * The queue is allocated as part of the scheduler instance,
+ * and there is a single flowset is in the template which stores
+ * queue size and policy.
+ * Enqueue and dequeue use the default library functions.
+ */
+static int
+fifo_enqueue(struct dn_sch_inst *si, struct dn_queue *q, struct mbuf *m)
+{
+ /* XXX if called with q != NULL and m=NULL, this is a
+ * re-enqueue from an existing scheduler, which we should
+ * handle.
+ */
+ return dn_enqueue((struct dn_queue *)(si+1), m, 0);
+}
+
+static struct mbuf *
+fifo_dequeue(struct dn_sch_inst *si)
+{
+ return dn_dequeue((struct dn_queue *)(si + 1));
+}
+
+static int
+fifo_new_sched(struct dn_sch_inst *si)
+{
+ /* This scheduler instance contains the queue */
+ struct dn_queue *q = (struct dn_queue *)(si + 1);
+
+ set_oid(&q->ni.oid, DN_QUEUE, sizeof(*q));
+ q->_si = si;
+ q->fs = si->sched->fs;
+ return 0;
+}
+
+static int
+fifo_free_sched(struct dn_sch_inst *si)
+{
+ struct dn_queue *q = (struct dn_queue *)(si + 1);
+ dn_free_pkts(q->mq.head);
+ bzero(q, sizeof(*q));
+ return 0;
+}
+
+/*
+ * FIFO scheduler descriptor
+ * contains the type of the scheduler, the name, the size of extra
+ * data structures, and function pointers.
+ */
+static struct dn_alg fifo_desc = {
+ _SI( .type = ) DN_SCHED_FIFO,
+ _SI( .name = ) "FIFO",
+ _SI( .flags = ) 0,
+
+ _SI( .schk_datalen = ) 0,
+ _SI( .si_datalen = ) sizeof(struct dn_queue),
+ _SI( .q_datalen = ) 0,
+
+ _SI( .enqueue = ) fifo_enqueue,
+ _SI( .dequeue = ) fifo_dequeue,
+ _SI( .config = ) NULL,
+ _SI( .destroy = ) NULL,
+ _SI( .new_sched = ) fifo_new_sched,
+ _SI( .free_sched = ) fifo_free_sched,
+ _SI( .new_fsk = ) NULL,
+ _SI( .free_fsk = ) NULL,
+ _SI( .new_queue = ) NULL,
+ _SI( .free_queue = ) NULL,
+};
+
+DECLARE_DNSCHED_MODULE(dn_fifo, &fifo_desc);
diff --git a/sys/netinet/ipfw/dn_sched_qfq.c b/sys/netinet/ipfw/dn_sched_qfq.c
new file mode 100644
index 0000000..44555ee
--- /dev/null
+++ b/sys/netinet/ipfw/dn_sched_qfq.c
@@ -0,0 +1,864 @@
+/*
+ * Copyright (c) 2010 Fabio Checconi, Luigi Rizzo, Paolo Valente
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ */
+
+#ifdef _KERNEL
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/kernel.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <net/if.h> /* IFNAMSIZ */
+#include <netinet/in.h>
+#include <netinet/ip_var.h> /* ipfw_rule_ref */
+#include <netinet/ip_fw.h> /* flow_id */
+#include <netinet/ip_dummynet.h>
+#include <netinet/ipfw/dn_heap.h>
+#include <netinet/ipfw/ip_dn_private.h>
+#include <netinet/ipfw/dn_sched.h>
+#else
+#include <dn_test.h>
+#endif
+
+#ifdef QFQ_DEBUG
+struct qfq_sched;
+static void dump_sched(struct qfq_sched *q, const char *msg);
+#define NO(x) x
+#else
+#define NO(x)
+#endif
+#define DN_SCHED_QFQ 4 // XXX Where?
+typedef unsigned long bitmap;
+
+/*
+ * bitmaps ops are critical. Some linux versions have __fls
+ * and the bitmap ops. Some machines have ffs
+ */
+#if defined(_WIN32)
+int fls(unsigned int n)
+{
+ int i = 0;
+ for (i = 0; n > 0; n >>= 1, i++)
+ ;
+ return i;
+}
+#endif
+
+#if !defined(_KERNEL) || defined( __FreeBSD__ ) || defined(_WIN32)
+static inline unsigned long __fls(unsigned long word)
+{
+ return fls(word) - 1;
+}
+#endif
+
+#if !defined(_KERNEL) || !defined(__linux__)
+#ifdef QFQ_DEBUG
+int test_bit(int ix, bitmap *p)
+{
+ if (ix < 0 || ix > 31)
+ D("bad index %d", ix);
+ return *p & (1<<ix);
+}
+void __set_bit(int ix, bitmap *p)
+{
+ if (ix < 0 || ix > 31)
+ D("bad index %d", ix);
+ *p |= (1<<ix);
+}
+void __clear_bit(int ix, bitmap *p)
+{
+ if (ix < 0 || ix > 31)
+ D("bad index %d", ix);
+ *p &= ~(1<<ix);
+}
+#else /* !QFQ_DEBUG */
+/* XXX do we have fast version, or leave it to the compiler ? */
+#define test_bit(ix, pData) ((*pData) & (1<<(ix)))
+#define __set_bit(ix, pData) (*pData) |= (1<<(ix))
+#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix))
+#endif /* !QFQ_DEBUG */
+#endif /* !__linux__ */
+
+#ifdef __MIPSEL__
+#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix))
+#endif
+
+/*-------------------------------------------*/
+/*
+
+Virtual time computations.
+
+S, F and V are all computed in fixed point arithmetic with
+FRAC_BITS decimal bits.
+
+ QFQ_MAX_INDEX is the maximum index allowed for a group. We need
+ one bit per index.
+ QFQ_MAX_WSHIFT is the maximum power of two supported as a weight.
+ The layout of the bits is as below:
+
+ [ MTU_SHIFT ][ FRAC_BITS ]
+ [ MAX_INDEX ][ MIN_SLOT_SHIFT ]
+ ^.__grp->index = 0
+ *.__grp->slot_shift
+
+ where MIN_SLOT_SHIFT is derived by difference from the others.
+
+The max group index corresponds to Lmax/w_min, where
+Lmax=1<<MTU_SHIFT, w_min = 1 .
+From this, and knowing how many groups (MAX_INDEX) we want,
+we can derive the shift corresponding to each group.
+
+Because we often need to compute
+ F = S + len/w_i and V = V + len/wsum
+instead of storing w_i store the value
+ inv_w = (1<<FRAC_BITS)/w_i
+so we can do F = S + len * inv_w * wsum.
+We use W_TOT in the formulas so we can easily move between
+static and adaptive weight sum.
+
+The per-scheduler-instance data contain all the data structures
+for the scheduler: bitmaps and bucket lists.
+
+ */
+/*
+ * Maximum number of consecutive slots occupied by backlogged classes
+ * inside a group. This is approx lmax/lmin + 5.
+ * XXX check because it poses constraints on MAX_INDEX
+ */
+#define QFQ_MAX_SLOTS 32
+/*
+ * Shifts used for class<->group mapping. Class weights are
+ * in the range [1, QFQ_MAX_WEIGHT], we to map each class i to the
+ * group with the smallest index that can support the L_i / r_i
+ * configured for the class.
+ *
+ * grp->index is the index of the group; and grp->slot_shift
+ * is the shift for the corresponding (scaled) sigma_i.
+ *
+ * When computing the group index, we do (len<<FP_SHIFT)/weight,
+ * then compute an FLS (which is like a log2()), and if the result
+ * is below the MAX_INDEX region we use 0 (which is the same as
+ * using a larger len).
+ */
+#define QFQ_MAX_INDEX 19
+#define QFQ_MAX_WSHIFT 16 /* log2(max_weight) */
+
+#define QFQ_MAX_WEIGHT (1<<QFQ_MAX_WSHIFT)
+#define QFQ_MAX_WSUM (2*QFQ_MAX_WEIGHT)
+//#define IWSUM (q->i_wsum)
+#define IWSUM ((1<<FRAC_BITS)/QFQ_MAX_WSUM)
+
+#define FRAC_BITS 30 /* fixed point arithmetic */
+#define ONE_FP (1UL << FRAC_BITS)
+
+#define QFQ_MTU_SHIFT 11 /* log2(max_len) */
+#define QFQ_MIN_SLOT_SHIFT (FRAC_BITS + QFQ_MTU_SHIFT - QFQ_MAX_INDEX)
+
+/*
+ * Possible group states, also indexes for the bitmaps array in
+ * struct qfq_queue. We rely on ER, IR, EB, IB being numbered 0..3
+ */
+enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE };
+
+struct qfq_group;
+/*
+ * additional queue info. Some of this info should come from
+ * the flowset, we copy them here for faster processing.
+ * This is an overlay of the struct dn_queue
+ */
+struct qfq_class {
+ struct dn_queue _q;
+ uint64_t S, F; /* flow timestamps (exact) */
+ struct qfq_class *next; /* Link for the slot list. */
+
+ /* group we belong to. In principle we would need the index,
+ * which is log_2(lmax/weight), but we never reference it
+ * directly, only the group.
+ */
+ struct qfq_group *grp;
+
+ /* these are copied from the flowset. */
+ uint32_t inv_w; /* ONE_FP/weight */
+ uint32_t lmax; /* Max packet size for this flow. */
+};
+
+/* Group descriptor, see the paper for details.
+ * Basically this contains the bucket lists
+ */
+struct qfq_group {
+ uint64_t S, F; /* group timestamps (approx). */
+ unsigned int slot_shift; /* Slot shift. */
+ unsigned int index; /* Group index. */
+ unsigned int front; /* Index of the front slot. */
+ bitmap full_slots; /* non-empty slots */
+
+ /* Array of lists of active classes. */
+ struct qfq_class *slots[QFQ_MAX_SLOTS];
+};
+
+/* scheduler instance descriptor. */
+struct qfq_sched {
+ uint64_t V; /* Precise virtual time. */
+ uint32_t wsum; /* weight sum */
+ NO(uint32_t i_wsum; /* ONE_FP/w_sum */
+ uint32_t _queued; /* debugging */
+ uint32_t loops; /* debugging */)
+ bitmap bitmaps[QFQ_MAX_STATE]; /* Group bitmaps. */
+ struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */
+};
+
+/*---- support functions ----------------------------*/
+
+/* Generic comparison function, handling wraparound. */
+static inline int qfq_gt(uint64_t a, uint64_t b)
+{
+ return (int64_t)(a - b) > 0;
+}
+
+/* Round a precise timestamp to its slotted value. */
+static inline uint64_t qfq_round_down(uint64_t ts, unsigned int shift)
+{
+ return ts & ~((1ULL << shift) - 1);
+}
+
+/* return the pointer to the group with lowest index in the bitmap */
+static inline struct qfq_group *qfq_ffs(struct qfq_sched *q,
+ unsigned long bitmap)
+{
+ int index = ffs(bitmap) - 1; // zero-based
+ return &q->groups[index];
+}
+
+/*
+ * Calculate a flow index, given its weight and maximum packet length.
+ * index = log_2(maxlen/weight) but we need to apply the scaling.
+ * This is used only once at flow creation.
+ */
+static int qfq_calc_index(uint32_t inv_w, unsigned int maxlen)
+{
+ uint64_t slot_size = (uint64_t)maxlen *inv_w;
+ unsigned long size_map;
+ int index = 0;
+
+ size_map = (unsigned long)(slot_size >> QFQ_MIN_SLOT_SHIFT);
+ if (!size_map)
+ goto out;
+
+ index = __fls(size_map) + 1; // basically a log_2()
+ index -= !(slot_size - (1ULL << (index + QFQ_MIN_SLOT_SHIFT - 1)));
+
+ if (index < 0)
+ index = 0;
+
+out:
+ ND("W = %d, L = %d, I = %d\n", ONE_FP/inv_w, maxlen, index);
+ return index;
+}
+/*---- end support functions ----*/
+
+/*-------- API calls --------------------------------*/
+/*
+ * Validate and copy parameters from flowset.
+ */
+static int
+qfq_new_queue(struct dn_queue *_q)
+{
+ struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1);
+ struct qfq_class *cl = (struct qfq_class *)_q;
+ int i;
+ uint32_t w; /* approximated weight */
+
+ /* import parameters from the flowset. They should be correct
+ * already.
+ */
+ w = _q->fs->fs.par[0];
+ cl->lmax = _q->fs->fs.par[1];
+ if (!w || w > QFQ_MAX_WEIGHT) {
+ w = 1;
+ D("rounding weight to 1");
+ }
+ cl->inv_w = ONE_FP/w;
+ w = ONE_FP/cl->inv_w;
+ if (q->wsum + w > QFQ_MAX_WSUM)
+ return EINVAL;
+
+ i = qfq_calc_index(cl->inv_w, cl->lmax);
+ cl->grp = &q->groups[i];
+ q->wsum += w;
+ // XXX cl->S = q->V; ?
+ // XXX compute q->i_wsum
+ return 0;
+}
+
+/* remove an empty queue */
+static int
+qfq_free_queue(struct dn_queue *_q)
+{
+ struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1);
+ struct qfq_class *cl = (struct qfq_class *)_q;
+ if (cl->inv_w) {
+ q->wsum -= ONE_FP/cl->inv_w;
+ cl->inv_w = 0; /* reset weight to avoid run twice */
+ }
+ return 0;
+}
+
+/* Calculate a mask to mimic what would be ffs_from(). */
+static inline unsigned long
+mask_from(unsigned long bitmap, int from)
+{
+ return bitmap & ~((1UL << from) - 1);
+}
+
+/*
+ * The state computation relies on ER=0, IR=1, EB=2, IB=3
+ * First compute eligibility comparing grp->S, q->V,
+ * then check if someone is blocking us and possibly add EB
+ */
+static inline unsigned int
+qfq_calc_state(struct qfq_sched *q, struct qfq_group *grp)
+{
+ /* if S > V we are not eligible */
+ unsigned int state = qfq_gt(grp->S, q->V);
+ unsigned long mask = mask_from(q->bitmaps[ER], grp->index);
+ struct qfq_group *next;
+
+ if (mask) {
+ next = qfq_ffs(q, mask);
+ if (qfq_gt(grp->F, next->F))
+ state |= EB;
+ }
+
+ return state;
+}
+
+/*
+ * In principle
+ * q->bitmaps[dst] |= q->bitmaps[src] & mask;
+ * q->bitmaps[src] &= ~mask;
+ * but we should make sure that src != dst
+ */
+static inline void
+qfq_move_groups(struct qfq_sched *q, unsigned long mask, int src, int dst)
+{
+ q->bitmaps[dst] |= q->bitmaps[src] & mask;
+ q->bitmaps[src] &= ~mask;
+}
+
+static inline void
+qfq_unblock_groups(struct qfq_sched *q, int index, uint64_t old_finish)
+{
+ unsigned long mask = mask_from(q->bitmaps[ER], index + 1);
+ struct qfq_group *next;
+
+ if (mask) {
+ next = qfq_ffs(q, mask);
+ if (!qfq_gt(next->F, old_finish))
+ return;
+ }
+
+ mask = (1UL << index) - 1;
+ qfq_move_groups(q, mask, EB, ER);
+ qfq_move_groups(q, mask, IB, IR);
+}
+
+/*
+ * perhaps
+ *
+ old_V ^= q->V;
+ old_V >>= QFQ_MIN_SLOT_SHIFT;
+ if (old_V) {
+ ...
+ }
+ *
+ */
+static inline void
+qfq_make_eligible(struct qfq_sched *q, uint64_t old_V)
+{
+ unsigned long mask, vslot, old_vslot;
+
+ vslot = q->V >> QFQ_MIN_SLOT_SHIFT;
+ old_vslot = old_V >> QFQ_MIN_SLOT_SHIFT;
+
+ if (vslot != old_vslot) {
+ mask = (2UL << (__fls(vslot ^ old_vslot))) - 1;
+ qfq_move_groups(q, mask, IR, ER);
+ qfq_move_groups(q, mask, IB, EB);
+ }
+}
+
+/*
+ * XXX we should make sure that slot becomes less than 32.
+ * This is guaranteed by the input values.
+ * roundedS is always cl->S rounded on grp->slot_shift bits.
+ */
+static inline void
+qfq_slot_insert(struct qfq_group *grp, struct qfq_class *cl, uint64_t roundedS)
+{
+ uint64_t slot = (roundedS - grp->S) >> grp->slot_shift;
+ unsigned int i = (grp->front + slot) % QFQ_MAX_SLOTS;
+
+ cl->next = grp->slots[i];
+ grp->slots[i] = cl;
+ __set_bit(slot, &grp->full_slots);
+}
+
+/*
+ * remove the entry from the slot
+ */
+static inline void
+qfq_front_slot_remove(struct qfq_group *grp)
+{
+ struct qfq_class **h = &grp->slots[grp->front];
+
+ *h = (*h)->next;
+ if (!*h)
+ __clear_bit(0, &grp->full_slots);
+}
+
+/*
+ * Returns the first full queue in a group. As a side effect,
+ * adjust the bucket list so the first non-empty bucket is at
+ * position 0 in full_slots.
+ */
+static inline struct qfq_class *
+qfq_slot_scan(struct qfq_group *grp)
+{
+ int i;
+
+ ND("grp %d full %x", grp->index, grp->full_slots);
+ if (!grp->full_slots)
+ return NULL;
+
+ i = ffs(grp->full_slots) - 1; // zero-based
+ if (i > 0) {
+ grp->front = (grp->front + i) % QFQ_MAX_SLOTS;
+ grp->full_slots >>= i;
+ }
+
+ return grp->slots[grp->front];
+}
+
+/*
+ * adjust the bucket list. When the start time of a group decreases,
+ * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to
+ * move the objects. The mask of occupied slots must be shifted
+ * because we use ffs() to find the first non-empty slot.
+ * This covers decreases in the group's start time, but what about
+ * increases of the start time ?
+ * Here too we should make sure that i is less than 32
+ */
+static inline void
+qfq_slot_rotate(struct qfq_sched *q, struct qfq_group *grp, uint64_t roundedS)
+{
+ unsigned int i = (grp->S - roundedS) >> grp->slot_shift;
+
+ grp->full_slots <<= i;
+ grp->front = (grp->front - i) % QFQ_MAX_SLOTS;
+}
+
+
+static inline void
+qfq_update_eligible(struct qfq_sched *q, uint64_t old_V)
+{
+ bitmap ineligible;
+
+ ineligible = q->bitmaps[IR] | q->bitmaps[IB];
+ if (ineligible) {
+ if (!q->bitmaps[ER]) {
+ struct qfq_group *grp;
+ grp = qfq_ffs(q, ineligible);
+ if (qfq_gt(grp->S, q->V))
+ q->V = grp->S;
+ }
+ qfq_make_eligible(q, old_V);
+ }
+}
+
+/*
+ * Updates the class, returns true if also the group needs to be updated.
+ */
+static inline int
+qfq_update_class(struct qfq_sched *q, struct qfq_group *grp,
+ struct qfq_class *cl)
+{
+
+ cl->S = cl->F;
+ if (cl->_q.mq.head == NULL) {
+ qfq_front_slot_remove(grp);
+ } else {
+ unsigned int len;
+ uint64_t roundedS;
+
+ len = cl->_q.mq.head->m_pkthdr.len;
+ cl->F = cl->S + (uint64_t)len * cl->inv_w;
+ roundedS = qfq_round_down(cl->S, grp->slot_shift);
+ if (roundedS == grp->S)
+ return 0;
+
+ qfq_front_slot_remove(grp);
+ qfq_slot_insert(grp, cl, roundedS);
+ }
+ return 1;
+}
+
+static struct mbuf *
+qfq_dequeue(struct dn_sch_inst *si)
+{
+ struct qfq_sched *q = (struct qfq_sched *)(si + 1);
+ struct qfq_group *grp;
+ struct qfq_class *cl;
+ struct mbuf *m;
+ uint64_t old_V;
+
+ NO(q->loops++;)
+ if (!q->bitmaps[ER]) {
+ NO(if (q->queued)
+ dump_sched(q, "start dequeue");)
+ return NULL;
+ }
+
+ grp = qfq_ffs(q, q->bitmaps[ER]);
+
+ cl = grp->slots[grp->front];
+ /* extract from the first bucket in the bucket list */
+ m = dn_dequeue(&cl->_q);
+
+ if (!m) {
+ D("BUG/* non-workconserving leaf */");
+ return NULL;
+ }
+ NO(q->queued--;)
+ old_V = q->V;
+ q->V += (uint64_t)m->m_pkthdr.len * IWSUM;
+ ND("m is %p F 0x%llx V now 0x%llx", m, cl->F, q->V);
+
+ if (qfq_update_class(q, grp, cl)) {
+ uint64_t old_F = grp->F;
+ cl = qfq_slot_scan(grp);
+ if (!cl) { /* group gone, remove from ER */
+ __clear_bit(grp->index, &q->bitmaps[ER]);
+ // grp->S = grp->F + 1; // XXX debugging only
+ } else {
+ uint64_t roundedS = qfq_round_down(cl->S, grp->slot_shift);
+ unsigned int s;
+
+ if (grp->S == roundedS)
+ goto skip_unblock;
+ grp->S = roundedS;
+ grp->F = roundedS + (2ULL << grp->slot_shift);
+ /* remove from ER and put in the new set */
+ __clear_bit(grp->index, &q->bitmaps[ER]);
+ s = qfq_calc_state(q, grp);
+ __set_bit(grp->index, &q->bitmaps[s]);
+ }
+ /* we need to unblock even if the group has gone away */
+ qfq_unblock_groups(q, grp->index, old_F);
+ }
+
+skip_unblock:
+ qfq_update_eligible(q, old_V);
+ NO(if (!q->bitmaps[ER] && q->queued)
+ dump_sched(q, "end dequeue");)
+
+ return m;
+}
+
+/*
+ * Assign a reasonable start time for a new flow k in group i.
+ * Admissible values for \hat(F) are multiples of \sigma_i
+ * no greater than V+\sigma_i . Larger values mean that
+ * we had a wraparound so we consider the timestamp to be stale.
+ *
+ * If F is not stale and F >= V then we set S = F.
+ * Otherwise we should assign S = V, but this may violate
+ * the ordering in ER. So, if we have groups in ER, set S to
+ * the F_j of the first group j which would be blocking us.
+ * We are guaranteed not to move S backward because
+ * otherwise our group i would still be blocked.
+ */
+static inline void
+qfq_update_start(struct qfq_sched *q, struct qfq_class *cl)
+{
+ unsigned long mask;
+ uint32_t limit, roundedF;
+ int slot_shift = cl->grp->slot_shift;
+
+ roundedF = qfq_round_down(cl->F, slot_shift);
+ limit = qfq_round_down(q->V, slot_shift) + (1UL << slot_shift);
+
+ if (!qfq_gt(cl->F, q->V) || qfq_gt(roundedF, limit)) {
+ /* timestamp was stale */
+ mask = mask_from(q->bitmaps[ER], cl->grp->index);
+ if (mask) {
+ struct qfq_group *next = qfq_ffs(q, mask);
+ if (qfq_gt(roundedF, next->F)) {
+ cl->S = next->F;
+ return;
+ }
+ }
+ cl->S = q->V;
+ } else { /* timestamp is not stale */
+ cl->S = cl->F;
+ }
+}
+
+static int
+qfq_enqueue(struct dn_sch_inst *si, struct dn_queue *_q, struct mbuf *m)
+{
+ struct qfq_sched *q = (struct qfq_sched *)(si + 1);
+ struct qfq_group *grp;
+ struct qfq_class *cl = (struct qfq_class *)_q;
+ uint64_t roundedS;
+ int s;
+
+ NO(q->loops++;)
+ DX(4, "len %d flow %p inv_w 0x%x grp %d", m->m_pkthdr.len,
+ _q, cl->inv_w, cl->grp->index);
+ /* XXX verify that the packet obeys the parameters */
+ if (m != _q->mq.head) {
+ if (dn_enqueue(_q, m, 0)) /* packet was dropped */
+ return 1;
+ NO(q->queued++;)
+ if (m != _q->mq.head)
+ return 0;
+ }
+ /* If reach this point, queue q was idle */
+ grp = cl->grp;
+ qfq_update_start(q, cl); /* adjust start time */
+ /* compute new finish time and rounded start. */
+ cl->F = cl->S + (uint64_t)(m->m_pkthdr.len) * cl->inv_w;
+ roundedS = qfq_round_down(cl->S, grp->slot_shift);
+
+ /*
+ * insert cl in the correct bucket.
+ * If cl->S >= grp->S we don't need to adjust the
+ * bucket list and simply go to the insertion phase.
+ * Otherwise grp->S is decreasing, we must make room
+ * in the bucket list, and also recompute the group state.
+ * Finally, if there were no flows in this group and nobody
+ * was in ER make sure to adjust V.
+ */
+ if (grp->full_slots) {
+ if (!qfq_gt(grp->S, cl->S))
+ goto skip_update;
+ /* create a slot for this cl->S */
+ qfq_slot_rotate(q, grp, roundedS);
+ /* group was surely ineligible, remove */
+ __clear_bit(grp->index, &q->bitmaps[IR]);
+ __clear_bit(grp->index, &q->bitmaps[IB]);
+ } else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V))
+ q->V = roundedS;
+
+ grp->S = roundedS;
+ grp->F = roundedS + (2ULL << grp->slot_shift); // i.e. 2\sigma_i
+ s = qfq_calc_state(q, grp);
+ __set_bit(grp->index, &q->bitmaps[s]);
+ ND("new state %d 0x%x", s, q->bitmaps[s]);
+ ND("S %llx F %llx V %llx", cl->S, cl->F, q->V);
+skip_update:
+ qfq_slot_insert(grp, cl, roundedS);
+
+ return 0;
+}
+
+
+#if 0
+static inline void
+qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp,
+ struct qfq_class *cl, struct qfq_class **pprev)
+{
+ unsigned int i, offset;
+ uint64_t roundedS;
+
+ roundedS = qfq_round_down(cl->S, grp->slot_shift);
+ offset = (roundedS - grp->S) >> grp->slot_shift;
+ i = (grp->front + offset) % QFQ_MAX_SLOTS;
+
+#ifdef notyet
+ if (!pprev) {
+ pprev = &grp->slots[i];
+ while (*pprev && *pprev != cl)
+ pprev = &(*pprev)->next;
+ }
+#endif
+
+ *pprev = cl->next;
+ if (!grp->slots[i])
+ __clear_bit(offset, &grp->full_slots);
+}
+
+/*
+ * called to forcibly destroy a queue.
+ * If the queue is not in the front bucket, or if it has
+ * other queues in the front bucket, we can simply remove
+ * the queue with no other side effects.
+ * Otherwise we must propagate the event up.
+ * XXX description to be completed.
+ */
+static void
+qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl,
+ struct qfq_class **pprev)
+{
+ struct qfq_group *grp = &q->groups[cl->index];
+ unsigned long mask;
+ uint64_t roundedS;
+ int s;
+
+ cl->F = cl->S; // not needed if the class goes away.
+ qfq_slot_remove(q, grp, cl, pprev);
+
+ if (!grp->full_slots) {
+ /* nothing left in the group, remove from all sets.
+ * Do ER last because if we were blocking other groups
+ * we must unblock them.
+ */
+ __clear_bit(grp->index, &q->bitmaps[IR]);
+ __clear_bit(grp->index, &q->bitmaps[EB]);
+ __clear_bit(grp->index, &q->bitmaps[IB]);
+
+ if (test_bit(grp->index, &q->bitmaps[ER]) &&
+ !(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) {
+ mask = q->bitmaps[ER] & ((1UL << grp->index) - 1);
+ if (mask)
+ mask = ~((1UL << __fls(mask)) - 1);
+ else
+ mask = ~0UL;
+ qfq_move_groups(q, mask, EB, ER);
+ qfq_move_groups(q, mask, IB, IR);
+ }
+ __clear_bit(grp->index, &q->bitmaps[ER]);
+ } else if (!grp->slots[grp->front]) {
+ cl = qfq_slot_scan(grp);
+ roundedS = qfq_round_down(cl->S, grp->slot_shift);
+ if (grp->S != roundedS) {
+ __clear_bit(grp->index, &q->bitmaps[ER]);
+ __clear_bit(grp->index, &q->bitmaps[IR]);
+ __clear_bit(grp->index, &q->bitmaps[EB]);
+ __clear_bit(grp->index, &q->bitmaps[IB]);
+ grp->S = roundedS;
+ grp->F = roundedS + (2ULL << grp->slot_shift);
+ s = qfq_calc_state(q, grp);
+ __set_bit(grp->index, &q->bitmaps[s]);
+ }
+ }
+ qfq_update_eligible(q, q->V);
+}
+#endif
+
+static int
+qfq_new_fsk(struct dn_fsk *f)
+{
+ ipdn_bound_var(&f->fs.par[0], 1, 1, QFQ_MAX_WEIGHT, "qfq weight");
+ ipdn_bound_var(&f->fs.par[1], 1500, 1, 2000, "qfq maxlen");
+ ND("weight %d len %d\n", f->fs.par[0], f->fs.par[1]);
+ return 0;
+}
+
+/*
+ * initialize a new scheduler instance
+ */
+static int
+qfq_new_sched(struct dn_sch_inst *si)
+{
+ struct qfq_sched *q = (struct qfq_sched *)(si + 1);
+ struct qfq_group *grp;
+ int i;
+
+ for (i = 0; i <= QFQ_MAX_INDEX; i++) {
+ grp = &q->groups[i];
+ grp->index = i;
+ grp->slot_shift = QFQ_MTU_SHIFT + FRAC_BITS -
+ (QFQ_MAX_INDEX - i);
+ }
+ return 0;
+}
+
+/*
+ * QFQ scheduler descriptor
+ */
+static struct dn_alg qfq_desc = {
+ _SI( .type = ) DN_SCHED_QFQ,
+ _SI( .name = ) "QFQ",
+ _SI( .flags = ) DN_MULTIQUEUE,
+
+ _SI( .schk_datalen = ) 0,
+ _SI( .si_datalen = ) sizeof(struct qfq_sched),
+ _SI( .q_datalen = ) sizeof(struct qfq_class) - sizeof(struct dn_queue),
+
+ _SI( .enqueue = ) qfq_enqueue,
+ _SI( .dequeue = ) qfq_dequeue,
+
+ _SI( .config = ) NULL,
+ _SI( .destroy = ) NULL,
+ _SI( .new_sched = ) qfq_new_sched,
+ _SI( .free_sched = ) NULL,
+ _SI( .new_fsk = ) qfq_new_fsk,
+ _SI( .free_fsk = ) NULL,
+ _SI( .new_queue = ) qfq_new_queue,
+ _SI( .free_queue = ) qfq_free_queue,
+};
+
+DECLARE_DNSCHED_MODULE(dn_qfq, &qfq_desc);
+
+#ifdef QFQ_DEBUG
+static void
+dump_groups(struct qfq_sched *q, uint32_t mask)
+{
+ int i, j;
+
+ for (i = 0; i < QFQ_MAX_INDEX + 1; i++) {
+ struct qfq_group *g = &q->groups[i];
+
+ if (0 == (mask & (1<<i)))
+ continue;
+ for (j = 0; j < QFQ_MAX_SLOTS; j++) {
+ if (g->slots[j])
+ D(" bucket %d %p", j, g->slots[j]);
+ }
+ D("full_slots 0x%x", g->full_slots);
+ D(" %2d S 0x%20llx F 0x%llx %c", i,
+ g->S, g->F,
+ mask & (1<<i) ? '1' : '0');
+ }
+}
+
+static void
+dump_sched(struct qfq_sched *q, const char *msg)
+{
+ D("--- in %s: ---", msg);
+ ND("loops %d queued %d V 0x%llx", q->loops, q->queued, q->V);
+ D(" ER 0x%08x", q->bitmaps[ER]);
+ D(" EB 0x%08x", q->bitmaps[EB]);
+ D(" IR 0x%08x", q->bitmaps[IR]);
+ D(" IB 0x%08x", q->bitmaps[IB]);
+ dump_groups(q, 0xffffffff);
+};
+#endif /* QFQ_DEBUG */
diff --git a/sys/netinet/ipfw/dn_sched_rr.c b/sys/netinet/ipfw/dn_sched_rr.c
new file mode 100644
index 0000000..fc7be00
--- /dev/null
+++ b/sys/netinet/ipfw/dn_sched_rr.c
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ */
+
+#ifdef _KERNEL
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/kernel.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <net/if.h> /* IFNAMSIZ */
+#include <netinet/in.h>
+#include <netinet/ip_var.h> /* ipfw_rule_ref */
+#include <netinet/ip_fw.h> /* flow_id */
+#include <netinet/ip_dummynet.h>
+#include <netinet/ipfw/dn_heap.h>
+#include <netinet/ipfw/ip_dn_private.h>
+#include <netinet/ipfw/dn_sched.h>
+#else
+#include <dn_test.h>
+#endif
+
+#define DN_SCHED_RR 3 // XXX Where?
+
+struct rr_queue {
+ struct dn_queue q; /* Standard queue */
+ int status; /* 1: queue is in the list */
+ int credit; /* Number of bytes to transmit */
+ int quantum; /* quantum * C */
+ struct rr_queue *qnext; /* */
+};
+
+/* struct rr_schk contains global config parameters
+ * and is right after dn_schk
+ */
+struct rr_schk {
+ int min_q; /* Min quantum */
+ int max_q; /* Max quantum */
+ int q_bytes; /* Bytes per quantum */
+};
+
+/* per-instance round robin list, right after dn_sch_inst */
+struct rr_si {
+ struct rr_queue *head, *tail; /* Pointer to current queue */
+};
+
+/* Append a queue to the rr list */
+static inline void
+rr_append(struct rr_queue *q, struct rr_si *si)
+{
+ q->status = 1; /* mark as in-rr_list */
+ q->credit = q->quantum; /* initialize credit */
+
+ /* append to the tail */
+ if (si->head == NULL)
+ si->head = q;
+ else
+ si->tail->qnext = q;
+ si->tail = q; /* advance the tail pointer */
+ q->qnext = si->head; /* make it circular */
+}
+
+/* Remove the head queue from circular list. */
+static inline void
+rr_remove_head(struct rr_si *si)
+{
+ if (si->head == NULL)
+ return; /* empty queue */
+ si->head->status = 0;
+
+ if (si->head == si->tail) {
+ si->head = si->tail = NULL;
+ return;
+ }
+
+ si->head = si->head->qnext;
+ si->tail->qnext = si->head;
+}
+
+/* Remove a queue from circular list.
+ * XXX see if ti can be merge with remove_queue()
+ */
+static inline void
+remove_queue_q(struct rr_queue *q, struct rr_si *si)
+{
+ struct rr_queue *prev;
+
+ if (q->status != 1)
+ return;
+ if (q == si->head) {
+ rr_remove_head(si);
+ return;
+ }
+
+ for (prev = si->head; prev; prev = prev->qnext) {
+ if (prev->qnext != q)
+ continue;
+ prev->qnext = q->qnext;
+ if (q == si->tail)
+ si->tail = prev;
+ q->status = 0;
+ break;
+ }
+}
+
+
+static inline void
+next_pointer(struct rr_si *si)
+{
+ if (si->head == NULL)
+ return; /* empty queue */
+
+ si->head = si->head->qnext;
+ si->tail = si->tail->qnext;
+}
+
+static int
+rr_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m)
+{
+ struct rr_si *si;
+ struct rr_queue *rrq;
+
+ if (m != q->mq.head) {
+ if (dn_enqueue(q, m, 0)) /* packet was dropped */
+ return 1;
+ if (m != q->mq.head)
+ return 0;
+ }
+
+ /* If reach this point, queue q was idle */
+ si = (struct rr_si *)(_si + 1);
+ rrq = (struct rr_queue *)q;
+
+ if (rrq->status == 1) /* Queue is already in the queue list */
+ return 0;
+
+ /* Insert the queue in the queue list */
+ rr_append(rrq, si);
+
+ return 0;
+}
+
+static struct mbuf *
+rr_dequeue(struct dn_sch_inst *_si)
+{
+ /* Access scheduler instance private data */
+ struct rr_si *si = (struct rr_si *)(_si + 1);
+ struct rr_queue *rrq;
+ uint64_t len;
+
+ while ( (rrq = si->head) ) {
+ struct mbuf *m = rrq->q.mq.head;
+ if ( m == NULL) {
+ /* empty queue, remove from list */
+ rr_remove_head(si);
+ continue;
+ }
+ len = m->m_pkthdr.len;
+
+ if (len > rrq->credit) {
+ /* Packet too big */
+ rrq->credit += rrq->quantum;
+ /* Try next queue */
+ next_pointer(si);
+ } else {
+ rrq->credit -= len;
+ return dn_dequeue(&rrq->q);
+ }
+ }
+
+ /* no packet to dequeue*/
+ return NULL;
+}
+
+static int
+rr_config(struct dn_schk *_schk)
+{
+ struct rr_schk *schk = (struct rr_schk *)(_schk + 1);
+ ND("called");
+
+ /* use reasonable quantums (64..2k bytes, default 1500) */
+ schk->min_q = 64;
+ schk->max_q = 2048;
+ schk->q_bytes = 1500; /* quantum */
+
+ return 0;
+}
+
+static int
+rr_new_sched(struct dn_sch_inst *_si)
+{
+ struct rr_si *si = (struct rr_si *)(_si + 1);
+
+ ND("called");
+ si->head = si->tail = NULL;
+
+ return 0;
+}
+
+static int
+rr_free_sched(struct dn_sch_inst *_si)
+{
+ ND("called");
+ /* Nothing to do? */
+ return 0;
+}
+
+static int
+rr_new_fsk(struct dn_fsk *fs)
+{
+ struct rr_schk *schk = (struct rr_schk *)(fs->sched + 1);
+ /* par[0] is the weight, par[1] is the quantum step */
+ ipdn_bound_var(&fs->fs.par[0], 1,
+ 1, 65536, "RR weight");
+ ipdn_bound_var(&fs->fs.par[1], schk->q_bytes,
+ schk->min_q, schk->max_q, "RR quantum");
+ return 0;
+}
+
+static int
+rr_new_queue(struct dn_queue *_q)
+{
+ struct rr_queue *q = (struct rr_queue *)_q;
+
+ _q->ni.oid.subtype = DN_SCHED_RR;
+
+ q->quantum = _q->fs->fs.par[0] * _q->fs->fs.par[1];
+ ND("called, q->quantum %d", q->quantum);
+ q->credit = q->quantum;
+ q->status = 0;
+
+ if (_q->mq.head != NULL) {
+ /* Queue NOT empty, insert in the queue list */
+ rr_append(q, (struct rr_si *)(_q->_si + 1));
+ }
+ return 0;
+}
+
+static int
+rr_free_queue(struct dn_queue *_q)
+{
+ struct rr_queue *q = (struct rr_queue *)_q;
+
+ ND("called");
+ if (q->status == 1) {
+ struct rr_si *si = (struct rr_si *)(_q->_si + 1);
+ remove_queue_q(q, si);
+ }
+ return 0;
+}
+
+/*
+ * RR scheduler descriptor
+ * contains the type of the scheduler, the name, the size of the
+ * structures and function pointers.
+ */
+static struct dn_alg rr_desc = {
+ _SI( .type = ) DN_SCHED_RR,
+ _SI( .name = ) "RR",
+ _SI( .flags = ) DN_MULTIQUEUE,
+
+ _SI( .schk_datalen = ) 0,
+ _SI( .si_datalen = ) sizeof(struct rr_si),
+ _SI( .q_datalen = ) sizeof(struct rr_queue) - sizeof(struct dn_queue),
+
+ _SI( .enqueue = ) rr_enqueue,
+ _SI( .dequeue = ) rr_dequeue,
+
+ _SI( .config = ) rr_config,
+ _SI( .destroy = ) NULL,
+ _SI( .new_sched = ) rr_new_sched,
+ _SI( .free_sched = ) rr_free_sched,
+ _SI( .new_fsk = ) rr_new_fsk,
+ _SI( .free_fsk = ) NULL,
+ _SI( .new_queue = ) rr_new_queue,
+ _SI( .free_queue = ) rr_free_queue,
+};
+
+
+DECLARE_DNSCHED_MODULE(dn_rr, &rr_desc);
diff --git a/sys/netinet/ipfw/dn_sched_wf2q.c b/sys/netinet/ipfw/dn_sched_wf2q.c
new file mode 100644
index 0000000..1fbc120
--- /dev/null
+++ b/sys/netinet/ipfw/dn_sched_wf2q.c
@@ -0,0 +1,373 @@
+/*
+ * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
+ * Copyright (c) 2000-2002 Luigi Rizzo, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ */
+
+#ifdef _KERNEL
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/kernel.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <net/if.h> /* IFNAMSIZ */
+#include <netinet/in.h>
+#include <netinet/ip_var.h> /* ipfw_rule_ref */
+#include <netinet/ip_fw.h> /* flow_id */
+#include <netinet/ip_dummynet.h>
+#include <netinet/ipfw/dn_heap.h>
+#include <netinet/ipfw/ip_dn_private.h>
+#include <netinet/ipfw/dn_sched.h>
+#else
+#include <dn_test.h>
+#endif
+
+#ifndef MAX64
+#define MAX64(x,y) (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x)
+#endif
+
+/*
+ * timestamps are computed on 64 bit using fixed point arithmetic.
+ * LMAX_BITS, WMAX_BITS are the max number of bits for the packet len
+ * and sum of weights, respectively. FRAC_BITS is the number of
+ * fractional bits. We want FRAC_BITS >> WMAX_BITS to avoid too large
+ * errors when computing the inverse, FRAC_BITS < 32 so we can do 1/w
+ * using an unsigned 32-bit division, and to avoid wraparounds we need
+ * LMAX_BITS + WMAX_BITS + FRAC_BITS << 64
+ * As an example
+ * FRAC_BITS = 26, LMAX_BITS=14, WMAX_BITS = 19
+ */
+#ifndef FRAC_BITS
+#define FRAC_BITS 28 /* shift for fixed point arithmetic */
+#define ONE_FP (1UL << FRAC_BITS)
+#endif
+
+/*
+ * Private information for the scheduler instance:
+ * sch_heap (key is Finish time) returns the next queue to serve
+ * ne_heap (key is Start time) stores not-eligible queues
+ * idle_heap (key=start/finish time) stores idle flows. It must
+ * support extract-from-middle.
+ * A flow is only in 1 of the three heaps.
+ * XXX todo: use a more efficient data structure, e.g. a tree sorted
+ * by F with min_subtree(S) in each node
+ */
+struct wf2qp_si {
+ struct dn_heap sch_heap; /* top extract - key Finish time */
+ struct dn_heap ne_heap; /* top extract - key Start time */
+ struct dn_heap idle_heap; /* random extract - key Start=Finish time */
+ uint64_t V; /* virtual time */
+ uint32_t inv_wsum; /* inverse of sum of weights */
+ uint32_t wsum; /* sum of weights */
+};
+
+struct wf2qp_queue {
+ struct dn_queue _q;
+ uint64_t S, F; /* start time, finish time */
+ uint32_t inv_w; /* ONE_FP / weight */
+ int32_t heap_pos; /* position (index) of struct in heap */
+};
+
+/*
+ * This file implements a WF2Q+ scheduler as it has been in dummynet
+ * since 2000.
+ * The scheduler supports per-flow queues and has O(log N) complexity.
+ *
+ * WF2Q+ needs to drain entries from the idle heap so that we
+ * can keep the sum of weights up to date. We can do it whenever
+ * we get a chance, or periodically, or following some other
+ * strategy. The function idle_check() drains at most N elements
+ * from the idle heap.
+ */
+static void
+idle_check(struct wf2qp_si *si, int n, int force)
+{
+ struct dn_heap *h = &si->idle_heap;
+ while (n-- > 0 && h->elements > 0 &&
+ (force || DN_KEY_LT(HEAP_TOP(h)->key, si->V))) {
+ struct dn_queue *q = HEAP_TOP(h)->object;
+ struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q;
+
+ heap_extract(h, NULL);
+ /* XXX to let the flowset delete the queue we should
+ * mark it as 'unused' by the scheduler.
+ */
+ alg_fq->S = alg_fq->F + 1; /* Mark timestamp as invalid. */
+ si->wsum -= q->fs->fs.par[0]; /* adjust sum of weights */
+ if (si->wsum > 0)
+ si->inv_wsum = ONE_FP/si->wsum;
+ }
+}
+
+static int
+wf2qp_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m)
+{
+ struct dn_fsk *fs = q->fs;
+ struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
+ struct wf2qp_queue *alg_fq;
+ uint64_t len = m->m_pkthdr.len;
+
+ if (m != q->mq.head) {
+ if (dn_enqueue(q, m, 0)) /* packet was dropped */
+ return 1;
+ if (m != q->mq.head) /* queue was already busy */
+ return 0;
+ }
+
+ /* If reach this point, queue q was idle */
+ alg_fq = (struct wf2qp_queue *)q;
+
+ if (DN_KEY_LT(alg_fq->F, alg_fq->S)) {
+ /* F<S means timestamps are invalid ->brand new queue. */
+ alg_fq->S = si->V; /* init start time */
+ si->wsum += fs->fs.par[0]; /* add weight of new queue. */
+ si->inv_wsum = ONE_FP/si->wsum;
+ } else { /* if it was idle then it was in the idle heap */
+ heap_extract(&si->idle_heap, q);
+ alg_fq->S = MAX64(alg_fq->F, si->V); /* compute new S */
+ }
+ alg_fq->F = alg_fq->S + len * alg_fq->inv_w;
+
+ /* if nothing is backlogged, make sure this flow is eligible */
+ if (si->ne_heap.elements == 0 && si->sch_heap.elements == 0)
+ si->V = MAX64(alg_fq->S, si->V);
+
+ /*
+ * Look at eligibility. A flow is not eligibile if S>V (when
+ * this happens, it means that there is some other flow already
+ * scheduled for the same pipe, so the sch_heap cannot be
+ * empty). If the flow is not eligible we just store it in the
+ * ne_heap. Otherwise, we store in the sch_heap.
+ * Note that for all flows in sch_heap (SCH), S_i <= V,
+ * and for all flows in ne_heap (NEH), S_i > V.
+ * So when we need to compute max(V, min(S_i)) forall i in
+ * SCH+NEH, we only need to look into NEH.
+ */
+ if (DN_KEY_LT(si->V, alg_fq->S)) {
+ /* S>V means flow Not eligible. */
+ if (si->sch_heap.elements == 0)
+ D("++ ouch! not eligible but empty scheduler!");
+ heap_insert(&si->ne_heap, alg_fq->S, q);
+ } else {
+ heap_insert(&si->sch_heap, alg_fq->F, q);
+ }
+ return 0;
+}
+
+/* XXX invariant: sch > 0 || V >= min(S in neh) */
+static struct mbuf *
+wf2qp_dequeue(struct dn_sch_inst *_si)
+{
+ /* Access scheduler instance private data */
+ struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
+ struct mbuf *m;
+ struct dn_queue *q;
+ struct dn_heap *sch = &si->sch_heap;
+ struct dn_heap *neh = &si->ne_heap;
+ struct wf2qp_queue *alg_fq;
+
+ if (sch->elements == 0 && neh->elements == 0) {
+ /* we have nothing to do. We could kill the idle heap
+ * altogether and reset V
+ */
+ idle_check(si, 0x7fffffff, 1);
+ si->V = 0;
+ si->wsum = 0; /* should be set already */
+ return NULL; /* quick return if nothing to do */
+ }
+ idle_check(si, 1, 0); /* drain something from the idle heap */
+
+ /* make sure at least one element is eligible, bumping V
+ * and moving entries that have become eligible.
+ * We need to repeat the first part twice, before and
+ * after extracting the candidate, or enqueue() will
+ * find the data structure in a wrong state.
+ */
+ m = NULL;
+ for(;;) {
+ /*
+ * Compute V = max(V, min(S_i)). Remember that all elements
+ * in sch have by definition S_i <= V so if sch is not empty,
+ * V is surely the max and we must not update it. Conversely,
+ * if sch is empty we only need to look at neh.
+ * We don't need to move the queues, as it will be done at the
+ * next enqueue
+ */
+ if (sch->elements == 0 && neh->elements > 0) {
+ si->V = MAX64(si->V, HEAP_TOP(neh)->key);
+ }
+ while (neh->elements > 0 &&
+ DN_KEY_LEQ(HEAP_TOP(neh)->key, si->V)) {
+ q = HEAP_TOP(neh)->object;
+ alg_fq = (struct wf2qp_queue *)q;
+ heap_extract(neh, NULL);
+ heap_insert(sch, alg_fq->F, q);
+ }
+ if (m) /* pkt found in previous iteration */
+ break;
+ /* ok we have at least one eligible pkt */
+ q = HEAP_TOP(sch)->object;
+ alg_fq = (struct wf2qp_queue *)q;
+ m = dn_dequeue(q);
+ heap_extract(sch, NULL); /* Remove queue from heap. */
+ si->V += (uint64_t)(m->m_pkthdr.len) * si->inv_wsum;
+ alg_fq->S = alg_fq->F; /* Update start time. */
+ if (q->mq.head == 0) { /* not backlogged any more. */
+ heap_insert(&si->idle_heap, alg_fq->F, q);
+ } else { /* Still backlogged. */
+ /* Update F, store in neh or sch */
+ uint64_t len = q->mq.head->m_pkthdr.len;
+ alg_fq->F += len * alg_fq->inv_w;
+ if (DN_KEY_LEQ(alg_fq->S, si->V)) {
+ heap_insert(sch, alg_fq->F, q);
+ } else {
+ heap_insert(neh, alg_fq->S, q);
+ }
+ }
+ }
+ return m;
+}
+
+static int
+wf2qp_new_sched(struct dn_sch_inst *_si)
+{
+ struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
+ int ofs = offsetof(struct wf2qp_queue, heap_pos);
+
+ /* all heaps support extract from middle */
+ if (heap_init(&si->idle_heap, 16, ofs) ||
+ heap_init(&si->sch_heap, 16, ofs) ||
+ heap_init(&si->ne_heap, 16, ofs)) {
+ heap_free(&si->ne_heap);
+ heap_free(&si->sch_heap);
+ heap_free(&si->idle_heap);
+ return ENOMEM;
+ }
+ return 0;
+}
+
+static int
+wf2qp_free_sched(struct dn_sch_inst *_si)
+{
+ struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
+
+ heap_free(&si->sch_heap);
+ heap_free(&si->ne_heap);
+ heap_free(&si->idle_heap);
+
+ return 0;
+}
+
+static int
+wf2qp_new_fsk(struct dn_fsk *fs)
+{
+ ipdn_bound_var(&fs->fs.par[0], 1,
+ 1, 100, "WF2Q+ weight");
+ return 0;
+}
+
+static int
+wf2qp_new_queue(struct dn_queue *_q)
+{
+ struct wf2qp_queue *q = (struct wf2qp_queue *)_q;
+
+ _q->ni.oid.subtype = DN_SCHED_WF2QP;
+ q->F = 0; /* not strictly necessary */
+ q->S = q->F + 1; /* mark timestamp as invalid. */
+ q->inv_w = ONE_FP / _q->fs->fs.par[0];
+ if (_q->mq.head != NULL) {
+ wf2qp_enqueue(_q->_si, _q, _q->mq.head);
+ }
+ return 0;
+}
+
+/*
+ * Called when the infrastructure removes a queue (e.g. flowset
+ * is reconfigured). Nothing to do if we did not 'own' the queue,
+ * otherwise remove it from the right heap and adjust the sum
+ * of weights.
+ */
+static int
+wf2qp_free_queue(struct dn_queue *q)
+{
+ struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q;
+ struct wf2qp_si *si = (struct wf2qp_si *)(q->_si + 1);
+
+ if (alg_fq->S >= alg_fq->F + 1)
+ return 0; /* nothing to do, not in any heap */
+ si->wsum -= q->fs->fs.par[0];
+ if (si->wsum > 0)
+ si->inv_wsum = ONE_FP/si->wsum;
+
+ /* extract from the heap. XXX TODO we may need to adjust V
+ * to make sure the invariants hold.
+ */
+ if (q->mq.head == NULL) {
+ heap_extract(&si->idle_heap, q);
+ } else if (DN_KEY_LT(si->V, alg_fq->S)) {
+ heap_extract(&si->ne_heap, q);
+ } else {
+ heap_extract(&si->sch_heap, q);
+ }
+ return 0;
+}
+
+/*
+ * WF2Q+ scheduler descriptor
+ * contains the type of the scheduler, the name, the size of the
+ * structures and function pointers.
+ */
+static struct dn_alg wf2qp_desc = {
+ _SI( .type = ) DN_SCHED_WF2QP,
+ _SI( .name = ) "WF2Q+",
+ _SI( .flags = ) DN_MULTIQUEUE,
+
+ /* we need extra space in the si and the queue */
+ _SI( .schk_datalen = ) 0,
+ _SI( .si_datalen = ) sizeof(struct wf2qp_si),
+ _SI( .q_datalen = ) sizeof(struct wf2qp_queue) -
+ sizeof(struct dn_queue),
+
+ _SI( .enqueue = ) wf2qp_enqueue,
+ _SI( .dequeue = ) wf2qp_dequeue,
+
+ _SI( .config = ) NULL,
+ _SI( .destroy = ) NULL,
+ _SI( .new_sched = ) wf2qp_new_sched,
+ _SI( .free_sched = ) wf2qp_free_sched,
+
+ _SI( .new_fsk = ) wf2qp_new_fsk,
+ _SI( .free_fsk = ) NULL,
+
+ _SI( .new_queue = ) wf2qp_new_queue,
+ _SI( .free_queue = ) wf2qp_free_queue,
+};
+
+
+DECLARE_DNSCHED_MODULE(dn_wf2qp, &wf2qp_desc);
diff --git a/sys/netinet/ipfw/dummynet.txt b/sys/netinet/ipfw/dummynet.txt
new file mode 100644
index 0000000..0ed6ad1
--- /dev/null
+++ b/sys/netinet/ipfw/dummynet.txt
@@ -0,0 +1,860 @@
+#
+# $FreeBSD$
+#
+
+Notes on the internal structure of dummynet (2010 version)
+by Riccardo Panicucci and Luigi Rizzo
+Work supported by the EC project ONELAB2
+
+
+*********
+* INDEX *
+*********
+Implementation of new dummynet
+ Internal structure
+ Files
+Packet arrival
+ The reconfiguration routine
+dummynet_task()
+Configuration
+ Add a pipe
+ Add a scheduler
+ Add a flowset
+Listing object
+Delete of object
+ Delete a pipe
+ Delete a flowset
+ Delete a scheduler
+Compatibility with FreeBSD7.2 and FreeBSD 8 ipfw binary
+ ip_dummynet_glue.c
+ ip_fw_glue.c
+How to configure dummynet
+How to implement a new scheduler
+
+
+
+OPEN ISSUES
+------------------------------
+20100131 deleting RR causes infinite loop
+ presumably in the rr_free_queue() call -- seems to hang
+ forever when deleting a live flow
+------------------------------
+
+Dummynet is a traffic shaper and network emulator. Packets are
+selected by an external filter such as ipfw, and passed to the emulator
+with a tag such as "pipe 10" or "queue 5" which tells what to
+do with the packet. As an example
+
+ ipfw add queue 5 icmp from 10.0.0.2 to all
+
+All packets with the same tag belong to a "flowset", or a set
+of flows which can be further partitioned according to a mask.
+Flowsets are then passed to a scheduler for processing. The
+association of flowsets and schedulers is configurable e.g.
+
+ ipfw queue 5 config sched 10 weight 3 flow_mask xxxx
+ ipfw queue 8 config sched 10 weight 1 ...
+ ipfw queue 3 config sched 20 weight 1 ...
+
+"sched 10" represents one or more scheduler instances,
+selected through a mask on the 5-tuple itself.
+
+ ipfw sched 20 config type FIFO sched_mask yyy ...
+
+There are in fact two masks applied to each packet:
++ the "sched_mask" sends packets arriving to a scheduler_id to
+ one of many instances.
++ the "flow_mask" together with the flowset_id is used to
+ collect packets into independent flows on each scheduler.
+
+As an example, we can have
+ ipfw queue 5 config sched 10 flow_mask src-ip 0x000000ff
+ ipfw sched 10 config type WF2Q+ sched_mask src-ip 0xffffff00
+
+means that sched 10 will have one instance per /24 source subnet,
+and within that, each individual source will be a flow.
+
+Internal structure
+-----------------
+Dummynet-related data is split into several data structures,
+part of them constituting the userland-kernel API, and others
+specific to the kernel.
+NOTE: for up-to-date details please look at the relevant source
+ headers (ip_dummynet.h, ip_dn_private.h, dn_sched.h)
+
+USERLAND-KERNEL API (ip_dummynet.h)
+
+ struct dn_link:
+ contains data about the physical link such as
+ bandwith, delay, burst size;
+
+ struct dn_fs:
+ describes a flowset, i.e. a template for queues.
+ Main parameters are the scheduler we attach to, a flow_mask,
+ buckets, queue size, plr, weight, and other scheduler-specific
+ parameters.
+
+ struct dn_flow
+ contains information on a flow, including masks and
+ statistics
+
+ struct dn_sch:
+ defines a scheduler (and a link attached to it).
+ Parameters include scheduler type, sched_mask, number of
+ buckets, and possibly other scheduler-specific parameters,
+
+ struct dn_profile:
+ fields to simulate a delay profile
+
+
+KERNEL REPRESENTATION (ip_dn_private.h)
+
+ struct mq
+ a queue of mbufs with head and tail.
+
+ struct dn_queue
+ individual queue of packets, created by a flowset using
+ flow_mask and attached to a scheduler instance selected
+ through sched_mask.
+ A dn_queue has a pointer to the dn_fsk (which in turn counts
+ how many queues point to it), a pointer to the
+ dn_sch_inst it attaches to, and is in a hash table in the
+ flowset. scheduler instances also should store queues in
+ their own containers used for scheduling (lists, trees, etc.)
+ CREATE: done on packet arrivals when a flow matches a flowset.
+ DELETE: done only when deleting the parent dn_sch_inst
+ or draining memory.
+
+ struct dn_fsk
+ includes a dn_fs; a pointer to the dn_schk; a link field
+ for the list of dn_fsk attached to the same scheduler,
+ or for the unlinked list;
+ a refcount for the number of queues pointing to it;
+ The dn_fsk is in a hash table, fshash.
+ CREATE: done on configuration commands.
+ DELETE: on configuration commands.
+
+ struct dn_sch_inst
+ a scheduler instance, created from a dn_schk applying sched_mask.
+ Contains a delay line, a reference to the parent, and scheduler-
+ specific info. Both dn_sch_inst and its delay line can be in the
+ evheap if they have events to be processed.
+ CREATE: created from a dn_schk applying sched_mask
+ DELETE: configuration command delete a scheduler which in turn
+ sweeps the hash table of instances deleting them
+
+ struct dn_schk
+ includes dn_sch, dn_link, a pointer to dn_profile,
+ a hash table of dn_sch_inst, a list of dn_fsk
+ attached to it.
+ CREATE: configuration command. If there are flowsets that
+ refer to this number, they are attached and moved
+ to the hash table
+ DELETE: manual, see dn_sch_inst
+
+
+ fshash schedhash
+ +---------------+ sched +--------------+
+ | sched-------------------->| NEW_SCHK|
+ -<----*sch_chain |<-----------------*fsk_list |
+ |NEW_FSK |<----. | [dn_link] |
+ +---------------+ | +--------------+
+ |qht (hash) | | | siht(hash) |
+ | [dn_queue] | | | [dn_si] |
+ | [dn_queue] | | | [dn_si] |
+ | ... | | | ... |
+ | +--------+ | | | +---------+ |
+ | |dn_queue| | | | |dn_si | |
+ | | fs *----------' | | | |
+ | | si *---------------------->| | |
+ | +---------+ | | +---------+ |
+ +---------------+ +--------------+
+
+The following global data structures contain all
+schedulers and flowsets.
+
+- schedhash[x]: contains all scheduler templates in the system.
+ Looked up only on manual configurations, where flowsets
+ are attached to matching schedulers.
+ We have one entry per 'sched X config' command
+ (plus one for each 'pipe X config').
+
+- fshash[x]: contains all flowsets.
+ We do a lookup on this for each packet.
+ We have one entry for each 'queue X config'
+ (plus one for each 'pipe X config').
+
+Additionally, a list that contains all unlinked flowset:
+- fsu: contains flowset that are not linked with any scheduler.
+ Flowset are put in this list when they refer to a non
+ existing scheduler.
+ We don't need an efficient data structure as we never search
+ here on a packet arrivals.
+
+Scheduler instances and the delay lines associated with each scheduler
+instance need to be woken up at certain times. Because we have many
+such objects, we keep them in a priority heap (system_heap).
+
+Almost all objects in this implementation are preceded by a structure
+(struct dn_id) which makes it easier to identify them.
+
+
+Files
+-----
+The dummynet code is split in several files.
+All kernel code is in sys/netinet/ipfw except ip_dummynet.h
+All userland code is in sbin/ipfw.
+Files are
+- sys/netinet/ip_dummynet.h defines the kernel-userland API
+- ip_dn_private.h contains the kernel-specific APIs
+ and data structures
+- dn_sched.h defines the scheduler API
+- ip_dummynet.c cointains module glue and sockopt handlers, with all
+ functions to configure and list objects.
+- ip_dn_io.c contains the functions directly related to packet processing,
+ and run in the critical path. It also contains some functions
+ exported to the schedulers.
+- dn_heap.[ch] implement a binary heap and a generic hash table
+- dn_sched_* implement the various scheduler modules
+
+- dummynet.c is the file used to implement the user side of dummynet.
+ It contains the function to parsing command line, and functions to
+ show the output of dummynet objects.
+Moreover, there are two new file (ip_dummynet_glue.c and ip_fw_glue.c) that
+are used to allow compatibility with the "ipfw" binary from FreeBSD 7.2 and
+FreeBSD 8.
+
+LOCKING
+=======
+At the moment the entire processing occurs under a single lock
+which is expected to be acquired in exclusive mode
+DN_BH_WLOCK() / DN_BH_WUNLOCK().
+
+In perspective we aim at the following:
+- the 'busy' flag, 'pending' list and all structures modified by packet
+ arrivals and departures are protected by the BH_WLOCK.
+ This is normally acquired in exclusive mode by the packet processing
+ functions for short sections of code (exception -- the timer).
+ If 'busy' is not set, we can do regular packet processing.
+ If 'busy' is set, no pieces can be accessed.
+ We must enqueue the packet on 'pending' and return immediately.
+
+- the 'busy' flag is set/cleared by long sections of code as follows:
+ UH_WLOCK(); KASSERT(busy == 0);
+ BH_WLOCK(); busy=1; BH_WUNLOCK();
+ ... do processing ...
+ BH_WLOCK(); busy=0; drain_queue(pending); BH_WUNLOCK();
+ UH_WUNLOCK();
+ this normally happens when the upper half has something heavy
+ to do. The prologue and epilogue are not in the critical path.
+
+- the main containers (fshash, schedhash, ...) are protected by
+ UH_WLOCK.
+
+Packet processing
+=================
+A packet enters dummynet through dummynet_io(). We first lookup
+the flowset number in fshash using dn_ht_find(), then find the scheduler
+instance using ipdn_si_find(), then possibly identify the correct
+queue with ipdn_q_find().
+If successful, we call the scheduler's enqueue function(), and
+if needed start I/O on the link calling serve_sched().
+If the packet can be returned immediately, this is done by
+leaving *m0 set. Otherwise, the packet is absorbed by dummynet
+and we simply return, possibly with some appropriate error code.
+
+Reconfiguration
+---------------
+Reconfiguration is the complex part of the system because we need to
+keep track of the various objects and containers.
+At the moment we do not use reference counts for objects so all
+processing must be done under a lock.
+
+The main entry points for configuration is the ip_dn_ctl() handler
+for the IP_DUMMYNET3 sockopt (others are provided only for backward
+compatibility). Modifications to the configuration call do_config().
+The argument is a sequence of blocks each starting with a struct dn_id
+which specifies its content.
+The first dn_id must contain as obj.id the DN_API_VERSION
+The obj.type is DN_CMD_CONFIG (followed by actual objects),
+DN_CMD_DELETE (with the correct subtype and list of objects), or
+DN_CMD_FLUSH.
+
+DN_CMD_CONFIG is followed by objects to add/reconfigure. In general,
+if an object already exists it is reconfigured, otherwise it is
+created in a way that keeps the structure consistent.
+We have the following objects in the system, normally numbered with
+an identifier N between 1 and 65535. For certain objects we have
+"shadow" copies numbered I+NMAX and I+ 2*NMAX which are used to
+implement certain backward compatibility features.
+
+In general we have the following linking
+
+ TRADITIONAL DUMMYNET QUEUES "queue N config ... pipe M ..."
+ corresponds to a dn_fs object numbered N
+
+ TRADITIONAL DUMMYNET PIPES "pipe N config ..."
+ dn_fs N+2*NMAX --> dn_sch N+NMAX type FIFO --> dn_link N+NMAX
+
+ GENERIC SCHEDULER "sched N config ... "
+ [dn_fs N+NMAX] --> dn_sch N --> dn_link N
+ The flowset N+NMAX is created only if the scheduler is not
+ of type MULTIQUEUE.
+
+ DELAY PROFILE "pipe N config profile ..."
+ it is always attached to an existing dn_link N
+
+Because traditional dummynet pipes actually configure both a
+'standalone' instance and one that can be used by queues,
+we do the following:
+
+ "pipe N config ..." configures:
+ dn_sched N type WF2Q+
+ dn_sched N+NMAX type FIFO
+ dn_fs N+2NMAX attached to dn_sched N+NMAX
+ dn_pipe N
+ dn_pipe N+NMAX
+
+ "queue N config" configures
+ dn_fs N
+
+ "sched N config" configures
+ dn_sched N type as desired
+ dn_fs N+NMAX attached to dn_sched N
+
+
+dummynet_task()
+===============
+The dummynet_task() is the the main dummynet processing function and is
+called every tick. This function first calculate the new current time, then
+it checks if it is the time to wake up object from the system_heap comparing
+the current time and the key of the heap. Two types of object (really the
+heap contains pointer to objects) are in the
+system_heap:
+
+- scheduler instance: if a scheduler instance is waked up, the dequeue()
+ function is called until it has credit. If the dequeue() returns packets,
+ the scheduler instance is inserted in the heap with a new key depending of
+ the data that will be send out. If the scheduler instance remains with
+ some credit, it means that is hasn't other packet to send and so the
+ instance is no longer inserted in the heap.
+
+ If the scheduler instance extracted from the heap has the DELETE flag set,
+ the dequeue() is not called and the instance is destroyed now.
+
+- delay line: when extracting a delay line, the function transmit_event() is
+ called to send out packet from delay line.
+
+ If the scheduler instance associated with this delay line doesn't exists,
+ the delay line will be delete now.
+
+Configuration
+=============
+To create a pipe, queue or scheduler, the user should type commands like:
+"ipfw pipe x config"
+"ipfw queue y config pipe x"
+"ipfw pipe x config sched <type>"
+
+The userland side of dummynet will prepare a buffer contains data to pass to
+kernel side.
+The buffer contains all struct needed to configure an object. In more detail,
+to configure a pipe all three structs (dn_link, dn_sch, dn_fs) are needed,
+plus the delay profile struct if the pipe has a delay profile.
+
+If configuring a scheduler only the struct dn_sch is wrote in the buffer,
+while if configuring a flowset only the dn_fs struct is wrote.
+
+The first struct in the buffer contains the type of command request, that is
+if it is configuring a pipe, a queue, or a scheduler. Then there are structs
+need to configure the object, and finally there is the struct that mark
+the end of the buffer.
+
+To support the insertion of pipe and queue using the old syntax, when adding
+a pipe it's necessary to create a FIFO flowset and a FIFO scheduler, which
+have a number x + DN_PIPEOFFSET.
+
+Add a pipe
+----------
+A pipe is only a template for a link.
+If the pipe already exists, parameters are updated. If a delay profile exists
+it is deleted and a new one is created.
+If the pipe doesn't exist a new one is created. After the creation, the
+flowset unlinked list is scanned to see if there are some flowset that would
+be linked with this pipe. If so, these flowset will be of wf2q+ type (for
+compatibility) and a new wf2q+ scheduler is created now.
+
+Add a scheduler
+---------------
+If the scheduler already exists, and the type and the mask are the same, the
+scheduler is simply reconfigured calling the config_scheduler() scheduler
+function with the RECONFIGURE flag active.
+If the type or the mask differ, it is necessary to delete the old scheduler
+and create a new one.
+If the scheduler doesn't exists, a new one is created. If the scheduler has
+a mask, the hash table is created to store pointers to scheduler instances.
+When a new scheduler is created, it is necessary to scan the unlinked
+flowset list to search eventually flowset that would be linked with this
+scheduler number. If some are found, flowsets became of the type of this
+scheduler and they are configured properly.
+
+Add a flowset
+-------------
+Flowset pointers are store in the system in two list. The unlinked flowset list
+contains all flowset that aren't linked with a scheduler, the flowset list
+contains flowset linked to a scheduler, and so they have a type.
+When adding a new flowset, first it is checked if the flowset exists (that is,
+it is in the flowset list) and if it doesn't exists a new flowset is created
+and added to unlinked flowset list if the scheduler which the flowset would be
+linked doesn't exists, or added in the flowset list and configured properly if
+the scheduler exists. If the flowset (before to be created) was in the
+unlinked flowset list, it is removed and deleted, and then recreated.
+If the flowset exists, to allow reconfiguration of this flowset, the
+scheduler number and types must match with the one in memory. If this isn't
+so, the flowset is deleted and a new one will be created. Really, the flowset
+it isn't deleted now, but it is removed from flowset list and it will be
+deleted later because there could be some queues that are using it.
+
+Listing of object
+=================
+The user can request a list of object present in dummynet through the command
+"ipfw [-v] pipe|queue [x] list|show"
+The kernel side of dummynet send a buffer to user side that contains all
+pipe, all scheduler, all flowset, plus all scheduler instances and all queues.
+The dummynet user land will format the output and show only the relevant
+information.
+The buffer sent start with all pipe from the system. The entire struct dn_link
+is passed, except the delay_profile struct that is useless in user space.
+After pipes, all flowset are wrote in the buffer. The struct contains
+scheduler flowset specific data is linked with the flowset writing the
+'obj' id of the extension into the 'alg_fs' pointer.
+Then schedulers are wrote. If a scheduler has one or more scheduler instance,
+these are linked to the parent scheduler writing the id of the parent in the
+'ptr_sched' pointer. If a scheduler instance has queues, there are wrote in
+the buffer and linked thorugh the 'obj' and 'sched_inst' pointer.
+Finally, flowsets in the unlinked flowset list are write in the buffer, and
+then a struct gen in saved in the buffer to mark the last struct in the buffer.
+
+
+Delete of object
+================
+An object is usually removed by user through a command like
+"ipfw pipe|queue x delete". XXX sched?
+ipfw pass to the kernel a struct gen that contains the type and the number
+of the object to remove
+
+Delete of pipe x
+----------------
+A pipe can be deleted by the user throught the command 'ipfw pipe x delete'.
+To delete a pipe, the pipe is removed from the pipe list, and then deleted.
+Also the scheduler associated with this pipe should be deleted.
+For compatibility with old dummynet syntax, the associated FIFO scheduler and
+FIFO flowset must be deleted.
+
+Delete of flowset x
+-------------------
+To remove a flowset, we must be sure that is no loger referenced by any object.
+If the flowset to remove is in the unlinked flowset list, there is not any
+issue, the flowset can be safely removed calling a free() (the flowset
+extension is not yet created if the flowset is in this list).
+If the flowset is in the flowset list, first we remove from it so new packet
+are discarded when arrive. Next, the flowset is marked as delete.
+Now we must check if some queue is using this flowset.
+To do this, a counter (active_f) is provided. This counter indicate how many
+queues exist using this flowset.
+The active_f counter is automatically incremented when a queue is created
+and decremented when a queue is deleted.
+If the counter is 0, the flowset can be safely deleted, and the delete_alg_fs()
+scheduler function is called before deallocate memory.
+If the counter is not 0, the flowset remain in memory until the counter become
+zero. When a queue is delete (by dn_delete_queue() function) it is checked if
+the linked flowset is deleting and if so the counter is decrementing. If the
+counter reaches 0, the flowset is deleted.
+The deletion of a queue can be done only by the scheduler, or when the scheduler
+is destroyed.
+
+Delete of scheduler x
+---------------------
+To delete a scheduler we must be sure that any scheduler instance of this type
+are in the system_heap. To do so, a counter (inst_counter) is provided.
+This counter is managed by the system: it is incremented every time it is
+inserted in the system_heap, and decremented every time it is extracted from it.
+To delete the scheduler, first we remove it from the scheduler list, so new
+packet are discarded when they arrive, and mark the scheduler as deleting.
+
+If the counter is 0, we can remove the scheduler safely calling the
+really_deletescheduler() function. This function will scan all scheduler
+instances and call the delete_scheduler_instance() function that will delete
+the instance. When all instance are deleted, the scheduler template is
+deleted calling the delete_scheduler_template(). If the delay line associate
+with the scheduler is empty, it is deleted now, else it will be deleted when
+it will became empy.
+If the counter was not 0, we wait for it. Every time the dummynet_task()
+function extract a scheduler from the system_heap, the counter is decremented.
+If the scheduler has the delete flag enabled the dequeue() is not called and
+delete_scheduler_instance() is called to delete the instance.
+Obviously this scheduler instance is no loger inserted in the system_heap.
+If the counter reaches 0, the delete_scheduler_template() function is called
+all memory is released.
+NOTE: Flowsets that belong to this scheduler are not deleted, so if a new
+ scheduler with the same number is inserted will use these flowsets.
+ To do so, the best approach would be insert these flowset in the
+ unlinked flowset list, but doing this now will be very expensive.
+ So flowsets will remain in memory and linked with a scheduler that no
+ longer exists until a packet belonging to this flowset arrives. When
+ this packet arrives, the reconfigure() function is called because the
+ generation number mismatch with one contains in the flowset and so
+ the flowset will be moved into the flowset unlinked list, or will be
+ linked with the new scheduler if a new one was created.
+
+
+COMPATIBILITY WITH FREEBSD 7.2 AND FREEBSD 8 'IPFW' BINARY
+==========================================================
+Dummynet is not compatible with old ipfw binary because internal structs are
+changed. Moreover, the old ipfw binary is not compatible with new kernels
+because the struct that represents a firewall rule has changed. So, if a user
+install a new kernel on a FreeBSD 7.2, the ipfw (and possibly many other
+commands) will not work.
+New dummynet uses a new socket option: IP_DUMMYNET3, used for both set and get.
+The old option can be used to allow compatibility with the 'ipfw' binary of
+older version (tested with 7.2 and 8.0) of FreeBSD.
+Two file are provided for this purpose:
+- ip_dummynet_glue.c translates old dummynet requests to the new ones,
+- ip_fw_glue.c converts the rule format between 7.2 and 8 versions.
+Let see in detail these two files.
+
+IP_DUMMYNET_GLUE.C
+------------------
+The internal structs of new dummynet are very different from the original.
+Because of there are some difference from between dummynet in FreeBSD 7.2 and
+dummynet in FreeBSD 8 (the FreeBSD 8 version includes support to pipe delay
+profile and burst option), I have to include both header files. I copied
+the revision 191715 (for version 7.2) and the revision 196045 (for version 8)
+and I appended a number to each struct to mark them.
+
+The main function of this file is ip_dummynet_compat() that is called by
+ip_dn_ctl() when it receive a request of old socket option.
+
+A global variabile ('is7') store the version of 'ipfw' that FreeBSD is using.
+This variable is set every time a request of configuration is done, because
+with this request we receive a buffer of which size depending of ipfw version.
+Because of in general the first action is a configuration, this variable is
+usually set accordly. If the first action is a request of listing of pipes
+or queues, the system cannot know the version of ipfw, and we suppose that
+version 7.2 is used. If version is wrong, the output can be senseless, but
+the application should not crash.
+
+There are four request for old dummynet:
+- IP_DUMMYNET_FLUSH: the flush options have no parameter, so simply the
+ dummynet_flush() function is called;
+- IP_DUMMYNET_DEL: the delete option need to be translate.
+ It is only necessary to extract the number and the type of the object
+ (pipe or queue) to delete from the buffer received and build a new struct
+ gen contains the right parameters, then call the delete_object() function;
+- IP_DUMMYNET_CONFIGURE: the configure command receive a buffer depending of
+ the ipfw version. After the properly extraction of all data, that depends
+ by the ipfw version used, new structures are filled and then the dummynet
+ config_link() function is properly called. Note that the 7.2 version does
+ not support some parameter as burst or delay profile.
+- IP_DUMMYNET_GET: The get command should send to the ipfw the correct buffer
+ depending of its version. There are two function that build the
+ corrected buffer, ip_dummynet_get7() and ip_dummynet_get8(). These
+ functions reproduce the buffer exactly as 'ipfw' expect. The only difference
+ is that the weight parameter for a queue is no loger sent by dummynet and so
+ it is set to 0.
+ Moreover, because of the internal structure has changed, the bucket size
+ of a queue could not be correct, because now all flowset share the hash
+ table.
+ If the version of ipfw is wrong, the output could be senseless or truncated,
+ but the application should not crash.
+
+IP_FW_GLUE.C
+------------
+The ipfw binary also is used to add rules to FreeBSD firewall. Because of the
+struct ip_fw is changed from FreeBsd 7.2 to FreeBSD 8, it is necessary
+to write some glue code to allow use ipfw from FreeBSD 7.2 with the kernel
+provided with FreeBSD 8.
+This file contains two functions to convert a rule from FreeBSD 7.2 format to
+FreeBSD 8 format, and viceversa.
+The conversion should be done when a rule passes from userspace to kernel space
+and viceversa.
+I have to modify the ip_fw2.c file to manage these two case, and added a
+variable (is7) to store the ipfw version used, using an approach like the
+previous file:
+- when a new rule is added (option IP_FW_ADD) the is7 variable is set if the
+ size of the rule received corrispond to FreeBSD 7.2 ipfw version. If so, the
+ rule is converted to version 8 calling the function convert_rule_to_8().
+ Moreover, after the insertion of the rule, the rule is now reconverted to
+ version 7 because the ipfw binary will print it.
+- when the user request a list of rules (option IP_FW_GET) the is7 variable
+ should be set correctly because we suppose that a configure command was done,
+ else we suppose that the FreeBSD version is 8. The function ipfw_getrules()
+ in ip_fw2.c file return all rules, eventually converted to version 7 (if
+ the is7 is set) to the ipfw binary.
+The conversion of a rule is quite simple. The only difference between the
+two structures (struct ip_fw) is that in the new there is a new field
+(uint32_t id). So, I copy the entire rule in a buffer and the copy the rule in
+the right position in the new (or old) struct. The size of commands are not
+changed, and the copy is done into a cicle.
+
+How to configure dummynet
+=========================
+It is possible to configure dummynet through two main commands:
+'ipfw pipe' and 'ipfw queue'.
+To allow compatibility with old version, it is possible configure dummynet
+using the old command syntax. Doing so, obviously, it is only possible to
+configure a FIFO scheduler or a wf2q+ scheduler.
+A new command, 'ipfw pipe x config sched <type>' is supported to add a new
+scheduler to the system.
+
+- ipfw pipe x config ...
+ create a new pipe with the link parameters
+ create a new scheduler fifo (x + offset)
+ create a new flowset fifo (x + offset)
+ the mask is eventually stored in the FIFO scheduler
+
+- ipfw queue y config pipe x ...
+ create a new flowset y linked to sched x.
+ The type of flowset depends by the specified scheduler.
+ If the scheduler does not exist, this flowset is inserted in a special
+ list and will be not active.
+ If pipe x exists and sched does not exist, a new wf2q+ scheduler is
+ created and the flowset will be linked to this new scheduler (this is
+ done for compatibility with old syntax).
+
+- ipfw pipe x config sched <type> ...
+ create a new scheduler x of type <type>.
+ Search into the flowset unlinked list if there are some flowset that
+ should be linked with this new scheduler.
+
+- ipfw pipe x delete
+ delete the pipe x
+ delete the scheduler fifo (x + offset)
+ delete the scheduler x
+ delete the flowset fifo (x + offset)
+
+- ipfw queue x delete
+ delete the flowset x
+
+- ipfw sched x delete ///XXX
+ delete the scheduler x
+
+Follow now some examples to how configure dummynet:
+- Ex1:
+ ipfw pipe 10 config bw 1M delay 15 // create a pipe with band and delay
+ A FIFO flowset and scheduler is
+ also created
+ ipfw queue 5 config pipe 10 weight 56 // create a flowset. This flowset
+ will be of wf2q+ because a pipe 10
+ exists. Moreover, the wf2q+
+ scheduler is created now.
+- Ex2:
+ ipfw queue 5 config pipe 10 weight 56 // Create a flowset. Scheduler 10
+ does not exist, so this flowset
+ is inserted in the unlinked
+ flowset list.
+ ipfw pipe 10 config bw... // Create a pipe, a FIFO flowset and scheduler.
+ Because of a flowset with 'pipe 10' exists,
+ a wf2q+ scheduler is created now and that
+ flowset is linked with this sceduler.
+
+- Ex3:
+ ipfw pipe 10 config bw... // Create a pipe, a FIFO flowset and scheduler.
+ ipfw pipe 10 config sched rr // Create a scheduler of type RR, linked to
+ pipe 10
+ ipfw queue 5 config pipe 10 weight 56 // Create a flowset 5. This flowset
+ will belong to scheduler 10 and
+ it is of type RR
+
+- Ex4:
+ ipfw pipe 10 config sched rr // Create a scheduler of type RR, linked to
+ pipe 10 (not exist yet)
+ ipfw pipe 10 config bw... // Create a pipe, a FIFO flowset and scheduler.
+ ipfw queue 5 config pipe 10 weight 56 // Create a flowset 5.This flowset
+ will belong to scheduler 10 and
+ it is of type RR
+ ipfw pipe 10 config sched wf2q+ // Modify the type of scheduler 10. It
+ becomes a wf2q+ scheduler.
+ When a new packet of flowset 5 arrives,
+ the flowset 5 becomes to wf2q+ type.
+
+How to implement a new scheduler
+================================
+In dummynet, a scheduler algorithm is represented by two main structs, some
+functions and other minor structs.
+- A struct dn_sch_xyz (where xyz is the 'type' of scheduler algorithm
+ implemented) contains data relative to scheduler, as global parameter that
+ are common to all instances of the scheduler
+- A struct dn_sch_inst_xyz contains data relative to a single scheduler
+ instance, as local status variable depending for example by flows that
+ are linked with the scheduler, and so on.
+To add a scheduler to dummynet, the user should type a command like:
+'ipfw pipe x config sched <type> [mask ... ...]'
+This command creates a new struct dn_sch_xyz of type <type>, and
+store the optional parameter in that struct.
+
+The parameter mask determines how many scheduler instance of this
+scheduler may exist. For example, it is possible to divide traffic
+depending on the source port (or destination, or ip address...),
+so that every scheduler instance act as an independent scheduler.
+If the mask is not set, all traffic goes to the same instance.
+
+When a packet arrives to a scheduler, the system search the corrected
+scheduler instance, and if it does not exist it is created now (the
+struct dn_sch_inst_xyz is allocated by the system, and the scheduler
+fills the field correctly). It is a task of the scheduler to create
+the struct that contains all queues for a scheduler instance.
+Dummynet provides some function to create an hash table to store
+queues, but the schedule algorithm can choice the own struct.
+
+To link a flow to a scheduler, the user should type a command like:
+'ipfw queue z config pipe x [mask... ...]'
+
+This command creates a new 'dn_fs' struct that will be inserted
+in the system. If the scheduler x exists, this flowset will be
+linked to that scheduler and the flowset type become the same as
+the scheduler type. At this point, the function create_alg_fs_xyz()
+is called to allow store eventually parameter for the flowset that
+depend by scheduler (for example the 'weight' parameter for a wf2q+
+scheduler, or some priority...). A parameter mask can be used for
+a flowset. If the mask parameter is set, the scheduler instance can
+separate packet according to its flow id (src and dst ip, ports...)
+and assign it to a separate queue. This is done by the scheduler,
+so it can ignore the mask if it wants.
+
+See now the two main structs:
+struct dn_sch_xyz {
+ struct gen g; /* important the name g */
+ /* global params */
+};
+struct dn_sch_inst_xyz {
+ struct gen g; /* important the name g */
+ /* params of the instance */
+};
+It is important to embed the struct gen as first parameter. The struct gen
+contains some values that the scheduler instance must fill (the 'type' of
+scheduler, the 'len' of the struct...)
+The function create_scheduler_xyz() should be implemented to initialize global
+parameters in the first struct, and if memory allocation is done it is
+mandatory to implement the delete_scheduler_template() function to free that
+memory.
+The function create_scheduler_instance_xyz() must be implemented even if the
+scheduler instance does not use extra parameters. In this function the struct
+gen fields must be filled with corrected infos. The
+delete_scheduler_instance_xyz() function must bu implemented if the instance
+has allocated some memory in the previous function.
+
+To store data belonging to a flowset the follow struct is used:
+struct alg_fs_xyz {
+ struct gen g;
+ /* fill correctly the gen struct
+ g.subtype = DN_XYZ;
+ g.len = sizeof(struct alg_fs_xyz)
+ ...
+ */
+ /* params for the flow */
+};
+The create_alg_fs_xyz() function is mandatory, because it must fill the struct
+gen, but the delete_alg_fs_xyz() is mandatory only if the previous function
+has allocated some memory.
+
+A struct dn_queue contains packets belonging to a queue and some statistical
+data. The scheduler could have to store data in this struct, so it must define
+a dn_queue_xyz struct:
+struct dn_queue_xyz {
+ struct dn_queue q;
+ /* parameter for a queue */
+}
+
+All structures are allocated by the system. To do so, the scheduler must
+set the size of its structs in the scheduler descriptor:
+scheduler_size: sizeof(dn_sch_xyz)
+scheduler_i_size: sizeof(dn_sch_inst_xyz)
+flowset_size: sizeof(alg_fs_xyz)
+queue_size: sizeof(dn_queue_xyz);
+The scheduler_size could be 0, but other struct must have at least a struct gen.
+
+
+After the definition of structs, it is necessary to implement the
+scheduler functions.
+
+- int (*config_scheduler)(char *command, void *sch, int reconfigure);
+ Configure a scheduler, or reconfigure if 'reconfigure' == 1.
+ This function performs additional allocation and initialization of global
+ parameter for this scheduler.
+ If memory is allocated here, the delete_scheduler_template() function
+ should be implemented to remove this memory.
+- int (*delete_scheduler_template)(void* sch);
+ Delete a scheduler template. This function is mandatory if the scheduler
+ uses extra data respect the struct dn_sch.
+- int (*create_scheduler_instance)(void *s);
+ Create a new scheduler instance. The system allocate the necessary memory
+ and the schedulet can access it using the 's' pointer.
+ The scheduler instance stores all queues, and to do this can use the
+ hash table provided by the system.
+- int (*delete_scheduler_instance)(void *s);
+ Delete a scheduler instance. It is important to free memory allocated
+ by create_scheduler_instance() function. The memory allocated by system
+ is freed by the system itself. The struct contains all queue also has
+ to be deleted.
+- int (*enqueue)(void *s, struct gen *f, struct mbuf *m,
+ struct ipfw_flow_id *id);
+ Called when a packet arrives. The packet 'm' belongs to the scheduler
+ instance 's', has a flowset 'f' and the flowid 'id' has already been
+ masked. The enqueue() must call dn_queue_packet(q, m) function to really
+ enqueue packet in the queue q. The queue 'q' is chosen by the scheduler
+ and if it does not exist should be created calling the dn_create_queue()
+ function. If the schedule want to drop the packet, it must call the
+ dn_drop_packet() function and then return 1.
+- struct mbuf * (*dequeue)(void *s);
+ Called when the timer expires (or when a packet arrives and the scheduler
+ instance is idle).
+ This function is called when at least a packet can be send out. The
+ scheduler choices the packet and returns it; if no packet are in the
+ schedulerinstance, the function must return NULL.
+ Before return a packet, it is important to call the function
+ dn_return_packet() to update some statistic of the queue and update the
+ queue counters.
+- int (*drain_queue)(void *s, int flag);
+ The system request to scheduler to delete all queues that is not using
+ to free memory. The flag parameter indicate if a queue must be deleted
+ even if it is active.
+
+- int (*create_alg_fs)(char *command, struct gen *g, int reconfigure);
+ It is called when a flowset is linked with a scheduler. This is done
+ when the scheduler is defined, so we can know the type of flowset.
+ The function initialize the flowset paramenter parsing the command
+ line. The parameter will be stored in the g struct that have the right
+ size allocated by the system. If the reconfigure flag is set, it means
+ that the flowset is reconfiguring
+- int (*delete_alg_fs)(struct gen *f);
+ It is called when a flowset is deleting. Must remove the memory allocate
+ by the create_alg_fs() function.
+
+- int (*create_queue_alg)(struct dn_queue *q, struct gen *f);
+ Called when a queue is created. The function should link the queue
+ to the struct used by the scheduler instance to store all queues.
+- int (*delete_queue_alg)(struct dn_queue *q);
+ Called when a queue is deleting. The function should remove extra data
+ and update the struct contains all queues in the scheduler instance.
+
+The struct scheduler represent the scheduler descriptor that is passed to
+dummynet when a scheduler module is loaded.
+This struct contains the type of scheduler, the lenght of all structs and
+all function pointers.
+If a function is not implemented should be initialize to NULL. Some functions
+are mandatory, other are mandatory if some memory should be freed.
+Mandatory functions:
+- create_scheduler_instance()
+- enqueue()
+- dequeue()
+- create_alg_fs()
+- drain_queue()
+Optional functions:
+- config_scheduler()
+- create_queue_alg()
+Mandatory functions if the corresponding create...() has allocated memory:
+- delete_scheduler_template()
+- delete_scheduler_instance()
+- delete_alg_fs()
+- delete_queue_alg()
+
diff --git a/sys/netinet/ipfw/ip_dn_glue.c b/sys/netinet/ipfw/ip_dn_glue.c
new file mode 100644
index 0000000..c0df1fc
--- /dev/null
+++ b/sys/netinet/ipfw/ip_dn_glue.c
@@ -0,0 +1,845 @@
+/*-
+ * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ *
+ * Binary compatibility support for /sbin/ipfw RELENG_7 and RELENG_8
+ */
+
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/time.h>
+#include <sys/taskqueue.h>
+#include <net/if.h> /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */
+#include <netinet/in.h>
+#include <netinet/ip_var.h> /* ip_output(), IP_FORWARDING */
+#include <netinet/ip_fw.h>
+#include <netinet/ipfw/ip_fw_private.h>
+#include <netinet/ipfw/dn_heap.h>
+#include <netinet/ip_dummynet.h>
+#include <netinet/ipfw/ip_dn_private.h>
+#include <netinet/ipfw/dn_sched.h>
+
+/* FREEBSD7.2 ip_dummynet.h r191715*/
+
+struct dn_heap_entry7 {
+ int64_t key; /* sorting key. Topmost element is smallest one */
+ void *object; /* object pointer */
+};
+
+struct dn_heap7 {
+ int size;
+ int elements;
+ int offset; /* XXX if > 0 this is the offset of direct ptr to obj */
+ struct dn_heap_entry7 *p; /* really an array of "size" entries */
+};
+
+/* Common to 7.2 and 8 */
+struct dn_flow_set {
+ SLIST_ENTRY(dn_flow_set) next; /* linked list in a hash slot */
+
+ u_short fs_nr ; /* flow_set number */
+ u_short flags_fs;
+#define DNOLD_HAVE_FLOW_MASK 0x0001
+#define DNOLD_IS_RED 0x0002
+#define DNOLD_IS_GENTLE_RED 0x0004
+#define DNOLD_QSIZE_IS_BYTES 0x0008 /* queue size is measured in bytes */
+#define DNOLD_NOERROR 0x0010 /* do not report ENOBUFS on drops */
+#define DNOLD_HAS_PROFILE 0x0020 /* the pipe has a delay profile. */
+#define DNOLD_IS_PIPE 0x4000
+#define DNOLD_IS_QUEUE 0x8000
+
+ struct dn_pipe7 *pipe ; /* pointer to parent pipe */
+ u_short parent_nr ; /* parent pipe#, 0 if local to a pipe */
+
+ int weight ; /* WFQ queue weight */
+ int qsize ; /* queue size in slots or bytes */
+ int plr ; /* pkt loss rate (2^31-1 means 100%) */
+
+ struct ipfw_flow_id flow_mask ;
+
+ /* hash table of queues onto this flow_set */
+ int rq_size ; /* number of slots */
+ int rq_elements ; /* active elements */
+ struct dn_flow_queue7 **rq; /* array of rq_size entries */
+
+ u_int32_t last_expired ; /* do not expire too frequently */
+ int backlogged ; /* #active queues for this flowset */
+
+ /* RED parameters */
+#define SCALE_RED 16
+#define SCALE(x) ( (x) << SCALE_RED )
+#define SCALE_VAL(x) ( (x) >> SCALE_RED )
+#define SCALE_MUL(x,y) ( ( (x) * (y) ) >> SCALE_RED )
+ int w_q ; /* queue weight (scaled) */
+ int max_th ; /* maximum threshold for queue (scaled) */
+ int min_th ; /* minimum threshold for queue (scaled) */
+ int max_p ; /* maximum value for p_b (scaled) */
+ u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */
+ u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */
+ u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */
+ u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */
+ u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */
+ u_int lookup_depth ; /* depth of lookup table */
+ int lookup_step ; /* granularity inside the lookup table */
+ int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */
+ int avg_pkt_size ; /* medium packet size */
+ int max_pkt_size ; /* max packet size */
+};
+SLIST_HEAD(dn_flow_set_head, dn_flow_set);
+
+#define DN_IS_PIPE 0x4000
+#define DN_IS_QUEUE 0x8000
+struct dn_flow_queue7 {
+ struct dn_flow_queue7 *next ;
+ struct ipfw_flow_id id ;
+
+ struct mbuf *head, *tail ; /* queue of packets */
+ u_int len ;
+ u_int len_bytes ;
+
+ u_long numbytes;
+
+ u_int64_t tot_pkts ; /* statistics counters */
+ u_int64_t tot_bytes ;
+ u_int32_t drops ;
+
+ int hash_slot ; /* debugging/diagnostic */
+
+ /* RED parameters */
+ int avg ; /* average queue length est. (scaled) */
+ int count ; /* arrivals since last RED drop */
+ int random ; /* random value (scaled) */
+ u_int32_t q_time; /* start of queue idle time */
+
+ /* WF2Q+ support */
+ struct dn_flow_set *fs ; /* parent flow set */
+ int heap_pos ; /* position (index) of struct in heap */
+ int64_t sched_time ; /* current time when queue enters ready_heap */
+
+ int64_t S,F ; /* start time, finish time */
+};
+
+struct dn_pipe7 { /* a pipe */
+ SLIST_ENTRY(dn_pipe7) next; /* linked list in a hash slot */
+
+ int pipe_nr ; /* number */
+ int bandwidth; /* really, bytes/tick. */
+ int delay ; /* really, ticks */
+
+ struct mbuf *head, *tail ; /* packets in delay line */
+
+ /* WF2Q+ */
+ struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/
+ struct dn_heap7 not_eligible_heap; /* top extract- key Start time */
+ struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */
+
+ int64_t V ; /* virtual time */
+ int sum; /* sum of weights of all active sessions */
+
+ int numbytes;
+
+ int64_t sched_time ; /* time pipe was scheduled in ready_heap */
+
+ /*
+ * When the tx clock come from an interface (if_name[0] != '\0'), its name
+ * is stored below, whereas the ifp is filled when the rule is configured.
+ */
+ char if_name[IFNAMSIZ];
+ struct ifnet *ifp ;
+ int ready ; /* set if ifp != NULL and we got a signal from it */
+
+ struct dn_flow_set fs ; /* used with fixed-rate flows */
+};
+SLIST_HEAD(dn_pipe_head7, dn_pipe7);
+
+
+/* FREEBSD8 ip_dummynet.h r196045 */
+struct dn_flow_queue8 {
+ struct dn_flow_queue8 *next ;
+ struct ipfw_flow_id id ;
+
+ struct mbuf *head, *tail ; /* queue of packets */
+ u_int len ;
+ u_int len_bytes ;
+
+ uint64_t numbytes ; /* credit for transmission (dynamic queues) */
+ int64_t extra_bits; /* extra bits simulating unavailable channel */
+
+ u_int64_t tot_pkts ; /* statistics counters */
+ u_int64_t tot_bytes ;
+ u_int32_t drops ;
+
+ int hash_slot ; /* debugging/diagnostic */
+
+ /* RED parameters */
+ int avg ; /* average queue length est. (scaled) */
+ int count ; /* arrivals since last RED drop */
+ int random ; /* random value (scaled) */
+ int64_t idle_time; /* start of queue idle time */
+
+ /* WF2Q+ support */
+ struct dn_flow_set *fs ; /* parent flow set */
+ int heap_pos ; /* position (index) of struct in heap */
+ int64_t sched_time ; /* current time when queue enters ready_heap */
+
+ int64_t S,F ; /* start time, finish time */
+};
+
+struct dn_pipe8 { /* a pipe */
+ SLIST_ENTRY(dn_pipe8) next; /* linked list in a hash slot */
+
+ int pipe_nr ; /* number */
+ int bandwidth; /* really, bytes/tick. */
+ int delay ; /* really, ticks */
+
+ struct mbuf *head, *tail ; /* packets in delay line */
+
+ /* WF2Q+ */
+ struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/
+ struct dn_heap7 not_eligible_heap; /* top extract- key Start time */
+ struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */
+
+ int64_t V ; /* virtual time */
+ int sum; /* sum of weights of all active sessions */
+
+ /* Same as in dn_flow_queue, numbytes can become large */
+ int64_t numbytes; /* bits I can transmit (more or less). */
+ uint64_t burst; /* burst size, scaled: bits * hz */
+
+ int64_t sched_time ; /* time pipe was scheduled in ready_heap */
+ int64_t idle_time; /* start of pipe idle time */
+
+ char if_name[IFNAMSIZ];
+ struct ifnet *ifp ;
+ int ready ; /* set if ifp != NULL and we got a signal from it */
+
+ struct dn_flow_set fs ; /* used with fixed-rate flows */
+
+ /* fields to simulate a delay profile */
+#define ED_MAX_NAME_LEN 32
+ char name[ED_MAX_NAME_LEN];
+ int loss_level;
+ int samples_no;
+ int *samples;
+};
+
+#define ED_MAX_SAMPLES_NO 1024
+struct dn_pipe_max8 {
+ struct dn_pipe8 pipe;
+ int samples[ED_MAX_SAMPLES_NO];
+};
+SLIST_HEAD(dn_pipe_head8, dn_pipe8);
+
+/*
+ * Changes from 7.2 to 8:
+ * dn_pipe:
+ * numbytes from int to int64_t
+ * add burst (int64_t)
+ * add idle_time (int64_t)
+ * add profile
+ * add struct dn_pipe_max
+ * add flag DN_HAS_PROFILE
+ *
+ * dn_flow_queue
+ * numbytes from u_long to int64_t
+ * add extra_bits (int64_t)
+ * q_time from u_int32_t to int64_t and name idle_time
+ *
+ * dn_flow_set unchanged
+ *
+ */
+
+/* NOTE:XXX copied from dummynet.c */
+#define O_NEXT(p, len) ((void *)((char *)p + len))
+static void
+oid_fill(struct dn_id *oid, int len, int type, uintptr_t id)
+{
+ oid->len = len;
+ oid->type = type;
+ oid->subtype = 0;
+ oid->id = id;
+}
+/* make room in the buffer and move the pointer forward */
+static void *
+o_next(struct dn_id **o, int len, int type)
+{
+ struct dn_id *ret = *o;
+ oid_fill(ret, len, type, 0);
+ *o = O_NEXT(*o, len);
+ return ret;
+}
+
+
+static size_t pipesize7 = sizeof(struct dn_pipe7);
+static size_t pipesize8 = sizeof(struct dn_pipe8);
+static size_t pipesizemax8 = sizeof(struct dn_pipe_max8);
+
+/* Indicate 'ipfw' version
+ * 1: from FreeBSD 7.2
+ * 0: from FreeBSD 8
+ * -1: unknow (for now is unused)
+ *
+ * It is update when a IP_DUMMYNET_DEL or IP_DUMMYNET_CONFIGURE request arrives
+ * NOTE: if a IP_DUMMYNET_GET arrives and the 'ipfw' version is unknow,
+ * it is suppose to be the FreeBSD 8 version.
+ */
+static int is7 = 0;
+
+static int
+convertflags2new(int src)
+{
+ int dst = 0;
+
+ if (src & DNOLD_HAVE_FLOW_MASK)
+ dst |= DN_HAVE_MASK;
+ if (src & DNOLD_QSIZE_IS_BYTES)
+ dst |= DN_QSIZE_BYTES;
+ if (src & DNOLD_NOERROR)
+ dst |= DN_NOERROR;
+ if (src & DNOLD_IS_RED)
+ dst |= DN_IS_RED;
+ if (src & DNOLD_IS_GENTLE_RED)
+ dst |= DN_IS_GENTLE_RED;
+ if (src & DNOLD_HAS_PROFILE)
+ dst |= DN_HAS_PROFILE;
+
+ return dst;
+}
+
+static int
+convertflags2old(int src)
+{
+ int dst = 0;
+
+ if (src & DN_HAVE_MASK)
+ dst |= DNOLD_HAVE_FLOW_MASK;
+ if (src & DN_IS_RED)
+ dst |= DNOLD_IS_RED;
+ if (src & DN_IS_GENTLE_RED)
+ dst |= DNOLD_IS_GENTLE_RED;
+ if (src & DN_NOERROR)
+ dst |= DNOLD_NOERROR;
+ if (src & DN_HAS_PROFILE)
+ dst |= DNOLD_HAS_PROFILE;
+ if (src & DN_QSIZE_BYTES)
+ dst |= DNOLD_QSIZE_IS_BYTES;
+
+ return dst;
+}
+
+static int
+dn_compat_del(void *v)
+{
+ struct dn_pipe7 *p = (struct dn_pipe7 *) v;
+ struct dn_pipe8 *p8 = (struct dn_pipe8 *) v;
+ struct {
+ struct dn_id oid;
+ uintptr_t a[1]; /* add more if we want a list */
+ } cmd;
+
+ /* XXX DN_API_VERSION ??? */
+ oid_fill((void *)&cmd, sizeof(cmd), DN_CMD_DELETE, DN_API_VERSION);
+
+ if (is7) {
+ if (p->pipe_nr == 0 && p->fs.fs_nr == 0)
+ return EINVAL;
+ if (p->pipe_nr != 0 && p->fs.fs_nr != 0)
+ return EINVAL;
+ } else {
+ if (p8->pipe_nr == 0 && p8->fs.fs_nr == 0)
+ return EINVAL;
+ if (p8->pipe_nr != 0 && p8->fs.fs_nr != 0)
+ return EINVAL;
+ }
+
+ if (p->pipe_nr != 0) { /* pipe x delete */
+ cmd.a[0] = p->pipe_nr;
+ cmd.oid.subtype = DN_LINK;
+ } else { /* queue x delete */
+ cmd.oid.subtype = DN_FS;
+ cmd.a[0] = (is7) ? p->fs.fs_nr : p8->fs.fs_nr;
+ }
+
+ return do_config(&cmd, cmd.oid.len);
+}
+
+static int
+dn_compat_config_queue(struct dn_fs *fs, void* v)
+{
+ struct dn_pipe7 *p7 = (struct dn_pipe7 *)v;
+ struct dn_pipe8 *p8 = (struct dn_pipe8 *)v;
+ struct dn_flow_set *f;
+
+ if (is7)
+ f = &p7->fs;
+ else
+ f = &p8->fs;
+
+ fs->fs_nr = f->fs_nr;
+ fs->sched_nr = f->parent_nr;
+ fs->flow_mask = f->flow_mask;
+ fs->buckets = f->rq_size;
+ fs->qsize = f->qsize;
+ fs->plr = f->plr;
+ fs->par[0] = f->weight;
+ fs->flags = convertflags2new(f->flags_fs);
+ if (fs->flags & DN_IS_GENTLE_RED || fs->flags & DN_IS_RED) {
+ fs->w_q = f->w_q;
+ fs->max_th = f->max_th;
+ fs->min_th = f->min_th;
+ fs->max_p = f->max_p;
+ }
+
+ return 0;
+}
+
+static int
+dn_compat_config_pipe(struct dn_sch *sch, struct dn_link *p,
+ struct dn_fs *fs, void* v)
+{
+ struct dn_pipe7 *p7 = (struct dn_pipe7 *)v;
+ struct dn_pipe8 *p8 = (struct dn_pipe8 *)v;
+ int i = p7->pipe_nr;
+
+ sch->sched_nr = i;
+ sch->oid.subtype = 0;
+ p->link_nr = i;
+ fs->fs_nr = i + 2*DN_MAX_ID;
+ fs->sched_nr = i + DN_MAX_ID;
+
+ /* Common to 7 and 8 */
+ p->bandwidth = p7->bandwidth;
+ p->delay = p7->delay;
+ if (!is7) {
+ /* FreeBSD 8 has burst */
+ p->burst = p8->burst;
+ }
+
+ /* fill the fifo flowset */
+ dn_compat_config_queue(fs, v);
+ fs->fs_nr = i + 2*DN_MAX_ID;
+ fs->sched_nr = i + DN_MAX_ID;
+
+ /* Move scheduler related parameter from fs to sch */
+ sch->buckets = fs->buckets; /*XXX*/
+ fs->buckets = 0;
+ if (fs->flags & DN_HAVE_MASK) {
+ sch->flags |= DN_HAVE_MASK;
+ fs->flags &= ~DN_HAVE_MASK;
+ sch->sched_mask = fs->flow_mask;
+ bzero(&fs->flow_mask, sizeof(struct ipfw_flow_id));
+ }
+
+ return 0;
+}
+
+static int
+dn_compat_config_profile(struct dn_profile *pf, struct dn_link *p,
+ void *v)
+{
+ struct dn_pipe8 *p8 = (struct dn_pipe8 *)v;
+
+ p8->samples = &(((struct dn_pipe_max8 *)p8)->samples[0]);
+
+ pf->link_nr = p->link_nr;
+ pf->loss_level = p8->loss_level;
+// pf->bandwidth = p->bandwidth; //XXX bandwidth redundant?
+ pf->samples_no = p8->samples_no;
+ strncpy(pf->name, p8->name,sizeof(pf->name));
+ bcopy(p8->samples, pf->samples, sizeof(pf->samples));
+
+ return 0;
+}
+
+/*
+ * If p->pipe_nr != 0 the command is 'pipe x config', so need to create
+ * the three main struct, else only a flowset is created
+ */
+static int
+dn_compat_configure(void *v)
+{
+ struct dn_id *buf = NULL, *base;
+ struct dn_sch *sch = NULL;
+ struct dn_link *p = NULL;
+ struct dn_fs *fs = NULL;
+ struct dn_profile *pf = NULL;
+ int lmax;
+ int error;
+
+ struct dn_pipe7 *p7 = (struct dn_pipe7 *)v;
+ struct dn_pipe8 *p8 = (struct dn_pipe8 *)v;
+
+ int i; /* number of object to configure */
+
+ lmax = sizeof(struct dn_id); /* command header */
+ lmax += sizeof(struct dn_sch) + sizeof(struct dn_link) +
+ sizeof(struct dn_fs) + sizeof(struct dn_profile);
+
+ base = buf = malloc(lmax, M_DUMMYNET, M_WAIT|M_ZERO);
+ o_next(&buf, sizeof(struct dn_id), DN_CMD_CONFIG);
+ base->id = DN_API_VERSION;
+
+ /* pipe_nr is the same in p7 and p8 */
+ i = p7->pipe_nr;
+ if (i != 0) { /* pipe config */
+ sch = o_next(&buf, sizeof(*sch), DN_SCH);
+ p = o_next(&buf, sizeof(*p), DN_LINK);
+ fs = o_next(&buf, sizeof(*fs), DN_FS);
+
+ error = dn_compat_config_pipe(sch, p, fs, v);
+ if (error) {
+ free(buf, M_DUMMYNET);
+ return error;
+ }
+ if (!is7 && p8->samples_no > 0) {
+ /* Add profiles*/
+ pf = o_next(&buf, sizeof(*pf), DN_PROFILE);
+ error = dn_compat_config_profile(pf, p, v);
+ if (error) {
+ free(buf, M_DUMMYNET);
+ return error;
+ }
+ }
+ } else { /* queue config */
+ fs = o_next(&buf, sizeof(*fs), DN_FS);
+ error = dn_compat_config_queue(fs, v);
+ if (error) {
+ free(buf, M_DUMMYNET);
+ return error;
+ }
+ }
+ error = do_config(base, (char *)buf - (char *)base);
+
+ if (buf)
+ free(buf, M_DUMMYNET);
+ return error;
+}
+
+int
+dn_compat_calc_size(struct dn_parms dn_cfg)
+{
+ int need = 0;
+ /* XXX use FreeBSD 8 struct size */
+ /* NOTE:
+ * - half scheduler: schk_count/2
+ * - all flowset: fsk_count
+ * - all flowset queues: queue_count
+ * - all pipe queue: si_count
+ */
+ need += dn_cfg.schk_count * sizeof(struct dn_pipe8) / 2;
+ need += dn_cfg.fsk_count * sizeof(struct dn_flow_set);
+ need += dn_cfg.si_count * sizeof(struct dn_flow_queue8);
+ need += dn_cfg.queue_count * sizeof(struct dn_flow_queue8);
+
+ return need;
+}
+
+int
+dn_c_copy_q (void *_ni, void *arg)
+{
+ struct copy_args *a = arg;
+ struct dn_flow_queue7 *fq7 = (struct dn_flow_queue7 *)*a->start;
+ struct dn_flow_queue8 *fq8 = (struct dn_flow_queue8 *)*a->start;
+ struct dn_flow *ni = (struct dn_flow *)_ni;
+ int size = 0;
+
+ /* XXX hash slot not set */
+ /* No difference between 7.2/8 */
+ fq7->len = ni->length;
+ fq7->len_bytes = ni->len_bytes;
+ fq7->id = ni->fid;
+
+ if (is7) {
+ size = sizeof(struct dn_flow_queue7);
+ fq7->tot_pkts = ni->tot_pkts;
+ fq7->tot_bytes = ni->tot_bytes;
+ fq7->drops = ni->drops;
+ } else {
+ size = sizeof(struct dn_flow_queue8);
+ fq8->tot_pkts = ni->tot_pkts;
+ fq8->tot_bytes = ni->tot_bytes;
+ fq8->drops = ni->drops;
+ }
+
+ *a->start += size;
+ return 0;
+}
+
+int
+dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq)
+{
+ struct dn_link *l = &s->link;
+ struct dn_fsk *f = s->fs;
+
+ struct dn_pipe7 *pipe7 = (struct dn_pipe7 *)*a->start;
+ struct dn_pipe8 *pipe8 = (struct dn_pipe8 *)*a->start;
+ struct dn_flow_set *fs;
+ int size = 0;
+
+ if (is7) {
+ fs = &pipe7->fs;
+ size = sizeof(struct dn_pipe7);
+ } else {
+ fs = &pipe8->fs;
+ size = sizeof(struct dn_pipe8);
+ }
+
+ /* These 4 field are the same in pipe7 and pipe8 */
+ pipe7->next.sle_next = (struct dn_pipe7 *)DN_IS_PIPE;
+ pipe7->bandwidth = l->bandwidth;
+ pipe7->delay = l->delay;
+ pipe7->pipe_nr = l->link_nr - DN_MAX_ID;
+
+ if (!is7) {
+ if (s->profile) {
+ struct dn_profile *pf = s->profile;
+ strncpy(pipe8->name, pf->name, sizeof(pf->name));
+ pipe8->loss_level = pf->loss_level;
+ pipe8->samples_no = pf->samples_no;
+ }
+ pipe8->burst = div64(l->burst , 8 * hz);
+ }
+
+ fs->flow_mask = s->sch.sched_mask;
+ fs->rq_size = s->sch.buckets ? s->sch.buckets : 1;
+
+ fs->parent_nr = l->link_nr - DN_MAX_ID;
+ fs->qsize = f->fs.qsize;
+ fs->plr = f->fs.plr;
+ fs->w_q = f->fs.w_q;
+ fs->max_th = f->max_th;
+ fs->min_th = f->min_th;
+ fs->max_p = f->fs.max_p;
+ fs->rq_elements = nq;
+
+ fs->flags_fs = convertflags2old(f->fs.flags);
+
+ *a->start += size;
+ return 0;
+}
+
+
+int
+dn_compat_copy_pipe(struct copy_args *a, void *_o)
+{
+ int have = a->end - *a->start;
+ int need = 0;
+ int pipe_size = sizeof(struct dn_pipe8);
+ int queue_size = sizeof(struct dn_flow_queue8);
+ int n_queue = 0; /* number of queues */
+
+ struct dn_schk *s = (struct dn_schk *)_o;
+ /* calculate needed space:
+ * - struct dn_pipe
+ * - if there are instances, dn_queue * n_instances
+ */
+ n_queue = (s->sch.flags & DN_HAVE_MASK ? dn_ht_entries(s->siht) :
+ (s->siht ? 1 : 0));
+ need = pipe_size + queue_size * n_queue;
+ if (have < need) {
+ D("have %d < need %d", have, need);
+ return 1;
+ }
+ /* copy pipe */
+ dn_c_copy_pipe(s, a, n_queue);
+
+ /* copy queues */
+ if (s->sch.flags & DN_HAVE_MASK)
+ dn_ht_scan(s->siht, dn_c_copy_q, a);
+ else if (s->siht)
+ dn_c_copy_q(s->siht, a);
+ return 0;
+}
+
+int
+dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq)
+{
+ struct dn_flow_set *fs = (struct dn_flow_set *)*a->start;
+
+ fs->next.sle_next = (struct dn_flow_set *)DN_IS_QUEUE;
+ fs->fs_nr = f->fs.fs_nr;
+ fs->qsize = f->fs.qsize;
+ fs->plr = f->fs.plr;
+ fs->w_q = f->fs.w_q;
+ fs->max_th = f->max_th;
+ fs->min_th = f->min_th;
+ fs->max_p = f->fs.max_p;
+ fs->flow_mask = f->fs.flow_mask;
+ fs->rq_elements = nq;
+ fs->rq_size = (f->fs.buckets ? f->fs.buckets : 1);
+ fs->parent_nr = f->fs.sched_nr;
+ fs->weight = f->fs.par[0];
+
+ fs->flags_fs = convertflags2old(f->fs.flags);
+ *a->start += sizeof(struct dn_flow_set);
+ return 0;
+}
+
+int
+dn_compat_copy_queue(struct copy_args *a, void *_o)
+{
+ int have = a->end - *a->start;
+ int need = 0;
+ int fs_size = sizeof(struct dn_flow_set);
+ int queue_size = sizeof(struct dn_flow_queue8);
+
+ struct dn_fsk *fs = (struct dn_fsk *)_o;
+ int n_queue = 0; /* number of queues */
+
+ n_queue = (fs->fs.flags & DN_HAVE_MASK ? dn_ht_entries(fs->qht) :
+ (fs->qht ? 1 : 0));
+
+ need = fs_size + queue_size * n_queue;
+ if (have < need) {
+ D("have < need");
+ return 1;
+ }
+
+ /* copy flowset */
+ dn_c_copy_fs(fs, a, n_queue);
+
+ /* copy queues */
+ if (fs->fs.flags & DN_HAVE_MASK)
+ dn_ht_scan(fs->qht, dn_c_copy_q, a);
+ else if (fs->qht)
+ dn_c_copy_q(fs->qht, a);
+
+ return 0;
+}
+
+int
+copy_data_helper_compat(void *_o, void *_arg)
+{
+ struct copy_args *a = _arg;
+
+ if (a->type == DN_COMPAT_PIPE) {
+ struct dn_schk *s = _o;
+ if (s->sch.oid.subtype != 1 || s->sch.sched_nr <= DN_MAX_ID) {
+ return 0; /* not old type */
+ }
+ /* copy pipe parameters, and if instance exists, copy
+ * other parameters and eventually queues.
+ */
+ if(dn_compat_copy_pipe(a, _o))
+ return DNHT_SCAN_END;
+ } else if (a->type == DN_COMPAT_QUEUE) {
+ struct dn_fsk *fs = _o;
+ if (fs->fs.fs_nr >= DN_MAX_ID)
+ return 0;
+ if (dn_compat_copy_queue(a, _o))
+ return DNHT_SCAN_END;
+ }
+ return 0;
+}
+
+/* Main function to manage old requests */
+int
+ip_dummynet_compat(struct sockopt *sopt)
+{
+ int error=0;
+ void *v = NULL;
+ struct dn_id oid;
+
+ /* Lenght of data, used to found ipfw version... */
+ int len = sopt->sopt_valsize;
+
+ /* len can be 0 if command was dummynet_flush */
+ if (len == pipesize7) {
+ D("setting compatibility with FreeBSD 7.2");
+ is7 = 1;
+ }
+ else if (len == pipesize8 || len == pipesizemax8) {
+ D("setting compatibility with FreeBSD 8");
+ is7 = 0;
+ }
+
+ switch (sopt->sopt_name) {
+ default:
+ printf("dummynet: -- unknown option %d", sopt->sopt_name);
+ error = EINVAL;
+ break;
+
+ case IP_DUMMYNET_FLUSH:
+ oid_fill(&oid, sizeof(oid), DN_CMD_FLUSH, DN_API_VERSION);
+ do_config(&oid, oid.len);
+ break;
+
+ case IP_DUMMYNET_DEL:
+ v = malloc(len, M_TEMP, M_WAITOK);
+ error = sooptcopyin(sopt, v, len, len);
+ if (error)
+ break;
+ error = dn_compat_del(v);
+ free(v, M_DUMMYNET);
+ break;
+
+ case IP_DUMMYNET_CONFIGURE:
+ v = malloc(len, M_TEMP, M_WAITOK);
+ error = sooptcopyin(sopt, v, len, len);
+ if (error)
+ break;
+ error = dn_compat_configure(v);
+ free(v, M_DUMMYNET);
+ break;
+
+ case IP_DUMMYNET_GET: {
+ void *buf;
+ int ret;
+ int original_size = sopt->sopt_valsize;
+ int size;
+
+ ret = dummynet_get(sopt, &buf);
+ if (ret)
+ return 0;//XXX ?
+ size = sopt->sopt_valsize;
+ sopt->sopt_valsize = original_size;
+ D("size=%d, buf=%p", size, buf);
+ ret = sooptcopyout(sopt, buf, size);
+ if (ret)
+ printf(" %s ERROR sooptcopyout\n", __FUNCTION__);
+ if (buf)
+ free(buf, M_DUMMYNET);
+ }
+ }
+
+ return error;
+}
+
+
diff --git a/sys/netinet/ipfw/ip_dn_io.c b/sys/netinet/ipfw/ip_dn_io.c
new file mode 100644
index 0000000..70f14ca
--- /dev/null
+++ b/sys/netinet/ipfw/ip_dn_io.c
@@ -0,0 +1,788 @@
+/*-
+ * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Dummynet portions related to packet handling.
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/sysctl.h>
+#include <net/if.h> /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */
+#include <net/netisr.h>
+#include <netinet/in.h>
+#include <netinet/ip.h> /* ip_len, ip_off */
+#include <netinet/ip_var.h> /* ip_output(), IP_FORWARDING */
+#include <netinet/ip_fw.h>
+#include <netinet/ipfw/ip_fw_private.h>
+#include <netinet/ipfw/dn_heap.h>
+#include <netinet/ip_dummynet.h>
+#include <netinet/ipfw/ip_dn_private.h>
+#include <netinet/ipfw/dn_sched.h>
+
+#include <netinet/if_ether.h> /* various ether_* routines */
+
+#include <netinet/ip6.h> /* for ip6_input, ip6_output prototypes */
+#include <netinet6/ip6_var.h>
+
+/*
+ * We keep a private variable for the simulation time, but we could
+ * probably use an existing one ("softticks" in sys/kern/kern_timeout.c)
+ * instead of dn_cfg.curr_time
+ */
+
+struct dn_parms dn_cfg;
+
+static long tick_last; /* Last tick duration (usec). */
+static long tick_delta; /* Last vs standard tick diff (usec). */
+static long tick_delta_sum; /* Accumulated tick difference (usec).*/
+static long tick_adjustment; /* Tick adjustments done. */
+static long tick_lost; /* Lost(coalesced) ticks number. */
+/* Adjusted vs non-adjusted curr_time difference (ticks). */
+static long tick_diff;
+
+static unsigned long io_pkt;
+static unsigned long io_pkt_fast;
+static unsigned long io_pkt_drop;
+
+/*
+ * We use a heap to store entities for which we have pending timer events.
+ * The heap is checked at every tick and all entities with expired events
+ * are extracted.
+ */
+
+MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap");
+
+extern void (*bridge_dn_p)(struct mbuf *, struct ifnet *);
+
+#ifdef SYSCTL_NODE
+
+SYSBEGIN(f4)
+
+SYSCTL_DECL(_net_inet);
+SYSCTL_DECL(_net_inet_ip);
+SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet");
+
+/* parameters */
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size,
+ CTLFLAG_RW, &dn_cfg.hash_size, 0, "Default hash table size");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_slot_limit,
+ CTLFLAG_RW, &dn_cfg.slot_limit, 0,
+ "Upper limit in slots for pipe queue.");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_byte_limit,
+ CTLFLAG_RW, &dn_cfg.byte_limit, 0,
+ "Upper limit in bytes for pipe queue.");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, io_fast,
+ CTLFLAG_RW, &dn_cfg.io_fast, 0, "Enable fast dummynet io.");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug,
+ CTLFLAG_RW, &dn_cfg.debug, 0, "Dummynet debug level");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire,
+ CTLFLAG_RW, &dn_cfg.expire, 0, "Expire empty queues/pipes");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire_cycle,
+ CTLFLAG_RD, &dn_cfg.expire_cycle, 0, "Expire cycle for queues/pipes");
+
+/* RED parameters */
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth,
+ CTLFLAG_RD, &dn_cfg.red_lookup_depth, 0, "Depth of RED lookup table");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size,
+ CTLFLAG_RD, &dn_cfg.red_avg_pkt_size, 0, "RED Medium packet size");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size,
+ CTLFLAG_RD, &dn_cfg.red_max_pkt_size, 0, "RED Max packet size");
+
+/* time adjustment */
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta,
+ CTLFLAG_RD, &tick_delta, 0, "Last vs standard tick difference (usec).");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta_sum,
+ CTLFLAG_RD, &tick_delta_sum, 0, "Accumulated tick difference (usec).");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_adjustment,
+ CTLFLAG_RD, &tick_adjustment, 0, "Tick adjustments done.");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_diff,
+ CTLFLAG_RD, &tick_diff, 0,
+ "Adjusted vs non-adjusted curr_time difference (ticks).");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost,
+ CTLFLAG_RD, &tick_lost, 0,
+ "Number of ticks coalesced by dummynet taskqueue.");
+
+/* statistics */
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, schk_count,
+ CTLFLAG_RD, &dn_cfg.schk_count, 0, "Number of schedulers");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, si_count,
+ CTLFLAG_RD, &dn_cfg.si_count, 0, "Number of scheduler instances");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, fsk_count,
+ CTLFLAG_RD, &dn_cfg.fsk_count, 0, "Number of flowsets");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, queue_count,
+ CTLFLAG_RD, &dn_cfg.queue_count, 0, "Number of queues");
+SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt,
+ CTLFLAG_RD, &io_pkt, 0,
+ "Number of packets passed to dummynet.");
+SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_fast,
+ CTLFLAG_RD, &io_pkt_fast, 0,
+ "Number of packets bypassed dummynet scheduler.");
+SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_drop,
+ CTLFLAG_RD, &io_pkt_drop, 0,
+ "Number of packets dropped by dummynet.");
+
+SYSEND
+
+#endif
+
+static void dummynet_send(struct mbuf *);
+
+/*
+ * Packets processed by dummynet have an mbuf tag associated with
+ * them that carries their dummynet state.
+ * Outside dummynet, only the 'rule' field is relevant, and it must
+ * be at the beginning of the structure.
+ */
+struct dn_pkt_tag {
+ struct ipfw_rule_ref rule; /* matching rule */
+
+ /* second part, dummynet specific */
+ int dn_dir; /* action when packet comes out.*/
+ /* see ip_fw_private.h */
+ uint64_t output_time; /* when the pkt is due for delivery*/
+ struct ifnet *ifp; /* interface, for ip_output */
+ struct _ip6dn_args ip6opt; /* XXX ipv6 options */
+};
+
+/*
+ * Return the mbuf tag holding the dummynet state (it should
+ * be the first one on the list).
+ */
+static struct dn_pkt_tag *
+dn_tag_get(struct mbuf *m)
+{
+ struct m_tag *mtag = m_tag_first(m);
+ KASSERT(mtag != NULL &&
+ mtag->m_tag_cookie == MTAG_ABI_COMPAT &&
+ mtag->m_tag_id == PACKET_TAG_DUMMYNET,
+ ("packet on dummynet queue w/o dummynet tag!"));
+ return (struct dn_pkt_tag *)(mtag+1);
+}
+
+static inline void
+mq_append(struct mq *q, struct mbuf *m)
+{
+ if (q->head == NULL)
+ q->head = m;
+ else
+ q->tail->m_nextpkt = m;
+ q->tail = m;
+ m->m_nextpkt = NULL;
+}
+
+/*
+ * Dispose a list of packet. Use a functions so if we need to do
+ * more work, this is a central point to do it.
+ */
+void dn_free_pkts(struct mbuf *mnext)
+{
+ struct mbuf *m;
+
+ while ((m = mnext) != NULL) {
+ mnext = m->m_nextpkt;
+ FREE_PKT(m);
+ }
+}
+
+static int
+red_drops (struct dn_queue *q, int len)
+{
+ /*
+ * RED algorithm
+ *
+ * RED calculates the average queue size (avg) using a low-pass filter
+ * with an exponential weighted (w_q) moving average:
+ * avg <- (1-w_q) * avg + w_q * q_size
+ * where q_size is the queue length (measured in bytes or * packets).
+ *
+ * If q_size == 0, we compute the idle time for the link, and set
+ * avg = (1 - w_q)^(idle/s)
+ * where s is the time needed for transmitting a medium-sized packet.
+ *
+ * Now, if avg < min_th the packet is enqueued.
+ * If avg > max_th the packet is dropped. Otherwise, the packet is
+ * dropped with probability P function of avg.
+ */
+
+ struct dn_fsk *fs = q->fs;
+ int64_t p_b = 0;
+
+ /* Queue in bytes or packets? */
+ uint32_t q_size = (fs->fs.flags & DN_QSIZE_BYTES) ?
+ q->ni.len_bytes : q->ni.length;
+
+ /* Average queue size estimation. */
+ if (q_size != 0) {
+ /* Queue is not empty, avg <- avg + (q_size - avg) * w_q */
+ int diff = SCALE(q_size) - q->avg;
+ int64_t v = SCALE_MUL((int64_t)diff, (int64_t)fs->w_q);
+
+ q->avg += (int)v;
+ } else {
+ /*
+ * Queue is empty, find for how long the queue has been
+ * empty and use a lookup table for computing
+ * (1 - * w_q)^(idle_time/s) where s is the time to send a
+ * (small) packet.
+ * XXX check wraps...
+ */
+ if (q->avg) {
+ u_int t = div64((dn_cfg.curr_time - q->q_time), fs->lookup_step);
+
+ q->avg = (t < fs->lookup_depth) ?
+ SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0;
+ }
+ }
+
+ /* Should i drop? */
+ if (q->avg < fs->min_th) {
+ q->count = -1;
+ return (0); /* accept packet */
+ }
+ if (q->avg >= fs->max_th) { /* average queue >= max threshold */
+ if (fs->fs.flags & DN_IS_GENTLE_RED) {
+ /*
+ * According to Gentle-RED, if avg is greater than
+ * max_th the packet is dropped with a probability
+ * p_b = c_3 * avg - c_4
+ * where c_3 = (1 - max_p) / max_th
+ * c_4 = 1 - 2 * max_p
+ */
+ p_b = SCALE_MUL((int64_t)fs->c_3, (int64_t)q->avg) -
+ fs->c_4;
+ } else {
+ q->count = -1;
+ return (1);
+ }
+ } else if (q->avg > fs->min_th) {
+ /*
+ * We compute p_b using the linear dropping function
+ * p_b = c_1 * avg - c_2
+ * where c_1 = max_p / (max_th - min_th)
+ * c_2 = max_p * min_th / (max_th - min_th)
+ */
+ p_b = SCALE_MUL((int64_t)fs->c_1, (int64_t)q->avg) - fs->c_2;
+ }
+
+ if (fs->fs.flags & DN_QSIZE_BYTES)
+ p_b = div64((p_b * len) , fs->max_pkt_size);
+ if (++q->count == 0)
+ q->random = random() & 0xffff;
+ else {
+ /*
+ * q->count counts packets arrived since last drop, so a greater
+ * value of q->count means a greater packet drop probability.
+ */
+ if (SCALE_MUL(p_b, SCALE((int64_t)q->count)) > q->random) {
+ q->count = 0;
+ /* After a drop we calculate a new random value. */
+ q->random = random() & 0xffff;
+ return (1); /* drop */
+ }
+ }
+ /* End of RED algorithm. */
+
+ return (0); /* accept */
+
+}
+
+/*
+ * Enqueue a packet in q, subject to space and queue management policy
+ * (whose parameters are in q->fs).
+ * Update stats for the queue and the scheduler.
+ * Return 0 on success, 1 on drop. The packet is consumed anyways.
+ */
+int
+dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop)
+{
+ struct dn_fs *f;
+ struct dn_flow *ni; /* stats for scheduler instance */
+ uint64_t len;
+
+ if (q->fs == NULL || q->_si == NULL) {
+ printf("%s fs %p si %p, dropping\n",
+ __FUNCTION__, q->fs, q->_si);
+ FREE_PKT(m);
+ return 1;
+ }
+ f = &(q->fs->fs);
+ ni = &q->_si->ni;
+ len = m->m_pkthdr.len;
+ /* Update statistics, then check reasons to drop pkt. */
+ q->ni.tot_bytes += len;
+ q->ni.tot_pkts++;
+ ni->tot_bytes += len;
+ ni->tot_pkts++;
+ if (drop)
+ goto drop;
+ if (f->plr && random() < f->plr)
+ goto drop;
+ if (f->flags & DN_IS_RED && red_drops(q, m->m_pkthdr.len))
+ goto drop;
+ if (f->flags & DN_QSIZE_BYTES) {
+ if (q->ni.len_bytes > f->qsize)
+ goto drop;
+ } else if (q->ni.length >= f->qsize) {
+ goto drop;
+ }
+ mq_append(&q->mq, m);
+ q->ni.length++;
+ q->ni.len_bytes += len;
+ ni->length++;
+ ni->len_bytes += len;
+ return 0;
+
+drop:
+ io_pkt_drop++;
+ q->ni.drops++;
+ ni->drops++;
+ FREE_PKT(m);
+ return 1;
+}
+
+/*
+ * Fetch packets from the delay line which are due now. If there are
+ * leftover packets, reinsert the delay line in the heap.
+ * Runs under scheduler lock.
+ */
+static void
+transmit_event(struct mq *q, struct delay_line *dline, uint64_t now)
+{
+ struct mbuf *m;
+ struct dn_pkt_tag *pkt = NULL;
+
+ dline->oid.subtype = 0; /* not in heap */
+ while ((m = dline->mq.head) != NULL) {
+ pkt = dn_tag_get(m);
+ if (!DN_KEY_LEQ(pkt->output_time, now))
+ break;
+ dline->mq.head = m->m_nextpkt;
+ mq_append(q, m);
+ }
+ if (m != NULL) {
+ dline->oid.subtype = 1; /* in heap */
+ heap_insert(&dn_cfg.evheap, pkt->output_time, dline);
+ }
+}
+
+/*
+ * Convert the additional MAC overheads/delays into an equivalent
+ * number of bits for the given data rate. The samples are
+ * in milliseconds so we need to divide by 1000.
+ */
+static uint64_t
+extra_bits(struct mbuf *m, struct dn_schk *s)
+{
+ int index;
+ uint64_t bits;
+ struct dn_profile *pf = s->profile;
+
+ if (!pf || pf->samples_no == 0)
+ return 0;
+ index = random() % pf->samples_no;
+ bits = div64((uint64_t)pf->samples[index] * s->link.bandwidth, 1000);
+ if (index >= pf->loss_level) {
+ struct dn_pkt_tag *dt = dn_tag_get(m);
+ if (dt)
+ dt->dn_dir = DIR_DROP;
+ }
+ return bits;
+}
+
+/*
+ * Send traffic from a scheduler instance due by 'now'.
+ * Return a pointer to the head of the queue.
+ */
+static struct mbuf *
+serve_sched(struct mq *q, struct dn_sch_inst *si, uint64_t now)
+{
+ struct mq def_q;
+ struct dn_schk *s = si->sched;
+ struct mbuf *m = NULL;
+ int delay_line_idle = (si->dline.mq.head == NULL);
+ int done, bw;
+
+ if (q == NULL) {
+ q = &def_q;
+ q->head = NULL;
+ }
+
+ bw = s->link.bandwidth;
+ si->kflags &= ~DN_ACTIVE;
+
+ if (bw > 0)
+ si->credit += (now - si->sched_time) * bw;
+ else
+ si->credit = 0;
+ si->sched_time = now;
+ done = 0;
+ while (si->credit >= 0 && (m = s->fp->dequeue(si)) != NULL) {
+ uint64_t len_scaled;
+ done++;
+ len_scaled = (bw == 0) ? 0 : hz *
+ (m->m_pkthdr.len * 8 + extra_bits(m, s));
+ si->credit -= len_scaled;
+ /* Move packet in the delay line */
+ dn_tag_get(m)->output_time += s->link.delay ;
+ mq_append(&si->dline.mq, m);
+ }
+ /*
+ * If credit >= 0 the instance is idle, mark time.
+ * Otherwise put back in the heap, and adjust the output
+ * time of the last inserted packet, m, which was too early.
+ */
+ if (si->credit >= 0) {
+ si->idle_time = now;
+ } else {
+ uint64_t t;
+ KASSERT (bw > 0, ("bw=0 and credit<0 ?"));
+ t = div64(bw - 1 - si->credit, bw);
+ if (m)
+ dn_tag_get(m)->output_time += t;
+ si->kflags |= DN_ACTIVE;
+ heap_insert(&dn_cfg.evheap, now + t, si);
+ }
+ if (delay_line_idle && done)
+ transmit_event(q, &si->dline, now);
+ return q->head;
+}
+
+/*
+ * The timer handler for dummynet. Time is computed in ticks, but
+ * but the code is tolerant to the actual rate at which this is called.
+ * Once complete, the function reschedules itself for the next tick.
+ */
+void
+dummynet_task(void *context, int pending)
+{
+ struct timeval t;
+ struct mq q = { NULL, NULL }; /* queue to accumulate results */
+
+ DN_BH_WLOCK();
+
+ /* Update number of lost(coalesced) ticks. */
+ tick_lost += pending - 1;
+
+ getmicrouptime(&t);
+ /* Last tick duration (usec). */
+ tick_last = (t.tv_sec - dn_cfg.prev_t.tv_sec) * 1000000 +
+ (t.tv_usec - dn_cfg.prev_t.tv_usec);
+ /* Last tick vs standard tick difference (usec). */
+ tick_delta = (tick_last * hz - 1000000) / hz;
+ /* Accumulated tick difference (usec). */
+ tick_delta_sum += tick_delta;
+
+ dn_cfg.prev_t = t;
+
+ /*
+ * Adjust curr_time if the accumulated tick difference is
+ * greater than the 'standard' tick. Since curr_time should
+ * be monotonically increasing, we do positive adjustments
+ * as required, and throttle curr_time in case of negative
+ * adjustment.
+ */
+ dn_cfg.curr_time++;
+ if (tick_delta_sum - tick >= 0) {
+ int diff = tick_delta_sum / tick;
+
+ dn_cfg.curr_time += diff;
+ tick_diff += diff;
+ tick_delta_sum %= tick;
+ tick_adjustment++;
+ } else if (tick_delta_sum + tick <= 0) {
+ dn_cfg.curr_time--;
+ tick_diff--;
+ tick_delta_sum += tick;
+ tick_adjustment++;
+ }
+
+ /* serve pending events, accumulate in q */
+ for (;;) {
+ struct dn_id *p; /* generic parameter to handler */
+
+ if (dn_cfg.evheap.elements == 0 ||
+ DN_KEY_LT(dn_cfg.curr_time, HEAP_TOP(&dn_cfg.evheap)->key))
+ break;
+ p = HEAP_TOP(&dn_cfg.evheap)->object;
+ heap_extract(&dn_cfg.evheap, NULL);
+
+ if (p->type == DN_SCH_I) {
+ serve_sched(&q, (struct dn_sch_inst *)p, dn_cfg.curr_time);
+ } else { /* extracted a delay line */
+ transmit_event(&q, (struct delay_line *)p, dn_cfg.curr_time);
+ }
+ }
+ if (dn_cfg.expire && ++dn_cfg.expire_cycle >= dn_cfg.expire) {
+ dn_cfg.expire_cycle = 0;
+ dn_drain_scheduler();
+ dn_drain_queue();
+ }
+
+ DN_BH_WUNLOCK();
+ dn_reschedule();
+ if (q.head != NULL)
+ dummynet_send(q.head);
+}
+
+/*
+ * forward a chain of packets to the proper destination.
+ * This runs outside the dummynet lock.
+ */
+static void
+dummynet_send(struct mbuf *m)
+{
+ struct mbuf *n;
+
+ for (; m != NULL; m = n) {
+ struct ifnet *ifp = NULL; /* gcc 3.4.6 complains */
+ struct m_tag *tag;
+ int dst;
+
+ n = m->m_nextpkt;
+ m->m_nextpkt = NULL;
+ tag = m_tag_first(m);
+ if (tag == NULL) { /* should not happen */
+ dst = DIR_DROP;
+ } else {
+ struct dn_pkt_tag *pkt = dn_tag_get(m);
+ /* extract the dummynet info, rename the tag
+ * to carry reinject info.
+ */
+ dst = pkt->dn_dir;
+ ifp = pkt->ifp;
+ tag->m_tag_cookie = MTAG_IPFW_RULE;
+ tag->m_tag_id = 0;
+ }
+
+ switch (dst) {
+ case DIR_OUT:
+ SET_HOST_IPLEN(mtod(m, struct ip *));
+ ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL);
+ break ;
+
+ case DIR_IN :
+ /* put header in network format for ip_input() */
+ //SET_NET_IPLEN(mtod(m, struct ip *));
+ netisr_dispatch(NETISR_IP, m);
+ break;
+
+#ifdef INET6
+ case DIR_IN | PROTO_IPV6:
+ netisr_dispatch(NETISR_IPV6, m);
+ break;
+
+ case DIR_OUT | PROTO_IPV6:
+ SET_HOST_IPLEN(mtod(m, struct ip *));
+ ip6_output(m, NULL, NULL, IPV6_FORWARDING, NULL, NULL, NULL);
+ break;
+#endif
+
+ case DIR_FWD | PROTO_IFB: /* DN_TO_IFB_FWD: */
+ if (bridge_dn_p != NULL)
+ ((*bridge_dn_p)(m, ifp));
+ else
+ printf("dummynet: if_bridge not loaded\n");
+
+ break;
+
+ case DIR_IN | PROTO_LAYER2: /* DN_TO_ETH_DEMUX: */
+ /*
+ * The Ethernet code assumes the Ethernet header is
+ * contiguous in the first mbuf header.
+ * Insure this is true.
+ */
+ if (m->m_len < ETHER_HDR_LEN &&
+ (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) {
+ printf("dummynet/ether: pullup failed, "
+ "dropping packet\n");
+ break;
+ }
+ ether_demux(m->m_pkthdr.rcvif, m);
+ break;
+
+ case DIR_OUT | PROTO_LAYER2: /* N_TO_ETH_OUT: */
+ ether_output_frame(ifp, m);
+ break;
+
+ case DIR_DROP:
+ /* drop the packet after some time */
+ FREE_PKT(m);
+ break;
+
+ default:
+ printf("dummynet: bad switch %d!\n", dst);
+ FREE_PKT(m);
+ break;
+ }
+ }
+}
+
+static inline int
+tag_mbuf(struct mbuf *m, int dir, struct ip_fw_args *fwa)
+{
+ struct dn_pkt_tag *dt;
+ struct m_tag *mtag;
+
+ mtag = m_tag_get(PACKET_TAG_DUMMYNET,
+ sizeof(*dt), M_NOWAIT | M_ZERO);
+ if (mtag == NULL)
+ return 1; /* Cannot allocate packet header. */
+ m_tag_prepend(m, mtag); /* Attach to mbuf chain. */
+ dt = (struct dn_pkt_tag *)(mtag + 1);
+ dt->rule = fwa->rule;
+ dt->rule.info &= IPFW_ONEPASS; /* only keep this info */
+ dt->dn_dir = dir;
+ dt->ifp = fwa->oif;
+ /* dt->output tame is updated as we move through */
+ dt->output_time = dn_cfg.curr_time;
+ return 0;
+}
+
+
+/*
+ * dummynet hook for packets.
+ * We use the argument to locate the flowset fs and the sched_set sch
+ * associated to it. The we apply flow_mask and sched_mask to
+ * determine the queue and scheduler instances.
+ *
+ * dir where shall we send the packet after dummynet.
+ * *m0 the mbuf with the packet
+ * ifp the 'ifp' parameter from the caller.
+ * NULL in ip_input, destination interface in ip_output,
+ */
+int
+dummynet_io(struct mbuf **m0, int dir, struct ip_fw_args *fwa)
+{
+ struct mbuf *m = *m0;
+ struct dn_fsk *fs = NULL;
+ struct dn_sch_inst *si;
+ struct dn_queue *q = NULL; /* default */
+
+ int fs_id = (fwa->rule.info & IPFW_INFO_MASK) +
+ ((fwa->rule.info & IPFW_IS_PIPE) ? 2*DN_MAX_ID : 0);
+ DN_BH_WLOCK();
+ io_pkt++;
+ /* we could actually tag outside the lock, but who cares... */
+ if (tag_mbuf(m, dir, fwa))
+ goto dropit;
+ if (dn_cfg.busy) {
+ /* if the upper half is busy doing something expensive,
+ * lets queue the packet and move forward
+ */
+ mq_append(&dn_cfg.pending, m);
+ m = *m0 = NULL; /* consumed */
+ goto done; /* already active, nothing to do */
+ }
+ /* XXX locate_flowset could be optimised with a direct ref. */
+ fs = dn_ht_find(dn_cfg.fshash, fs_id, 0, NULL);
+ if (fs == NULL)
+ goto dropit; /* This queue/pipe does not exist! */
+ if (fs->sched == NULL) /* should not happen */
+ goto dropit;
+ /* find scheduler instance, possibly applying sched_mask */
+ si = ipdn_si_find(fs->sched, &(fwa->f_id));
+ if (si == NULL)
+ goto dropit;
+ /*
+ * If the scheduler supports multiple queues, find the right one
+ * (otherwise it will be ignored by enqueue).
+ */
+ if (fs->sched->fp->flags & DN_MULTIQUEUE) {
+ q = ipdn_q_find(fs, si, &(fwa->f_id));
+ if (q == NULL)
+ goto dropit;
+ }
+ if (fs->sched->fp->enqueue(si, q, m)) {
+ printf("%s dropped by enqueue\n", __FUNCTION__);
+ /* packet was dropped by enqueue() */
+ m = *m0 = NULL;
+ goto dropit;
+ }
+
+ if (si->kflags & DN_ACTIVE) {
+ m = *m0 = NULL; /* consumed */
+ goto done; /* already active, nothing to do */
+ }
+
+ /* compute the initial allowance */
+ {
+ struct dn_link *p = &fs->sched->link;
+ si->credit = dn_cfg.io_fast ? p->bandwidth : 0;
+ if (p->burst) {
+ uint64_t burst = (dn_cfg.curr_time - si->idle_time) * p->bandwidth;
+ if (burst > p->burst)
+ burst = p->burst;
+ si->credit += burst;
+ }
+ }
+ /* pass through scheduler and delay line */
+ m = serve_sched(NULL, si, dn_cfg.curr_time);
+
+ /* optimization -- pass it back to ipfw for immediate send */
+ /* XXX Don't call dummynet_send() if scheduler return the packet
+ * just enqueued. This avoid a lock order reversal.
+ *
+ */
+ if (/*dn_cfg.io_fast &&*/ m == *m0 && (dir & PROTO_LAYER2) == 0 ) {
+ /* fast io */
+ io_pkt_fast++;
+ if (m->m_nextpkt != NULL) {
+ printf("dummynet: fast io: pkt chain detected!\n");
+ m->m_nextpkt = NULL;
+ }
+ m = NULL;
+ } else {
+ *m0 = NULL;
+ }
+done:
+ DN_BH_WUNLOCK();
+ if (m)
+ dummynet_send(m);
+ return 0;
+
+dropit:
+ io_pkt_drop++;
+ DN_BH_WUNLOCK();
+ if (m)
+ FREE_PKT(m);
+ *m0 = NULL;
+ return (fs && (fs->fs.flags & DN_NOERROR)) ? 0 : ENOBUFS;
+}
diff --git a/sys/netinet/ipfw/ip_dn_private.h b/sys/netinet/ipfw/ip_dn_private.h
new file mode 100644
index 0000000..270f188
--- /dev/null
+++ b/sys/netinet/ipfw/ip_dn_private.h
@@ -0,0 +1,402 @@
+/*-
+ * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * internal dummynet APIs.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IP_DN_PRIVATE_H
+#define _IP_DN_PRIVATE_H
+
+/* debugging support
+ * use ND() to remove debugging, D() to print a line,
+ * DX(level, ...) to print above a certain level
+ * If you redefine D() you are expected to redefine all.
+ */
+#ifndef D
+#define ND(fmt, ...) do {} while (0)
+#define D1(fmt, ...) do {} while (0)
+#define D(fmt, ...) printf("%-10s " fmt "\n", \
+ __FUNCTION__, ## __VA_ARGS__)
+#define DX(lev, fmt, ...) do { \
+ if (dn_cfg.debug > lev) D(fmt, ## __VA_ARGS__); } while (0)
+#endif
+
+MALLOC_DECLARE(M_DUMMYNET);
+
+#ifndef FREE_PKT
+#define FREE_PKT(m) m_freem(m)
+#endif
+
+#ifndef __linux__
+#define div64(a, b) ((int64_t)(a) / (int64_t)(b))
+#endif
+
+#define DN_LOCK_INIT() do { \
+ mtx_init(&dn_cfg.uh_mtx, "dn_uh", NULL, MTX_DEF); \
+ mtx_init(&dn_cfg.bh_mtx, "dn_bh", NULL, MTX_DEF); \
+ } while (0)
+#define DN_LOCK_DESTROY() do { \
+ mtx_destroy(&dn_cfg.uh_mtx); \
+ mtx_destroy(&dn_cfg.bh_mtx); \
+ } while (0)
+#if 0 /* not used yet */
+#define DN_UH_RLOCK() mtx_lock(&dn_cfg.uh_mtx)
+#define DN_UH_RUNLOCK() mtx_unlock(&dn_cfg.uh_mtx)
+#define DN_UH_WLOCK() mtx_lock(&dn_cfg.uh_mtx)
+#define DN_UH_WUNLOCK() mtx_unlock(&dn_cfg.uh_mtx)
+#define DN_UH_LOCK_ASSERT() mtx_assert(&dn_cfg.uh_mtx, MA_OWNED)
+#endif
+
+#define DN_BH_RLOCK() mtx_lock(&dn_cfg.uh_mtx)
+#define DN_BH_RUNLOCK() mtx_unlock(&dn_cfg.uh_mtx)
+#define DN_BH_WLOCK() mtx_lock(&dn_cfg.uh_mtx)
+#define DN_BH_WUNLOCK() mtx_unlock(&dn_cfg.uh_mtx)
+#define DN_BH_LOCK_ASSERT() mtx_assert(&dn_cfg.uh_mtx, MA_OWNED)
+
+SLIST_HEAD(dn_schk_head, dn_schk);
+SLIST_HEAD(dn_sch_inst_head, dn_sch_inst);
+SLIST_HEAD(dn_fsk_head, dn_fsk);
+SLIST_HEAD(dn_queue_head, dn_queue);
+SLIST_HEAD(dn_alg_head, dn_alg);
+
+struct mq { /* a basic queue of packets*/
+ struct mbuf *head, *tail;
+};
+
+static inline void
+set_oid(struct dn_id *o, int type, int len)
+{
+ o->type = type;
+ o->len = len;
+ o->subtype = 0;
+};
+
+/*
+ * configuration and global data for a dummynet instance
+ *
+ * When a configuration is modified from userland, 'id' is incremented
+ * so we can use the value to check for stale pointers.
+ */
+struct dn_parms {
+ uint32_t id; /* configuration version */
+
+ /* defaults (sysctl-accessible) */
+ int red_lookup_depth;
+ int red_avg_pkt_size;
+ int red_max_pkt_size;
+ int hash_size;
+ int max_hash_size;
+ long byte_limit; /* max queue sizes */
+ long slot_limit;
+
+ int io_fast;
+ int debug;
+
+ /* timekeeping */
+ struct timeval prev_t; /* last time dummynet_tick ran */
+ struct dn_heap evheap; /* scheduled events */
+
+ /* counters of objects -- used for reporting space */
+ int schk_count;
+ int si_count;
+ int fsk_count;
+ int queue_count;
+
+ /* ticks and other stuff */
+ uint64_t curr_time;
+ /* flowsets and schedulers are in hash tables, with 'hash_size'
+ * buckets. fshash is looked up at every packet arrival
+ * so better be generous if we expect many entries.
+ */
+ struct dn_ht *fshash;
+ struct dn_ht *schedhash;
+ /* list of flowsets without a scheduler -- use sch_chain */
+ struct dn_fsk_head fsu; /* list of unlinked flowsets */
+ struct dn_alg_head schedlist; /* list of algorithms */
+
+ /* Store the fs/sch to scan when draining. The value is the
+ * bucket number of the hash table. Expire can be disabled
+ * with net.inet.ip.dummynet.expire=0, or it happens every
+ * expire ticks.
+ **/
+ int drain_fs;
+ int drain_sch;
+ uint32_t expire;
+ uint32_t expire_cycle; /* tick count */
+
+ /* if the upper half is busy doing something long,
+ * can set the busy flag and we will enqueue packets in
+ * a queue for later processing.
+ */
+ int busy;
+ struct mq pending;
+
+#ifdef _KERNEL
+ /*
+ * This file is normally used in the kernel, unless we do
+ * some userland tests, in which case we do not need a mtx.
+ * uh_mtx arbitrates between system calls and also
+ * protects fshash, schedhash and fsunlinked.
+ * These structures are readonly for the lower half.
+ * bh_mtx protects all other structures which may be
+ * modified upon packet arrivals
+ */
+#if defined( __linux__ ) || defined( _WIN32 )
+ spinlock_t uh_mtx;
+ spinlock_t bh_mtx;
+#else
+ struct mtx uh_mtx;
+ struct mtx bh_mtx;
+#endif
+
+#endif /* _KERNEL */
+};
+
+/*
+ * Delay line, contains all packets on output from a link.
+ * Every scheduler instance has one.
+ */
+struct delay_line {
+ struct dn_id oid;
+ struct dn_sch_inst *si;
+ struct mq mq;
+};
+
+/*
+ * The kernel side of a flowset. It is linked in a hash table
+ * of flowsets, and in a list of children of their parent scheduler.
+ * qht is either the queue or (if HAVE_MASK) a hash table queues.
+ * Note that the mask to use is the (flow_mask|sched_mask), which
+ * changes as we attach/detach schedulers. So we store it here.
+ *
+ * XXX If we want to add scheduler-specific parameters, we need to
+ * put them in external storage because the scheduler may not be
+ * available when the fsk is created.
+ */
+struct dn_fsk { /* kernel side of a flowset */
+ struct dn_fs fs;
+ SLIST_ENTRY(dn_fsk) fsk_next; /* hash chain for fshash */
+
+ struct ipfw_flow_id fsk_mask;
+
+ /* qht is a hash table of queues, or just a single queue
+ * a bit in fs.flags tells us which one
+ */
+ struct dn_ht *qht;
+ struct dn_schk *sched; /* Sched we are linked to */
+ SLIST_ENTRY(dn_fsk) sch_chain; /* list of fsk attached to sched */
+
+ /* bucket index used by drain routine to drain queues for this
+ * flowset
+ */
+ int drain_bucket;
+ /* Parameter realted to RED / GRED */
+ /* original values are in dn_fs*/
+ int w_q ; /* queue weight (scaled) */
+ int max_th ; /* maximum threshold for queue (scaled) */
+ int min_th ; /* minimum threshold for queue (scaled) */
+ int max_p ; /* maximum value for p_b (scaled) */
+
+ u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */
+ u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */
+ u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */
+ u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */
+ u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */
+ u_int lookup_depth ; /* depth of lookup table */
+ int lookup_step ; /* granularity inside the lookup table */
+ int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */
+ int avg_pkt_size ; /* medium packet size */
+ int max_pkt_size ; /* max packet size */
+};
+
+/*
+ * A queue is created as a child of a flowset unless it belongs to
+ * a !MULTIQUEUE scheduler. It is normally in a hash table in the
+ * flowset. fs always points to the parent flowset.
+ * si normally points to the sch_inst, unless the flowset has been
+ * detached from the scheduler -- in this case si == NULL and we
+ * should not enqueue.
+ */
+struct dn_queue {
+ struct dn_flow ni; /* oid, flow_id, stats */
+ struct mq mq; /* packets queue */
+ struct dn_sch_inst *_si; /* owner scheduler instance */
+ SLIST_ENTRY(dn_queue) q_next; /* hash chain list for qht */
+ struct dn_fsk *fs; /* parent flowset. */
+
+ /* RED parameters */
+ int avg; /* average queue length est. (scaled) */
+ int count; /* arrivals since last RED drop */
+ int random; /* random value (scaled) */
+ uint64_t q_time; /* start of queue idle time */
+
+};
+
+/*
+ * The kernel side of a scheduler. Contains the userland config,
+ * a link, pointer to extra config arguments from command line,
+ * kernel flags, and a pointer to the scheduler methods.
+ * It is stored in a hash table, and holds a list of all
+ * flowsets and scheduler instances.
+ * XXX sch must be at the beginning, see schk_hash().
+ */
+struct dn_schk {
+ struct dn_sch sch;
+ struct dn_alg *fp; /* Pointer to scheduler functions */
+ struct dn_link link; /* The link, embedded */
+ struct dn_profile *profile; /* delay profile, if any */
+ struct dn_id *cfg; /* extra config arguments */
+
+ SLIST_ENTRY(dn_schk) schk_next; /* hash chain for schedhash */
+
+ struct dn_fsk_head fsk_list; /* all fsk linked to me */
+ struct dn_fsk *fs; /* Flowset for !MULTIQUEUE */
+
+ /* bucket index used by the drain routine to drain the scheduler
+ * instance for this flowset.
+ */
+ int drain_bucket;
+
+ /* Hash table of all instances (through sch.sched_mask)
+ * or single instance if no mask. Always valid.
+ */
+ struct dn_ht *siht;
+};
+
+
+/*
+ * Scheduler instance.
+ * Contains variables and all queues relative to a this instance.
+ * This struct is created a runtime.
+ */
+struct dn_sch_inst {
+ struct dn_flow ni; /* oid, flowid and stats */
+ SLIST_ENTRY(dn_sch_inst) si_next; /* hash chain for siht */
+ struct delay_line dline;
+ struct dn_schk *sched; /* the template */
+ int kflags; /* DN_ACTIVE */
+
+ int64_t credit; /* bits I can transmit (more or less). */
+ uint64_t sched_time; /* time link was scheduled in ready_heap */
+ uint64_t idle_time; /* start of scheduler instance idle time */
+
+ /* q_count is the number of queues that this instance is using.
+ * The counter is incremented or decremented when
+ * a reference from the queue is created or deleted.
+ * It is used to make sure that a scheduler instance can be safely
+ * deleted by the drain routine. See notes below.
+ */
+ int q_count;
+
+};
+
+/*
+ * NOTE about object drain.
+ * The system will automatically (XXX check when) drain queues and
+ * scheduler instances when they are idle.
+ * A queue is idle when it has no packets; an instance is idle when
+ * it is not in the evheap heap, and the corresponding delay line is empty.
+ * A queue can be safely deleted when it is idle because of the scheduler
+ * function xxx_free_queue() will remove any references to it.
+ * An instance can be only deleted when no queues reference it. To be sure
+ * of that, a counter (q_count) stores the number of queues that are pointing
+ * to the instance.
+ *
+ * XXX
+ * Order of scan:
+ * - take all flowset in a bucket for the flowset hash table
+ * - take all queues in a bucket for the flowset
+ * - increment the queue bucket
+ * - scan next flowset bucket
+ * Nothing is done if a bucket contains no entries.
+ *
+ * The same schema is used for sceduler instances
+ */
+
+
+/* kernel-side flags. Linux has DN_DELETE in fcntl.h
+ */
+enum {
+ /* 1 and 2 are reserved for the SCAN flags */
+ DN_DESTROY = 0x0004, /* destroy */
+ DN_DELETE_FS = 0x0008, /* destroy flowset */
+ DN_DETACH = 0x0010,
+ DN_ACTIVE = 0x0020, /* object is in evheap */
+ DN_F_DLINE = 0x0040, /* object is a delay line */
+ DN_F_SCHI = 0x00C0, /* object is a sched.instance */
+ DN_QHT_IS_Q = 0x0100, /* in flowset, qht is a single queue */
+};
+
+extern struct dn_parms dn_cfg;
+
+int dummynet_io(struct mbuf **, int , struct ip_fw_args *);
+void dummynet_task(void *context, int pending);
+void dn_reschedule(void);
+
+struct dn_queue *ipdn_q_find(struct dn_fsk *, struct dn_sch_inst *,
+ struct ipfw_flow_id *);
+struct dn_sch_inst *ipdn_si_find(struct dn_schk *, struct ipfw_flow_id *);
+
+/*
+ * copy_range is a template for requests for ranges of pipes/queues/scheds.
+ * The number of ranges is variable and can be derived by o.len.
+ * As a default, we use a small number of entries so that the struct
+ * fits easily on the stack and is sufficient for most common requests.
+ */
+#define DEFAULT_RANGES 5
+struct copy_range {
+ struct dn_id o;
+ uint32_t r[ 2 * DEFAULT_RANGES ];
+};
+
+struct copy_args {
+ char **start;
+ char *end;
+ int flags;
+ int type;
+ struct copy_range *extra; /* extra filtering */
+};
+
+struct sockopt;
+int ip_dummynet_compat(struct sockopt *sopt);
+int dummynet_get(struct sockopt *sopt, void **compat);
+int dn_c_copy_q (void *_ni, void *arg);
+int dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq);
+int dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq);
+int dn_compat_copy_queue(struct copy_args *a, void *_o);
+int dn_compat_copy_pipe(struct copy_args *a, void *_o);
+int copy_data_helper_compat(void *_o, void *_arg);
+int dn_compat_calc_size(struct dn_parms dn_cfg);
+int do_config(void *p, int l);
+
+/* function to drain idle object */
+void dn_drain_scheduler(void);
+void dn_drain_queue(void);
+
+#endif /* _IP_DN_PRIVATE_H */
diff --git a/sys/netinet/ipfw/ip_dummynet.c b/sys/netinet/ipfw/ip_dummynet.c
index e961a55..d7073eb 100644
--- a/sys/netinet/ipfw/ip_dummynet.c
+++ b/sys/netinet/ipfw/ip_dummynet.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa
+ * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa
* Portions Copyright (c) 2000 Akamba Corp.
* All rights reserved
*
@@ -28,34 +28,12 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
-#define DUMMYNET_DEBUG
-
-#include "opt_inet6.h"
-
/*
- * This module implements IP dummynet, a bandwidth limiter/delay emulator
- * used in conjunction with the ipfw package.
- * Description of the data structures used is in ip_dummynet.h
- * Here you mainly find the following blocks of code:
- * + variable declarations;
- * + heap management functions;
- * + scheduler and dummynet functions;
- * + configuration and initialization.
- *
- * NOTA BENE: critical sections are protected by the "dummynet lock".
- *
- * Most important Changes:
- *
- * 011004: KLDable
- * 010124: Fixed WF2Q behaviour
- * 010122: Fixed spl protection.
- * 000601: WF2Q support
- * 000106: large rewrite, use heaps to handle very many pipes.
- * 980513: initial release
- *
- * include files marked with XXX are probably not needed
+ * Configuration and internal object management for dummynet.
*/
+#include "opt_inet6.h"
+
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
@@ -69,2210 +47,2115 @@ __FBSDID("$FreeBSD$");
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/time.h>
-#include <sys/sysctl.h>
#include <sys/taskqueue.h>
#include <net/if.h> /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */
-#include <net/netisr.h>
#include <netinet/in.h>
-#include <netinet/ip.h> /* ip_len, ip_off */
+#include <netinet/ip_var.h> /* ip_output(), IP_FORWARDING */
#include <netinet/ip_fw.h>
+#include <netinet/ipfw/ip_fw_private.h>
+#include <netinet/ipfw/dn_heap.h>
#include <netinet/ip_dummynet.h>
-#include <netinet/ip_var.h> /* ip_output(), IP_FORWARDING */
-
-#include <netinet/if_ether.h> /* various ether_* routines */
-
-#include <netinet/ip6.h> /* for ip6_input, ip6_output prototypes */
-#include <netinet6/ip6_var.h>
+#include <netinet/ipfw/ip_dn_private.h>
+#include <netinet/ipfw/dn_sched.h>
+
+/* which objects to copy */
+#define DN_C_LINK 0x01
+#define DN_C_SCH 0x02
+#define DN_C_FLOW 0x04
+#define DN_C_FS 0x08
+#define DN_C_QUEUE 0x10
+
+/* we use this argument in case of a schk_new */
+struct schk_new_arg {
+ struct dn_alg *fp;
+ struct dn_sch *sch;
+};
-/*
- * We keep a private variable for the simulation time, but we could
- * probably use an existing one ("softticks" in sys/kern/kern_timeout.c)
- */
-static dn_key curr_time = 0 ; /* current simulation time */
+/*---- callout hooks. ----*/
+static struct callout dn_timeout;
+static struct task dn_task;
+static struct taskqueue *dn_tq = NULL;
-static int dn_hash_size = 64 ; /* default hash size */
+static void
+dummynet(void * __unused unused)
+{
-/* statistics on number of queue searches and search steps */
-static long searches, search_steps ;
-static int pipe_expire = 1 ; /* expire queue if empty */
-static int dn_max_ratio = 16 ; /* max queues/buckets ratio */
+ taskqueue_enqueue(dn_tq, &dn_task);
+}
-static long pipe_slot_limit = 100; /* Foot shooting limit for pipe queues. */
-static long pipe_byte_limit = 1024 * 1024;
+void
+dn_reschedule(void)
+{
+ callout_reset(&dn_timeout, 1, dummynet, NULL);
+}
+/*----- end of callout hooks -----*/
-static int red_lookup_depth = 256; /* RED - default lookup table depth */
-static int red_avg_pkt_size = 512; /* RED - default medium packet size */
-static int red_max_pkt_size = 1500; /* RED - default max packet size */
+/* Return a scheduler descriptor given the type or name. */
+static struct dn_alg *
+find_sched_type(int type, char *name)
+{
+ struct dn_alg *d;
-static struct timeval prev_t, t;
-static long tick_last; /* Last tick duration (usec). */
-static long tick_delta; /* Last vs standard tick diff (usec). */
-static long tick_delta_sum; /* Accumulated tick difference (usec).*/
-static long tick_adjustment; /* Tick adjustments done. */
-static long tick_lost; /* Lost(coalesced) ticks number. */
-/* Adjusted vs non-adjusted curr_time difference (ticks). */
-static long tick_diff;
+ SLIST_FOREACH(d, &dn_cfg.schedlist, next) {
+ if (d->type == type || (name && !strcmp(d->name, name)))
+ return d;
+ }
+ return NULL; /* not found */
+}
-static int io_fast;
-static unsigned long io_pkt;
-static unsigned long io_pkt_fast;
-static unsigned long io_pkt_drop;
+int
+ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg)
+{
+ int oldv = *v;
+ const char *op = NULL;
+ if (oldv < lo) {
+ *v = dflt;
+ op = "Bump";
+ } else if (oldv > hi) {
+ *v = hi;
+ op = "Clamp";
+ } else
+ return *v;
+ if (op && msg)
+ printf("%s %s to %d (was %d)\n", op, msg, *v, oldv);
+ return *v;
+}
+/*---- flow_id mask, hash and compare functions ---*/
/*
- * Three heaps contain queues and pipes that the scheduler handles:
- *
- * ready_heap contains all dn_flow_queue related to fixed-rate pipes.
- *
- * wfq_ready_heap contains the pipes associated with WF2Q flows
- *
- * extract_heap contains pipes associated with delay lines.
- *
+ * The flow_id includes the 5-tuple, the queue/pipe number
+ * which we store in the extra area in host order,
+ * and for ipv6 also the flow_id6.
+ * XXX see if we want the tos byte (can store in 'flags')
*/
+static struct ipfw_flow_id *
+flow_id_mask(struct ipfw_flow_id *mask, struct ipfw_flow_id *id)
+{
+ int is_v6 = IS_IP6_FLOW_ID(id);
-MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap");
-
-static struct dn_heap ready_heap, extract_heap, wfq_ready_heap ;
-
-static int heap_init(struct dn_heap *h, int size);
-static int heap_insert (struct dn_heap *h, dn_key key1, void *p);
-static void heap_extract(struct dn_heap *h, void *obj);
-static void transmit_event(struct dn_pipe *pipe, struct mbuf **head,
- struct mbuf **tail);
-static void ready_event(struct dn_flow_queue *q, struct mbuf **head,
- struct mbuf **tail);
-static void ready_event_wfq(struct dn_pipe *p, struct mbuf **head,
- struct mbuf **tail);
-
-#define HASHSIZE 16
-#define HASH(num) ((((num) >> 8) ^ ((num) >> 4) ^ (num)) & 0x0f)
-static struct dn_pipe_head pipehash[HASHSIZE]; /* all pipes */
-static struct dn_flow_set_head flowsethash[HASHSIZE]; /* all flowsets */
-
-static struct callout dn_timeout;
-
-extern void (*bridge_dn_p)(struct mbuf *, struct ifnet *);
-
-#ifdef SYSCTL_NODE
-SYSCTL_DECL(_net_inet);
-SYSCTL_DECL(_net_inet_ip);
-
-SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet");
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size,
- CTLFLAG_RW, &dn_hash_size, 0, "Default hash table size");
-#if 0 /* curr_time is 64 bit */
-SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, curr_time,
- CTLFLAG_RD, &curr_time, 0, "Current tick");
-#endif
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, ready_heap,
- CTLFLAG_RD, &ready_heap.size, 0, "Size of ready heap");
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, extract_heap,
- CTLFLAG_RD, &extract_heap.size, 0, "Size of extract heap");
-SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, searches,
- CTLFLAG_RD, &searches, 0, "Number of queue searches");
-SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, search_steps,
- CTLFLAG_RD, &search_steps, 0, "Number of queue search steps");
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire,
- CTLFLAG_RW, &pipe_expire, 0, "Expire queue if empty");
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, max_chain_len,
- CTLFLAG_RW, &dn_max_ratio, 0,
- "Max ratio between dynamic queues and buckets");
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth,
- CTLFLAG_RD, &red_lookup_depth, 0, "Depth of RED lookup table");
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size,
- CTLFLAG_RD, &red_avg_pkt_size, 0, "RED Medium packet size");
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size,
- CTLFLAG_RD, &red_max_pkt_size, 0, "RED Max packet size");
-SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta,
- CTLFLAG_RD, &tick_delta, 0, "Last vs standard tick difference (usec).");
-SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta_sum,
- CTLFLAG_RD, &tick_delta_sum, 0, "Accumulated tick difference (usec).");
-SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_adjustment,
- CTLFLAG_RD, &tick_adjustment, 0, "Tick adjustments done.");
-SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_diff,
- CTLFLAG_RD, &tick_diff, 0,
- "Adjusted vs non-adjusted curr_time difference (ticks).");
-SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost,
- CTLFLAG_RD, &tick_lost, 0,
- "Number of ticks coalesced by dummynet taskqueue.");
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, io_fast,
- CTLFLAG_RW, &io_fast, 0, "Enable fast dummynet io.");
-SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt,
- CTLFLAG_RD, &io_pkt, 0,
- "Number of packets passed to dummynet.");
-SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_fast,
- CTLFLAG_RD, &io_pkt_fast, 0,
- "Number of packets bypassed dummynet scheduler.");
-SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_drop,
- CTLFLAG_RD, &io_pkt_drop, 0,
- "Number of packets dropped by dummynet.");
-SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_slot_limit,
- CTLFLAG_RW, &pipe_slot_limit, 0, "Upper limit in slots for pipe queue.");
-SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_byte_limit,
- CTLFLAG_RW, &pipe_byte_limit, 0, "Upper limit in bytes for pipe queue.");
-#endif
-
-#ifdef DUMMYNET_DEBUG
-int dummynet_debug = 0;
-#ifdef SYSCTL_NODE
-SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW, &dummynet_debug,
- 0, "control debugging printfs");
-#endif
-#define DPRINTF(X) if (dummynet_debug) printf X
-#else
-#define DPRINTF(X)
-#endif
-
-static struct task dn_task;
-static struct taskqueue *dn_tq = NULL;
-static void dummynet_task(void *, int);
-
-static struct mtx dummynet_mtx;
-#define DUMMYNET_LOCK_INIT() \
- mtx_init(&dummynet_mtx, "dummynet", NULL, MTX_DEF)
-#define DUMMYNET_LOCK_DESTROY() mtx_destroy(&dummynet_mtx)
-#define DUMMYNET_LOCK() mtx_lock(&dummynet_mtx)
-#define DUMMYNET_UNLOCK() mtx_unlock(&dummynet_mtx)
-#define DUMMYNET_LOCK_ASSERT() mtx_assert(&dummynet_mtx, MA_OWNED)
-
-static int config_pipe(struct dn_pipe *p);
-static int ip_dn_ctl(struct sockopt *sopt);
-
-static void dummynet(void *);
-static void dummynet_flush(void);
-static void dummynet_send(struct mbuf *);
-void dummynet_drain(void);
-static int dummynet_io(struct mbuf **, int , struct ip_fw_args *);
+ id->dst_port &= mask->dst_port;
+ id->src_port &= mask->src_port;
+ id->proto &= mask->proto;
+ id->extra &= mask->extra;
+ if (is_v6) {
+ APPLY_MASK(&id->dst_ip6, &mask->dst_ip6);
+ APPLY_MASK(&id->src_ip6, &mask->src_ip6);
+ id->flow_id6 &= mask->flow_id6;
+ } else {
+ id->dst_ip &= mask->dst_ip;
+ id->src_ip &= mask->src_ip;
+ }
+ return id;
+}
-/*
- * Flow queue is idle if:
- * 1) it's empty for at least 1 tick
- * 2) it has invalid timestamp (WF2Q case)
- * 3) parent pipe has no 'exhausted' burst.
- */
-#define QUEUE_IS_IDLE(q) ((q)->head == NULL && (q)->S == (q)->F + 1 && \
- curr_time > (q)->idle_time + 1 && \
- ((q)->numbytes + (curr_time - (q)->idle_time - 1) * \
- (q)->fs->pipe->bandwidth >= (q)->fs->pipe->burst))
+/* computes an OR of two masks, result in dst and also returned */
+static struct ipfw_flow_id *
+flow_id_or(struct ipfw_flow_id *src, struct ipfw_flow_id *dst)
+{
+ int is_v6 = IS_IP6_FLOW_ID(dst);
-/*
- * Heap management functions.
- *
- * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2.
- * Some macros help finding parent/children so we can optimize them.
- *
- * heap_init() is called to expand the heap when needed.
- * Increment size in blocks of 16 entries.
- * XXX failure to allocate a new element is a pretty bad failure
- * as we basically stall a whole queue forever!!
- * Returns 1 on error, 0 on success
- */
-#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 )
-#define HEAP_LEFT(x) ( 2*(x) + 1 )
-#define HEAP_IS_LEFT(x) ( (x) & 1 )
-#define HEAP_RIGHT(x) ( 2*(x) + 2 )
-#define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; }
-#define HEAP_INCREMENT 15
+ dst->dst_port |= src->dst_port;
+ dst->src_port |= src->src_port;
+ dst->proto |= src->proto;
+ dst->extra |= src->extra;
+ if (is_v6) {
+#define OR_MASK(_d, _s) \
+ (_d)->__u6_addr.__u6_addr32[0] |= (_s)->__u6_addr.__u6_addr32[0]; \
+ (_d)->__u6_addr.__u6_addr32[1] |= (_s)->__u6_addr.__u6_addr32[1]; \
+ (_d)->__u6_addr.__u6_addr32[2] |= (_s)->__u6_addr.__u6_addr32[2]; \
+ (_d)->__u6_addr.__u6_addr32[3] |= (_s)->__u6_addr.__u6_addr32[3];
+ OR_MASK(&dst->dst_ip6, &src->dst_ip6);
+ OR_MASK(&dst->src_ip6, &src->src_ip6);
+#undef OR_MASK
+ dst->flow_id6 |= src->flow_id6;
+ } else {
+ dst->dst_ip |= src->dst_ip;
+ dst->src_ip |= src->src_ip;
+ }
+ return dst;
+}
static int
-heap_init(struct dn_heap *h, int new_size)
+nonzero_mask(struct ipfw_flow_id *m)
{
- struct dn_heap_entry *p;
+ if (m->dst_port || m->src_port || m->proto || m->extra)
+ return 1;
+ if (IS_IP6_FLOW_ID(m)) {
+ return
+ m->dst_ip6.__u6_addr.__u6_addr32[0] ||
+ m->dst_ip6.__u6_addr.__u6_addr32[1] ||
+ m->dst_ip6.__u6_addr.__u6_addr32[2] ||
+ m->dst_ip6.__u6_addr.__u6_addr32[3] ||
+ m->src_ip6.__u6_addr.__u6_addr32[0] ||
+ m->src_ip6.__u6_addr.__u6_addr32[1] ||
+ m->src_ip6.__u6_addr.__u6_addr32[2] ||
+ m->src_ip6.__u6_addr.__u6_addr32[3] ||
+ m->flow_id6;
+ } else {
+ return m->dst_ip || m->src_ip;
+ }
+}
- if (h->size >= new_size ) {
- printf("dummynet: %s, Bogus call, have %d want %d\n", __func__,
- h->size, new_size);
- return 0 ;
- }
- new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT ;
- p = malloc(new_size * sizeof(*p), M_DUMMYNET, M_NOWAIT);
- if (p == NULL) {
- printf("dummynet: %s, resize %d failed\n", __func__, new_size );
- return 1 ; /* error */
- }
- if (h->size > 0) {
- bcopy(h->p, p, h->size * sizeof(*p) );
- free(h->p, M_DUMMYNET);
+/* XXX we may want a better hash function */
+static uint32_t
+flow_id_hash(struct ipfw_flow_id *id)
+{
+ uint32_t i;
+
+ if (IS_IP6_FLOW_ID(id)) {
+ uint32_t *d = (uint32_t *)&id->dst_ip6;
+ uint32_t *s = (uint32_t *)&id->src_ip6;
+ i = (d[0] ) ^ (d[1]) ^
+ (d[2] ) ^ (d[3]) ^
+ (d[0] >> 15) ^ (d[1] >> 15) ^
+ (d[2] >> 15) ^ (d[3] >> 15) ^
+ (s[0] << 1) ^ (s[1] << 1) ^
+ (s[2] << 1) ^ (s[3] << 1) ^
+ (s[0] << 16) ^ (s[1] << 16) ^
+ (s[2] << 16) ^ (s[3] << 16) ^
+ (id->dst_port << 1) ^ (id->src_port) ^
+ (id->extra) ^
+ (id->proto ) ^ (id->flow_id6);
+ } else {
+ i = (id->dst_ip) ^ (id->dst_ip >> 15) ^
+ (id->src_ip << 1) ^ (id->src_ip >> 16) ^
+ (id->extra) ^
+ (id->dst_port << 1) ^ (id->src_port) ^ (id->proto);
}
- h->p = p ;
- h->size = new_size ;
- return 0 ;
+ return i;
}
-/*
- * Insert element in heap. Normally, p != NULL, we insert p in
- * a new position and bubble up. If p == NULL, then the element is
- * already in place, and key is the position where to start the
- * bubble-up.
- * Returns 1 on failure (cannot allocate new heap entry)
- *
- * If offset > 0 the position (index, int) of the element in the heap is
- * also stored in the element itself at the given offset in bytes.
- */
-#define SET_OFFSET(heap, node) \
- if (heap->offset > 0) \
- *((int *)((char *)(heap->p[node].object) + heap->offset)) = node ;
-/*
- * RESET_OFFSET is used for sanity checks. It sets offset to an invalid value.
- */
-#define RESET_OFFSET(heap, node) \
- if (heap->offset > 0) \
- *((int *)((char *)(heap->p[node].object) + heap->offset)) = -1 ;
+/* Like bcmp, returns 0 if ids match, 1 otherwise. */
static int
-heap_insert(struct dn_heap *h, dn_key key1, void *p)
-{
- int son = h->elements ;
-
- if (p == NULL) /* data already there, set starting point */
- son = key1 ;
- else { /* insert new element at the end, possibly resize */
- son = h->elements ;
- if (son == h->size) /* need resize... */
- if (heap_init(h, h->elements+1) )
- return 1 ; /* failure... */
- h->p[son].object = p ;
- h->p[son].key = key1 ;
- h->elements++ ;
- }
- while (son > 0) { /* bubble up */
- int father = HEAP_FATHER(son) ;
- struct dn_heap_entry tmp ;
-
- if (DN_KEY_LT( h->p[father].key, h->p[son].key ) )
- break ; /* found right position */
- /* son smaller than father, swap and repeat */
- HEAP_SWAP(h->p[son], h->p[father], tmp) ;
- SET_OFFSET(h, son);
- son = father ;
- }
- SET_OFFSET(h, son);
- return 0 ;
+flow_id_cmp(struct ipfw_flow_id *id1, struct ipfw_flow_id *id2)
+{
+ int is_v6 = IS_IP6_FLOW_ID(id1);
+
+ if (!is_v6) {
+ if (IS_IP6_FLOW_ID(id2))
+ return 1; /* different address families */
+
+ return (id1->dst_ip == id2->dst_ip &&
+ id1->src_ip == id2->src_ip &&
+ id1->dst_port == id2->dst_port &&
+ id1->src_port == id2->src_port &&
+ id1->proto == id2->proto &&
+ id1->extra == id2->extra) ? 0 : 1;
+ }
+ /* the ipv6 case */
+ return (
+ !bcmp(&id1->dst_ip6,&id2->dst_ip6, sizeof(id1->dst_ip6)) &&
+ !bcmp(&id1->src_ip6,&id2->src_ip6, sizeof(id1->src_ip6)) &&
+ id1->dst_port == id2->dst_port &&
+ id1->src_port == id2->src_port &&
+ id1->proto == id2->proto &&
+ id1->extra == id2->extra &&
+ id1->flow_id6 == id2->flow_id6) ? 0 : 1;
}
+/*--------- end of flow-id mask, hash and compare ---------*/
-/*
- * remove top element from heap, or obj if obj != NULL
+/*--- support functions for the qht hashtable ----
+ * Entries are hashed by flow-id
*/
-static void
-heap_extract(struct dn_heap *h, void *obj)
+static uint32_t
+q_hash(uintptr_t key, int flags, void *arg)
{
- int child, father, max = h->elements - 1 ;
+ /* compute the hash slot from the flow id */
+ struct ipfw_flow_id *id = (flags & DNHT_KEY_IS_OBJ) ?
+ &((struct dn_queue *)key)->ni.fid :
+ (struct ipfw_flow_id *)key;
- if (max < 0) {
- printf("dummynet: warning, extract from empty heap 0x%p\n", h);
- return ;
- }
- father = 0 ; /* default: move up smallest child */
- if (obj != NULL) { /* extract specific element, index is at offset */
- if (h->offset <= 0)
- panic("dummynet: heap_extract from middle not supported on this heap!!!\n");
- father = *((int *)((char *)obj + h->offset)) ;
- if (father < 0 || father >= h->elements) {
- printf("dummynet: heap_extract, father %d out of bound 0..%d\n",
- father, h->elements);
- panic("dummynet: heap_extract");
+ return flow_id_hash(id);
+}
+
+static int
+q_match(void *obj, uintptr_t key, int flags, void *arg)
+{
+ struct dn_queue *o = (struct dn_queue *)obj;
+ struct ipfw_flow_id *id2;
+
+ if (flags & DNHT_KEY_IS_OBJ) {
+ /* compare pointers */
+ id2 = &((struct dn_queue *)key)->ni.fid;
+ } else {
+ id2 = (struct ipfw_flow_id *)key;
}
- }
- RESET_OFFSET(h, father);
- child = HEAP_LEFT(father) ; /* left child */
- while (child <= max) { /* valid entry */
- if (child != max && DN_KEY_LT(h->p[child+1].key, h->p[child].key) )
- child = child+1 ; /* take right child, otherwise left */
- h->p[father] = h->p[child] ;
- SET_OFFSET(h, father);
- father = child ;
- child = HEAP_LEFT(child) ; /* left child for next loop */
- }
- h->elements-- ;
- if (father != max) {
- /*
- * Fill hole with last entry and bubble up, reusing the insert code
- */
- h->p[father] = h->p[max] ;
- heap_insert(h, father, NULL); /* this one cannot fail */
- }
+ return (0 == flow_id_cmp(&o->ni.fid, id2));
}
-#if 0
/*
- * change object position and update references
- * XXX this one is never used!
+ * create a new queue instance for the given 'key'.
*/
-static void
-heap_move(struct dn_heap *h, dn_key new_key, void *object)
-{
- int temp;
- int i ;
- int max = h->elements-1 ;
- struct dn_heap_entry buf ;
-
- if (h->offset <= 0)
- panic("cannot move items on this heap");
-
- i = *((int *)((char *)object + h->offset));
- if (DN_KEY_LT(new_key, h->p[i].key) ) { /* must move up */
- h->p[i].key = new_key ;
- for (; i>0 && DN_KEY_LT(new_key, h->p[(temp = HEAP_FATHER(i))].key) ;
- i = temp ) { /* bubble up */
- HEAP_SWAP(h->p[i], h->p[temp], buf) ;
- SET_OFFSET(h, i);
- }
- } else { /* must move down */
- h->p[i].key = new_key ;
- while ( (temp = HEAP_LEFT(i)) <= max ) { /* found left child */
- if ((temp != max) && DN_KEY_GT(h->p[temp].key, h->p[temp+1].key))
- temp++ ; /* select child with min key */
- if (DN_KEY_GT(new_key, h->p[temp].key)) { /* go down */
- HEAP_SWAP(h->p[i], h->p[temp], buf) ;
- SET_OFFSET(h, i);
- } else
- break ;
- i = temp ;
+static void *
+q_new(uintptr_t key, int flags, void *arg)
+{
+ struct dn_queue *q, *template = arg;
+ struct dn_fsk *fs = template->fs;
+ int size = sizeof(*q) + fs->sched->fp->q_datalen;
+
+ q = malloc(size, M_DUMMYNET, M_NOWAIT | M_ZERO);
+ if (q == NULL) {
+ D("no memory for new queue");
+ return NULL;
}
- }
- SET_OFFSET(h, i);
+
+ set_oid(&q->ni.oid, DN_QUEUE, size);
+ if (fs->fs.flags & DN_QHT_HASH)
+ q->ni.fid = *(struct ipfw_flow_id *)key;
+ q->fs = fs;
+ q->_si = template->_si;
+ q->_si->q_count++;
+
+ if (fs->sched->fp->new_queue)
+ fs->sched->fp->new_queue(q);
+ dn_cfg.queue_count++;
+ return q;
}
-#endif /* heap_move, unused */
/*
- * heapify() will reorganize data inside an array to maintain the
- * heap property. It is needed when we delete a bunch of entries.
+ * Notify schedulers that a queue is going away.
+ * If (flags & DN_DESTROY), also free the packets.
+ * The version for callbacks is called q_delete_cb().
*/
static void
-heapify(struct dn_heap *h)
+dn_delete_queue(struct dn_queue *q, int flags)
{
- int i ;
+ struct dn_fsk *fs = q->fs;
+
+ // D("fs %p si %p\n", fs, q->_si);
+ /* notify the parent scheduler that the queue is going away */
+ if (fs && fs->sched->fp->free_queue)
+ fs->sched->fp->free_queue(q);
+ q->_si->q_count--;
+ q->_si = NULL;
+ if (flags & DN_DESTROY) {
+ if (q->mq.head)
+ dn_free_pkts(q->mq.head);
+ bzero(q, sizeof(*q)); // safety
+ free(q, M_DUMMYNET);
+ dn_cfg.queue_count--;
+ }
+}
- for (i = 0 ; i < h->elements ; i++ )
- heap_insert(h, i , NULL) ;
+static int
+q_delete_cb(void *q, void *arg)
+{
+ int flags = (int)(uintptr_t)arg;
+ dn_delete_queue(q, flags);
+ return (flags & DN_DESTROY) ? DNHT_SCAN_DEL : 0;
}
/*
- * cleanup the heap and free data structure
+ * calls dn_delete_queue/q_delete_cb on all queues,
+ * which notifies the parent scheduler and possibly drains packets.
+ * flags & DN_DESTROY: drains queues and destroy qht;
*/
static void
-heap_free(struct dn_heap *h)
+qht_delete(struct dn_fsk *fs, int flags)
{
- if (h->size >0 )
- free(h->p, M_DUMMYNET);
- bzero(h, sizeof(*h) );
+ ND("fs %d start flags %d qht %p",
+ fs->fs.fs_nr, flags, fs->qht);
+ if (!fs->qht)
+ return;
+ if (fs->fs.flags & DN_QHT_HASH) {
+ dn_ht_scan(fs->qht, q_delete_cb, (void *)(uintptr_t)flags);
+ if (flags & DN_DESTROY) {
+ dn_ht_free(fs->qht, 0);
+ fs->qht = NULL;
+ }
+ } else {
+ dn_delete_queue((struct dn_queue *)(fs->qht), flags);
+ if (flags & DN_DESTROY)
+ fs->qht = NULL;
+ }
}
/*
- * --- end of heap management functions ---
- */
-
-/*
- * Return the mbuf tag holding the dummynet state. As an optimization
- * this is assumed to be the first tag on the list. If this turns out
- * wrong we'll need to search the list.
+ * Find and possibly create the queue for a MULTIQUEUE scheduler.
+ * We never call it for !MULTIQUEUE (the queue is in the sch_inst).
*/
-static struct dn_pkt_tag *
-dn_tag_get(struct mbuf *m)
+struct dn_queue *
+ipdn_q_find(struct dn_fsk *fs, struct dn_sch_inst *si,
+ struct ipfw_flow_id *id)
{
- struct m_tag *mtag = m_tag_first(m);
- KASSERT(mtag != NULL &&
- mtag->m_tag_cookie == MTAG_ABI_COMPAT &&
- mtag->m_tag_id == PACKET_TAG_DUMMYNET,
- ("packet on dummynet queue w/o dummynet tag!"));
- return (struct dn_pkt_tag *)(mtag+1);
+ struct dn_queue template;
+
+ template._si = si;
+ template.fs = fs;
+
+ if (fs->fs.flags & DN_QHT_HASH) {
+ struct ipfw_flow_id masked_id;
+ if (fs->qht == NULL) {
+ fs->qht = dn_ht_init(NULL, fs->fs.buckets,
+ offsetof(struct dn_queue, q_next),
+ q_hash, q_match, q_new);
+ if (fs->qht == NULL)
+ return NULL;
+ }
+ masked_id = *id;
+ flow_id_mask(&fs->fsk_mask, &masked_id);
+ return dn_ht_find(fs->qht, (uintptr_t)&masked_id,
+ DNHT_INSERT, &template);
+ } else {
+ if (fs->qht == NULL)
+ fs->qht = q_new(0, 0, &template);
+ return (struct dn_queue *)fs->qht;
+ }
}
+/*--- end of queue hash table ---*/
-/*
- * Scheduler functions:
- *
- * transmit_event() is called when the delay-line needs to enter
- * the scheduler, either because of existing pkts getting ready,
- * or new packets entering the queue. The event handled is the delivery
- * time of the packet.
- *
- * ready_event() does something similar with fixed-rate queues, and the
- * event handled is the finish time of the head pkt.
+/*--- support functions for the sch_inst hashtable ----
*
- * wfq_ready_event() does something similar with WF2Q queues, and the
- * event handled is the start time of the head pkt.
- *
- * In all cases, we make sure that the data structures are consistent
- * before passing pkts out, because this might trigger recursive
- * invocations of the procedures.
+ * These are hashed by flow-id
*/
-static void
-transmit_event(struct dn_pipe *pipe, struct mbuf **head, struct mbuf **tail)
+static uint32_t
+si_hash(uintptr_t key, int flags, void *arg)
{
- struct mbuf *m;
- struct dn_pkt_tag *pkt;
+ /* compute the hash slot from the flow id */
+ struct ipfw_flow_id *id = (flags & DNHT_KEY_IS_OBJ) ?
+ &((struct dn_sch_inst *)key)->ni.fid :
+ (struct ipfw_flow_id *)key;
- DUMMYNET_LOCK_ASSERT();
+ return flow_id_hash(id);
+}
- while ((m = pipe->head) != NULL) {
- pkt = dn_tag_get(m);
- if (!DN_KEY_LEQ(pkt->output_time, curr_time))
- break;
+static int
+si_match(void *obj, uintptr_t key, int flags, void *arg)
+{
+ struct dn_sch_inst *o = obj;
+ struct ipfw_flow_id *id2;
- pipe->head = m->m_nextpkt;
- if (*tail != NULL)
- (*tail)->m_nextpkt = m;
- else
- *head = m;
- *tail = m;
+ id2 = (flags & DNHT_KEY_IS_OBJ) ?
+ &((struct dn_sch_inst *)key)->ni.fid :
+ (struct ipfw_flow_id *)key;
+ return flow_id_cmp(&o->ni.fid, id2) == 0;
+}
+
+/*
+ * create a new instance for the given 'key'
+ * Allocate memory for instance, delay line and scheduler private data.
+ */
+static void *
+si_new(uintptr_t key, int flags, void *arg)
+{
+ struct dn_schk *s = arg;
+ struct dn_sch_inst *si;
+ int l = sizeof(*si) + s->fp->si_datalen;
+
+ si = malloc(l, M_DUMMYNET, M_NOWAIT | M_ZERO);
+ if (si == NULL)
+ goto error;
+ /* Set length only for the part passed up to userland. */
+ set_oid(&si->ni.oid, DN_SCH_I, sizeof(struct dn_flow));
+ set_oid(&(si->dline.oid), DN_DELAY_LINE,
+ sizeof(struct delay_line));
+ /* mark si and dline as outside the event queue */
+ si->ni.oid.id = si->dline.oid.id = -1;
+
+ si->sched = s;
+ si->dline.si = si;
+
+ if (s->fp->new_sched && s->fp->new_sched(si)) {
+ D("new_sched error");
+ goto error;
}
- if (*tail != NULL)
- (*tail)->m_nextpkt = NULL;
+ if (s->sch.flags & DN_HAVE_MASK)
+ si->ni.fid = *(struct ipfw_flow_id *)key;
- /* If there are leftover packets, put into the heap for next event. */
- if ((m = pipe->head) != NULL) {
- pkt = dn_tag_get(m);
- /*
- * XXX Should check errors on heap_insert, by draining the
- * whole pipe p and hoping in the future we are more successful.
- */
- heap_insert(&extract_heap, pkt->output_time, pipe);
+ dn_cfg.si_count++;
+ return si;
+
+error:
+ if (si) {
+ bzero(si, sizeof(*si)); // safety
+ free(si, M_DUMMYNET);
}
+ return NULL;
}
-#define div64(a, b) ((int64_t)(a) / (int64_t)(b))
-#define DN_TO_DROP 0xffff
/*
- * Compute how many ticks we have to wait before being able to send
- * a packet. This is computed as the "wire time" for the packet
- * (length + extra bits), minus the credit available, scaled to ticks.
- * Check that the result is not be negative (it could be if we have
- * too much leftover credit in q->numbytes).
+ * Callback from siht to delete all scheduler instances. Remove
+ * si and delay line from the system heap, destroy all queues.
+ * We assume that all flowset have been notified and do not
+ * point to us anymore.
*/
-static inline dn_key
-set_ticks(struct mbuf *m, struct dn_flow_queue *q, struct dn_pipe *p)
+static int
+si_destroy(void *_si, void *arg)
{
- int64_t ret;
-
- ret = div64( (m->m_pkthdr.len * 8 + q->extra_bits) * hz
- - q->numbytes + p->bandwidth - 1 , p->bandwidth);
-#if 0
- printf("%s %d extra_bits %d numb %d ret %d\n",
- __FUNCTION__, __LINE__,
- (int)(q->extra_bits & 0xffffffff),
- (int)(q->numbytes & 0xffffffff),
- (int)(ret & 0xffffffff));
-#endif
- if (ret < 0)
- ret = 0;
- return ret;
+ struct dn_sch_inst *si = _si;
+ struct dn_schk *s = si->sched;
+ struct delay_line *dl = &si->dline;
+
+ if (dl->oid.subtype) /* remove delay line from event heap */
+ heap_extract(&dn_cfg.evheap, dl);
+ dn_free_pkts(dl->mq.head); /* drain delay line */
+ if (si->kflags & DN_ACTIVE) /* remove si from event heap */
+ heap_extract(&dn_cfg.evheap, si);
+ if (s->fp->free_sched)
+ s->fp->free_sched(si);
+ bzero(si, sizeof(*si)); /* safety */
+ free(si, M_DUMMYNET);
+ dn_cfg.si_count--;
+ return DNHT_SCAN_DEL;
}
/*
- * Convert the additional MAC overheads/delays into an equivalent
- * number of bits for the given data rate. The samples are in milliseconds
- * so we need to divide by 1000.
+ * Find the scheduler instance for this packet. If we need to apply
+ * a mask, do on a local copy of the flow_id to preserve the original.
+ * Assume siht is always initialized if we have a mask.
*/
-static dn_key
-compute_extra_bits(struct mbuf *pkt, struct dn_pipe *p)
+struct dn_sch_inst *
+ipdn_si_find(struct dn_schk *s, struct ipfw_flow_id *id)
{
- int index;
- dn_key extra_bits;
- if (!p->samples || p->samples_no == 0)
- return 0;
- index = random() % p->samples_no;
- extra_bits = ((dn_key)p->samples[index] * p->bandwidth) / 1000;
- if (index >= p->loss_level) {
- struct dn_pkt_tag *dt = dn_tag_get(pkt);
- if (dt)
- dt->dn_dir = DN_TO_DROP;
+ if (s->sch.flags & DN_HAVE_MASK) {
+ struct ipfw_flow_id id_t = *id;
+ flow_id_mask(&s->sch.sched_mask, &id_t);
+ return dn_ht_find(s->siht, (uintptr_t)&id_t,
+ DNHT_INSERT, s);
}
- return extra_bits;
+ if (!s->siht)
+ s->siht = si_new(0, 0, s);
+ return (struct dn_sch_inst *)s->siht;
}
-static void
-free_pipe(struct dn_pipe *p)
+/* callback to flush credit for the scheduler instance */
+static int
+si_reset_credit(void *_si, void *arg)
{
- if (p->samples)
- free(p->samples, M_DUMMYNET);
- free(p, M_DUMMYNET);
+ struct dn_sch_inst *si = _si;
+ struct dn_link *p = &si->sched->link;
+
+ si->credit = p->burst + (dn_cfg.io_fast ? p->bandwidth : 0);
+ return 0;
}
-/*
- * extract pkt from queue, compute output time (could be now)
- * and put into delay line (p_queue)
- */
static void
-move_pkt(struct mbuf *pkt, struct dn_flow_queue *q, struct dn_pipe *p,
- int len)
+schk_reset_credit(struct dn_schk *s)
{
- struct dn_pkt_tag *dt = dn_tag_get(pkt);
-
- q->head = pkt->m_nextpkt ;
- q->len-- ;
- q->len_bytes -= len ;
-
- dt->output_time = curr_time + p->delay ;
-
- if (p->head == NULL)
- p->head = pkt;
- else
- p->tail->m_nextpkt = pkt;
- p->tail = pkt;
- p->tail->m_nextpkt = NULL;
+ if (s->sch.flags & DN_HAVE_MASK)
+ dn_ht_scan(s->siht, si_reset_credit, NULL);
+ else if (s->siht)
+ si_reset_credit(s->siht, NULL);
}
+/*---- end of sch_inst hashtable ---------------------*/
-/*
- * ready_event() is invoked every time the queue must enter the
- * scheduler, either because the first packet arrives, or because
- * a previously scheduled event fired.
- * On invokation, drain as many pkts as possible (could be 0) and then
- * if there are leftover packets reinsert the pkt in the scheduler.
+/*-------------------------------------------------------
+ * flowset hash (fshash) support. Entries are hashed by fs_nr.
+ * New allocations are put in the fsunlinked list, from which
+ * they are removed when they point to a specific scheduler.
*/
-static void
-ready_event(struct dn_flow_queue *q, struct mbuf **head, struct mbuf **tail)
+static uint32_t
+fsk_hash(uintptr_t key, int flags, void *arg)
{
- struct mbuf *pkt;
- struct dn_pipe *p = q->fs->pipe;
- int p_was_empty;
-
- DUMMYNET_LOCK_ASSERT();
+ uint32_t i = !(flags & DNHT_KEY_IS_OBJ) ? key :
+ ((struct dn_fsk *)key)->fs.fs_nr;
- if (p == NULL) {
- printf("dummynet: ready_event- pipe is gone\n");
- return;
- }
- p_was_empty = (p->head == NULL);
+ return ( (i>>8)^(i>>4)^i );
+}
- /*
- * Schedule fixed-rate queues linked to this pipe:
- * account for the bw accumulated since last scheduling, then
- * drain as many pkts as allowed by q->numbytes and move to
- * the delay line (in p) computing output time.
- * bandwidth==0 (no limit) means we can drain the whole queue,
- * setting len_scaled = 0 does the job.
- */
- q->numbytes += (curr_time - q->sched_time) * p->bandwidth;
- while ((pkt = q->head) != NULL) {
- int len = pkt->m_pkthdr.len;
- dn_key len_scaled = p->bandwidth ? len*8*hz
- + q->extra_bits*hz
- : 0;
-
- if (DN_KEY_GT(len_scaled, q->numbytes))
- break;
- q->numbytes -= len_scaled;
- move_pkt(pkt, q, p, len);
- if (q->head)
- q->extra_bits = compute_extra_bits(q->head, p);
- }
- /*
- * If we have more packets queued, schedule next ready event
- * (can only occur when bandwidth != 0, otherwise we would have
- * flushed the whole queue in the previous loop).
- * To this purpose we record the current time and compute how many
- * ticks to go for the finish time of the packet.
- */
- if ((pkt = q->head) != NULL) { /* this implies bandwidth != 0 */
- dn_key t = set_ticks(pkt, q, p); /* ticks i have to wait */
+static int
+fsk_match(void *obj, uintptr_t key, int flags, void *arg)
+{
+ struct dn_fsk *fs = obj;
+ int i = !(flags & DNHT_KEY_IS_OBJ) ? key :
+ ((struct dn_fsk *)key)->fs.fs_nr;
- q->sched_time = curr_time;
- heap_insert(&ready_heap, curr_time + t, (void *)q);
- /*
- * XXX Should check errors on heap_insert, and drain the whole
- * queue on error hoping next time we are luckier.
- */
- } else /* RED needs to know when the queue becomes empty. */
- q->idle_time = curr_time;
+ return (fs->fs.fs_nr == i);
+}
- /*
- * If the delay line was empty call transmit_event() now.
- * Otherwise, the scheduler will take care of it.
- */
- if (p_was_empty)
- transmit_event(p, head, tail);
+static void *
+fsk_new(uintptr_t key, int flags, void *arg)
+{
+ struct dn_fsk *fs;
+
+ fs = malloc(sizeof(*fs), M_DUMMYNET, M_NOWAIT | M_ZERO);
+ if (fs) {
+ set_oid(&fs->fs.oid, DN_FS, sizeof(fs->fs));
+ dn_cfg.fsk_count++;
+ fs->drain_bucket = 0;
+ SLIST_INSERT_HEAD(&dn_cfg.fsu, fs, sch_chain);
+ }
+ return fs;
}
/*
- * Called when we can transmit packets on WF2Q queues. Take pkts out of
- * the queues at their start time, and enqueue into the delay line.
- * Packets are drained until p->numbytes < 0. As long as
- * len_scaled >= p->numbytes, the packet goes into the delay line
- * with a deadline p->delay. For the last packet, if p->numbytes < 0,
- * there is an additional delay.
+ * detach flowset from its current scheduler. Flags as follows:
+ * DN_DETACH removes from the fsk_list
+ * DN_DESTROY deletes individual queues
+ * DN_DELETE_FS destroys the flowset (otherwise goes in unlinked).
*/
static void
-ready_event_wfq(struct dn_pipe *p, struct mbuf **head, struct mbuf **tail)
+fsk_detach(struct dn_fsk *fs, int flags)
{
- int p_was_empty = (p->head == NULL);
- struct dn_heap *sch = &(p->scheduler_heap);
- struct dn_heap *neh = &(p->not_eligible_heap);
-
- DUMMYNET_LOCK_ASSERT();
-
- if (p->if_name[0] == 0) /* tx clock is simulated */
- p->numbytes += (curr_time - p->sched_time) * p->bandwidth;
- else { /*
- * tx clock is for real,
- * the ifq must be empty or this is a NOP.
- */
- if (p->ifp && p->ifp->if_snd.ifq_head != NULL)
- return;
- else {
- DPRINTF(("dummynet: pipe %d ready from %s --\n",
- p->pipe_nr, p->if_name));
- }
+ if (flags & DN_DELETE_FS)
+ flags |= DN_DESTROY;
+ ND("fs %d from sched %d flags %s %s %s",
+ fs->fs.fs_nr, fs->fs.sched_nr,
+ (flags & DN_DELETE_FS) ? "DEL_FS":"",
+ (flags & DN_DESTROY) ? "DEL":"",
+ (flags & DN_DETACH) ? "DET":"");
+ if (flags & DN_DETACH) { /* detach from the list */
+ struct dn_fsk_head *h;
+ h = fs->sched ? &fs->sched->fsk_list : &dn_cfg.fsu;
+ SLIST_REMOVE(h, fs, dn_fsk, sch_chain);
}
-
- /*
- * While we have backlogged traffic AND credit, we need to do
- * something on the queue.
+ /* Free the RED parameters, they will be recomputed on
+ * subsequent attach if needed.
*/
- while (p->numbytes >= 0 && (sch->elements > 0 || neh->elements > 0)) {
- if (sch->elements > 0) {
- /* Have some eligible pkts to send out. */
- struct dn_flow_queue *q = sch->p[0].object;
- struct mbuf *pkt = q->head;
- struct dn_flow_set *fs = q->fs;
- uint64_t len = pkt->m_pkthdr.len;
- int len_scaled = p->bandwidth ? len * 8 * hz : 0;
-
- heap_extract(sch, NULL); /* Remove queue from heap. */
- p->numbytes -= len_scaled;
- move_pkt(pkt, q, p, len);
-
- p->V += (len << MY_M) / p->sum; /* Update V. */
- q->S = q->F; /* Update start time. */
- if (q->len == 0) {
- /* Flow not backlogged any more. */
- fs->backlogged--;
- heap_insert(&(p->idle_heap), q->F, q);
- } else {
- /* Still backlogged. */
-
- /*
- * Update F and position in backlogged queue,
- * then put flow in not_eligible_heap
- * (we will fix this later).
- */
- len = (q->head)->m_pkthdr.len;
- q->F += (len << MY_M) / (uint64_t)fs->weight;
- if (DN_KEY_LEQ(q->S, p->V))
- heap_insert(neh, q->S, q);
- else
- heap_insert(sch, q->F, q);
- }
- }
- /*
- * Now compute V = max(V, min(S_i)). Remember that all elements
- * in sch have by definition S_i <= V so if sch is not empty,
- * V is surely the max and we must not update it. Conversely,
- * if sch is empty we only need to look at neh.
- */
- if (sch->elements == 0 && neh->elements > 0)
- p->V = MAX64(p->V, neh->p[0].key);
- /* Move from neh to sch any packets that have become eligible */
- while (neh->elements > 0 && DN_KEY_LEQ(neh->p[0].key, p->V)) {
- struct dn_flow_queue *q = neh->p[0].object;
- heap_extract(neh, NULL);
- heap_insert(sch, q->F, q);
- }
-
- if (p->if_name[0] != '\0') { /* Tx clock is from a real thing */
- p->numbytes = -1; /* Mark not ready for I/O. */
- break;
- }
- }
- if (sch->elements == 0 && neh->elements == 0 && p->numbytes >= 0) {
- p->idle_time = curr_time;
- /*
- * No traffic and no events scheduled.
- * We can get rid of idle-heap.
- */
- if (p->idle_heap.elements > 0) {
- int i;
-
- for (i = 0; i < p->idle_heap.elements; i++) {
- struct dn_flow_queue *q;
-
- q = p->idle_heap.p[i].object;
- q->F = 0;
- q->S = q->F + 1;
- }
- p->sum = 0;
- p->V = 0;
- p->idle_heap.elements = 0;
- }
- }
- /*
- * If we are getting clocks from dummynet (not a real interface) and
- * If we are under credit, schedule the next ready event.
- * Also fix the delivery time of the last packet.
- */
- if (p->if_name[0]==0 && p->numbytes < 0) { /* This implies bw > 0. */
- dn_key t = 0; /* Number of ticks i have to wait. */
-
- if (p->bandwidth > 0)
- t = (p->bandwidth - 1 - p->numbytes) / p->bandwidth;
- dn_tag_get(p->tail)->output_time += t;
- p->sched_time = curr_time;
- heap_insert(&wfq_ready_heap, curr_time + t, (void *)p);
- /*
- * XXX Should check errors on heap_insert, and drain the whole
- * queue on error hoping next time we are luckier.
- */
+ if (fs->w_q_lookup)
+ free(fs->w_q_lookup, M_DUMMYNET);
+ fs->w_q_lookup = NULL;
+ qht_delete(fs, flags);
+ if (fs->sched && fs->sched->fp->free_fsk)
+ fs->sched->fp->free_fsk(fs);
+ fs->sched = NULL;
+ if (flags & DN_DELETE_FS) {
+ bzero(fs, sizeof(fs)); /* safety */
+ free(fs, M_DUMMYNET);
+ dn_cfg.fsk_count--;
+ } else {
+ SLIST_INSERT_HEAD(&dn_cfg.fsu, fs, sch_chain);
}
-
- /*
- * If the delay line was empty call transmit_event() now.
- * Otherwise, the scheduler will take care of it.
- */
- if (p_was_empty)
- transmit_event(p, head, tail);
}
/*
- * This is called one tick, after previous run. It is used to
- * schedule next run.
+ * Detach or destroy all flowsets in a list.
+ * flags specifies what to do:
+ * DN_DESTROY: flush all queues
+ * DN_DELETE_FS: DN_DESTROY + destroy flowset
+ * DN_DELETE_FS implies DN_DESTROY
*/
static void
-dummynet(void * __unused unused)
+fsk_detach_list(struct dn_fsk_head *h, int flags)
{
-
- taskqueue_enqueue(dn_tq, &dn_task);
+ struct dn_fsk *fs;
+ int n = 0; /* only for stats */
+
+ ND("head %p flags %x", h, flags);
+ while ((fs = SLIST_FIRST(h))) {
+ SLIST_REMOVE_HEAD(h, sch_chain);
+ n++;
+ fsk_detach(fs, flags);
+ }
+ ND("done %d flowsets", n);
}
/*
- * The main dummynet processing function.
+ * called on 'queue X delete' -- removes the flowset from fshash,
+ * deletes all queues for the flowset, and removes the flowset.
*/
-static void
-dummynet_task(void *context, int pending)
+static int
+delete_fs(int i, int locked)
{
- struct mbuf *head = NULL, *tail = NULL;
- struct dn_pipe *pipe;
- struct dn_heap *heaps[3];
- struct dn_heap *h;
- void *p; /* generic parameter to handler */
- int i;
-
- DUMMYNET_LOCK();
-
- heaps[0] = &ready_heap; /* fixed-rate queues */
- heaps[1] = &wfq_ready_heap; /* wfq queues */
- heaps[2] = &extract_heap; /* delay line */
-
- /* Update number of lost(coalesced) ticks. */
- tick_lost += pending - 1;
-
- getmicrouptime(&t);
- /* Last tick duration (usec). */
- tick_last = (t.tv_sec - prev_t.tv_sec) * 1000000 +
- (t.tv_usec - prev_t.tv_usec);
- /* Last tick vs standard tick difference (usec). */
- tick_delta = (tick_last * hz - 1000000) / hz;
- /* Accumulated tick difference (usec). */
- tick_delta_sum += tick_delta;
-
- prev_t = t;
-
- /*
- * Adjust curr_time if accumulated tick difference greater than
- * 'standard' tick. Since curr_time should be monotonically increasing,
- * we do positive adjustment as required and throttle curr_time in
- * case of negative adjustment.
- */
- curr_time++;
- if (tick_delta_sum - tick >= 0) {
- int diff = tick_delta_sum / tick;
-
- curr_time += diff;
- tick_diff += diff;
- tick_delta_sum %= tick;
- tick_adjustment++;
- } else if (tick_delta_sum + tick <= 0) {
- curr_time--;
- tick_diff--;
- tick_delta_sum += tick;
- tick_adjustment++;
- }
-
- for (i = 0; i < 3; i++) {
- h = heaps[i];
- while (h->elements > 0 && DN_KEY_LEQ(h->p[0].key, curr_time)) {
- if (h->p[0].key > curr_time)
- printf("dummynet: warning, "
- "heap %d is %d ticks late\n",
- i, (int)(curr_time - h->p[0].key));
- /* store a copy before heap_extract */
- p = h->p[0].object;
- /* need to extract before processing */
- heap_extract(h, NULL);
- if (i == 0)
- ready_event(p, &head, &tail);
- else if (i == 1) {
- struct dn_pipe *pipe = p;
- if (pipe->if_name[0] != '\0')
- printf("dummynet: bad ready_event_wfq "
- "for pipe %s\n", pipe->if_name);
- else
- ready_event_wfq(p, &head, &tail);
- } else
- transmit_event(p, &head, &tail);
- }
- }
-
- /* Sweep pipes trying to expire idle flow_queues. */
- for (i = 0; i < HASHSIZE; i++)
- SLIST_FOREACH(pipe, &pipehash[i], next)
- if (pipe->idle_heap.elements > 0 &&
- DN_KEY_LT(pipe->idle_heap.p[0].key, pipe->V)) {
- struct dn_flow_queue *q =
- pipe->idle_heap.p[0].object;
-
- heap_extract(&(pipe->idle_heap), NULL);
- /* Mark timestamp as invalid. */
- q->S = q->F + 1;
- pipe->sum -= q->fs->weight;
- }
-
- DUMMYNET_UNLOCK();
-
- if (head != NULL)
- dummynet_send(head);
-
- callout_reset(&dn_timeout, 1, dummynet, NULL);
+ struct dn_fsk *fs;
+ int err = 0;
+
+ if (!locked)
+ DN_BH_WLOCK();
+ fs = dn_ht_find(dn_cfg.fshash, i, DNHT_REMOVE, NULL);
+ ND("fs %d found %p", i, fs);
+ if (fs) {
+ fsk_detach(fs, DN_DETACH | DN_DELETE_FS);
+ err = 0;
+ } else
+ err = EINVAL;
+ if (!locked)
+ DN_BH_WUNLOCK();
+ return err;
}
-static void
-dummynet_send(struct mbuf *m)
-{
- struct dn_pkt_tag *pkt;
- struct mbuf *n;
- struct ip *ip;
-
- for (; m != NULL; m = n) {
- n = m->m_nextpkt;
- m->m_nextpkt = NULL;
- pkt = dn_tag_get(m);
- switch (pkt->dn_dir) {
- case DN_TO_IP_OUT:
- ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL);
- break ;
- case DN_TO_IP_IN :
- ip = mtod(m, struct ip *);
- ip->ip_len = htons(ip->ip_len);
- ip->ip_off = htons(ip->ip_off);
- netisr_dispatch(NETISR_IP, m);
- break;
-#ifdef INET6
- case DN_TO_IP6_IN:
- netisr_dispatch(NETISR_IPV6, m);
- break;
+/*----- end of flowset hashtable support -------------*/
- case DN_TO_IP6_OUT:
- ip6_output(m, NULL, NULL, IPV6_FORWARDING, NULL, NULL, NULL);
- break;
-#endif
- case DN_TO_IFB_FWD:
- if (bridge_dn_p != NULL)
- ((*bridge_dn_p)(m, pkt->ifp));
- else
- printf("dummynet: if_bridge not loaded\n");
-
- break;
- case DN_TO_ETH_DEMUX:
- /*
- * The Ethernet code assumes the Ethernet header is
- * contiguous in the first mbuf header.
- * Insure this is true.
- */
- if (m->m_len < ETHER_HDR_LEN &&
- (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) {
- printf("dummynet/ether: pullup failed, "
- "dropping packet\n");
- break;
- }
- ether_demux(m->m_pkthdr.rcvif, m);
- break;
- case DN_TO_ETH_OUT:
- ether_output_frame(pkt->ifp, m);
- break;
-
- case DN_TO_DROP:
- /* drop the packet after some time */
- m_freem(m);
- break;
-
- default:
- printf("dummynet: bad switch %d!\n", pkt->dn_dir);
- m_freem(m);
- break;
- }
- }
+/*------------------------------------------------------------
+ * Scheduler hash. When searching by index we pass sched_nr,
+ * otherwise we pass struct dn_sch * which is the first field in
+ * struct dn_schk so we can cast between the two. We use this trick
+ * because in the create phase (but it should be fixed).
+ */
+static uint32_t
+schk_hash(uintptr_t key, int flags, void *_arg)
+{
+ uint32_t i = !(flags & DNHT_KEY_IS_OBJ) ? key :
+ ((struct dn_schk *)key)->sch.sched_nr;
+ return ( (i>>8)^(i>>4)^i );
}
-/*
- * Unconditionally expire empty queues in case of shortage.
- * Returns the number of queues freed.
- */
static int
-expire_queues(struct dn_flow_set *fs)
-{
- struct dn_flow_queue *q, *prev ;
- int i, initial_elements = fs->rq_elements ;
-
- if (fs->last_expired == time_uptime)
- return 0 ;
- fs->last_expired = time_uptime ;
- for (i = 0 ; i <= fs->rq_size ; i++) /* last one is overflow */
- for (prev=NULL, q = fs->rq[i] ; q != NULL ; )
- if (!QUEUE_IS_IDLE(q)) {
- prev = q ;
- q = q->next ;
- } else { /* entry is idle, expire it */
- struct dn_flow_queue *old_q = q ;
-
- if (prev != NULL)
- prev->next = q = q->next ;
- else
- fs->rq[i] = q = q->next ;
- fs->rq_elements-- ;
- free(old_q, M_DUMMYNET);
- }
- return initial_elements - fs->rq_elements ;
+schk_match(void *obj, uintptr_t key, int flags, void *_arg)
+{
+ struct dn_schk *s = (struct dn_schk *)obj;
+ int i = !(flags & DNHT_KEY_IS_OBJ) ? key :
+ ((struct dn_schk *)key)->sch.sched_nr;
+ return (s->sch.sched_nr == i);
}
/*
- * If room, create a new queue and put at head of slot i;
- * otherwise, create or use the default queue.
+ * Create the entry and intialize with the sched hash if needed.
+ * Leave s->fp unset so we can tell whether a dn_ht_find() returns
+ * a new object or a previously existing one.
*/
-static struct dn_flow_queue *
-create_queue(struct dn_flow_set *fs, int i)
+static void *
+schk_new(uintptr_t key, int flags, void *arg)
{
- struct dn_flow_queue *q;
-
- if (fs->rq_elements > fs->rq_size * dn_max_ratio &&
- expire_queues(fs) == 0) {
- /* No way to get room, use or create overflow queue. */
- i = fs->rq_size;
- if (fs->rq[i] != NULL)
- return fs->rq[i];
- }
- q = malloc(sizeof(*q), M_DUMMYNET, M_NOWAIT | M_ZERO);
- if (q == NULL) {
- printf("dummynet: sorry, cannot allocate queue for new flow\n");
- return (NULL);
+ struct schk_new_arg *a = arg;
+ struct dn_schk *s;
+ int l = sizeof(*s) +a->fp->schk_datalen;
+
+ s = malloc(l, M_DUMMYNET, M_NOWAIT | M_ZERO);
+ if (s == NULL)
+ return NULL;
+ set_oid(&s->link.oid, DN_LINK, sizeof(s->link));
+ s->sch = *a->sch; // copy initial values
+ s->link.link_nr = s->sch.sched_nr;
+ SLIST_INIT(&s->fsk_list);
+ /* initialize the hash table or create the single instance */
+ s->fp = a->fp; /* si_new needs this */
+ s->drain_bucket = 0;
+ if (s->sch.flags & DN_HAVE_MASK) {
+ s->siht = dn_ht_init(NULL, s->sch.buckets,
+ offsetof(struct dn_sch_inst, si_next),
+ si_hash, si_match, si_new);
+ if (s->siht == NULL) {
+ free(s, M_DUMMYNET);
+ return NULL;
+ }
}
- q->fs = fs;
- q->hash_slot = i;
- q->next = fs->rq[i];
- q->S = q->F + 1; /* hack - mark timestamp as invalid. */
- q->numbytes = fs->pipe->burst + (io_fast ? fs->pipe->bandwidth : 0);
- fs->rq[i] = q;
- fs->rq_elements++;
- return (q);
+ s->fp = NULL; /* mark as a new scheduler */
+ dn_cfg.schk_count++;
+ return s;
}
/*
- * Given a flow_set and a pkt in last_pkt, find a matching queue
- * after appropriate masking. The queue is moved to front
- * so that further searches take less time.
+ * Callback for sched delete. Notify all attached flowsets to
+ * detach from the scheduler, destroy the internal flowset, and
+ * all instances. The scheduler goes away too.
+ * arg is 0 (only detach flowsets and destroy instances)
+ * DN_DESTROY (detach & delete queues, delete schk)
+ * or DN_DELETE_FS (delete queues and flowsets, delete schk)
*/
-static struct dn_flow_queue *
-find_queue(struct dn_flow_set *fs, struct ipfw_flow_id *id)
-{
- int i = 0 ; /* we need i and q for new allocations */
- struct dn_flow_queue *q, *prev;
- int is_v6 = IS_IP6_FLOW_ID(id);
-
- if ( !(fs->flags_fs & DN_HAVE_FLOW_MASK) )
- q = fs->rq[0] ;
- else {
- /* first, do the masking, then hash */
- id->dst_port &= fs->flow_mask.dst_port ;
- id->src_port &= fs->flow_mask.src_port ;
- id->proto &= fs->flow_mask.proto ;
- id->flags = 0 ; /* we don't care about this one */
- if (is_v6) {
- APPLY_MASK(&id->dst_ip6, &fs->flow_mask.dst_ip6);
- APPLY_MASK(&id->src_ip6, &fs->flow_mask.src_ip6);
- id->flow_id6 &= fs->flow_mask.flow_id6;
-
- i = ((id->dst_ip6.__u6_addr.__u6_addr32[0]) & 0xffff)^
- ((id->dst_ip6.__u6_addr.__u6_addr32[1]) & 0xffff)^
- ((id->dst_ip6.__u6_addr.__u6_addr32[2]) & 0xffff)^
- ((id->dst_ip6.__u6_addr.__u6_addr32[3]) & 0xffff)^
-
- ((id->dst_ip6.__u6_addr.__u6_addr32[0] >> 15) & 0xffff)^
- ((id->dst_ip6.__u6_addr.__u6_addr32[1] >> 15) & 0xffff)^
- ((id->dst_ip6.__u6_addr.__u6_addr32[2] >> 15) & 0xffff)^
- ((id->dst_ip6.__u6_addr.__u6_addr32[3] >> 15) & 0xffff)^
-
- ((id->src_ip6.__u6_addr.__u6_addr32[0] << 1) & 0xfffff)^
- ((id->src_ip6.__u6_addr.__u6_addr32[1] << 1) & 0xfffff)^
- ((id->src_ip6.__u6_addr.__u6_addr32[2] << 1) & 0xfffff)^
- ((id->src_ip6.__u6_addr.__u6_addr32[3] << 1) & 0xfffff)^
-
- ((id->src_ip6.__u6_addr.__u6_addr32[0] << 16) & 0xffff)^
- ((id->src_ip6.__u6_addr.__u6_addr32[1] << 16) & 0xffff)^
- ((id->src_ip6.__u6_addr.__u6_addr32[2] << 16) & 0xffff)^
- ((id->src_ip6.__u6_addr.__u6_addr32[3] << 16) & 0xffff)^
-
- (id->dst_port << 1) ^ (id->src_port) ^
- (id->proto ) ^
- (id->flow_id6);
- } else {
- id->dst_ip &= fs->flow_mask.dst_ip ;
- id->src_ip &= fs->flow_mask.src_ip ;
-
- i = ( (id->dst_ip) & 0xffff ) ^
- ( (id->dst_ip >> 15) & 0xffff ) ^
- ( (id->src_ip << 1) & 0xffff ) ^
- ( (id->src_ip >> 16 ) & 0xffff ) ^
- (id->dst_port << 1) ^ (id->src_port) ^
- (id->proto );
- }
- i = i % fs->rq_size ;
- /* finally, scan the current list for a match */
- searches++ ;
- for (prev=NULL, q = fs->rq[i] ; q ; ) {
- search_steps++;
- if (is_v6 &&
- IN6_ARE_ADDR_EQUAL(&id->dst_ip6,&q->id.dst_ip6) &&
- IN6_ARE_ADDR_EQUAL(&id->src_ip6,&q->id.src_ip6) &&
- id->dst_port == q->id.dst_port &&
- id->src_port == q->id.src_port &&
- id->proto == q->id.proto &&
- id->flags == q->id.flags &&
- id->flow_id6 == q->id.flow_id6)
- break ; /* found */
-
- if (!is_v6 && id->dst_ip == q->id.dst_ip &&
- id->src_ip == q->id.src_ip &&
- id->dst_port == q->id.dst_port &&
- id->src_port == q->id.src_port &&
- id->proto == q->id.proto &&
- id->flags == q->id.flags)
- break ; /* found */
-
- /* No match. Check if we can expire the entry */
- if (pipe_expire && QUEUE_IS_IDLE(q)) {
- /* entry is idle and not in any heap, expire it */
- struct dn_flow_queue *old_q = q ;
-
- if (prev != NULL)
- prev->next = q = q->next ;
- else
- fs->rq[i] = q = q->next ;
- fs->rq_elements-- ;
- free(old_q, M_DUMMYNET);
- continue ;
- }
- prev = q ;
- q = q->next ;
- }
- if (q && prev != NULL) { /* found and not in front */
- prev->next = q->next ;
- q->next = fs->rq[i] ;
- fs->rq[i] = q ;
+static int
+schk_delete_cb(void *obj, void *arg)
+{
+ struct dn_schk *s = obj;
+#if 0
+ int a = (int)arg;
+ ND("sched %d arg %s%s",
+ s->sch.sched_nr,
+ a&DN_DESTROY ? "DEL ":"",
+ a&DN_DELETE_FS ? "DEL_FS":"");
+#endif
+ fsk_detach_list(&s->fsk_list, arg ? DN_DESTROY : 0);
+ /* no more flowset pointing to us now */
+ if (s->sch.flags & DN_HAVE_MASK)
+ dn_ht_scan(s->siht, si_destroy, NULL);
+ else if (s->siht)
+ si_destroy(s->siht, NULL);
+ if (s->profile) {
+ free(s->profile, M_DUMMYNET);
+ s->profile = NULL;
}
- }
- if (q == NULL) { /* no match, need to allocate a new entry */
- q = create_queue(fs, i);
- if (q != NULL)
- q->id = *id ;
- }
- return q ;
+ s->siht = NULL;
+ if (s->fp->destroy)
+ s->fp->destroy(s);
+ bzero(s, sizeof(*s)); // safety
+ free(obj, M_DUMMYNET);
+ dn_cfg.schk_count--;
+ return DNHT_SCAN_DEL;
}
+/*
+ * called on a 'sched X delete' command. Deletes a single scheduler.
+ * This is done by removing from the schedhash, unlinking all
+ * flowsets and deleting their traffic.
+ */
static int
-red_drops(struct dn_flow_set *fs, struct dn_flow_queue *q, int len)
+delete_schk(int i)
{
- /*
- * RED algorithm
- *
- * RED calculates the average queue size (avg) using a low-pass filter
- * with an exponential weighted (w_q) moving average:
- * avg <- (1-w_q) * avg + w_q * q_size
- * where q_size is the queue length (measured in bytes or * packets).
- *
- * If q_size == 0, we compute the idle time for the link, and set
- * avg = (1 - w_q)^(idle/s)
- * where s is the time needed for transmitting a medium-sized packet.
- *
- * Now, if avg < min_th the packet is enqueued.
- * If avg > max_th the packet is dropped. Otherwise, the packet is
- * dropped with probability P function of avg.
- */
-
- int64_t p_b = 0;
-
- /* Queue in bytes or packets? */
- u_int q_size = (fs->flags_fs & DN_QSIZE_IS_BYTES) ?
- q->len_bytes : q->len;
-
- DPRINTF(("\ndummynet: %d q: %2u ", (int)curr_time, q_size));
-
- /* Average queue size estimation. */
- if (q_size != 0) {
- /* Queue is not empty, avg <- avg + (q_size - avg) * w_q */
- int diff = SCALE(q_size) - q->avg;
- int64_t v = SCALE_MUL((int64_t)diff, (int64_t)fs->w_q);
+ struct dn_schk *s;
+
+ s = dn_ht_find(dn_cfg.schedhash, i, DNHT_REMOVE, NULL);
+ ND("%d %p", i, s);
+ if (!s)
+ return EINVAL;
+ delete_fs(i + DN_MAX_ID, 1); /* first delete internal fs */
+ /* then detach flowsets, delete traffic */
+ schk_delete_cb(s, (void*)(uintptr_t)DN_DESTROY);
+ return 0;
+}
+/*--- end of schk hashtable support ---*/
- q->avg += (int)v;
- } else {
- /*
- * Queue is empty, find for how long the queue has been
- * empty and use a lookup table for computing
- * (1 - * w_q)^(idle_time/s) where s is the time to send a
- * (small) packet.
- * XXX check wraps...
- */
- if (q->avg) {
- u_int t = (curr_time - q->idle_time) / fs->lookup_step;
+static int
+copy_obj(char **start, char *end, void *_o, const char *msg, int i)
+{
+ struct dn_id *o = _o;
+ int have = end - *start;
- q->avg = (t < fs->lookup_depth) ?
- SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0;
- }
- }
- DPRINTF(("dummynet: avg: %u ", SCALE_VAL(q->avg)));
-
- /* Should i drop? */
- if (q->avg < fs->min_th) {
- q->count = -1;
- return (0); /* accept packet */
- }
- if (q->avg >= fs->max_th) { /* average queue >= max threshold */
- if (fs->flags_fs & DN_IS_GENTLE_RED) {
- /*
- * According to Gentle-RED, if avg is greater than
- * max_th the packet is dropped with a probability
- * p_b = c_3 * avg - c_4
- * where c_3 = (1 - max_p) / max_th
- * c_4 = 1 - 2 * max_p
- */
- p_b = SCALE_MUL((int64_t)fs->c_3, (int64_t)q->avg) -
- fs->c_4;
- } else {
- q->count = -1;
- DPRINTF(("dummynet: - drop"));
- return (1);
- }
- } else if (q->avg > fs->min_th) {
- /*
- * We compute p_b using the linear dropping function
- * p_b = c_1 * avg - c_2
- * where c_1 = max_p / (max_th - min_th)
- * c_2 = max_p * min_th / (max_th - min_th)
- */
- p_b = SCALE_MUL((int64_t)fs->c_1, (int64_t)q->avg) - fs->c_2;
+ if (have < o->len || o->len == 0 || o->type == 0) {
+ D("(WARN) type %d %s %d have %d need %d",
+ o->type, msg, i, have, o->len);
+ return 1;
}
-
- if (fs->flags_fs & DN_QSIZE_IS_BYTES)
- p_b = (p_b * len) / fs->max_pkt_size;
- if (++q->count == 0)
- q->random = random() & 0xffff;
- else {
- /*
- * q->count counts packets arrived since last drop, so a greater
- * value of q->count means a greater packet drop probability.
- */
- if (SCALE_MUL(p_b, SCALE((int64_t)q->count)) > q->random) {
- q->count = 0;
- DPRINTF(("dummynet: - red drop"));
- /* After a drop we calculate a new random value. */
- q->random = random() & 0xffff;
- return (1); /* drop */
- }
+ ND("type %d %s %d len %d", o->type, msg, i, o->len);
+ bcopy(_o, *start, o->len);
+ if (o->type == DN_LINK) {
+ /* Adjust burst parameter for link */
+ struct dn_link *l = (struct dn_link *)*start;
+ l->burst = div64(l->burst, 8 * hz);
+ } else if (o->type == DN_SCH) {
+ /* Set id->id to the number of instances */
+ struct dn_schk *s = _o;
+ struct dn_id *id = (struct dn_id *)(*start);
+ id->id = (s->sch.flags & DN_HAVE_MASK) ?
+ dn_ht_entries(s->siht) : (s->siht ? 1 : 0);
}
- /* End of RED algorithm. */
-
- return (0); /* accept */
+ *start += o->len;
+ return 0;
}
-static __inline struct dn_flow_set *
-locate_flowset(int fs_nr)
+/* Specific function to copy a queue.
+ * Copies only the user-visible part of a queue (which is in
+ * a struct dn_flow), and sets len accordingly.
+ */
+static int
+copy_obj_q(char **start, char *end, void *_o, const char *msg, int i)
{
- struct dn_flow_set *fs;
-
- SLIST_FOREACH(fs, &flowsethash[HASH(fs_nr)], next)
- if (fs->fs_nr == fs_nr)
- return (fs);
-
- return (NULL);
+ struct dn_id *o = _o;
+ int have = end - *start;
+ int len = sizeof(struct dn_flow); /* see above comment */
+
+ if (have < len || o->len == 0 || o->type != DN_QUEUE) {
+ D("ERROR type %d %s %d have %d need %d",
+ o->type, msg, i, have, len);
+ return 1;
+ }
+ ND("type %d %s %d len %d", o->type, msg, i, len);
+ bcopy(_o, *start, len);
+ ((struct dn_id*)(*start))->len = len;
+ *start += len;
+ return 0;
}
-static __inline struct dn_pipe *
-locate_pipe(int pipe_nr)
+static int
+copy_q_cb(void *obj, void *arg)
{
- struct dn_pipe *pipe;
-
- SLIST_FOREACH(pipe, &pipehash[HASH(pipe_nr)], next)
- if (pipe->pipe_nr == pipe_nr)
- return (pipe);
-
- return (NULL);
+ struct dn_queue *q = obj;
+ struct copy_args *a = arg;
+ struct dn_flow *ni = (struct dn_flow *)(*a->start);
+ if (copy_obj_q(a->start, a->end, &q->ni, "queue", -1))
+ return DNHT_SCAN_END;
+ ni->oid.type = DN_FLOW; /* override the DN_QUEUE */
+ ni->oid.id = si_hash((uintptr_t)&ni->fid, 0, NULL);
+ return 0;
}
-/*
- * dummynet hook for packets. Below 'pipe' is a pipe or a queue
- * depending on whether WF2Q or fixed bw is used.
- *
- * pipe_nr pipe or queue the packet is destined for.
- * dir where shall we send the packet after dummynet.
- * m the mbuf with the packet
- * ifp the 'ifp' parameter from the caller.
- * NULL in ip_input, destination interface in ip_output,
- * rule matching rule, in case of multiple passes
- */
static int
-dummynet_io(struct mbuf **m0, int dir, struct ip_fw_args *fwa)
-{
- struct mbuf *m = *m0, *head = NULL, *tail = NULL;
- struct dn_pkt_tag *pkt;
- struct m_tag *mtag;
- struct dn_flow_set *fs = NULL;
- struct dn_pipe *pipe;
- uint64_t len = m->m_pkthdr.len;
- struct dn_flow_queue *q = NULL;
- int is_pipe;
- ipfw_insn *cmd = ACTION_PTR(fwa->rule);
-
- KASSERT(m->m_nextpkt == NULL,
- ("dummynet_io: mbuf queue passed to dummynet"));
-
- if (cmd->opcode == O_LOG)
- cmd += F_LEN(cmd);
- if (cmd->opcode == O_ALTQ)
- cmd += F_LEN(cmd);
- if (cmd->opcode == O_TAG)
- cmd += F_LEN(cmd);
- is_pipe = (cmd->opcode == O_PIPE);
-
- DUMMYNET_LOCK();
- io_pkt++;
- /*
- * This is a dummynet rule, so we expect an O_PIPE or O_QUEUE rule.
- *
- * XXXGL: probably the pipe->fs and fs->pipe logic here
- * below can be simplified.
- */
- if (is_pipe) {
- pipe = locate_pipe(fwa->cookie);
- if (pipe != NULL)
- fs = &(pipe->fs);
- } else
- fs = locate_flowset(fwa->cookie);
-
- if (fs == NULL)
- goto dropit; /* This queue/pipe does not exist! */
- pipe = fs->pipe;
- if (pipe == NULL) { /* Must be a queue, try find a matching pipe. */
- pipe = locate_pipe(fs->parent_nr);
- if (pipe != NULL)
- fs->pipe = pipe;
- else {
- printf("dummynet: no pipe %d for queue %d, drop pkt\n",
- fs->parent_nr, fs->fs_nr);
- goto dropit;
- }
- }
- q = find_queue(fs, &(fwa->f_id));
- if (q == NULL)
- goto dropit; /* Cannot allocate queue. */
-
- /* Update statistics, then check reasons to drop pkt. */
- q->tot_bytes += len;
- q->tot_pkts++;
- if (fs->plr && random() < fs->plr)
- goto dropit; /* Random pkt drop. */
- if (fs->flags_fs & DN_QSIZE_IS_BYTES) {
- if (q->len_bytes > fs->qsize)
- goto dropit; /* Queue size overflow. */
- } else {
- if (q->len >= fs->qsize)
- goto dropit; /* Queue count overflow. */
- }
- if (fs->flags_fs & DN_IS_RED && red_drops(fs, q, len))
- goto dropit;
-
- /* XXX expensive to zero, see if we can remove it. */
- mtag = m_tag_get(PACKET_TAG_DUMMYNET,
- sizeof(struct dn_pkt_tag), M_NOWAIT | M_ZERO);
- if (mtag == NULL)
- goto dropit; /* Cannot allocate packet header. */
- m_tag_prepend(m, mtag); /* Attach to mbuf chain. */
-
- pkt = (struct dn_pkt_tag *)(mtag + 1);
- /*
- * Ok, i can handle the pkt now...
- * Build and enqueue packet + parameters.
- */
- pkt->rule = fwa->rule;
- pkt->rule_id = fwa->rule_id;
- pkt->chain_id = fwa->chain_id;
- pkt->dn_dir = dir;
-
- pkt->ifp = fwa->oif;
-
- if (q->head == NULL)
- q->head = m;
+copy_q(struct copy_args *a, struct dn_fsk *fs, int flags)
+{
+ if (!fs->qht)
+ return 0;
+ if (fs->fs.flags & DN_QHT_HASH)
+ dn_ht_scan(fs->qht, copy_q_cb, a);
else
- q->tail->m_nextpkt = m;
- q->tail = m;
- q->len++;
- q->len_bytes += len;
-
- if (q->head != m) /* Flow was not idle, we are done. */
- goto done;
-
- if (is_pipe) { /* Fixed rate queues. */
- if (q->idle_time < curr_time) {
- /* Calculate available burst size. */
- q->numbytes +=
- (curr_time - q->idle_time - 1) * pipe->bandwidth;
- if (q->numbytes > pipe->burst)
- q->numbytes = pipe->burst;
- if (io_fast)
- q->numbytes += pipe->bandwidth;
- }
- } else { /* WF2Q. */
- if (pipe->idle_time < curr_time) {
- /* Calculate available burst size. */
- pipe->numbytes +=
- (curr_time - pipe->idle_time - 1) * pipe->bandwidth;
- if (pipe->numbytes > 0 && pipe->numbytes > pipe->burst)
- pipe->numbytes = pipe->burst;
- if (io_fast)
- pipe->numbytes += pipe->bandwidth;
- }
- pipe->idle_time = curr_time;
- }
- /* Necessary for both: fixed rate & WF2Q queues. */
- q->idle_time = curr_time;
-
- /*
- * If we reach this point the flow was previously idle, so we need
- * to schedule it. This involves different actions for fixed-rate or
- * WF2Q queues.
- */
- if (is_pipe) {
- /* Fixed-rate queue: just insert into the ready_heap. */
- dn_key t = 0;
-
- if (pipe->bandwidth) {
- q->extra_bits = compute_extra_bits(m, pipe);
- t = set_ticks(m, q, pipe);
- }
- q->sched_time = curr_time;
- if (t == 0) /* Must process it now. */
- ready_event(q, &head, &tail);
- else
- heap_insert(&ready_heap, curr_time + t , q);
- } else {
- /*
- * WF2Q. First, compute start time S: if the flow was
- * idle (S = F + 1) set S to the virtual time V for the
- * controlling pipe, and update the sum of weights for the pipe;
- * otherwise, remove flow from idle_heap and set S to max(F,V).
- * Second, compute finish time F = S + len / weight.
- * Third, if pipe was idle, update V = max(S, V).
- * Fourth, count one more backlogged flow.
- */
- if (DN_KEY_GT(q->S, q->F)) { /* Means timestamps are invalid. */
- q->S = pipe->V;
- pipe->sum += fs->weight; /* Add weight of new queue. */
- } else {
- heap_extract(&(pipe->idle_heap), q);
- q->S = MAX64(q->F, pipe->V);
- }
- q->F = q->S + (len << MY_M) / (uint64_t)fs->weight;
-
- if (pipe->not_eligible_heap.elements == 0 &&
- pipe->scheduler_heap.elements == 0)
- pipe->V = MAX64(q->S, pipe->V);
- fs->backlogged++;
- /*
- * Look at eligibility. A flow is not eligibile if S>V (when
- * this happens, it means that there is some other flow already
- * scheduled for the same pipe, so the scheduler_heap cannot be
- * empty). If the flow is not eligible we just store it in the
- * not_eligible_heap. Otherwise, we store in the scheduler_heap
- * and possibly invoke ready_event_wfq() right now if there is
- * leftover credit.
- * Note that for all flows in scheduler_heap (SCH), S_i <= V,
- * and for all flows in not_eligible_heap (NEH), S_i > V.
- * So when we need to compute max(V, min(S_i)) forall i in
- * SCH+NEH, we only need to look into NEH.
- */
- if (DN_KEY_GT(q->S, pipe->V)) { /* Not eligible. */
- if (pipe->scheduler_heap.elements == 0)
- printf("dummynet: ++ ouch! not eligible but empty scheduler!\n");
- heap_insert(&(pipe->not_eligible_heap), q->S, q);
- } else {
- heap_insert(&(pipe->scheduler_heap), q->F, q);
- if (pipe->numbytes >= 0) { /* Pipe is idle. */
- if (pipe->scheduler_heap.elements != 1)
- printf("dummynet: OUCH! pipe should have been idle!\n");
- DPRINTF(("dummynet: waking up pipe %d at %d\n",
- pipe->pipe_nr, (int)(q->F >> MY_M)));
- pipe->sched_time = curr_time;
- ready_event_wfq(pipe, &head, &tail);
- }
- }
- }
-done:
- if (head == m && dir != DN_TO_IFB_FWD && dir != DN_TO_ETH_DEMUX &&
- dir != DN_TO_ETH_OUT) { /* Fast io. */
- io_pkt_fast++;
- if (m->m_nextpkt != NULL)
- printf("dummynet: fast io: pkt chain detected!\n");
- head = m->m_nextpkt = NULL;
- } else
- *m0 = NULL; /* Normal io. */
-
- DUMMYNET_UNLOCK();
- if (head != NULL)
- dummynet_send(head);
- return (0);
-
-dropit:
- io_pkt_drop++;
- if (q)
- q->drops++;
- DUMMYNET_UNLOCK();
- m_freem(m);
- *m0 = NULL;
- return ((fs && (fs->flags_fs & DN_NOERROR)) ? 0 : ENOBUFS);
+ copy_q_cb(fs->qht, a);
+ return 0;
}
/*
- * Below, the rt_unref is only needed when (pkt->dn_dir == DN_TO_IP_OUT)
- * Doing this would probably save us the initial bzero of dn_pkt
+ * This routine only copies the initial part of a profile ? XXX
*/
-#define DN_FREE_PKT(_m) do { \
- m_freem(_m); \
-} while (0)
-
-/*
- * Dispose all packets and flow_queues on a flow_set.
- * If all=1, also remove red lookup table and other storage,
- * including the descriptor itself.
- * For the one in dn_pipe MUST also cleanup ready_heap...
- */
-static void
-purge_flow_set(struct dn_flow_set *fs, int all)
+static int
+copy_profile(struct copy_args *a, struct dn_profile *p)
{
- struct dn_flow_queue *q, *qn;
- int i;
-
- DUMMYNET_LOCK_ASSERT();
-
- for (i = 0; i <= fs->rq_size; i++) {
- for (q = fs->rq[i]; q != NULL; q = qn) {
- struct mbuf *m, *mnext;
+ int have = a->end - *a->start;
+ /* XXX here we check for max length */
+ int profile_len = sizeof(struct dn_profile) -
+ ED_MAX_SAMPLES_NO*sizeof(int);
- mnext = q->head;
- while ((m = mnext) != NULL) {
- mnext = m->m_nextpkt;
- DN_FREE_PKT(m);
- }
- qn = q->next;
- free(q, M_DUMMYNET);
- }
- fs->rq[i] = NULL;
+ if (p == NULL)
+ return 0;
+ if (have < profile_len) {
+ D("error have %d need %d", have, profile_len);
+ return 1;
}
+ bcopy(p, *a->start, profile_len);
+ ((struct dn_id *)(*a->start))->len = profile_len;
+ *a->start += profile_len;
+ return 0;
+}
- fs->rq_elements = 0;
- if (all) {
- /* RED - free lookup table. */
- if (fs->w_q_lookup != NULL)
- free(fs->w_q_lookup, M_DUMMYNET);
- if (fs->rq != NULL)
- free(fs->rq, M_DUMMYNET);
- /* If this fs is not part of a pipe, free it. */
- if (fs->pipe == NULL || fs != &(fs->pipe->fs))
- free(fs, M_DUMMYNET);
+static int
+copy_flowset(struct copy_args *a, struct dn_fsk *fs, int flags)
+{
+ struct dn_fs *ufs = (struct dn_fs *)(*a->start);
+ if (!fs)
+ return 0;
+ ND("flowset %d", fs->fs.fs_nr);
+ if (copy_obj(a->start, a->end, &fs->fs, "flowset", fs->fs.fs_nr))
+ return DNHT_SCAN_END;
+ ufs->oid.id = (fs->fs.flags & DN_QHT_HASH) ?
+ dn_ht_entries(fs->qht) : (fs->qht ? 1 : 0);
+ if (flags) { /* copy queues */
+ copy_q(a, fs, 0);
}
+ return 0;
}
-/*
- * Dispose all packets queued on a pipe (not a flow_set).
- * Also free all resources associated to a pipe, which is about
- * to be deleted.
- */
-static void
-purge_pipe(struct dn_pipe *pipe)
+static int
+copy_si_cb(void *obj, void *arg)
{
- struct mbuf *m, *mnext;
-
- purge_flow_set( &(pipe->fs), 1 );
-
- mnext = pipe->head;
- while ((m = mnext) != NULL) {
- mnext = m->m_nextpkt;
- DN_FREE_PKT(m);
- }
+ struct dn_sch_inst *si = obj;
+ struct copy_args *a = arg;
+ struct dn_flow *ni = (struct dn_flow *)(*a->start);
+ if (copy_obj(a->start, a->end, &si->ni, "inst",
+ si->sched->sch.sched_nr))
+ return DNHT_SCAN_END;
+ ni->oid.type = DN_FLOW; /* override the DN_SCH_I */
+ ni->oid.id = si_hash((uintptr_t)si, DNHT_KEY_IS_OBJ, NULL);
+ return 0;
+}
- heap_free( &(pipe->scheduler_heap) );
- heap_free( &(pipe->not_eligible_heap) );
- heap_free( &(pipe->idle_heap) );
+static int
+copy_si(struct copy_args *a, struct dn_schk *s, int flags)
+{
+ if (s->sch.flags & DN_HAVE_MASK)
+ dn_ht_scan(s->siht, copy_si_cb, a);
+ else if (s->siht)
+ copy_si_cb(s->siht, a);
+ return 0;
}
/*
- * Delete all pipes and heaps returning memory. Must also
- * remove references from all ipfw rules to all pipes.
+ * compute a list of children of a scheduler and copy up
*/
-static void
-dummynet_flush(void)
+static int
+copy_fsk_list(struct copy_args *a, struct dn_schk *s, int flags)
{
- struct dn_pipe *pipe, *pipe1;
- struct dn_flow_set *fs, *fs1;
- int i;
-
- DUMMYNET_LOCK();
- /* Free heaps so we don't have unwanted events. */
- heap_free(&ready_heap);
- heap_free(&wfq_ready_heap);
- heap_free(&extract_heap);
+ struct dn_fsk *fs;
+ struct dn_id *o;
+ uint32_t *p;
+
+ int n = 0, space = sizeof(*o);
+ SLIST_FOREACH(fs, &s->fsk_list, sch_chain) {
+ if (fs->fs.fs_nr < DN_MAX_ID)
+ n++;
+ }
+ space += n * sizeof(uint32_t);
+ DX(3, "sched %d has %d flowsets", s->sch.sched_nr, n);
+ if (a->end - *(a->start) < space)
+ return DNHT_SCAN_END;
+ o = (struct dn_id *)(*(a->start));
+ o->len = space;
+ *a->start += o->len;
+ o->type = DN_TEXT;
+ p = (uint32_t *)(o+1);
+ SLIST_FOREACH(fs, &s->fsk_list, sch_chain)
+ if (fs->fs.fs_nr < DN_MAX_ID)
+ *p++ = fs->fs.fs_nr;
+ return 0;
+}
- /*
- * Now purge all queued pkts and delete all pipes.
- *
- * XXXGL: can we merge the for(;;) cycles into one or not?
- */
- for (i = 0; i < HASHSIZE; i++)
- SLIST_FOREACH_SAFE(fs, &flowsethash[i], next, fs1) {
- SLIST_REMOVE(&flowsethash[i], fs, dn_flow_set, next);
- purge_flow_set(fs, 1);
+static int
+copy_data_helper(void *_o, void *_arg)
+{
+ struct copy_args *a = _arg;
+ uint32_t *r = a->extra->r; /* start of first range */
+ uint32_t *lim; /* first invalid pointer */
+ int n;
+
+ lim = (uint32_t *)((char *)(a->extra) + a->extra->o.len);
+
+ if (a->type == DN_LINK || a->type == DN_SCH) {
+ /* pipe|sched show, we receive a dn_schk */
+ struct dn_schk *s = _o;
+
+ n = s->sch.sched_nr;
+ if (a->type == DN_SCH && n >= DN_MAX_ID)
+ return 0; /* not a scheduler */
+ if (a->type == DN_LINK && n <= DN_MAX_ID)
+ return 0; /* not a pipe */
+
+ /* see if the object is within one of our ranges */
+ for (;r < lim; r += 2) {
+ if (n < r[0] || n > r[1])
+ continue;
+ /* Found a valid entry, copy and we are done */
+ if (a->flags & DN_C_LINK) {
+ if (copy_obj(a->start, a->end,
+ &s->link, "link", n))
+ return DNHT_SCAN_END;
+ if (copy_profile(a, s->profile))
+ return DNHT_SCAN_END;
+ if (copy_flowset(a, s->fs, 0))
+ return DNHT_SCAN_END;
+ }
+ if (a->flags & DN_C_SCH) {
+ if (copy_obj(a->start, a->end,
+ &s->sch, "sched", n))
+ return DNHT_SCAN_END;
+ /* list all attached flowsets */
+ if (copy_fsk_list(a, s, 0))
+ return DNHT_SCAN_END;
+ }
+ if (a->flags & DN_C_FLOW)
+ copy_si(a, s, 0);
+ break;
}
- for (i = 0; i < HASHSIZE; i++)
- SLIST_FOREACH_SAFE(pipe, &pipehash[i], next, pipe1) {
- SLIST_REMOVE(&pipehash[i], pipe, dn_pipe, next);
- purge_pipe(pipe);
- free_pipe(pipe);
+ } else if (a->type == DN_FS) {
+ /* queue show, skip internal flowsets */
+ struct dn_fsk *fs = _o;
+
+ n = fs->fs.fs_nr;
+ if (n >= DN_MAX_ID)
+ return 0;
+ /* see if the object is within one of our ranges */
+ for (;r < lim; r += 2) {
+ if (n < r[0] || n > r[1])
+ continue;
+ if (copy_flowset(a, fs, 0))
+ return DNHT_SCAN_END;
+ copy_q(a, fs, 0);
+ break; /* we are done */
}
- DUMMYNET_UNLOCK();
+ }
+ return 0;
+}
+
+static inline struct dn_schk *
+locate_scheduler(int i)
+{
+ return dn_ht_find(dn_cfg.schedhash, i, 0, NULL);
}
/*
- * setup RED parameters
+ * red parameters are in fixed point arithmetic.
*/
static int
-config_red(struct dn_flow_set *p, struct dn_flow_set *x)
+config_red(struct dn_fsk *fs)
{
- int i;
-
- x->w_q = p->w_q;
- x->min_th = SCALE(p->min_th);
- x->max_th = SCALE(p->max_th);
- x->max_p = p->max_p;
-
- x->c_1 = p->max_p / (p->max_th - p->min_th);
- x->c_2 = SCALE_MUL(x->c_1, SCALE(p->min_th));
-
- if (x->flags_fs & DN_IS_GENTLE_RED) {
- x->c_3 = (SCALE(1) - p->max_p) / p->max_th;
- x->c_4 = SCALE(1) - 2 * p->max_p;
+ int64_t s, idle, weight, w0;
+ int t, i;
+
+ fs->w_q = fs->fs.w_q;
+ fs->max_p = fs->fs.max_p;
+ D("called");
+ /* Doing stuff that was in userland */
+ i = fs->sched->link.bandwidth;
+ s = (i <= 0) ? 0 :
+ hz * dn_cfg.red_avg_pkt_size * 8 * SCALE(1) / i;
+
+ idle = div64((s * 3) , fs->w_q); /* s, fs->w_q scaled; idle not scaled */
+ fs->lookup_step = div64(idle , dn_cfg.red_lookup_depth);
+ /* fs->lookup_step not scaled, */
+ if (!fs->lookup_step)
+ fs->lookup_step = 1;
+ w0 = weight = SCALE(1) - fs->w_q; //fs->w_q scaled
+
+ for (t = fs->lookup_step; t > 1; --t)
+ weight = SCALE_MUL(weight, w0);
+ fs->lookup_weight = (int)(weight); // scaled
+
+ /* Now doing stuff that was in kerneland */
+ fs->min_th = SCALE(fs->fs.min_th);
+ fs->max_th = SCALE(fs->fs.max_th);
+
+ fs->c_1 = fs->max_p / (fs->fs.max_th - fs->fs.min_th);
+ fs->c_2 = SCALE_MUL(fs->c_1, SCALE(fs->fs.min_th));
+
+ if (fs->fs.flags & DN_IS_GENTLE_RED) {
+ fs->c_3 = (SCALE(1) - fs->max_p) / fs->fs.max_th;
+ fs->c_4 = SCALE(1) - 2 * fs->max_p;
}
/* If the lookup table already exist, free and create it again. */
- if (x->w_q_lookup) {
- free(x->w_q_lookup, M_DUMMYNET);
- x->w_q_lookup = NULL;
+ if (fs->w_q_lookup) {
+ free(fs->w_q_lookup, M_DUMMYNET);
+ fs->w_q_lookup = NULL;
}
- if (red_lookup_depth == 0) {
+ if (dn_cfg.red_lookup_depth == 0) {
printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth"
"must be > 0\n");
- free(x, M_DUMMYNET);
+ fs->fs.flags &= ~DN_IS_RED;
+ fs->fs.flags &= ~DN_IS_GENTLE_RED;
return (EINVAL);
}
- x->lookup_depth = red_lookup_depth;
- x->w_q_lookup = (u_int *)malloc(x->lookup_depth * sizeof(int),
+ fs->lookup_depth = dn_cfg.red_lookup_depth;
+ fs->w_q_lookup = (u_int *)malloc(fs->lookup_depth * sizeof(int),
M_DUMMYNET, M_NOWAIT);
- if (x->w_q_lookup == NULL) {
+ if (fs->w_q_lookup == NULL) {
printf("dummynet: sorry, cannot allocate red lookup table\n");
- free(x, M_DUMMYNET);
+ fs->fs.flags &= ~DN_IS_RED;
+ fs->fs.flags &= ~DN_IS_GENTLE_RED;
return(ENOSPC);
}
/* Fill the lookup table with (1 - w_q)^x */
- x->lookup_step = p->lookup_step;
- x->lookup_weight = p->lookup_weight;
- x->w_q_lookup[0] = SCALE(1) - x->w_q;
-
- for (i = 1; i < x->lookup_depth; i++)
- x->w_q_lookup[i] =
- SCALE_MUL(x->w_q_lookup[i - 1], x->lookup_weight);
+ fs->w_q_lookup[0] = SCALE(1) - fs->w_q;
+
+ for (i = 1; i < fs->lookup_depth; i++)
+ fs->w_q_lookup[i] =
+ SCALE_MUL(fs->w_q_lookup[i - 1], fs->lookup_weight);
+
+ if (dn_cfg.red_avg_pkt_size < 1)
+ dn_cfg.red_avg_pkt_size = 512;
+ fs->avg_pkt_size = dn_cfg.red_avg_pkt_size;
+ if (dn_cfg.red_max_pkt_size < 1)
+ dn_cfg.red_max_pkt_size = 1500;
+ fs->max_pkt_size = dn_cfg.red_max_pkt_size;
+ D("exit");
+ return 0;
+}
- if (red_avg_pkt_size < 1)
- red_avg_pkt_size = 512;
- x->avg_pkt_size = red_avg_pkt_size;
- if (red_max_pkt_size < 1)
- red_max_pkt_size = 1500;
- x->max_pkt_size = red_max_pkt_size;
- return (0);
+/* Scan all flowset attached to this scheduler and update red */
+static void
+update_red(struct dn_schk *s)
+{
+ struct dn_fsk *fs;
+ SLIST_FOREACH(fs, &s->fsk_list, sch_chain) {
+ if (fs && (fs->fs.flags & DN_IS_RED))
+ config_red(fs);
+ }
}
-static int
-alloc_hash(struct dn_flow_set *x, struct dn_flow_set *pfs)
-{
- if (x->flags_fs & DN_HAVE_FLOW_MASK) { /* allocate some slots */
- int l = pfs->rq_size;
-
- if (l == 0)
- l = dn_hash_size;
- if (l < 4)
- l = 4;
- else if (l > DN_MAX_HASH_SIZE)
- l = DN_MAX_HASH_SIZE;
- x->rq_size = l;
- } else /* one is enough for null mask */
- x->rq_size = 1;
- x->rq = malloc((1 + x->rq_size) * sizeof(struct dn_flow_queue *),
- M_DUMMYNET, M_NOWAIT | M_ZERO);
- if (x->rq == NULL) {
- printf("dummynet: sorry, cannot allocate queue\n");
- return (ENOMEM);
- }
- x->rq_elements = 0;
- return 0 ;
+/* attach flowset to scheduler s, possibly requeue */
+static void
+fsk_attach(struct dn_fsk *fs, struct dn_schk *s)
+{
+ ND("remove fs %d from fsunlinked, link to sched %d",
+ fs->fs.fs_nr, s->sch.sched_nr);
+ SLIST_REMOVE(&dn_cfg.fsu, fs, dn_fsk, sch_chain);
+ fs->sched = s;
+ SLIST_INSERT_HEAD(&s->fsk_list, fs, sch_chain);
+ if (s->fp->new_fsk)
+ s->fp->new_fsk(fs);
+ /* XXX compute fsk_mask */
+ fs->fsk_mask = fs->fs.flow_mask;
+ if (fs->sched->sch.flags & DN_HAVE_MASK)
+ flow_id_or(&fs->sched->sch.sched_mask, &fs->fsk_mask);
+ if (fs->qht) {
+ /*
+ * we must drain qht according to the old
+ * type, and reinsert according to the new one.
+ * The requeue is complex -- in general we need to
+ * reclassify every single packet.
+ * For the time being, let's hope qht is never set
+ * when we reach this point.
+ */
+ D("XXX TODO requeue from fs %d to sch %d",
+ fs->fs.fs_nr, s->sch.sched_nr);
+ fs->qht = NULL;
+ }
+ /* set the new type for qht */
+ if (nonzero_mask(&fs->fsk_mask))
+ fs->fs.flags |= DN_QHT_HASH;
+ else
+ fs->fs.flags &= ~DN_QHT_HASH;
+
+ /* XXX config_red() can fail... */
+ if (fs->fs.flags & DN_IS_RED)
+ config_red(fs);
}
+/* update all flowsets which may refer to this scheduler */
static void
-set_fs_parms(struct dn_flow_set *x, struct dn_flow_set *src)
-{
- x->flags_fs = src->flags_fs;
- x->qsize = src->qsize;
- x->plr = src->plr;
- x->flow_mask = src->flow_mask;
- if (x->flags_fs & DN_QSIZE_IS_BYTES) {
- if (x->qsize > pipe_byte_limit)
- x->qsize = 1024 * 1024;
- } else {
- if (x->qsize == 0)
- x->qsize = 50;
- if (x->qsize > pipe_slot_limit)
- x->qsize = 50;
+update_fs(struct dn_schk *s)
+{
+ struct dn_fsk *fs, *tmp;
+
+ SLIST_FOREACH_SAFE(fs, &dn_cfg.fsu, sch_chain, tmp) {
+ if (s->sch.sched_nr != fs->fs.sched_nr) {
+ D("fs %d for sch %d not %d still unlinked",
+ fs->fs.fs_nr, fs->fs.sched_nr,
+ s->sch.sched_nr);
+ continue;
+ }
+ fsk_attach(fs, s);
}
- /* Configuring RED. */
- if (x->flags_fs & DN_IS_RED)
- config_red(src, x); /* XXX should check errors */
}
/*
- * Setup pipe or queue parameters.
+ * Configuration -- to preserve backward compatibility we use
+ * the following scheme (N is 65536)
+ * NUMBER SCHED LINK FLOWSET
+ * 1 .. N-1 (1)WFQ (2)WFQ (3)queue
+ * N+1 .. 2N-1 (4)FIFO (5)FIFO (6)FIFO for sched 1..N-1
+ * 2N+1 .. 3N-1 -- -- (7)FIFO for sched N+1..2N-1
+ *
+ * "pipe i config" configures #1, #2 and #3
+ * "sched i config" configures #1 and possibly #6
+ * "queue i config" configures #3
+ * #1 is configured with 'pipe i config' or 'sched i config'
+ * #2 is configured with 'pipe i config', and created if not
+ * existing with 'sched i config'
+ * #3 is configured with 'queue i config'
+ * #4 is automatically configured after #1, can only be FIFO
+ * #5 is automatically configured after #2
+ * #6 is automatically created when #1 is !MULTIQUEUE,
+ * and can be updated.
+ * #7 is automatically configured after #2
+ */
+
+/*
+ * configure a link (and its FIFO instance)
*/
static int
-config_pipe(struct dn_pipe *p)
+config_link(struct dn_link *p, struct dn_id *arg)
{
- struct dn_flow_set *pfs = &(p->fs);
- struct dn_flow_queue *q;
- int i, error;
+ int i;
+ if (p->oid.len != sizeof(*p)) {
+ D("invalid pipe len %d", p->oid.len);
+ return EINVAL;
+ }
+ i = p->link_nr;
+ if (i <= 0 || i >= DN_MAX_ID)
+ return EINVAL;
/*
* The config program passes parameters as follows:
* bw = bits/second (0 means no limits),
* delay = ms, must be translated into ticks.
* qsize = slots/bytes
+ * burst ???
*/
p->delay = (p->delay * hz) / 1000;
/* Scale burst size: bytes -> bits * hz */
p->burst *= 8 * hz;
- /* We need either a pipe number or a flow_set number. */
- if (p->pipe_nr == 0 && pfs->fs_nr == 0)
- return (EINVAL);
- if (p->pipe_nr != 0 && pfs->fs_nr != 0)
- return (EINVAL);
- if (p->pipe_nr != 0) { /* this is a pipe */
- struct dn_pipe *pipe;
-
- DUMMYNET_LOCK();
- pipe = locate_pipe(p->pipe_nr); /* locate pipe */
-
- if (pipe == NULL) { /* new pipe */
- pipe = malloc(sizeof(struct dn_pipe), M_DUMMYNET,
- M_NOWAIT | M_ZERO);
- if (pipe == NULL) {
- DUMMYNET_UNLOCK();
- printf("dummynet: no memory for new pipe\n");
- return (ENOMEM);
- }
- pipe->pipe_nr = p->pipe_nr;
- pipe->fs.pipe = pipe;
- /*
- * idle_heap is the only one from which
- * we extract from the middle.
- */
- pipe->idle_heap.size = pipe->idle_heap.elements = 0;
- pipe->idle_heap.offset =
- offsetof(struct dn_flow_queue, heap_pos);
- } else
- /* Flush accumulated credit for all queues. */
- for (i = 0; i <= pipe->fs.rq_size; i++)
- for (q = pipe->fs.rq[i]; q; q = q->next) {
- q->numbytes = p->burst +
- (io_fast ? p->bandwidth : 0);
- }
-
- pipe->bandwidth = p->bandwidth;
- pipe->burst = p->burst;
- pipe->numbytes = pipe->burst + (io_fast ? pipe->bandwidth : 0);
- bcopy(p->if_name, pipe->if_name, sizeof(p->if_name));
- pipe->ifp = NULL; /* reset interface ptr */
- pipe->delay = p->delay;
- set_fs_parms(&(pipe->fs), pfs);
-
- /* Handle changes in the delay profile. */
- if (p->samples_no > 0) {
- if (pipe->samples_no != p->samples_no) {
- if (pipe->samples != NULL)
- free(pipe->samples, M_DUMMYNET);
- pipe->samples =
- malloc(p->samples_no*sizeof(dn_key),
- M_DUMMYNET, M_NOWAIT | M_ZERO);
- if (pipe->samples == NULL) {
- DUMMYNET_UNLOCK();
- printf("dummynet: no memory "
- "for new samples\n");
- return (ENOMEM);
- }
- pipe->samples_no = p->samples_no;
- }
- strncpy(pipe->name,p->name,sizeof(pipe->name));
- pipe->loss_level = p->loss_level;
- for (i = 0; i<pipe->samples_no; ++i)
- pipe->samples[i] = p->samples[i];
- } else if (pipe->samples != NULL) {
- free(pipe->samples, M_DUMMYNET);
- pipe->samples = NULL;
- pipe->samples_no = 0;
- }
+ DN_BH_WLOCK();
+ /* do it twice, base link and FIFO link */
+ for (; i < 2*DN_MAX_ID; i += DN_MAX_ID) {
+ struct dn_schk *s = locate_scheduler(i);
+ if (s == NULL) {
+ DN_BH_WUNLOCK();
+ D("sched %d not found", i);
+ return EINVAL;
+ }
+ /* remove profile if exists */
+ if (s->profile) {
+ free(s->profile, M_DUMMYNET);
+ s->profile = NULL;
+ }
+ /* copy all parameters */
+ s->link.oid = p->oid;
+ s->link.link_nr = i;
+ s->link.delay = p->delay;
+ if (s->link.bandwidth != p->bandwidth) {
+ /* XXX bandwidth changes, need to update red params */
+ s->link.bandwidth = p->bandwidth;
+ update_red(s);
+ }
+ s->link.burst = p->burst;
+ schk_reset_credit(s);
+ }
+ dn_cfg.id++;
+ DN_BH_WUNLOCK();
+ return 0;
+}
- if (pipe->fs.rq == NULL) { /* a new pipe */
- error = alloc_hash(&(pipe->fs), pfs);
- if (error) {
- DUMMYNET_UNLOCK();
- free_pipe(pipe);
- return (error);
- }
- SLIST_INSERT_HEAD(&pipehash[HASH(pipe->pipe_nr)],
- pipe, next);
- }
- DUMMYNET_UNLOCK();
- } else { /* config queue */
- struct dn_flow_set *fs;
+/*
+ * configure a flowset. Can be called from inside with locked=1,
+ */
+static struct dn_fsk *
+config_fs(struct dn_fs *nfs, struct dn_id *arg, int locked)
+{
+ int i;
+ struct dn_fsk *fs;
- DUMMYNET_LOCK();
- fs = locate_flowset(pfs->fs_nr); /* locate flow_set */
+ if (nfs->oid.len != sizeof(*nfs)) {
+ D("invalid flowset len %d", nfs->oid.len);
+ return NULL;
+ }
+ i = nfs->fs_nr;
+ if (i <= 0 || i >= 3*DN_MAX_ID)
+ return NULL;
+ ND("flowset %d", i);
+ /* XXX other sanity checks */
+ if (nfs->flags & DN_QSIZE_BYTES) {
+ ipdn_bound_var(&nfs->qsize, 16384,
+ 1500, dn_cfg.byte_limit, NULL); // "queue byte size");
+ } else {
+ ipdn_bound_var(&nfs->qsize, 50,
+ 1, dn_cfg.slot_limit, NULL); // "queue slot size");
+ }
+ if (nfs->flags & DN_HAVE_MASK) {
+ /* make sure we have some buckets */
+ ipdn_bound_var(&nfs->buckets, dn_cfg.hash_size,
+ 1, dn_cfg.max_hash_size, "flowset buckets");
+ } else {
+ nfs->buckets = 1; /* we only need 1 */
+ }
+ if (!locked)
+ DN_BH_WLOCK();
+ do { /* exit with break when done */
+ struct dn_schk *s;
+ int flags = nfs->sched_nr ? DNHT_INSERT : 0;
+ int j;
+ int oldc = dn_cfg.fsk_count;
+ fs = dn_ht_find(dn_cfg.fshash, i, flags, NULL);
+ if (fs == NULL) {
+ D("missing sched for flowset %d", i);
+ break;
+ }
+ /* grab some defaults from the existing one */
+ if (nfs->sched_nr == 0) /* reuse */
+ nfs->sched_nr = fs->fs.sched_nr;
+ for (j = 0; j < sizeof(nfs->par)/sizeof(nfs->par[0]); j++) {
+ if (nfs->par[j] == -1) /* reuse */
+ nfs->par[j] = fs->fs.par[j];
+ }
+ if (bcmp(&fs->fs, nfs, sizeof(*nfs)) == 0) {
+ ND("flowset %d unchanged", i);
+ break; /* no change, nothing to do */
+ }
+ if (oldc != dn_cfg.fsk_count) /* new item */
+ dn_cfg.id++;
+ s = locate_scheduler(nfs->sched_nr);
+ /* detach from old scheduler if needed, preserving
+ * queues if we need to reattach. Then update the
+ * configuration, and possibly attach to the new sched.
+ */
+ DX(2, "fs %d changed sched %d@%p to %d@%p",
+ fs->fs.fs_nr,
+ fs->fs.sched_nr, fs->sched, nfs->sched_nr, s);
+ if (fs->sched) {
+ int flags = s ? DN_DETACH : (DN_DETACH | DN_DESTROY);
+ flags |= DN_DESTROY; /* XXX temporary */
+ fsk_detach(fs, flags);
+ }
+ fs->fs = *nfs; /* copy configuration */
+ if (s != NULL)
+ fsk_attach(fs, s);
+ } while (0);
+ if (!locked)
+ DN_BH_WUNLOCK();
+ return fs;
+}
- if (fs == NULL) { /* new */
- if (pfs->parent_nr == 0) { /* need link to a pipe */
- DUMMYNET_UNLOCK();
- return (EINVAL);
+/*
+ * config/reconfig a scheduler and its FIFO variant.
+ * For !MULTIQUEUE schedulers, also set up the flowset.
+ *
+ * On reconfigurations (detected because s->fp is set),
+ * detach existing flowsets preserving traffic, preserve link,
+ * and delete the old scheduler creating a new one.
+ */
+static int
+config_sched(struct dn_sch *_nsch, struct dn_id *arg)
+{
+ struct dn_schk *s;
+ struct schk_new_arg a; /* argument for schk_new */
+ int i;
+ struct dn_link p; /* copy of oldlink */
+ struct dn_profile *pf = NULL; /* copy of old link profile */
+ /* Used to preserv mask parameter */
+ struct ipfw_flow_id new_mask;
+ int new_buckets = 0;
+ int new_flags = 0;
+ int pipe_cmd;
+ int err = ENOMEM;
+
+ a.sch = _nsch;
+ if (a.sch->oid.len != sizeof(*a.sch)) {
+ D("bad sched len %d", a.sch->oid.len);
+ return EINVAL;
+ }
+ i = a.sch->sched_nr;
+ if (i <= 0 || i >= DN_MAX_ID)
+ return EINVAL;
+ /* make sure we have some buckets */
+ if (a.sch->flags & DN_HAVE_MASK)
+ ipdn_bound_var(&a.sch->buckets, dn_cfg.hash_size,
+ 1, dn_cfg.max_hash_size, "sched buckets");
+ /* XXX other sanity checks */
+ bzero(&p, sizeof(p));
+
+ pipe_cmd = a.sch->flags & DN_PIPE_CMD;
+ a.sch->flags &= ~DN_PIPE_CMD; //XXX do it even if is not set?
+ if (pipe_cmd) {
+ /* Copy mask parameter */
+ new_mask = a.sch->sched_mask;
+ new_buckets = a.sch->buckets;
+ new_flags = a.sch->flags;
+ }
+ DN_BH_WLOCK();
+again: /* run twice, for wfq and fifo */
+ /*
+ * lookup the type. If not supplied, use the previous one
+ * or default to WF2Q+. Otherwise, return an error.
+ */
+ dn_cfg.id++;
+ a.fp = find_sched_type(a.sch->oid.subtype, a.sch->name);
+ if (a.fp != NULL) {
+ /* found. Lookup or create entry */
+ s = dn_ht_find(dn_cfg.schedhash, i, DNHT_INSERT, &a);
+ } else if (a.sch->oid.subtype == 0 && !a.sch->name[0]) {
+ /* No type. search existing s* or retry with WF2Q+ */
+ s = dn_ht_find(dn_cfg.schedhash, i, 0, &a);
+ if (s != NULL) {
+ a.fp = s->fp;
+ /* Scheduler exists, skip to FIFO scheduler
+ * if command was pipe config...
+ */
+ if (pipe_cmd)
+ goto next;
+ } else {
+ /* New scheduler, create a wf2q+ with no mask
+ * if command was pipe config...
+ */
+ if (pipe_cmd) {
+ /* clear mask parameter */
+ bzero(&a.sch->sched_mask, sizeof(new_mask));
+ a.sch->buckets = 0;
+ a.sch->flags &= ~DN_HAVE_MASK;
}
- fs = malloc(sizeof(struct dn_flow_set), M_DUMMYNET,
- M_NOWAIT | M_ZERO);
- if (fs == NULL) {
- DUMMYNET_UNLOCK();
- printf(
- "dummynet: no memory for new flow_set\n");
- return (ENOMEM);
+ a.sch->oid.subtype = DN_SCHED_WF2QP;
+ goto again;
+ }
+ } else {
+ D("invalid scheduler type %d %s",
+ a.sch->oid.subtype, a.sch->name);
+ err = EINVAL;
+ goto error;
+ }
+ /* normalize name and subtype */
+ a.sch->oid.subtype = a.fp->type;
+ bzero(a.sch->name, sizeof(a.sch->name));
+ strlcpy(a.sch->name, a.fp->name, sizeof(a.sch->name));
+ if (s == NULL) {
+ D("cannot allocate scheduler %d", i);
+ goto error;
+ }
+ /* restore existing link if any */
+ if (p.link_nr) {
+ s->link = p;
+ if (!pf || pf->link_nr != p.link_nr) { /* no saved value */
+ s->profile = NULL; /* XXX maybe not needed */
+ } else {
+ s->profile = malloc(sizeof(struct dn_profile),
+ M_DUMMYNET, M_NOWAIT | M_ZERO);
+ if (s->profile == NULL) {
+ D("cannot allocate profile");
+ goto error; //XXX
}
- fs->fs_nr = pfs->fs_nr;
- fs->parent_nr = pfs->parent_nr;
- fs->weight = pfs->weight;
- if (fs->weight == 0)
- fs->weight = 1;
- else if (fs->weight > 100)
- fs->weight = 100;
+ bcopy(pf, s->profile, sizeof(*pf));
+ }
+ }
+ p.link_nr = 0;
+ if (s->fp == NULL) {
+ DX(2, "sched %d new type %s", i, a.fp->name);
+ } else if (s->fp != a.fp ||
+ bcmp(a.sch, &s->sch, sizeof(*a.sch)) ) {
+ /* already existing. */
+ DX(2, "sched %d type changed from %s to %s",
+ i, s->fp->name, a.fp->name);
+ DX(4, " type/sub %d/%d -> %d/%d",
+ s->sch.oid.type, s->sch.oid.subtype,
+ a.sch->oid.type, a.sch->oid.subtype);
+ if (s->link.link_nr == 0)
+ D("XXX WARNING link 0 for sched %d", i);
+ p = s->link; /* preserve link */
+ if (s->profile) {/* preserve profile */
+ if (!pf)
+ pf = malloc(sizeof(*pf),
+ M_DUMMYNET, M_NOWAIT | M_ZERO);
+ if (pf) /* XXX should issue a warning otherwise */
+ bcopy(s->profile, pf, sizeof(*pf));
+ }
+ /* remove from the hash */
+ dn_ht_find(dn_cfg.schedhash, i, DNHT_REMOVE, NULL);
+ /* Detach flowsets, preserve queues. */
+ // schk_delete_cb(s, NULL);
+ // XXX temporarily, kill queues
+ schk_delete_cb(s, (void *)DN_DESTROY);
+ goto again;
+ } else {
+ DX(4, "sched %d unchanged type %s", i, a.fp->name);
+ }
+ /* complete initialization */
+ s->sch = *a.sch;
+ s->fp = a.fp;
+ s->cfg = arg;
+ // XXX schk_reset_credit(s);
+ /* create the internal flowset if needed,
+ * trying to reuse existing ones if available
+ */
+ if (!(s->fp->flags & DN_MULTIQUEUE) && !s->fs) {
+ s->fs = dn_ht_find(dn_cfg.fshash, i, 0, NULL);
+ if (!s->fs) {
+ struct dn_fs fs;
+ bzero(&fs, sizeof(fs));
+ set_oid(&fs.oid, DN_FS, sizeof(fs));
+ fs.fs_nr = i + DN_MAX_ID;
+ fs.sched_nr = i;
+ s->fs = config_fs(&fs, NULL, 1 /* locked */);
+ }
+ if (!s->fs) {
+ schk_delete_cb(s, (void *)DN_DESTROY);
+ D("error creating internal fs for %d", i);
+ goto error;
+ }
+ }
+ /* call init function after the flowset is created */
+ if (s->fp->config)
+ s->fp->config(s);
+ update_fs(s);
+next:
+ if (i < DN_MAX_ID) { /* now configure the FIFO instance */
+ i += DN_MAX_ID;
+ if (pipe_cmd) {
+ /* Restore mask parameter for FIFO */
+ a.sch->sched_mask = new_mask;
+ a.sch->buckets = new_buckets;
+ a.sch->flags = new_flags;
} else {
- /*
- * Change parent pipe not allowed;
- * must delete and recreate.
- */
- if (pfs->parent_nr != 0 &&
- fs->parent_nr != pfs->parent_nr) {
- DUMMYNET_UNLOCK();
- return (EINVAL);
+ /* sched config shouldn't modify the FIFO scheduler */
+ if (dn_ht_find(dn_cfg.schedhash, i, 0, &a) != NULL) {
+ /* FIFO already exist, don't touch it */
+ err = 0; /* and this is not an error */
+ goto error;
}
}
+ a.sch->sched_nr = i;
+ a.sch->oid.subtype = DN_SCHED_FIFO;
+ bzero(a.sch->name, sizeof(a.sch->name));
+ goto again;
+ }
+ err = 0;
+error:
+ DN_BH_WUNLOCK();
+ if (pf)
+ free(pf, M_DUMMYNET);
+ return err;
+}
- set_fs_parms(fs, pfs);
+/*
+ * attach a profile to a link
+ */
+static int
+config_profile(struct dn_profile *pf, struct dn_id *arg)
+{
+ struct dn_schk *s;
+ int i, olen, err = 0;
- if (fs->rq == NULL) { /* a new flow_set */
- error = alloc_hash(fs, pfs);
- if (error) {
- DUMMYNET_UNLOCK();
- free(fs, M_DUMMYNET);
- return (error);
- }
- SLIST_INSERT_HEAD(&flowsethash[HASH(fs->fs_nr)],
- fs, next);
+ if (pf->oid.len < sizeof(*pf)) {
+ D("short profile len %d", pf->oid.len);
+ return EINVAL;
+ }
+ i = pf->link_nr;
+ if (i <= 0 || i >= DN_MAX_ID)
+ return EINVAL;
+ /* XXX other sanity checks */
+ DN_BH_WLOCK();
+ for (; i < 2*DN_MAX_ID; i += DN_MAX_ID) {
+ s = locate_scheduler(i);
+
+ if (s == NULL) {
+ err = EINVAL;
+ break;
+ }
+ dn_cfg.id++;
+ /*
+ * If we had a profile and the new one does not fit,
+ * or it is deleted, then we need to free memory.
+ */
+ if (s->profile && (pf->samples_no == 0 ||
+ s->profile->oid.len < pf->oid.len)) {
+ free(s->profile, M_DUMMYNET);
+ s->profile = NULL;
+ }
+ if (pf->samples_no == 0)
+ continue;
+ /*
+ * new profile, possibly allocate memory
+ * and copy data.
+ */
+ if (s->profile == NULL)
+ s->profile = malloc(pf->oid.len,
+ M_DUMMYNET, M_NOWAIT | M_ZERO);
+ if (s->profile == NULL) {
+ D("no memory for profile %d", i);
+ err = ENOMEM;
+ break;
}
- DUMMYNET_UNLOCK();
+ /* preserve larger length XXX double check */
+ olen = s->profile->oid.len;
+ if (olen < pf->oid.len)
+ olen = pf->oid.len;
+ bcopy(pf, s->profile, pf->oid.len);
+ s->profile->oid.len = olen;
}
- return (0);
+ DN_BH_WUNLOCK();
+ return err;
}
/*
- * Helper function to remove from a heap queues which are linked to
- * a flow_set about to be deleted.
+ * Delete all objects:
*/
static void
-fs_remove_from_heap(struct dn_heap *h, struct dn_flow_set *fs)
-{
- int i = 0, found = 0 ;
- for (; i < h->elements ;)
- if ( ((struct dn_flow_queue *)h->p[i].object)->fs == fs) {
- h->elements-- ;
- h->p[i] = h->p[h->elements] ;
- found++ ;
- } else
- i++ ;
- if (found)
- heapify(h);
+dummynet_flush(void)
+{
+
+ /* delete all schedulers and related links/queues/flowsets */
+ dn_ht_scan(dn_cfg.schedhash, schk_delete_cb,
+ (void *)(uintptr_t)DN_DELETE_FS);
+ /* delete all remaining (unlinked) flowsets */
+ DX(4, "still %d unlinked fs", dn_cfg.fsk_count);
+ dn_ht_free(dn_cfg.fshash, DNHT_REMOVE);
+ fsk_detach_list(&dn_cfg.fsu, DN_DELETE_FS);
+ /* Reinitialize system heap... */
+ heap_init(&dn_cfg.evheap, 16, offsetof(struct dn_id, id));
}
/*
- * helper function to remove a pipe from a heap (can be there at most once)
+ * Main handler for configuration. We are guaranteed to be called
+ * with an oid which is at least a dn_id.
+ * - the first object is the command (config, delete, flush, ...)
+ * - config_link must be issued after the corresponding config_sched
+ * - parameters (DN_TXT) for an object must preceed the object
+ * processed on a config_sched.
*/
-static void
-pipe_remove_from_heap(struct dn_heap *h, struct dn_pipe *p)
-{
- if (h->elements > 0) {
- int i = 0 ;
- for (i=0; i < h->elements ; i++ ) {
- if (h->p[i].object == p) { /* found it */
- h->elements-- ;
- h->p[i] = h->p[h->elements] ;
- heapify(h);
- break ;
- }
+int
+do_config(void *p, int l)
+{
+ struct dn_id *next, *o;
+ int err = 0, err2 = 0;
+ struct dn_id *arg = NULL;
+ uintptr_t *a;
+
+ o = p;
+ if (o->id != DN_API_VERSION) {
+ D("invalid api version got %d need %d",
+ o->id, DN_API_VERSION);
+ return EINVAL;
}
- }
+ for (; l >= sizeof(*o); o = next) {
+ struct dn_id *prev = arg;
+ if (o->len < sizeof(*o) || l < o->len) {
+ D("bad len o->len %d len %d", o->len, l);
+ err = EINVAL;
+ break;
+ }
+ l -= o->len;
+ next = (struct dn_id *)((char *)o + o->len);
+ err = 0;
+ switch (o->type) {
+ default:
+ D("cmd %d not implemented", o->type);
+ break;
+#ifdef EMULATE_SYSCTL
+ /* sysctl emulation.
+ * if we recognize the command, jump to the correct
+ * handler and return
+ */
+ case DN_SYSCTL_SET:
+ err = kesysctl_emu_set(p, l);
+ return err;
+#endif
+ case DN_CMD_CONFIG: /* simply a header */
+ break;
+
+ case DN_CMD_DELETE:
+ /* the argument is in the first uintptr_t after o */
+ a = (uintptr_t *)(o+1);
+ if (o->len < sizeof(*o) + sizeof(*a)) {
+ err = EINVAL;
+ break;
+ }
+ switch (o->subtype) {
+ case DN_LINK:
+ /* delete base and derived schedulers */
+ DN_BH_WLOCK();
+ err = delete_schk(*a);
+ err2 = delete_schk(*a + DN_MAX_ID);
+ DN_BH_WUNLOCK();
+ if (!err)
+ err = err2;
+ break;
+
+ default:
+ D("invalid delete type %d",
+ o->subtype);
+ err = EINVAL;
+ break;
+
+ case DN_FS:
+ err = (*a <1 || *a >= DN_MAX_ID) ?
+ EINVAL : delete_fs(*a, 0) ;
+ break;
+ }
+ break;
+
+ case DN_CMD_FLUSH:
+ DN_BH_WLOCK();
+ dummynet_flush();
+ DN_BH_WUNLOCK();
+ break;
+ case DN_TEXT: /* store argument the next block */
+ prev = NULL;
+ arg = o;
+ break;
+ case DN_LINK:
+ err = config_link((struct dn_link *)o, arg);
+ break;
+ case DN_PROFILE:
+ err = config_profile((struct dn_profile *)o, arg);
+ break;
+ case DN_SCH:
+ err = config_sched((struct dn_sch *)o, arg);
+ break;
+ case DN_FS:
+ err = (NULL==config_fs((struct dn_fs *)o, arg, 0));
+ break;
+ }
+ if (prev)
+ arg = NULL;
+ if (err != 0)
+ break;
+ }
+ return err;
+}
+
+static int
+compute_space(struct dn_id *cmd, struct copy_args *a)
+{
+ int x = 0, need = 0;
+ int profile_size = sizeof(struct dn_profile) -
+ ED_MAX_SAMPLES_NO*sizeof(int);
+
+ /* NOTE about compute space:
+ * NP = dn_cfg.schk_count
+ * NSI = dn_cfg.si_count
+ * NF = dn_cfg.fsk_count
+ * NQ = dn_cfg.queue_count
+ * - ipfw pipe show
+ * (NP/2)*(dn_link + dn_sch + dn_id + dn_fs) only half scheduler
+ * link, scheduler template, flowset
+ * integrated in scheduler and header
+ * for flowset list
+ * (NSI)*(dn_flow) all scheduler instance (includes
+ * the queue instance)
+ * - ipfw sched show
+ * (NP/2)*(dn_link + dn_sch + dn_id + dn_fs) only half scheduler
+ * link, scheduler template, flowset
+ * integrated in scheduler and header
+ * for flowset list
+ * (NSI * dn_flow) all scheduler instances
+ * (NF * sizeof(uint_32)) space for flowset list linked to scheduler
+ * (NQ * dn_queue) all queue [XXXfor now not listed]
+ * - ipfw queue show
+ * (NF * dn_fs) all flowset
+ * (NQ * dn_queue) all queues
+ */
+ switch (cmd->subtype) {
+ default:
+ return -1;
+ /* XXX where do LINK and SCH differ ? */
+ /* 'ipfw sched show' could list all queues associated to
+ * a scheduler. This feature for now is disabled
+ */
+ case DN_LINK: /* pipe show */
+ x = DN_C_LINK | DN_C_SCH | DN_C_FLOW;
+ need += dn_cfg.schk_count *
+ (sizeof(struct dn_fs) + profile_size) / 2;
+ need += dn_cfg.fsk_count * sizeof(uint32_t);
+ break;
+ case DN_SCH: /* sched show */
+ need += dn_cfg.schk_count *
+ (sizeof(struct dn_fs) + profile_size) / 2;
+ need += dn_cfg.fsk_count * sizeof(uint32_t);
+ x = DN_C_SCH | DN_C_LINK | DN_C_FLOW;
+ break;
+ case DN_FS: /* queue show */
+ x = DN_C_FS | DN_C_QUEUE;
+ break;
+ case DN_GET_COMPAT: /* compatibility mode */
+ need = dn_compat_calc_size(dn_cfg);
+ break;
+ }
+ a->flags = x;
+ if (x & DN_C_SCH) {
+ need += dn_cfg.schk_count * sizeof(struct dn_sch) / 2;
+ /* NOT also, each fs might be attached to a sched */
+ need += dn_cfg.schk_count * sizeof(struct dn_id) / 2;
+ }
+ if (x & DN_C_FS)
+ need += dn_cfg.fsk_count * sizeof(struct dn_fs);
+ if (x & DN_C_LINK) {
+ need += dn_cfg.schk_count * sizeof(struct dn_link) / 2;
+ }
+ /*
+ * When exporting a queue to userland, only pass up the
+ * struct dn_flow, which is the only visible part.
+ */
+
+ if (x & DN_C_QUEUE)
+ need += dn_cfg.queue_count * sizeof(struct dn_flow);
+ if (x & DN_C_FLOW)
+ need += dn_cfg.si_count * (sizeof(struct dn_flow));
+ return need;
}
/*
- * drain all queues. Called in case of severe mbuf shortage.
+ * If compat != NULL dummynet_get is called in compatibility mode.
+ * *compat will be the pointer to the buffer to pass to ipfw
*/
-void
-dummynet_drain(void)
-{
- struct dn_flow_set *fs;
- struct dn_pipe *pipe;
- struct mbuf *m, *mnext;
- int i;
-
- DUMMYNET_LOCK_ASSERT();
-
- heap_free(&ready_heap);
- heap_free(&wfq_ready_heap);
- heap_free(&extract_heap);
- /* remove all references to this pipe from flow_sets */
- for (i = 0; i < HASHSIZE; i++)
- SLIST_FOREACH(fs, &flowsethash[i], next)
- purge_flow_set(fs, 0);
-
- for (i = 0; i < HASHSIZE; i++) {
- SLIST_FOREACH(pipe, &pipehash[i], next) {
- purge_flow_set(&(pipe->fs), 0);
-
- mnext = pipe->head;
- while ((m = mnext) != NULL) {
- mnext = m->m_nextpkt;
- DN_FREE_PKT(m);
+int
+dummynet_get(struct sockopt *sopt, void **compat)
+{
+ int have, i, need, error;
+ char *start = NULL, *buf;
+ size_t sopt_valsize;
+ struct dn_id *cmd;
+ struct copy_args a;
+ struct copy_range r;
+ int l = sizeof(struct dn_id);
+
+ bzero(&a, sizeof(a));
+ bzero(&r, sizeof(r));
+
+ /* save and restore original sopt_valsize around copyin */
+ sopt_valsize = sopt->sopt_valsize;
+
+ cmd = &r.o;
+
+ if (!compat) {
+ /* copy at least an oid, and possibly a full object */
+ error = sooptcopyin(sopt, cmd, sizeof(r), sizeof(*cmd));
+ sopt->sopt_valsize = sopt_valsize;
+ if (error)
+ goto done;
+ l = cmd->len;
+#ifdef EMULATE_SYSCTL
+ /* sysctl emulation. */
+ if (cmd->type == DN_SYSCTL_GET)
+ return kesysctl_emu_get(sopt);
+#endif
+ if (l > sizeof(r)) {
+ /* request larger than default, allocate buffer */
+ cmd = malloc(l, M_DUMMYNET, M_WAIT);
+ if (cmd == NULL)
+ return ENOMEM; //XXX
+ error = sooptcopyin(sopt, cmd, l, l);
+ sopt->sopt_valsize = sopt_valsize;
+ if (error)
+ goto done;
}
- pipe->head = pipe->tail = NULL;
+ } else { /* compatibility */
+ error = 0;
+ cmd->type = DN_CMD_GET;
+ cmd->len = sizeof(struct dn_id);
+ cmd->subtype = DN_GET_COMPAT;
+ // cmd->id = sopt_valsize;
+ D("compatibility mode");
}
- }
+ a.extra = (struct copy_range *)cmd;
+ if (cmd->len == sizeof(*cmd)) { /* no range, create a default */
+ uint32_t *rp = (uint32_t *)(cmd + 1);
+ cmd->len += 2* sizeof(uint32_t);
+ rp[0] = 1;
+ rp[1] = DN_MAX_ID - 1;
+ if (cmd->subtype == DN_LINK) {
+ rp[0] += DN_MAX_ID;
+ rp[1] += DN_MAX_ID;
+ }
+ }
+ /* Count space (under lock) and allocate (outside lock).
+ * Exit with lock held if we manage to get enough buffer.
+ * Try a few times then give up.
+ */
+ for (have = 0, i = 0; i < 10; i++) {
+ DN_BH_WLOCK();
+ need = compute_space(cmd, &a);
+
+ /* if there is a range, ignore value from compute_space() */
+ if (l > sizeof(*cmd))
+ need = sopt_valsize - sizeof(*cmd);
+
+ if (need < 0) {
+ DN_BH_WUNLOCK();
+ error = EINVAL;
+ goto done;
+ }
+ need += sizeof(*cmd);
+ cmd->id = need;
+ if (have >= need)
+ break;
+
+ DN_BH_WUNLOCK();
+ if (start)
+ free(start, M_DUMMYNET);
+ start = NULL;
+ if (need > sopt_valsize)
+ break;
+
+ have = need;
+ start = malloc(have, M_DUMMYNET, M_WAITOK | M_ZERO);
+ if (start == NULL) {
+ error = ENOMEM;
+ goto done;
+ }
+ }
+
+ if (start == NULL) {
+ if (compat) {
+ *compat = NULL;
+ error = 1; // XXX
+ } else {
+ error = sooptcopyout(sopt, cmd, sizeof(*cmd));
+ }
+ goto done;
+ }
+ ND("have %d:%d sched %d, %d:%d links %d, %d:%d flowsets %d, "
+ "%d:%d si %d, %d:%d queues %d",
+ dn_cfg.schk_count, sizeof(struct dn_sch), DN_SCH,
+ dn_cfg.schk_count, sizeof(struct dn_link), DN_LINK,
+ dn_cfg.fsk_count, sizeof(struct dn_fs), DN_FS,
+ dn_cfg.si_count, sizeof(struct dn_flow), DN_SCH_I,
+ dn_cfg.queue_count, sizeof(struct dn_queue), DN_QUEUE);
+ sopt->sopt_valsize = sopt_valsize;
+ a.type = cmd->subtype;
+
+ if (compat == NULL) {
+ bcopy(cmd, start, sizeof(*cmd));
+ ((struct dn_id*)(start))->len = sizeof(struct dn_id);
+ buf = start + sizeof(*cmd);
+ } else
+ buf = start;
+ a.start = &buf;
+ a.end = start + have;
+ /* start copying other objects */
+ if (compat) {
+ a.type = DN_COMPAT_PIPE;
+ dn_ht_scan(dn_cfg.schedhash, copy_data_helper_compat, &a);
+ a.type = DN_COMPAT_QUEUE;
+ dn_ht_scan(dn_cfg.fshash, copy_data_helper_compat, &a);
+ } else if (a.type == DN_FS) {
+ dn_ht_scan(dn_cfg.fshash, copy_data_helper, &a);
+ } else {
+ dn_ht_scan(dn_cfg.schedhash, copy_data_helper, &a);
+ }
+ DN_BH_WUNLOCK();
+
+ if (compat) {
+ *compat = start;
+ sopt->sopt_valsize = buf - start;
+ /* free() is done by ip_dummynet_compat() */
+ start = NULL; //XXX hack
+ } else {
+ error = sooptcopyout(sopt, start, buf - start);
+ }
+done:
+ if (cmd && cmd != &r.o)
+ free(cmd, M_DUMMYNET);
+ if (start)
+ free(start, M_DUMMYNET);
+ return error;
}
-/*
- * Fully delete a pipe or a queue, cleaning up associated info.
- */
+/* Callback called on scheduler instance to delete it if idle */
static int
-delete_pipe(struct dn_pipe *p)
+drain_scheduler_cb(void *_si, void *arg)
{
+ struct dn_sch_inst *si = _si;
- if (p->pipe_nr == 0 && p->fs.fs_nr == 0)
- return EINVAL ;
- if (p->pipe_nr != 0 && p->fs.fs_nr != 0)
- return EINVAL ;
- if (p->pipe_nr != 0) { /* this is an old-style pipe */
- struct dn_pipe *pipe;
- struct dn_flow_set *fs;
- int i;
-
- DUMMYNET_LOCK();
- pipe = locate_pipe(p->pipe_nr); /* locate pipe */
+ if ((si->kflags & DN_ACTIVE) || si->dline.mq.head != NULL)
+ return 0;
- if (pipe == NULL) {
- DUMMYNET_UNLOCK();
- return (ENOENT); /* not found */
+ if (si->sched->fp->flags & DN_MULTIQUEUE) {
+ if (si->q_count == 0)
+ return si_destroy(si, NULL);
+ else
+ return 0;
+ } else { /* !DN_MULTIQUEUE */
+ if ((si+1)->ni.length == 0)
+ return si_destroy(si, NULL);
+ else
+ return 0;
}
+ return 0; /* unreachable */
+}
- /* Unlink from list of pipes. */
- SLIST_REMOVE(&pipehash[HASH(pipe->pipe_nr)], pipe, dn_pipe, next);
+/* Callback called on scheduler to check if it has instances */
+static int
+drain_scheduler_sch_cb(void *_s, void *arg)
+{
+ struct dn_schk *s = _s;
- /* Remove all references to this pipe from flow_sets. */
- for (i = 0; i < HASHSIZE; i++)
- SLIST_FOREACH(fs, &flowsethash[i], next)
- if (fs->pipe == pipe) {
- printf("dummynet: ++ ref to pipe %d from fs %d\n",
- p->pipe_nr, fs->fs_nr);
- fs->pipe = NULL ;
- purge_flow_set(fs, 0);
+ if (s->sch.flags & DN_HAVE_MASK) {
+ dn_ht_scan_bucket(s->siht, &s->drain_bucket,
+ drain_scheduler_cb, NULL);
+ s->drain_bucket++;
+ } else {
+ if (s->siht) {
+ if (drain_scheduler_cb(s->siht, NULL) == DNHT_SCAN_DEL)
+ s->siht = NULL;
}
- fs_remove_from_heap(&ready_heap, &(pipe->fs));
- purge_pipe(pipe); /* remove all data associated to this pipe */
- /* remove reference to here from extract_heap and wfq_ready_heap */
- pipe_remove_from_heap(&extract_heap, pipe);
- pipe_remove_from_heap(&wfq_ready_heap, pipe);
- DUMMYNET_UNLOCK();
-
- free_pipe(pipe);
- } else { /* this is a WF2Q queue (dn_flow_set) */
- struct dn_flow_set *fs;
-
- DUMMYNET_LOCK();
- fs = locate_flowset(p->fs.fs_nr); /* locate set */
-
- if (fs == NULL) {
- DUMMYNET_UNLOCK();
- return (ENOENT); /* not found */
- }
-
- /* Unlink from list of flowsets. */
- SLIST_REMOVE( &flowsethash[HASH(fs->fs_nr)], fs, dn_flow_set, next);
-
- if (fs->pipe != NULL) {
- /* Update total weight on parent pipe and cleanup parent heaps. */
- fs->pipe->sum -= fs->weight * fs->backlogged ;
- fs_remove_from_heap(&(fs->pipe->not_eligible_heap), fs);
- fs_remove_from_heap(&(fs->pipe->scheduler_heap), fs);
-#if 1 /* XXX should i remove from idle_heap as well ? */
- fs_remove_from_heap(&(fs->pipe->idle_heap), fs);
-#endif
}
- purge_flow_set(fs, 1);
- DUMMYNET_UNLOCK();
- }
- return 0 ;
+ return 0;
}
-/*
- * helper function used to copy data from kernel in DUMMYNET_GET
- */
-static char *
-dn_copy_set(struct dn_flow_set *set, char *bp)
-{
- int i, copied = 0 ;
- struct dn_flow_queue *q, *qp = (struct dn_flow_queue *)bp;
-
- DUMMYNET_LOCK_ASSERT();
-
- for (i = 0 ; i <= set->rq_size ; i++)
- for (q = set->rq[i] ; q ; q = q->next, qp++ ) {
- if (q->hash_slot != i)
- printf("dummynet: ++ at %d: wrong slot (have %d, "
- "should be %d)\n", copied, q->hash_slot, i);
- if (q->fs != set)
- printf("dummynet: ++ at %d: wrong fs ptr (have %p, should be %p)\n",
- i, q->fs, set);
- copied++ ;
- bcopy(q, qp, sizeof( *q ) );
- /* cleanup pointers */
- qp->next = NULL ;
- qp->head = qp->tail = NULL ;
- qp->fs = NULL ;
- }
- if (copied != set->rq_elements)
- printf("dummynet: ++ wrong count, have %d should be %d\n",
- copied, set->rq_elements);
- return (char *)qp ;
-}
-
-static size_t
-dn_calc_size(void)
-{
- struct dn_flow_set *fs;
- struct dn_pipe *pipe;
- size_t size = 0;
- int i;
-
- DUMMYNET_LOCK_ASSERT();
- /*
- * Compute size of data structures: list of pipes and flow_sets.
- */
- for (i = 0; i < HASHSIZE; i++) {
- SLIST_FOREACH(pipe, &pipehash[i], next)
- size += sizeof(*pipe) +
- pipe->fs.rq_elements * sizeof(struct dn_flow_queue);
- SLIST_FOREACH(fs, &flowsethash[i], next)
- size += sizeof (*fs) +
- fs->rq_elements * sizeof(struct dn_flow_queue);
- }
- return size;
+/* Called every tick, try to delete a 'bucket' of scheduler */
+void
+dn_drain_scheduler(void)
+{
+ dn_ht_scan_bucket(dn_cfg.schedhash, &dn_cfg.drain_sch,
+ drain_scheduler_sch_cb, NULL);
+ dn_cfg.drain_sch++;
}
+/* Callback called on queue to delete if it is idle */
static int
-dummynet_get(struct sockopt *sopt)
-{
- char *buf, *bp ; /* bp is the "copy-pointer" */
- size_t size ;
- struct dn_flow_set *fs;
- struct dn_pipe *pipe;
- int error=0, i ;
-
- /* XXX lock held too long */
- DUMMYNET_LOCK();
- /*
- * XXX: Ugly, but we need to allocate memory with M_WAITOK flag and we
- * cannot use this flag while holding a mutex.
- */
- for (i = 0; i < 10; i++) {
- size = dn_calc_size();
- DUMMYNET_UNLOCK();
- buf = malloc(size, M_TEMP, M_WAITOK);
- DUMMYNET_LOCK();
- if (size == dn_calc_size())
- break;
- free(buf, M_TEMP);
- buf = NULL;
- }
- if (buf == NULL) {
- DUMMYNET_UNLOCK();
- return ENOBUFS ;
- }
- bp = buf;
- for (i = 0; i < HASHSIZE; i++)
- SLIST_FOREACH(pipe, &pipehash[i], next) {
- struct dn_pipe *pipe_bp = (struct dn_pipe *)bp;
-
- /*
- * Copy pipe descriptor into *bp, convert delay back to ms,
- * then copy the flow_set descriptor(s) one at a time.
- * After each flow_set, copy the queue descriptor it owns.
- */
- bcopy(pipe, bp, sizeof(*pipe));
- pipe_bp->delay = (pipe_bp->delay * 1000) / hz;
- pipe_bp->burst /= 8 * hz;
- /*
- * XXX the following is a hack based on ->next being the
- * first field in dn_pipe and dn_flow_set. The correct
- * solution would be to move the dn_flow_set to the beginning
- * of struct dn_pipe.
- */
- pipe_bp->next.sle_next = (struct dn_pipe *)DN_IS_PIPE;
- /* Clean pointers. */
- pipe_bp->head = pipe_bp->tail = NULL;
- pipe_bp->fs.next.sle_next = NULL;
- pipe_bp->fs.pipe = NULL;
- pipe_bp->fs.rq = NULL;
- pipe_bp->samples = NULL;
+drain_queue_cb(void *_q, void *arg)
+{
+ struct dn_queue *q = _q;
- bp += sizeof(*pipe) ;
- bp = dn_copy_set(&(pipe->fs), bp);
+ if (q->ni.length == 0) {
+ dn_delete_queue(q, DN_DESTROY);
+ return DNHT_SCAN_DEL; /* queue is deleted */
}
- for (i = 0; i < HASHSIZE; i++)
- SLIST_FOREACH(fs, &flowsethash[i], next) {
- struct dn_flow_set *fs_bp = (struct dn_flow_set *)bp;
+ return 0; /* queue isn't deleted */
+}
- bcopy(fs, bp, sizeof(*fs));
- /* XXX same hack as above */
- fs_bp->next.sle_next = (struct dn_flow_set *)DN_IS_QUEUE;
- fs_bp->pipe = NULL;
- fs_bp->rq = NULL;
- bp += sizeof(*fs);
- bp = dn_copy_set(fs, bp);
- }
+/* Callback called on flowset used to check if it has queues */
+static int
+drain_queue_fs_cb(void *_fs, void *arg)
+{
+ struct dn_fsk *fs = _fs;
- DUMMYNET_UNLOCK();
+ if (fs->fs.flags & DN_QHT_HASH) {
+ /* Flowset has a hash table for queues */
+ dn_ht_scan_bucket(fs->qht, &fs->drain_bucket,
+ drain_queue_cb, NULL);
+ fs->drain_bucket++;
+ } else {
+ /* No hash table for this flowset, null the pointer
+ * if the queue is deleted
+ */
+ if (fs->qht) {
+ if (drain_queue_cb(fs->qht, NULL) == DNHT_SCAN_DEL)
+ fs->qht = NULL;
+ }
+ }
+ return 0;
+}
- error = sooptcopyout(sopt, buf, size);
- free(buf, M_TEMP);
- return error ;
+/* Called every tick, try to delete a 'bucket' of queue */
+void
+dn_drain_queue(void)
+{
+ /* scan a bucket of flowset */
+ dn_ht_scan_bucket(dn_cfg.fshash, &dn_cfg.drain_fs,
+ drain_queue_fs_cb, NULL);
+ dn_cfg.drain_fs++;
}
/*
- * Handler for the various dummynet socket options (get, flush, config, del)
+ * Handler for the various dummynet socket options
*/
static int
ip_dn_ctl(struct sockopt *sopt)
{
- int error;
- struct dn_pipe *p = NULL;
+ void *p = NULL;
+ int error, l;
- error = priv_check(sopt->sopt_td, PRIV_NETINET_DUMMYNET);
- if (error)
- return (error);
-
- /* Disallow sets in really-really secure mode. */
- if (sopt->sopt_dir == SOPT_SET) {
-#if __FreeBSD_version >= 500034
- error = securelevel_ge(sopt->sopt_td->td_ucred, 3);
+ error = priv_check(sopt->sopt_td, PRIV_NETINET_DUMMYNET);
if (error)
- return (error);
-#else
- if (securelevel >= 3)
- return (EPERM);
-#endif
- }
-
- switch (sopt->sopt_name) {
- default :
- printf("dummynet: -- unknown option %d", sopt->sopt_name);
- error = EINVAL ;
- break;
+ return (error);
- case IP_DUMMYNET_GET :
- error = dummynet_get(sopt);
- break ;
+ /* Disallow sets in really-really secure mode. */
+ if (sopt->sopt_dir == SOPT_SET) {
+ error = securelevel_ge(sopt->sopt_td->td_ucred, 3);
+ if (error)
+ return (error);
+ }
- case IP_DUMMYNET_FLUSH :
- dummynet_flush() ;
- break ;
+ switch (sopt->sopt_name) {
+ default :
+ D("dummynet: unknown option %d", sopt->sopt_name);
+ error = EINVAL;
+ break;
- case IP_DUMMYNET_CONFIGURE :
- p = malloc(sizeof(struct dn_pipe_max), M_TEMP, M_WAITOK);
- error = sooptcopyin(sopt, p, sizeof(struct dn_pipe_max), sizeof *p);
- if (error)
- break ;
- if (p->samples_no > 0)
- p->samples = &(((struct dn_pipe_max *)p)->samples[0]);
+ case IP_DUMMYNET_FLUSH:
+ case IP_DUMMYNET_CONFIGURE:
+ case IP_DUMMYNET_DEL: /* remove a pipe or queue */
+ case IP_DUMMYNET_GET:
+ D("dummynet: compat option %d", sopt->sopt_name);
+ error = ip_dummynet_compat(sopt);
+ break;
- error = config_pipe(p);
- break ;
+ case IP_DUMMYNET3 :
+ if (sopt->sopt_dir == SOPT_GET) {
+ error = dummynet_get(sopt, NULL);
+ break;
+ }
+ l = sopt->sopt_valsize;
+ if (l < sizeof(struct dn_id) || l > 12000) {
+ D("argument len %d invalid", l);
+ break;
+ }
+ p = malloc(l, M_TEMP, M_WAITOK); // XXX can it fail ?
+ error = sooptcopyin(sopt, p, l, l);
+ if (error)
+ break ;
+ error = do_config(p, l);
+ break;
+ }
- case IP_DUMMYNET_DEL : /* remove a pipe or queue */
- p = malloc(sizeof(struct dn_pipe), M_TEMP, M_WAITOK);
- error = sooptcopyin(sopt, p, sizeof(struct dn_pipe), sizeof *p);
- if (error)
- break ;
+ if (p != NULL)
+ free(p, M_TEMP);
- error = delete_pipe(p);
- break ;
- }
- if (p != NULL)
- free(p, M_TEMP);
- return error ;
+ return error ;
}
+
static void
ip_dn_init(void)
{
- int i;
+ static int init_done = 0;
+ if (init_done)
+ return;
+ init_done = 1;
if (bootverbose)
- printf("DUMMYNET with IPv6 initialized (040826)\n");
-
- DUMMYNET_LOCK_INIT();
-
- for (i = 0; i < HASHSIZE; i++) {
- SLIST_INIT(&pipehash[i]);
- SLIST_INIT(&flowsethash[i]);
- }
- ready_heap.size = ready_heap.elements = 0;
- ready_heap.offset = 0;
-
- wfq_ready_heap.size = wfq_ready_heap.elements = 0;
- wfq_ready_heap.offset = 0;
-
- extract_heap.size = extract_heap.elements = 0;
- extract_heap.offset = 0;
+ printf("DUMMYNET with IPv6 initialized (100131)\n");
+ /* Set defaults here. MSVC does not accept initializers,
+ * and this is also useful for vimages
+ */
+ /* queue limits */
+ dn_cfg.slot_limit = 100; /* Foot shooting limit for queues. */
+ dn_cfg.byte_limit = 1024 * 1024;
+ dn_cfg.expire = 1;
+
+ /* RED parameters */
+ dn_cfg.red_lookup_depth = 256; /* default lookup table depth */
+ dn_cfg.red_avg_pkt_size = 512; /* default medium packet size */
+ dn_cfg.red_max_pkt_size = 1500; /* default max packet size */
+
+ /* hash tables */
+ dn_cfg.max_hash_size = 1024; /* max in the hash tables */
+ dn_cfg.hash_size = 64; /* default hash size */
+
+ /* create hash tables for schedulers and flowsets.
+ * In both we search by key and by pointer.
+ */
+ dn_cfg.schedhash = dn_ht_init(NULL, dn_cfg.hash_size,
+ offsetof(struct dn_schk, schk_next),
+ schk_hash, schk_match, schk_new);
+ dn_cfg.fshash = dn_ht_init(NULL, dn_cfg.hash_size,
+ offsetof(struct dn_fsk, fsk_next),
+ fsk_hash, fsk_match, fsk_new);
+
+ /* bucket index to drain object */
+ dn_cfg.drain_fs = 0;
+ dn_cfg.drain_sch = 0;
+
+ heap_init(&dn_cfg.evheap, 16, offsetof(struct dn_id, id));
+ SLIST_INIT(&dn_cfg.fsu);
+ SLIST_INIT(&dn_cfg.schedlist);
+
+ DN_LOCK_INIT();
ip_dn_ctl_ptr = ip_dn_ctl;
ip_dn_io_ptr = dummynet_io;
@@ -2285,25 +2168,29 @@ ip_dn_init(void)
callout_reset(&dn_timeout, 1, dummynet, NULL);
/* Initialize curr_time adjustment mechanics. */
- getmicrouptime(&prev_t);
+ getmicrouptime(&dn_cfg.prev_t);
}
#ifdef KLD_MODULE
static void
ip_dn_destroy(void)
{
+ callout_drain(&dn_timeout);
+
+ DN_BH_WLOCK();
ip_dn_ctl_ptr = NULL;
ip_dn_io_ptr = NULL;
- DUMMYNET_LOCK();
- callout_stop(&dn_timeout);
- DUMMYNET_UNLOCK();
+ dummynet_flush();
+ DN_BH_WUNLOCK();
taskqueue_drain(dn_tq, &dn_task);
taskqueue_free(dn_tq);
- dummynet_flush();
+ dn_ht_free(dn_cfg.schedhash, 0);
+ dn_ht_free(dn_cfg.fshash, 0);
+ heap_free(&dn_cfg.evheap);
- DUMMYNET_LOCK_DESTROY();
+ DN_LOCK_DESTROY();
}
#endif /* KLD_MODULE */
@@ -2311,35 +2198,98 @@ static int
dummynet_modevent(module_t mod, int type, void *data)
{
- switch (type) {
- case MOD_LOAD:
+ if (type == MOD_LOAD) {
if (ip_dn_io_ptr) {
- printf("DUMMYNET already loaded\n");
- return EEXIST ;
+ printf("DUMMYNET already loaded\n");
+ return EEXIST ;
}
ip_dn_init();
- break;
-
- case MOD_UNLOAD:
+ return 0;
+ } else if (type == MOD_UNLOAD) {
#if !defined(KLD_MODULE)
printf("dummynet statically compiled, cannot unload\n");
return EINVAL ;
#else
ip_dn_destroy();
+ return 0;
#endif
- break ;
- default:
+ } else
return EOPNOTSUPP;
- break ;
+}
+
+/* modevent helpers for the modules */
+static int
+load_dn_sched(struct dn_alg *d)
+{
+ struct dn_alg *s;
+
+ if (d == NULL)
+ return 1; /* error */
+ ip_dn_init(); /* just in case, we need the lock */
+
+ /* Check that mandatory funcs exists */
+ if (d->enqueue == NULL || d->dequeue == NULL) {
+ D("missing enqueue or dequeue for %s", d->name);
+ return 1;
+ }
+
+ /* Search if scheduler already exists */
+ DN_BH_WLOCK();
+ SLIST_FOREACH(s, &dn_cfg.schedlist, next) {
+ if (strcmp(s->name, d->name) == 0) {
+ D("%s already loaded", d->name);
+ break; /* scheduler already exists */
+ }
+ }
+ if (s == NULL)
+ SLIST_INSERT_HEAD(&dn_cfg.schedlist, d, next);
+ DN_BH_WUNLOCK();
+ D("dn_sched %s %sloaded", d->name, s ? "not ":"");
+ return s ? 1 : 0;
+}
+
+static int
+unload_dn_sched(struct dn_alg *s)
+{
+ struct dn_alg *tmp, *r;
+ int err = EINVAL;
+
+ D("called for %s", s->name);
+
+ DN_BH_WLOCK();
+ SLIST_FOREACH_SAFE(r, &dn_cfg.schedlist, next, tmp) {
+ if (strcmp(s->name, r->name) != 0)
+ continue;
+ D("ref_count = %d", r->ref_count);
+ err = (r->ref_count != 0) ? EBUSY : 0;
+ if (err == 0)
+ SLIST_REMOVE(&dn_cfg.schedlist, r, dn_alg, next);
+ break;
}
- return 0 ;
+ DN_BH_WUNLOCK();
+ D("dn_sched %s %sunloaded", s->name, err ? "not ":"");
+ return err;
+}
+
+int
+dn_sched_modevent(module_t mod, int cmd, void *arg)
+{
+ struct dn_alg *sch = arg;
+
+ if (cmd == MOD_LOAD)
+ return load_dn_sched(sch);
+ else if (cmd == MOD_UNLOAD)
+ return unload_dn_sched(sch);
+ else
+ return EINVAL;
}
static moduledata_t dummynet_mod = {
- "dummynet",
- dummynet_modevent,
- NULL
+ "dummynet", dummynet_modevent, NULL
};
-DECLARE_MODULE(dummynet, dummynet_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY);
+
+DECLARE_MODULE(dummynet, dummynet_mod,
+ SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY-1);
MODULE_DEPEND(dummynet, ipfw, 2, 2, 2);
MODULE_VERSION(dummynet, 1);
+/* end of file */
diff --git a/sys/netinet/ipfw/ip_fw2.c b/sys/netinet/ipfw/ip_fw2.c
index 1ccbf2b..959ad8e 100644
--- a/sys/netinet/ipfw/ip_fw2.c
+++ b/sys/netinet/ipfw/ip_fw2.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa
+ * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -26,11 +26,8 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
-#define DEB(x)
-#define DDB(x) x
-
/*
- * Implement IP packet firewall (new version)
+ * The FreeBSD IP packet firewall, main file
*/
#if !defined(KLD_MODULE)
@@ -65,13 +62,10 @@ __FBSDID("$FreeBSD$");
#include <sys/ucred.h>
#include <net/ethernet.h> /* for ETHERTYPE_IP */
#include <net/if.h>
-#include <net/radix.h>
#include <net/route.h>
#include <net/pf_mtag.h>
#include <net/vnet.h>
-#define IPFW_INTERNAL /* Access to protected data structures in ip_fw.h. */
-
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/in_pcb.h>
@@ -79,8 +73,7 @@ __FBSDID("$FreeBSD$");
#include <netinet/ip_var.h>
#include <netinet/ip_icmp.h>
#include <netinet/ip_fw.h>
-#include <netinet/ip_divert.h>
-#include <netinet/ip_dummynet.h>
+#include <netinet/ipfw/ip_fw_private.h>
#include <netinet/ip_carp.h>
#include <netinet/pim.h>
#include <netinet/tcp_var.h>
@@ -88,8 +81,6 @@ __FBSDID("$FreeBSD$");
#include <netinet/udp_var.h>
#include <netinet/sctp.h>
-#include <netgraph/ng_ipfw.h>
-
#include <netinet/ip6.h>
#include <netinet/icmp6.h>
#ifdef INET6
@@ -103,73 +94,66 @@ __FBSDID("$FreeBSD$");
#include <security/mac/mac_framework.h>
#endif
-static VNET_DEFINE(int, ipfw_vnet_ready) = 0;
-#define V_ipfw_vnet_ready VNET(ipfw_vnet_ready)
/*
- * set_disable contains one bit per set value (0..31).
- * If the bit is set, all rules with the corresponding set
- * are disabled. Set RESVD_SET(31) is reserved for the default rule
- * and rules that are not deleted by the flush command,
- * and CANNOT be disabled.
- * Rules in set RESVD_SET can only be deleted explicitly.
+ * static variables followed by global ones.
+ * All ipfw global variables are here.
*/
-static VNET_DEFINE(u_int32_t, set_disable);
-static VNET_DEFINE(int, fw_verbose);
-static VNET_DEFINE(struct callout, ipfw_timeout);
-static VNET_DEFINE(int, verbose_limit);
-#define V_set_disable VNET(set_disable)
-#define V_fw_verbose VNET(fw_verbose)
-#define V_ipfw_timeout VNET(ipfw_timeout)
-#define V_verbose_limit VNET(verbose_limit)
+/* ipfw_vnet_ready controls when we are open for business */
+static VNET_DEFINE(int, ipfw_vnet_ready) = 0;
+#define V_ipfw_vnet_ready VNET(ipfw_vnet_ready)
+
+static VNET_DEFINE(int, fw_deny_unknown_exthdrs);
+#define V_fw_deny_unknown_exthdrs VNET(fw_deny_unknown_exthdrs)
#ifdef IPFIREWALL_DEFAULT_TO_ACCEPT
static int default_to_accept = 1;
#else
static int default_to_accept;
#endif
-static uma_zone_t ipfw_dyn_rule_zone;
-struct ip_fw *ip_fw_default_rule;
+VNET_DEFINE(int, autoinc_step);
/*
- * list of rules for layer 3
+ * Each rule belongs to one of 32 different sets (0..31).
+ * The variable set_disable contains one bit per set.
+ * If the bit is set, all rules in the corresponding set
+ * are disabled. Set RESVD_SET(31) is reserved for the default rule
+ * and rules that are not deleted by the flush command,
+ * and CANNOT be disabled.
+ * Rules in set RESVD_SET can only be deleted individually.
*/
+VNET_DEFINE(u_int32_t, set_disable);
+#define V_set_disable VNET(set_disable)
+
+VNET_DEFINE(int, fw_verbose);
+/* counter for ipfw_log(NULL...) */
+VNET_DEFINE(u_int64_t, norule_counter);
+VNET_DEFINE(int, verbose_limit);
+
+/* layer3_chain contains the list of rules for layer 3 */
VNET_DEFINE(struct ip_fw_chain, layer3_chain);
-MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");
-MALLOC_DEFINE(M_IPFW_TBL, "ipfw_tbl", "IpFw tables");
-#define IPFW_NAT_LOADED (ipfw_nat_ptr != NULL)
ipfw_nat_t *ipfw_nat_ptr = NULL;
+struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int);
ipfw_nat_cfg_t *ipfw_nat_cfg_ptr;
ipfw_nat_cfg_t *ipfw_nat_del_ptr;
ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr;
ipfw_nat_cfg_t *ipfw_nat_get_log_ptr;
-struct table_entry {
- struct radix_node rn[2];
- struct sockaddr_in addr, mask;
- u_int32_t value;
-};
-
-static VNET_DEFINE(int, autoinc_step);
-#define V_autoinc_step VNET(autoinc_step)
-static VNET_DEFINE(int, fw_deny_unknown_exthdrs);
-#define V_fw_deny_unknown_exthdrs VNET(fw_deny_unknown_exthdrs)
+#ifdef SYSCTL_NODE
+uint32_t dummy_def = IPFW_DEFAULT_RULE;
+uint32_t dummy_tables_max = IPFW_TABLES_MAX;
-extern int ipfw_chg_hook(SYSCTL_HANDLER_ARGS);
+SYSBEGIN(f3)
-#ifdef SYSCTL_NODE
SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
-SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, enable,
- CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_enable), 0,
- ipfw_chg_hook, "I", "Enable ipfw");
-SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step,
- CTLFLAG_RW, &VNET_NAME(autoinc_step), 0,
- "Rule number auto-increment step");
SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, one_pass,
CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0,
"Only do a single pass through ipfw when using dummynet(4)");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step,
+ CTLFLAG_RW, &VNET_NAME(autoinc_step), 0,
+ "Rule number auto-increment step");
SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose,
CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_verbose), 0,
"Log matches to ipfw rules");
@@ -177,168 +161,34 @@ SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit,
CTLFLAG_RW, &VNET_NAME(verbose_limit), 0,
"Set upper limit of matches of ipfw rules logged");
SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, default_rule, CTLFLAG_RD,
- NULL, IPFW_DEFAULT_RULE,
+ &dummy_def, 0,
"The default/max possible rule number.");
SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, tables_max, CTLFLAG_RD,
- NULL, IPFW_TABLES_MAX,
+ &dummy_tables_max, 0,
"The maximum number of tables.");
SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, default_to_accept, CTLFLAG_RDTUN,
&default_to_accept, 0,
"Make the default rule accept all packets.");
TUNABLE_INT("net.inet.ip.fw.default_to_accept", &default_to_accept);
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, static_count,
+ CTLFLAG_RD, &VNET_NAME(layer3_chain.n_rules), 0,
+ "Number of static rules");
#ifdef INET6
SYSCTL_DECL(_net_inet6_ip6);
SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
-SYSCTL_VNET_PROC(_net_inet6_ip6_fw, OID_AUTO, enable,
- CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw6_enable), 0,
- ipfw_chg_hook, "I", "Enable ipfw+6");
SYSCTL_VNET_INT(_net_inet6_ip6_fw, OID_AUTO, deny_unknown_exthdrs,
CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_deny_unknown_exthdrs), 0,
"Deny packets with unknown IPv6 Extension Headers");
#endif /* INET6 */
-#endif /* SYSCTL_NODE */
-
-/*
- * Description of dynamic rules.
- *
- * Dynamic rules are stored in lists accessed through a hash table
- * (ipfw_dyn_v) whose size is curr_dyn_buckets. This value can
- * be modified through the sysctl variable dyn_buckets which is
- * updated when the table becomes empty.
- *
- * XXX currently there is only one list, ipfw_dyn.
- *
- * When a packet is received, its address fields are first masked
- * with the mask defined for the rule, then hashed, then matched
- * against the entries in the corresponding list.
- * Dynamic rules can be used for different purposes:
- * + stateful rules;
- * + enforcing limits on the number of sessions;
- * + in-kernel NAT (not implemented yet)
- *
- * The lifetime of dynamic rules is regulated by dyn_*_lifetime,
- * measured in seconds and depending on the flags.
- *
- * The total number of dynamic rules is stored in dyn_count.
- * The max number of dynamic rules is dyn_max. When we reach
- * the maximum number of rules we do not create anymore. This is
- * done to avoid consuming too much memory, but also too much
- * time when searching on each packet (ideally, we should try instead
- * to put a limit on the length of the list on each bucket...).
- *
- * Each dynamic rule holds a pointer to the parent ipfw rule so
- * we know what action to perform. Dynamic rules are removed when
- * the parent rule is deleted. XXX we should make them survive.
- *
- * There are some limitations with dynamic rules -- we do not
- * obey the 'randomized match', and we do not do multiple
- * passes through the firewall. XXX check the latter!!!
- */
-static VNET_DEFINE(ipfw_dyn_rule **, ipfw_dyn_v);
-static VNET_DEFINE(u_int32_t, dyn_buckets);
-static VNET_DEFINE(u_int32_t, curr_dyn_buckets);
-
-#define V_ipfw_dyn_v VNET(ipfw_dyn_v)
-#define V_dyn_buckets VNET(dyn_buckets)
-#define V_curr_dyn_buckets VNET(curr_dyn_buckets)
-
-static struct mtx ipfw_dyn_mtx; /* mutex guarding dynamic rules */
-#define IPFW_DYN_LOCK_INIT() \
- mtx_init(&ipfw_dyn_mtx, "IPFW dynamic rules", NULL, MTX_DEF)
-#define IPFW_DYN_LOCK_DESTROY() mtx_destroy(&ipfw_dyn_mtx)
-#define IPFW_DYN_LOCK() mtx_lock(&ipfw_dyn_mtx)
-#define IPFW_DYN_UNLOCK() mtx_unlock(&ipfw_dyn_mtx)
-#define IPFW_DYN_LOCK_ASSERT() mtx_assert(&ipfw_dyn_mtx, MA_OWNED)
+SYSEND
-static struct mbuf *send_pkt(struct mbuf *, struct ipfw_flow_id *,
- u_int32_t, u_int32_t, int);
-
-
-/*
- * Timeouts for various events in handing dynamic rules.
- */
-static VNET_DEFINE(u_int32_t, dyn_ack_lifetime);
-static VNET_DEFINE(u_int32_t, dyn_syn_lifetime);
-static VNET_DEFINE(u_int32_t, dyn_fin_lifetime);
-static VNET_DEFINE(u_int32_t, dyn_rst_lifetime);
-static VNET_DEFINE(u_int32_t, dyn_udp_lifetime);
-static VNET_DEFINE(u_int32_t, dyn_short_lifetime);
-
-#define V_dyn_ack_lifetime VNET(dyn_ack_lifetime)
-#define V_dyn_syn_lifetime VNET(dyn_syn_lifetime)
-#define V_dyn_fin_lifetime VNET(dyn_fin_lifetime)
-#define V_dyn_rst_lifetime VNET(dyn_rst_lifetime)
-#define V_dyn_udp_lifetime VNET(dyn_udp_lifetime)
-#define V_dyn_short_lifetime VNET(dyn_short_lifetime)
-
-/*
- * Keepalives are sent if dyn_keepalive is set. They are sent every
- * dyn_keepalive_period seconds, in the last dyn_keepalive_interval
- * seconds of lifetime of a rule.
- * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower
- * than dyn_keepalive_period.
- */
-
-static VNET_DEFINE(u_int32_t, dyn_keepalive_interval);
-static VNET_DEFINE(u_int32_t, dyn_keepalive_period);
-static VNET_DEFINE(u_int32_t, dyn_keepalive);
-
-#define V_dyn_keepalive_interval VNET(dyn_keepalive_interval)
-#define V_dyn_keepalive_period VNET(dyn_keepalive_period)
-#define V_dyn_keepalive VNET(dyn_keepalive)
-
-static VNET_DEFINE(u_int32_t, static_count); /* # of static rules */
-static VNET_DEFINE(u_int32_t, static_len); /* bytes of static rules */
-static VNET_DEFINE(u_int32_t, dyn_count); /* # of dynamic rules */
-static VNET_DEFINE(u_int32_t, dyn_max); /* max # of dynamic rules */
-
-#define V_static_count VNET(static_count)
-#define V_static_len VNET(static_len)
-#define V_dyn_count VNET(dyn_count)
-#define V_dyn_max VNET(dyn_max)
-
-#ifdef SYSCTL_NODE
-SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_buckets,
- CTLFLAG_RW, &VNET_NAME(dyn_buckets), 0,
- "Number of dyn. buckets");
-SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets,
- CTLFLAG_RD, &VNET_NAME(curr_dyn_buckets), 0,
- "Current Number of dyn. buckets");
-SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_count,
- CTLFLAG_RD, &VNET_NAME(dyn_count), 0,
- "Number of dyn. rules");
-SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_max,
- CTLFLAG_RW, &VNET_NAME(dyn_max), 0,
- "Max number of dyn. rules");
-SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, static_count,
- CTLFLAG_RD, &VNET_NAME(static_count), 0,
- "Number of static rules");
-SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime,
- CTLFLAG_RW, &VNET_NAME(dyn_ack_lifetime), 0,
- "Lifetime of dyn. rules for acks");
-SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime,
- CTLFLAG_RW, &VNET_NAME(dyn_syn_lifetime), 0,
- "Lifetime of dyn. rules for syn");
-SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime,
- CTLFLAG_RW, &VNET_NAME(dyn_fin_lifetime), 0,
- "Lifetime of dyn. rules for fin");
-SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime,
- CTLFLAG_RW, &VNET_NAME(dyn_rst_lifetime), 0,
- "Lifetime of dyn. rules for rst");
-SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime,
- CTLFLAG_RW, &VNET_NAME(dyn_udp_lifetime), 0,
- "Lifetime of dyn. rules for UDP");
-SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime,
- CTLFLAG_RW, &VNET_NAME(dyn_short_lifetime), 0,
- "Lifetime of dyn. rules for other situations");
-SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive,
- CTLFLAG_RW, &VNET_NAME(dyn_keepalive), 0,
- "Enable keepalives for dyn. rules");
#endif /* SYSCTL_NODE */
+
/*
+ * Some macros used in the various matching options.
* L3HDR maps an ipv4 pointer into a layer3 header pointer of type T
* Other macros just cast void * into the appropriate type
*/
@@ -501,6 +351,7 @@ iface_match(struct ifnet *ifp, ipfw_insn_if *cmd)
return(1);
}
} else {
+#ifdef __FreeBSD__ /* and OSX too ? */
struct ifaddr *ia;
if_addr_rlock(ifp);
@@ -514,6 +365,7 @@ iface_match(struct ifnet *ifp, ipfw_insn_if *cmd)
}
}
if_addr_runlock(ifp);
+#endif /* __FreeBSD__ */
}
return(0); /* no match, fail ... */
}
@@ -524,23 +376,27 @@ iface_match(struct ifnet *ifp, ipfw_insn_if *cmd)
*
* The 'verrevpath' option checks that the interface that an IP packet
* arrives on is the same interface that traffic destined for the
- * packet's source address would be routed out of. The 'versrcreach'
- * option just checks that the source address is reachable via any route
- * (except default) in the routing table. These two are a measure to block
- * forged packets. This is also commonly known as "anti-spoofing" or Unicast
- * Reverse Path Forwarding (Unicast RFP) in Cisco-ese. The name of the knobs
+ * packet's source address would be routed out of.
+ * The 'versrcreach' option just checks that the source address is
+ * reachable via any route (except default) in the routing table.
+ * These two are a measure to block forged packets. This is also
+ * commonly known as "anti-spoofing" or Unicast Reverse Path
+ * Forwarding (Unicast RFP) in Cisco-ese. The name of the knobs
* is purposely reminiscent of the Cisco IOS command,
*
* ip verify unicast reverse-path
* ip verify unicast source reachable-via any
*
- * which implements the same functionality. But note that syntax is
- * misleading. The check may be performed on all IP packets whether unicast,
- * multicast, or broadcast.
+ * which implements the same functionality. But note that the syntax
+ * is misleading, and the check may be performed on all IP packets
+ * whether unicast, multicast, or broadcast.
*/
static int
verify_path(struct in_addr src, struct ifnet *ifp, u_int fib)
{
+#ifndef __FreeBSD__
+ return 0;
+#else
struct route ro;
struct sockaddr_in *dst;
@@ -583,6 +439,7 @@ verify_path(struct in_addr src, struct ifnet *ifp, u_int fib)
/* found valid route */
RTFREE(ro.ro_rt);
return 1;
+#endif /* __FreeBSD__ */
}
#ifdef INET6
@@ -681,17 +538,6 @@ verify_path6(struct in6_addr *src, struct ifnet *ifp)
return 1;
}
-static __inline int
-hash_packet6(struct ipfw_flow_id *id)
-{
- u_int32_t i;
- i = (id->dst_ip6.__u6_addr.__u6_addr32[2]) ^
- (id->dst_ip6.__u6_addr.__u6_addr32[3]) ^
- (id->src_ip6.__u6_addr.__u6_addr32[2]) ^
- (id->src_ip6.__u6_addr.__u6_addr32[3]) ^
- (id->dst_port) ^ (id->src_port);
- return i;
-}
static int
is_icmp6_query(int icmp6_type)
@@ -719,14 +565,14 @@ send_reject6(struct ip_fw_args *args, int code, u_int hlen, struct ip6_hdr *ip6)
if ((tcp->th_flags & TH_RST) == 0) {
struct mbuf *m0;
- m0 = send_pkt(args->m, &(args->f_id),
+ m0 = ipfw_send_pkt(args->m, &(args->f_id),
ntohl(tcp->th_seq), ntohl(tcp->th_ack),
tcp->th_flags | TH_RST);
if (m0 != NULL)
ip6_output(m0, NULL, NULL, 0, NULL, NULL,
NULL);
}
- m_freem(m);
+ FREE_PKT(m);
} else if (code != ICMP6_UNREACH_RST) { /* Send an ICMPv6 unreach. */
#if 0
/*
@@ -742,1085 +588,19 @@ send_reject6(struct ip_fw_args *args, int code, u_int hlen, struct ip6_hdr *ip6)
#endif
icmp6_error(m, ICMP6_DST_UNREACH, code, 0);
} else
- m_freem(m);
+ FREE_PKT(m);
args->m = NULL;
}
#endif /* INET6 */
-/* counter for ipfw_log(NULL...) */
-static VNET_DEFINE(u_int64_t, norule_counter);
-#define V_norule_counter VNET(norule_counter)
-
-#define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0
-#define SNP(buf) buf, sizeof(buf)
-
-/*
- * We enter here when we have a rule with O_LOG.
- * XXX this function alone takes about 2Kbytes of code!
- */
-static void
-ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args,
- struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg,
- struct ip *ip)
-{
- struct ether_header *eh = args->eh;
- char *action;
- int limit_reached = 0;
- char action2[40], proto[128], fragment[32];
-
- fragment[0] = '\0';
- proto[0] = '\0';
-
- if (f == NULL) { /* bogus pkt */
- if (V_verbose_limit != 0 && V_norule_counter >= V_verbose_limit)
- return;
- V_norule_counter++;
- if (V_norule_counter == V_verbose_limit)
- limit_reached = V_verbose_limit;
- action = "Refuse";
- } else { /* O_LOG is the first action, find the real one */
- ipfw_insn *cmd = ACTION_PTR(f);
- ipfw_insn_log *l = (ipfw_insn_log *)cmd;
-
- if (l->max_log != 0 && l->log_left == 0)
- return;
- l->log_left--;
- if (l->log_left == 0)
- limit_reached = l->max_log;
- cmd += F_LEN(cmd); /* point to first action */
- if (cmd->opcode == O_ALTQ) {
- ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd;
-
- snprintf(SNPARGS(action2, 0), "Altq %d",
- altq->qid);
- cmd += F_LEN(cmd);
- }
- if (cmd->opcode == O_PROB)
- cmd += F_LEN(cmd);
-
- if (cmd->opcode == O_TAG)
- cmd += F_LEN(cmd);
-
- action = action2;
- switch (cmd->opcode) {
- case O_DENY:
- action = "Deny";
- break;
-
- case O_REJECT:
- if (cmd->arg1==ICMP_REJECT_RST)
- action = "Reset";
- else if (cmd->arg1==ICMP_UNREACH_HOST)
- action = "Reject";
- else
- snprintf(SNPARGS(action2, 0), "Unreach %d",
- cmd->arg1);
- break;
-
- case O_UNREACH6:
- if (cmd->arg1==ICMP6_UNREACH_RST)
- action = "Reset";
- else
- snprintf(SNPARGS(action2, 0), "Unreach %d",
- cmd->arg1);
- break;
-
- case O_ACCEPT:
- action = "Accept";
- break;
- case O_COUNT:
- action = "Count";
- break;
- case O_DIVERT:
- snprintf(SNPARGS(action2, 0), "Divert %d",
- cmd->arg1);
- break;
- case O_TEE:
- snprintf(SNPARGS(action2, 0), "Tee %d",
- cmd->arg1);
- break;
- case O_SETFIB:
- snprintf(SNPARGS(action2, 0), "SetFib %d",
- cmd->arg1);
- break;
- case O_SKIPTO:
- snprintf(SNPARGS(action2, 0), "SkipTo %d",
- cmd->arg1);
- break;
- case O_PIPE:
- snprintf(SNPARGS(action2, 0), "Pipe %d",
- cmd->arg1);
- break;
- case O_QUEUE:
- snprintf(SNPARGS(action2, 0), "Queue %d",
- cmd->arg1);
- break;
- case O_FORWARD_IP: {
- ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd;
- int len;
- struct in_addr dummyaddr;
- if (sa->sa.sin_addr.s_addr == INADDR_ANY)
- dummyaddr.s_addr = htonl(tablearg);
- else
- dummyaddr.s_addr = sa->sa.sin_addr.s_addr;
-
- len = snprintf(SNPARGS(action2, 0), "Forward to %s",
- inet_ntoa(dummyaddr));
-
- if (sa->sa.sin_port)
- snprintf(SNPARGS(action2, len), ":%d",
- sa->sa.sin_port);
- }
- break;
- case O_NETGRAPH:
- snprintf(SNPARGS(action2, 0), "Netgraph %d",
- cmd->arg1);
- break;
- case O_NGTEE:
- snprintf(SNPARGS(action2, 0), "Ngtee %d",
- cmd->arg1);
- break;
- case O_NAT:
- action = "Nat";
- break;
- case O_REASS:
- action = "Reass";
- break;
- default:
- action = "UNKNOWN";
- break;
- }
- }
-
- if (hlen == 0) { /* non-ip */
- snprintf(SNPARGS(proto, 0), "MAC");
-
- } else {
- int len;
-#ifdef INET6
- char src[INET6_ADDRSTRLEN + 2], dst[INET6_ADDRSTRLEN + 2];
-#else
- char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN];
-#endif
- struct icmphdr *icmp;
- struct tcphdr *tcp;
- struct udphdr *udp;
-#ifdef INET6
- struct ip6_hdr *ip6 = NULL;
- struct icmp6_hdr *icmp6;
-#endif
- src[0] = '\0';
- dst[0] = '\0';
-#ifdef INET6
- if (IS_IP6_FLOW_ID(&(args->f_id))) {
- char ip6buf[INET6_ADDRSTRLEN];
- snprintf(src, sizeof(src), "[%s]",
- ip6_sprintf(ip6buf, &args->f_id.src_ip6));
- snprintf(dst, sizeof(dst), "[%s]",
- ip6_sprintf(ip6buf, &args->f_id.dst_ip6));
-
- ip6 = (struct ip6_hdr *)ip;
- tcp = (struct tcphdr *)(((char *)ip) + hlen);
- udp = (struct udphdr *)(((char *)ip) + hlen);
- } else
-#endif
- {
- tcp = L3HDR(struct tcphdr, ip);
- udp = L3HDR(struct udphdr, ip);
-
- inet_ntoa_r(ip->ip_src, src);
- inet_ntoa_r(ip->ip_dst, dst);
- }
-
- switch (args->f_id.proto) {
- case IPPROTO_TCP:
- len = snprintf(SNPARGS(proto, 0), "TCP %s", src);
- if (offset == 0)
- snprintf(SNPARGS(proto, len), ":%d %s:%d",
- ntohs(tcp->th_sport),
- dst,
- ntohs(tcp->th_dport));
- else
- snprintf(SNPARGS(proto, len), " %s", dst);
- break;
-
- case IPPROTO_UDP:
- len = snprintf(SNPARGS(proto, 0), "UDP %s", src);
- if (offset == 0)
- snprintf(SNPARGS(proto, len), ":%d %s:%d",
- ntohs(udp->uh_sport),
- dst,
- ntohs(udp->uh_dport));
- else
- snprintf(SNPARGS(proto, len), " %s", dst);
- break;
-
- case IPPROTO_ICMP:
- icmp = L3HDR(struct icmphdr, ip);
- if (offset == 0)
- len = snprintf(SNPARGS(proto, 0),
- "ICMP:%u.%u ",
- icmp->icmp_type, icmp->icmp_code);
- else
- len = snprintf(SNPARGS(proto, 0), "ICMP ");
- len += snprintf(SNPARGS(proto, len), "%s", src);
- snprintf(SNPARGS(proto, len), " %s", dst);
- break;
-#ifdef INET6
- case IPPROTO_ICMPV6:
- icmp6 = (struct icmp6_hdr *)(((char *)ip) + hlen);
- if (offset == 0)
- len = snprintf(SNPARGS(proto, 0),
- "ICMPv6:%u.%u ",
- icmp6->icmp6_type, icmp6->icmp6_code);
- else
- len = snprintf(SNPARGS(proto, 0), "ICMPv6 ");
- len += snprintf(SNPARGS(proto, len), "%s", src);
- snprintf(SNPARGS(proto, len), " %s", dst);
- break;
-#endif
- default:
- len = snprintf(SNPARGS(proto, 0), "P:%d %s",
- args->f_id.proto, src);
- snprintf(SNPARGS(proto, len), " %s", dst);
- break;
- }
-
-#ifdef INET6
- if (IS_IP6_FLOW_ID(&(args->f_id))) {
- if (offset & (IP6F_OFF_MASK | IP6F_MORE_FRAG))
- snprintf(SNPARGS(fragment, 0),
- " (frag %08x:%d@%d%s)",
- args->f_id.frag_id6,
- ntohs(ip6->ip6_plen) - hlen,
- ntohs(offset & IP6F_OFF_MASK) << 3,
- (offset & IP6F_MORE_FRAG) ? "+" : "");
- } else
-#endif
- {
- int ip_off, ip_len;
- if (eh != NULL) { /* layer 2 packets are as on the wire */
- ip_off = ntohs(ip->ip_off);
- ip_len = ntohs(ip->ip_len);
- } else {
- ip_off = ip->ip_off;
- ip_len = ip->ip_len;
- }
- if (ip_off & (IP_MF | IP_OFFMASK))
- snprintf(SNPARGS(fragment, 0),
- " (frag %d:%d@%d%s)",
- ntohs(ip->ip_id), ip_len - (ip->ip_hl << 2),
- offset << 3,
- (ip_off & IP_MF) ? "+" : "");
- }
- }
- if (oif || m->m_pkthdr.rcvif)
- log(LOG_SECURITY | LOG_INFO,
- "ipfw: %d %s %s %s via %s%s\n",
- f ? f->rulenum : -1,
- action, proto, oif ? "out" : "in",
- oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname,
- fragment);
- else
- log(LOG_SECURITY | LOG_INFO,
- "ipfw: %d %s %s [no if info]%s\n",
- f ? f->rulenum : -1,
- action, proto, fragment);
- if (limit_reached)
- log(LOG_SECURITY | LOG_NOTICE,
- "ipfw: limit %d reached on entry %d\n",
- limit_reached, f ? f->rulenum : -1);
-}
-
-/*
- * IMPORTANT: the hash function for dynamic rules must be commutative
- * in source and destination (ip,port), because rules are bidirectional
- * and we want to find both in the same bucket.
- */
-static __inline int
-hash_packet(struct ipfw_flow_id *id)
-{
- u_int32_t i;
-
-#ifdef INET6
- if (IS_IP6_FLOW_ID(id))
- i = hash_packet6(id);
- else
-#endif /* INET6 */
- i = (id->dst_ip) ^ (id->src_ip) ^ (id->dst_port) ^ (id->src_port);
- i &= (V_curr_dyn_buckets - 1);
- return i;
-}
-
-static __inline void
-unlink_dyn_rule_print(struct ipfw_flow_id *id)
-{
- struct in_addr da;
-#ifdef INET6
- char src[INET6_ADDRSTRLEN], dst[INET6_ADDRSTRLEN];
-#else
- char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN];
-#endif
-
-#ifdef INET6
- if (IS_IP6_FLOW_ID(id)) {
- ip6_sprintf(src, &id->src_ip6);
- ip6_sprintf(dst, &id->dst_ip6);
- } else
-#endif
- {
- da.s_addr = htonl(id->src_ip);
- inet_ntoa_r(da, src);
- da.s_addr = htonl(id->dst_ip);
- inet_ntoa_r(da, dst);
- }
- printf("ipfw: unlink entry %s %d -> %s %d, %d left\n",
- src, id->src_port, dst, id->dst_port, V_dyn_count - 1);
-}
-
-/**
- * unlink a dynamic rule from a chain. prev is a pointer to
- * the previous one, q is a pointer to the rule to delete,
- * head is a pointer to the head of the queue.
- * Modifies q and potentially also head.
- */
-#define UNLINK_DYN_RULE(prev, head, q) { \
- ipfw_dyn_rule *old_q = q; \
- \
- /* remove a refcount to the parent */ \
- if (q->dyn_type == O_LIMIT) \
- q->parent->count--; \
- DEB(unlink_dyn_rule_print(&q->id);) \
- if (prev != NULL) \
- prev->next = q = q->next; \
- else \
- head = q = q->next; \
- V_dyn_count--; \
- uma_zfree(ipfw_dyn_rule_zone, old_q); }
-
-#define TIME_LEQ(a,b) ((int)((a)-(b)) <= 0)
-
-/**
- * Remove dynamic rules pointing to "rule", or all of them if rule == NULL.
- *
- * If keep_me == NULL, rules are deleted even if not expired,
- * otherwise only expired rules are removed.
- *
- * The value of the second parameter is also used to point to identify
- * a rule we absolutely do not want to remove (e.g. because we are
- * holding a reference to it -- this is the case with O_LIMIT_PARENT
- * rules). The pointer is only used for comparison, so any non-null
- * value will do.
- */
-static void
-remove_dyn_rule(struct ip_fw *rule, ipfw_dyn_rule *keep_me)
-{
- static u_int32_t last_remove = 0;
-
-#define FORCE (keep_me == NULL)
-
- ipfw_dyn_rule *prev, *q;
- int i, pass = 0, max_pass = 0;
-
- IPFW_DYN_LOCK_ASSERT();
-
- if (V_ipfw_dyn_v == NULL || V_dyn_count == 0)
- return;
- /* do not expire more than once per second, it is useless */
- if (!FORCE && last_remove == time_uptime)
- return;
- last_remove = time_uptime;
-
- /*
- * because O_LIMIT refer to parent rules, during the first pass only
- * remove child and mark any pending LIMIT_PARENT, and remove
- * them in a second pass.
- */
-next_pass:
- for (i = 0 ; i < V_curr_dyn_buckets ; i++) {
- for (prev=NULL, q = V_ipfw_dyn_v[i] ; q ; ) {
- /*
- * Logic can become complex here, so we split tests.
- */
- if (q == keep_me)
- goto next;
- if (rule != NULL && rule != q->rule)
- goto next; /* not the one we are looking for */
- if (q->dyn_type == O_LIMIT_PARENT) {
- /*
- * handle parent in the second pass,
- * record we need one.
- */
- max_pass = 1;
- if (pass == 0)
- goto next;
- if (FORCE && q->count != 0 ) {
- /* XXX should not happen! */
- printf("ipfw: OUCH! cannot remove rule,"
- " count %d\n", q->count);
- }
- } else {
- if (!FORCE &&
- !TIME_LEQ( q->expire, time_uptime ))
- goto next;
- }
- if (q->dyn_type != O_LIMIT_PARENT || !q->count) {
- UNLINK_DYN_RULE(prev, V_ipfw_dyn_v[i], q);
- continue;
- }
-next:
- prev=q;
- q=q->next;
- }
- }
- if (pass++ < max_pass)
- goto next_pass;
-}
-
-
-/**
- * lookup a dynamic rule.
- */
-static ipfw_dyn_rule *
-lookup_dyn_rule_locked(struct ipfw_flow_id *pkt, int *match_direction,
- struct tcphdr *tcp)
-{
- /*
- * stateful ipfw extensions.
- * Lookup into dynamic session queue
- */
-#define MATCH_REVERSE 0
-#define MATCH_FORWARD 1
-#define MATCH_NONE 2
-#define MATCH_UNKNOWN 3
- int i, dir = MATCH_NONE;
- ipfw_dyn_rule *prev, *q=NULL;
-
- IPFW_DYN_LOCK_ASSERT();
-
- if (V_ipfw_dyn_v == NULL)
- goto done; /* not found */
- i = hash_packet( pkt );
- for (prev=NULL, q = V_ipfw_dyn_v[i] ; q != NULL ; ) {
- if (q->dyn_type == O_LIMIT_PARENT && q->count)
- goto next;
- if (TIME_LEQ( q->expire, time_uptime)) { /* expire entry */
- UNLINK_DYN_RULE(prev, V_ipfw_dyn_v[i], q);
- continue;
- }
- if (pkt->proto == q->id.proto &&
- q->dyn_type != O_LIMIT_PARENT) {
- if (IS_IP6_FLOW_ID(pkt)) {
- if (IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6),
- &(q->id.src_ip6)) &&
- IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6),
- &(q->id.dst_ip6)) &&
- pkt->src_port == q->id.src_port &&
- pkt->dst_port == q->id.dst_port ) {
- dir = MATCH_FORWARD;
- break;
- }
- if (IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6),
- &(q->id.dst_ip6)) &&
- IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6),
- &(q->id.src_ip6)) &&
- pkt->src_port == q->id.dst_port &&
- pkt->dst_port == q->id.src_port ) {
- dir = MATCH_REVERSE;
- break;
- }
- } else {
- if (pkt->src_ip == q->id.src_ip &&
- pkt->dst_ip == q->id.dst_ip &&
- pkt->src_port == q->id.src_port &&
- pkt->dst_port == q->id.dst_port ) {
- dir = MATCH_FORWARD;
- break;
- }
- if (pkt->src_ip == q->id.dst_ip &&
- pkt->dst_ip == q->id.src_ip &&
- pkt->src_port == q->id.dst_port &&
- pkt->dst_port == q->id.src_port ) {
- dir = MATCH_REVERSE;
- break;
- }
- }
- }
-next:
- prev = q;
- q = q->next;
- }
- if (q == NULL)
- goto done; /* q = NULL, not found */
-
- if ( prev != NULL) { /* found and not in front */
- prev->next = q->next;
- q->next = V_ipfw_dyn_v[i];
- V_ipfw_dyn_v[i] = q;
- }
- if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */
- u_char flags = pkt->flags & (TH_FIN|TH_SYN|TH_RST);
-
-#define BOTH_SYN (TH_SYN | (TH_SYN << 8))
-#define BOTH_FIN (TH_FIN | (TH_FIN << 8))
- q->state |= (dir == MATCH_FORWARD ) ? flags : (flags << 8);
- switch (q->state) {
- case TH_SYN: /* opening */
- q->expire = time_uptime + V_dyn_syn_lifetime;
- break;
-
- case BOTH_SYN: /* move to established */
- case BOTH_SYN | TH_FIN : /* one side tries to close */
- case BOTH_SYN | (TH_FIN << 8) :
- if (tcp) {
-#define _SEQ_GE(a,b) ((int)(a) - (int)(b) >= 0)
- u_int32_t ack = ntohl(tcp->th_ack);
- if (dir == MATCH_FORWARD) {
- if (q->ack_fwd == 0 || _SEQ_GE(ack, q->ack_fwd))
- q->ack_fwd = ack;
- else { /* ignore out-of-sequence */
- break;
- }
- } else {
- if (q->ack_rev == 0 || _SEQ_GE(ack, q->ack_rev))
- q->ack_rev = ack;
- else { /* ignore out-of-sequence */
- break;
- }
- }
- }
- q->expire = time_uptime + V_dyn_ack_lifetime;
- break;
-
- case BOTH_SYN | BOTH_FIN: /* both sides closed */
- if (V_dyn_fin_lifetime >= V_dyn_keepalive_period)
- V_dyn_fin_lifetime = V_dyn_keepalive_period - 1;
- q->expire = time_uptime + V_dyn_fin_lifetime;
- break;
-
- default:
-#if 0
- /*
- * reset or some invalid combination, but can also
- * occur if we use keep-state the wrong way.
- */
- if ( (q->state & ((TH_RST << 8)|TH_RST)) == 0)
- printf("invalid state: 0x%x\n", q->state);
-#endif
- if (V_dyn_rst_lifetime >= V_dyn_keepalive_period)
- V_dyn_rst_lifetime = V_dyn_keepalive_period - 1;
- q->expire = time_uptime + V_dyn_rst_lifetime;
- break;
- }
- } else if (pkt->proto == IPPROTO_UDP) {
- q->expire = time_uptime + V_dyn_udp_lifetime;
- } else {
- /* other protocols */
- q->expire = time_uptime + V_dyn_short_lifetime;
- }
-done:
- if (match_direction)
- *match_direction = dir;
- return q;
-}
-
-static ipfw_dyn_rule *
-lookup_dyn_rule(struct ipfw_flow_id *pkt, int *match_direction,
- struct tcphdr *tcp)
-{
- ipfw_dyn_rule *q;
-
- IPFW_DYN_LOCK();
- q = lookup_dyn_rule_locked(pkt, match_direction, tcp);
- if (q == NULL)
- IPFW_DYN_UNLOCK();
- /* NB: return table locked when q is not NULL */
- return q;
-}
-
-static void
-realloc_dynamic_table(void)
-{
- IPFW_DYN_LOCK_ASSERT();
-
- /*
- * Try reallocation, make sure we have a power of 2 and do
- * not allow more than 64k entries. In case of overflow,
- * default to 1024.
- */
-
- if (V_dyn_buckets > 65536)
- V_dyn_buckets = 1024;
- if ((V_dyn_buckets & (V_dyn_buckets-1)) != 0) { /* not a power of 2 */
- V_dyn_buckets = V_curr_dyn_buckets; /* reset */
- return;
- }
- V_curr_dyn_buckets = V_dyn_buckets;
- if (V_ipfw_dyn_v != NULL)
- free(V_ipfw_dyn_v, M_IPFW);
- for (;;) {
- V_ipfw_dyn_v = malloc(V_curr_dyn_buckets * sizeof(ipfw_dyn_rule *),
- M_IPFW, M_NOWAIT | M_ZERO);
- if (V_ipfw_dyn_v != NULL || V_curr_dyn_buckets <= 2)
- break;
- V_curr_dyn_buckets /= 2;
- }
-}
-
-/**
- * Install state of type 'type' for a dynamic session.
- * The hash table contains two type of rules:
- * - regular rules (O_KEEP_STATE)
- * - rules for sessions with limited number of sess per user
- * (O_LIMIT). When they are created, the parent is
- * increased by 1, and decreased on delete. In this case,
- * the third parameter is the parent rule and not the chain.
- * - "parent" rules for the above (O_LIMIT_PARENT).
- */
-static ipfw_dyn_rule *
-add_dyn_rule(struct ipfw_flow_id *id, u_int8_t dyn_type, struct ip_fw *rule)
-{
- ipfw_dyn_rule *r;
- int i;
-
- IPFW_DYN_LOCK_ASSERT();
-
- if (V_ipfw_dyn_v == NULL ||
- (V_dyn_count == 0 && V_dyn_buckets != V_curr_dyn_buckets)) {
- realloc_dynamic_table();
- if (V_ipfw_dyn_v == NULL)
- return NULL; /* failed ! */
- }
- i = hash_packet(id);
-
- r = uma_zalloc(ipfw_dyn_rule_zone, M_NOWAIT | M_ZERO);
- if (r == NULL) {
- printf ("ipfw: sorry cannot allocate state\n");
- return NULL;
- }
-
- /* increase refcount on parent, and set pointer */
- if (dyn_type == O_LIMIT) {
- ipfw_dyn_rule *parent = (ipfw_dyn_rule *)rule;
- if ( parent->dyn_type != O_LIMIT_PARENT)
- panic("invalid parent");
- parent->count++;
- r->parent = parent;
- rule = parent->rule;
- }
-
- r->id = *id;
- r->expire = time_uptime + V_dyn_syn_lifetime;
- r->rule = rule;
- r->dyn_type = dyn_type;
- r->pcnt = r->bcnt = 0;
- r->count = 0;
-
- r->bucket = i;
- r->next = V_ipfw_dyn_v[i];
- V_ipfw_dyn_v[i] = r;
- V_dyn_count++;
- DEB({
- struct in_addr da;
-#ifdef INET6
- char src[INET6_ADDRSTRLEN];
- char dst[INET6_ADDRSTRLEN];
-#else
- char src[INET_ADDRSTRLEN];
- char dst[INET_ADDRSTRLEN];
-#endif
-
-#ifdef INET6
- if (IS_IP6_FLOW_ID(&(r->id))) {
- ip6_sprintf(src, &r->id.src_ip6);
- ip6_sprintf(dst, &r->id.dst_ip6);
- } else
-#endif
- {
- da.s_addr = htonl(r->id.src_ip);
- inet_ntoa_r(da, src);
- da.s_addr = htonl(r->id.dst_ip);
- inet_ntoa_r(da, dst);
- }
- printf("ipfw: add dyn entry ty %d %s %d -> %s %d, total %d\n",
- dyn_type, src, r->id.src_port, dst, r->id.dst_port,
- V_dyn_count);
- })
- return r;
-}
-
-/**
- * lookup dynamic parent rule using pkt and rule as search keys.
- * If the lookup fails, then install one.
- */
-static ipfw_dyn_rule *
-lookup_dyn_parent(struct ipfw_flow_id *pkt, struct ip_fw *rule)
-{
- ipfw_dyn_rule *q;
- int i;
-
- IPFW_DYN_LOCK_ASSERT();
-
- if (V_ipfw_dyn_v) {
- int is_v6 = IS_IP6_FLOW_ID(pkt);
- i = hash_packet( pkt );
- for (q = V_ipfw_dyn_v[i] ; q != NULL ; q=q->next)
- if (q->dyn_type == O_LIMIT_PARENT &&
- rule== q->rule &&
- pkt->proto == q->id.proto &&
- pkt->src_port == q->id.src_port &&
- pkt->dst_port == q->id.dst_port &&
- (
- (is_v6 &&
- IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6),
- &(q->id.src_ip6)) &&
- IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6),
- &(q->id.dst_ip6))) ||
- (!is_v6 &&
- pkt->src_ip == q->id.src_ip &&
- pkt->dst_ip == q->id.dst_ip)
- )
- ) {
- q->expire = time_uptime + V_dyn_short_lifetime;
- DEB(printf("ipfw: lookup_dyn_parent found 0x%p\n",q);)
- return q;
- }
- }
- return add_dyn_rule(pkt, O_LIMIT_PARENT, rule);
-}
-
-/**
- * Install dynamic state for rule type cmd->o.opcode
- *
- * Returns 1 (failure) if state is not installed because of errors or because
- * session limitations are enforced.
- */
-static int
-install_state(struct ip_fw *rule, ipfw_insn_limit *cmd,
- struct ip_fw_args *args, uint32_t tablearg)
-{
- static int last_log;
- ipfw_dyn_rule *q;
- struct in_addr da;
-#ifdef INET6
- char src[INET6_ADDRSTRLEN + 2], dst[INET6_ADDRSTRLEN + 2];
-#else
- char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN];
-#endif
-
- src[0] = '\0';
- dst[0] = '\0';
-
- IPFW_DYN_LOCK();
-
- DEB(
-#ifdef INET6
- if (IS_IP6_FLOW_ID(&(args->f_id))) {
- ip6_sprintf(src, &args->f_id.src_ip6);
- ip6_sprintf(dst, &args->f_id.dst_ip6);
- } else
-#endif
- {
- da.s_addr = htonl(args->f_id.src_ip);
- inet_ntoa_r(da, src);
- da.s_addr = htonl(args->f_id.dst_ip);
- inet_ntoa_r(da, dst);
- }
- printf("ipfw: %s: type %d %s %u -> %s %u\n",
- __func__, cmd->o.opcode, src, args->f_id.src_port,
- dst, args->f_id.dst_port);
- src[0] = '\0';
- dst[0] = '\0';
- )
-
- q = lookup_dyn_rule_locked(&args->f_id, NULL, NULL);
-
- if (q != NULL) { /* should never occur */
- if (last_log != time_uptime) {
- last_log = time_uptime;
- printf("ipfw: %s: entry already present, done\n",
- __func__);
- }
- IPFW_DYN_UNLOCK();
- return (0);
- }
-
- if (V_dyn_count >= V_dyn_max)
- /* Run out of slots, try to remove any expired rule. */
- remove_dyn_rule(NULL, (ipfw_dyn_rule *)1);
-
- if (V_dyn_count >= V_dyn_max) {
- if (last_log != time_uptime) {
- last_log = time_uptime;
- printf("ipfw: %s: Too many dynamic rules\n", __func__);
- }
- IPFW_DYN_UNLOCK();
- return (1); /* cannot install, notify caller */
- }
-
- switch (cmd->o.opcode) {
- case O_KEEP_STATE: /* bidir rule */
- add_dyn_rule(&args->f_id, O_KEEP_STATE, rule);
- break;
-
- case O_LIMIT: { /* limit number of sessions */
- struct ipfw_flow_id id;
- ipfw_dyn_rule *parent;
- uint32_t conn_limit;
- uint16_t limit_mask = cmd->limit_mask;
-
- conn_limit = (cmd->conn_limit == IP_FW_TABLEARG) ?
- tablearg : cmd->conn_limit;
-
- DEB(
- if (cmd->conn_limit == IP_FW_TABLEARG)
- printf("ipfw: %s: O_LIMIT rule, conn_limit: %u "
- "(tablearg)\n", __func__, conn_limit);
- else
- printf("ipfw: %s: O_LIMIT rule, conn_limit: %u\n",
- __func__, conn_limit);
- )
-
- id.dst_ip = id.src_ip = id.dst_port = id.src_port = 0;
- id.proto = args->f_id.proto;
- id.addr_type = args->f_id.addr_type;
- id.fib = M_GETFIB(args->m);
-
- if (IS_IP6_FLOW_ID (&(args->f_id))) {
- if (limit_mask & DYN_SRC_ADDR)
- id.src_ip6 = args->f_id.src_ip6;
- if (limit_mask & DYN_DST_ADDR)
- id.dst_ip6 = args->f_id.dst_ip6;
- } else {
- if (limit_mask & DYN_SRC_ADDR)
- id.src_ip = args->f_id.src_ip;
- if (limit_mask & DYN_DST_ADDR)
- id.dst_ip = args->f_id.dst_ip;
- }
- if (limit_mask & DYN_SRC_PORT)
- id.src_port = args->f_id.src_port;
- if (limit_mask & DYN_DST_PORT)
- id.dst_port = args->f_id.dst_port;
- if ((parent = lookup_dyn_parent(&id, rule)) == NULL) {
- printf("ipfw: %s: add parent failed\n", __func__);
- IPFW_DYN_UNLOCK();
- return (1);
- }
-
- if (parent->count >= conn_limit) {
- /* See if we can remove some expired rule. */
- remove_dyn_rule(rule, parent);
- if (parent->count >= conn_limit) {
- if (V_fw_verbose && last_log != time_uptime) {
- last_log = time_uptime;
-#ifdef INET6
- /*
- * XXX IPv6 flows are not
- * supported yet.
- */
- if (IS_IP6_FLOW_ID(&(args->f_id))) {
- char ip6buf[INET6_ADDRSTRLEN];
- snprintf(src, sizeof(src),
- "[%s]", ip6_sprintf(ip6buf,
- &args->f_id.src_ip6));
- snprintf(dst, sizeof(dst),
- "[%s]", ip6_sprintf(ip6buf,
- &args->f_id.dst_ip6));
- } else
-#endif
- {
- da.s_addr =
- htonl(args->f_id.src_ip);
- inet_ntoa_r(da, src);
- da.s_addr =
- htonl(args->f_id.dst_ip);
- inet_ntoa_r(da, dst);
- }
- log(LOG_SECURITY | LOG_DEBUG,
- "ipfw: %d %s %s:%u -> %s:%u, %s\n",
- parent->rule->rulenum,
- "drop session",
- src, (args->f_id.src_port),
- dst, (args->f_id.dst_port),
- "too many entries");
- }
- IPFW_DYN_UNLOCK();
- return (1);
- }
- }
- add_dyn_rule(&args->f_id, O_LIMIT, (struct ip_fw *)parent);
- break;
- }
- default:
- printf("ipfw: %s: unknown dynamic rule type %u\n",
- __func__, cmd->o.opcode);
- IPFW_DYN_UNLOCK();
- return (1);
- }
-
- /* XXX just set lifetime */
- lookup_dyn_rule_locked(&args->f_id, NULL, NULL);
-
- IPFW_DYN_UNLOCK();
- return (0);
-}
-
-/*
- * Generate a TCP packet, containing either a RST or a keepalive.
- * When flags & TH_RST, we are sending a RST packet, because of a
- * "reset" action matched the packet.
- * Otherwise we are sending a keepalive, and flags & TH_
- * The 'replyto' mbuf is the mbuf being replied to, if any, and is required
- * so that MAC can label the reply appropriately.
- */
-static struct mbuf *
-send_pkt(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t seq,
- u_int32_t ack, int flags)
-{
- struct mbuf *m;
- int len, dir;
- struct ip *h = NULL; /* stupid compiler */
-#ifdef INET6
- struct ip6_hdr *h6 = NULL;
-#endif
- struct tcphdr *th = NULL;
-
- MGETHDR(m, M_DONTWAIT, MT_DATA);
- if (m == NULL)
- return (NULL);
-
- M_SETFIB(m, id->fib);
-#ifdef MAC
- if (replyto != NULL)
- mac_netinet_firewall_reply(replyto, m);
- else
- mac_netinet_firewall_send(m);
-#else
- (void)replyto; /* don't warn about unused arg */
-#endif
-
- switch (id->addr_type) {
- case 4:
- len = sizeof(struct ip) + sizeof(struct tcphdr);
- break;
-#ifdef INET6
- case 6:
- len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
- break;
-#endif
- default:
- /* XXX: log me?!? */
- m_freem(m);
- return (NULL);
- }
- dir = ((flags & (TH_SYN | TH_RST)) == TH_SYN);
-
- m->m_data += max_linkhdr;
- m->m_flags |= M_SKIP_FIREWALL;
- m->m_pkthdr.len = m->m_len = len;
- m->m_pkthdr.rcvif = NULL;
- bzero(m->m_data, len);
-
- switch (id->addr_type) {
- case 4:
- h = mtod(m, struct ip *);
-
- /* prepare for checksum */
- h->ip_p = IPPROTO_TCP;
- h->ip_len = htons(sizeof(struct tcphdr));
- if (dir) {
- h->ip_src.s_addr = htonl(id->src_ip);
- h->ip_dst.s_addr = htonl(id->dst_ip);
- } else {
- h->ip_src.s_addr = htonl(id->dst_ip);
- h->ip_dst.s_addr = htonl(id->src_ip);
- }
-
- th = (struct tcphdr *)(h + 1);
- break;
-#ifdef INET6
- case 6:
- h6 = mtod(m, struct ip6_hdr *);
-
- /* prepare for checksum */
- h6->ip6_nxt = IPPROTO_TCP;
- h6->ip6_plen = htons(sizeof(struct tcphdr));
- if (dir) {
- h6->ip6_src = id->src_ip6;
- h6->ip6_dst = id->dst_ip6;
- } else {
- h6->ip6_src = id->dst_ip6;
- h6->ip6_dst = id->src_ip6;
- }
-
- th = (struct tcphdr *)(h6 + 1);
- break;
-#endif
- }
-
- if (dir) {
- th->th_sport = htons(id->src_port);
- th->th_dport = htons(id->dst_port);
- } else {
- th->th_sport = htons(id->dst_port);
- th->th_dport = htons(id->src_port);
- }
- th->th_off = sizeof(struct tcphdr) >> 2;
-
- if (flags & TH_RST) {
- if (flags & TH_ACK) {
- th->th_seq = htonl(ack);
- th->th_flags = TH_RST;
- } else {
- if (flags & TH_SYN)
- seq++;
- th->th_ack = htonl(seq);
- th->th_flags = TH_RST | TH_ACK;
- }
- } else {
- /*
- * Keepalive - use caller provided sequence numbers
- */
- th->th_seq = htonl(seq);
- th->th_ack = htonl(ack);
- th->th_flags = TH_ACK;
- }
-
- switch (id->addr_type) {
- case 4:
- th->th_sum = in_cksum(m, len);
-
- /* finish the ip header */
- h->ip_v = 4;
- h->ip_hl = sizeof(*h) >> 2;
- h->ip_tos = IPTOS_LOWDELAY;
- h->ip_off = 0;
- h->ip_len = len;
- h->ip_ttl = V_ip_defttl;
- h->ip_sum = 0;
- break;
-#ifdef INET6
- case 6:
- th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(*h6),
- sizeof(struct tcphdr));
-
- /* finish the ip6 header */
- h6->ip6_vfc |= IPV6_VERSION;
- h6->ip6_hlim = IPV6_DEFHLIM;
- break;
-#endif
- }
-
- return (m);
-}
/*
* sends a reject message, consuming the mbuf passed as an argument.
*/
static void
-send_reject(struct ip_fw_args *args, int code, int ip_len, struct ip *ip)
+send_reject(struct ip_fw_args *args, int code, int iplen, struct ip *ip)
{
#if 0
@@ -1835,269 +615,46 @@ send_reject(struct ip_fw_args *args, int code, int ip_len, struct ip *ip)
#endif
if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */
/* We need the IP header in host order for icmp_error(). */
- if (args->eh != NULL) {
- ip->ip_len = ntohs(ip->ip_len);
- ip->ip_off = ntohs(ip->ip_off);
- }
+ SET_HOST_IPLEN(ip);
icmp_error(args->m, ICMP_UNREACH, code, 0L, 0);
} else if (args->f_id.proto == IPPROTO_TCP) {
struct tcphdr *const tcp =
L3HDR(struct tcphdr, mtod(args->m, struct ip *));
if ( (tcp->th_flags & TH_RST) == 0) {
struct mbuf *m;
- m = send_pkt(args->m, &(args->f_id),
+ m = ipfw_send_pkt(args->m, &(args->f_id),
ntohl(tcp->th_seq), ntohl(tcp->th_ack),
tcp->th_flags | TH_RST);
if (m != NULL)
ip_output(m, NULL, NULL, 0, NULL, NULL);
}
- m_freem(args->m);
+ FREE_PKT(args->m);
} else
- m_freem(args->m);
+ FREE_PKT(args->m);
args->m = NULL;
}
-/**
- *
- * Given an ip_fw *, lookup_next_rule will return a pointer
- * to the next rule, which can be either the jump
- * target (for skipto instructions) or the next one in the list (in
- * all other cases including a missing jump target).
- * The result is also written in the "next_rule" field of the rule.
- * Backward jumps are not allowed, so start looking from the next
- * rule...
- *
- * This never returns NULL -- in case we do not have an exact match,
- * the next rule is returned. When the ruleset is changed,
- * pointers are flushed so we are always correct.
+/*
+ * Support for uid/gid/jail lookup. These tests are expensive
+ * (because we may need to look into the list of active sockets)
+ * so we cache the results. ugid_lookupp is 0 if we have not
+ * yet done a lookup, 1 if we succeeded, and -1 if we tried
+ * and failed. The function always returns the match value.
+ * We could actually spare the variable and use *uc, setting
+ * it to '(void *)check_uidgid if we have no info, NULL if
+ * we tried and failed, or any other value if successful.
*/
-
-static struct ip_fw *
-lookup_next_rule(struct ip_fw *me, u_int32_t tablearg)
-{
- struct ip_fw *rule = NULL;
- ipfw_insn *cmd;
- u_int16_t rulenum;
-
- /* look for action, in case it is a skipto */
- cmd = ACTION_PTR(me);
- if (cmd->opcode == O_LOG)
- cmd += F_LEN(cmd);
- if (cmd->opcode == O_ALTQ)
- cmd += F_LEN(cmd);
- if (cmd->opcode == O_TAG)
- cmd += F_LEN(cmd);
- if (cmd->opcode == O_SKIPTO ) {
- if (tablearg != 0) {
- rulenum = (u_int16_t)tablearg;
- } else {
- rulenum = cmd->arg1;
- }
- for (rule = me->next; rule ; rule = rule->next) {
- if (rule->rulenum >= rulenum) {
- break;
- }
- }
- }
- if (rule == NULL) /* failure or not a skipto */
- rule = me->next;
- me->next_rule = rule;
- return rule;
-}
-
-static int
-add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
- uint8_t mlen, uint32_t value)
-{
- struct radix_node_head *rnh;
- struct table_entry *ent;
- struct radix_node *rn;
-
- if (tbl >= IPFW_TABLES_MAX)
- return (EINVAL);
- rnh = ch->tables[tbl];
- ent = malloc(sizeof(*ent), M_IPFW_TBL, M_NOWAIT | M_ZERO);
- if (ent == NULL)
- return (ENOMEM);
- ent->value = value;
- ent->addr.sin_len = ent->mask.sin_len = 8;
- ent->mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0);
- ent->addr.sin_addr.s_addr = addr & ent->mask.sin_addr.s_addr;
- IPFW_WLOCK(ch);
- rn = rnh->rnh_addaddr(&ent->addr, &ent->mask, rnh, (void *)ent);
- if (rn == NULL) {
- IPFW_WUNLOCK(ch);
- free(ent, M_IPFW_TBL);
- return (EEXIST);
- }
- IPFW_WUNLOCK(ch);
- return (0);
-}
-
-static int
-del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
- uint8_t mlen)
-{
- struct radix_node_head *rnh;
- struct table_entry *ent;
- struct sockaddr_in sa, mask;
-
- if (tbl >= IPFW_TABLES_MAX)
- return (EINVAL);
- rnh = ch->tables[tbl];
- sa.sin_len = mask.sin_len = 8;
- mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0);
- sa.sin_addr.s_addr = addr & mask.sin_addr.s_addr;
- IPFW_WLOCK(ch);
- ent = (struct table_entry *)rnh->rnh_deladdr(&sa, &mask, rnh);
- if (ent == NULL) {
- IPFW_WUNLOCK(ch);
- return (ESRCH);
- }
- IPFW_WUNLOCK(ch);
- free(ent, M_IPFW_TBL);
- return (0);
-}
-
-static int
-flush_table_entry(struct radix_node *rn, void *arg)
-{
- struct radix_node_head * const rnh = arg;
- struct table_entry *ent;
-
- ent = (struct table_entry *)
- rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh);
- if (ent != NULL)
- free(ent, M_IPFW_TBL);
- return (0);
-}
-
-static int
-flush_table(struct ip_fw_chain *ch, uint16_t tbl)
-{
- struct radix_node_head *rnh;
-
- IPFW_WLOCK_ASSERT(ch);
-
- if (tbl >= IPFW_TABLES_MAX)
- return (EINVAL);
- rnh = ch->tables[tbl];
- KASSERT(rnh != NULL, ("NULL IPFW table"));
- rnh->rnh_walktree(rnh, flush_table_entry, rnh);
- return (0);
-}
-
-static void
-flush_tables(struct ip_fw_chain *ch)
-{
- uint16_t tbl;
-
- IPFW_WLOCK_ASSERT(ch);
-
- for (tbl = 0; tbl < IPFW_TABLES_MAX; tbl++)
- flush_table(ch, tbl);
-}
-
-static int
-init_tables(struct ip_fw_chain *ch)
-{
- int i;
- uint16_t j;
-
- for (i = 0; i < IPFW_TABLES_MAX; i++) {
- if (!rn_inithead((void **)&ch->tables[i], 32)) {
- for (j = 0; j < i; j++) {
- (void) flush_table(ch, j);
- }
- return (ENOMEM);
- }
- }
- return (0);
-}
-
-static int
-lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
- uint32_t *val)
-{
- struct radix_node_head *rnh;
- struct table_entry *ent;
- struct sockaddr_in sa;
-
- if (tbl >= IPFW_TABLES_MAX)
- return (0);
- rnh = ch->tables[tbl];
- sa.sin_len = 8;
- sa.sin_addr.s_addr = addr;
- ent = (struct table_entry *)(rnh->rnh_lookup(&sa, NULL, rnh));
- if (ent != NULL) {
- *val = ent->value;
- return (1);
- }
- return (0);
-}
-
-static int
-count_table_entry(struct radix_node *rn, void *arg)
-{
- u_int32_t * const cnt = arg;
-
- (*cnt)++;
- return (0);
-}
-
-static int
-count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt)
-{
- struct radix_node_head *rnh;
-
- if (tbl >= IPFW_TABLES_MAX)
- return (EINVAL);
- rnh = ch->tables[tbl];
- *cnt = 0;
- rnh->rnh_walktree(rnh, count_table_entry, cnt);
- return (0);
-}
-
-static int
-dump_table_entry(struct radix_node *rn, void *arg)
-{
- struct table_entry * const n = (struct table_entry *)rn;
- ipfw_table * const tbl = arg;
- ipfw_table_entry *ent;
-
- if (tbl->cnt == tbl->size)
- return (1);
- ent = &tbl->ent[tbl->cnt];
- ent->tbl = tbl->tbl;
- if (in_nullhost(n->mask.sin_addr))
- ent->masklen = 0;
- else
- ent->masklen = 33 - ffs(ntohl(n->mask.sin_addr.s_addr));
- ent->addr = n->addr.sin_addr.s_addr;
- ent->value = n->value;
- tbl->cnt++;
- return (0);
-}
-
-static int
-dump_table(struct ip_fw_chain *ch, ipfw_table *tbl)
-{
- struct radix_node_head *rnh;
-
- if (tbl->tbl >= IPFW_TABLES_MAX)
- return (EINVAL);
- rnh = ch->tables[tbl->tbl];
- tbl->cnt = 0;
- rnh->rnh_walktree(rnh, dump_table_entry, tbl);
- return (0);
-}
-
static int
check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif,
struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip,
- u_int16_t src_port, struct ucred **uc, int *ugid_lookupp,
- struct inpcb *inp)
+ u_int16_t src_port, int *ugid_lookupp,
+ struct ucred **uc, struct inpcb *inp)
{
+#ifndef __FreeBSD__
+ return cred_check(insn, proto, oif,
+ dst_ip, dst_port, src_ip, src_port,
+ (struct bsd_ucred *)uc, ugid_lookupp, ((struct mbuf *)inp)->m_skb);
+#else /* FreeBSD */
struct inpcbinfo *pi;
int wildcard;
struct inpcb *pcb;
@@ -2150,10 +707,8 @@ check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif,
INP_INFO_RUNLOCK(pi);
if (*ugid_lookupp == 0) {
/*
- * If the lookup did not yield any results, there
- * is no sense in coming back and trying again. So
- * we can set lookup to -1 and ensure that we wont
- * bother the pcb system again.
+ * We tried and failed, set the variable to -1
+ * so we will not try again on this packet.
*/
*ugid_lookupp = -1;
return (0);
@@ -2166,6 +721,22 @@ check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif,
else if (insn->o.opcode == O_JAIL)
match = ((*uc)->cr_prison->pr_id == (int)insn->d[0]);
return match;
+#endif /* __FreeBSD__ */
+}
+
+/*
+ * Helper function to set args with info on the rule after the matching
+ * one. slot is precise, whereas we guess rule_id as they are
+ * assigned sequentially.
+ */
+static inline void
+set_match(struct ip_fw_args *args, int slot,
+ struct ip_fw_chain *chain)
+{
+ args->rule.chain_id = chain->id;
+ args->rule.slot = slot + 1; /* we use 0 as a marker */
+ args->rule.rule_id = 1 + chain->map[slot]->id;
+ args->rule.rulenum = chain->map[slot]->rulenum;
}
/*
@@ -2178,10 +749,10 @@ check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif,
*
* args->m (in/out) The packet; we set to NULL when/if we nuke it.
* Starts with the IP header.
- * args->eh (in) Mac header if present, or NULL for layer3 packet.
+ * args->eh (in) Mac header if present, NULL for layer3 packet.
* args->L3offset Number of bytes bypassed if we came from L2.
* e.g. often sizeof(eh) ** NOTYET **
- * args->oif Outgoing interface, or NULL if packet is incoming.
+ * args->oif Outgoing interface, NULL if packet is incoming.
* The incoming interface is in the mbuf. (in)
* args->divert_rule (in/out)
* Skip up to the first rule past this rule number;
@@ -2190,7 +761,7 @@ check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif,
* args->rule Pointer to the last matching rule (in/out)
* args->next_hop Socket we are forwarding to (out).
* args->f_id Addresses grabbed from the packet (out)
- * args->cookie a cookie depending on rule action
+ * args->rule.info a cookie depending on rule action
*
* Return value:
*
@@ -2200,6 +771,8 @@ check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif,
* IP_FW_TEE tee packet, port in m_tag
* IP_FW_DUMMYNET to dummynet, pipe in args->cookie
* IP_FW_NETGRAPH into netgraph, cookie args->cookie
+ * args->rule contains the matching rule,
+ * args->rule.info has additional information.
*
*/
int
@@ -2207,7 +780,7 @@ ipfw_chk(struct ip_fw_args *args)
{
/*
- * Local variables holding state during the processing of a packet:
+ * Local variables holding state while processing a packet:
*
* IMPORTANT NOTE: to speed up the processing of rules, there
* are some assumption on the values of the variables, which
@@ -2240,18 +813,14 @@ ipfw_chk(struct ip_fw_args *args)
* these types of constraints, as well as decrease contention
* on pcb related locks.
*/
+#ifndef __FreeBSD__
+ struct bsd_ucred ucred_cache;
+#else
struct ucred *ucred_cache = NULL;
+#endif
int ucred_lookup = 0;
/*
- * divinput_flags If non-zero, set to the IP_FW_DIVERT_*_FLAG
- * associated with a packet input on a divert socket. This
- * will allow to distinguish traffic and its direction when
- * it originates from a divert socket.
- */
- u_int divinput_flags = 0;
-
- /*
* oif | args->oif If NULL, ipfw_chk has been called on the
* inbound path (ether_input, ip_input).
* If non-NULL, ipfw_chk has been called on the outbound path
@@ -2259,7 +828,7 @@ ipfw_chk(struct ip_fw_args *args)
*/
struct ifnet *oif = args->oif;
- struct ip_fw *f = NULL; /* matching rule */
+ int f_pos = 0; /* index of current rule in the array */
int retval = 0;
/*
@@ -2294,12 +863,12 @@ ipfw_chk(struct ip_fw_args *args)
* src_ip, dst_ip ip addresses, in NETWORK format.
* Only valid for IPv4 packets.
*/
- u_int8_t proto;
- u_int16_t src_port = 0, dst_port = 0; /* NOTE: host format */
+ uint8_t proto;
+ uint16_t src_port = 0, dst_port = 0; /* NOTE: host format */
struct in_addr src_ip, dst_ip; /* NOTE: network format */
- u_int16_t ip_len=0;
+ uint16_t iplen=0;
int pktlen;
- u_int16_t etype = 0; /* Host order stored ether type */
+ uint16_t etype = 0; /* Host order stored ether type */
/*
* dyn_dir = MATCH_UNKNOWN when rules unchecked,
@@ -2309,7 +878,6 @@ ipfw_chk(struct ip_fw_args *args)
int dyn_dir = MATCH_UNKNOWN;
ipfw_dyn_rule *q = NULL;
struct ip_fw_chain *chain = &V_layer3_chain;
- struct m_tag *mtag;
/*
* We store in ulp a pointer to the upper layer protocol header.
@@ -2318,12 +886,17 @@ ipfw_chk(struct ip_fw_args *args)
* ulp is NULL if not found.
*/
void *ulp = NULL; /* upper layer protocol pointer. */
+
/* XXX ipv6 variables */
int is_ipv6 = 0;
- u_int16_t ext_hd = 0; /* bits vector for extension header filtering */
+ uint8_t icmp6_type = 0;
+ uint16_t ext_hd = 0; /* bits vector for extension header filtering */
/* end of ipv6 variables */
+
int is_ipv4 = 0;
+ int done = 0; /* flag to exit the outer loop */
+
if (m->m_flags & M_SKIP_FIREWALL || (! V_ipfw_vnet_ready))
return (IP_FW_PASS); /* accept */
@@ -2340,15 +913,15 @@ ipfw_chk(struct ip_fw_args *args)
* pointer might become stale after other pullups (but we never use it
* this way).
*/
-#define PULLUP_TO(_len, p, T) \
-do { \
- int x = (_len) + sizeof(T); \
- if ((m)->m_len < x) { \
- args->m = m = m_pullup(m, x); \
- if (m == NULL) \
- goto pullup_failed; \
- } \
- p = (mtod(m, char *) + (_len)); \
+#define PULLUP_TO(_len, p, T) \
+do { \
+ int x = (_len) + sizeof(T); \
+ if ((m)->m_len < x) { \
+ args->m = m = m_pullup(m, x); \
+ if (m == NULL) \
+ goto pullup_failed; \
+ } \
+ p = (mtod(m, char *) + (_len)); \
} while (0)
/*
@@ -2371,14 +944,15 @@ do { \
switch (proto) {
case IPPROTO_ICMPV6:
PULLUP_TO(hlen, ulp, struct icmp6_hdr);
- args->f_id.flags = ICMP6(ulp)->icmp6_type;
+ icmp6_type = ICMP6(ulp)->icmp6_type;
break;
case IPPROTO_TCP:
PULLUP_TO(hlen, ulp, struct tcphdr);
dst_port = TCP(ulp)->th_dport;
src_port = TCP(ulp)->th_sport;
- args->f_id.flags = TCP(ulp)->th_flags;
+ /* save flags for dynamic rules */
+ args->f_id._flags = TCP(ulp)->th_flags;
break;
case IPPROTO_SCTP:
@@ -2442,7 +1016,7 @@ do { \
return (IP_FW_DENY);
break;
}
- args->f_id.frag_id6 =
+ args->f_id.extra =
ntohl(((struct ip6_frag *)ulp)->ip6f_ident);
ulp = NULL;
break;
@@ -2535,14 +1109,9 @@ do { \
proto = ip->ip_p;
src_ip = ip->ip_src;
dst_ip = ip->ip_dst;
- if (args->eh != NULL) { /* layer 2 packets are as on the wire */
- offset = ntohs(ip->ip_off) & IP_OFFMASK;
- ip_len = ntohs(ip->ip_len);
- } else {
- offset = ip->ip_off & IP_OFFMASK;
- ip_len = ip->ip_len;
- }
- pktlen = ip_len < pktlen ? ip_len : pktlen;
+ offset = ntohs(ip->ip_off) & IP_OFFMASK;
+ iplen = ntohs(ip->ip_len);
+ pktlen = iplen < pktlen ? iplen : pktlen;
if (offset == 0) {
switch (proto) {
@@ -2550,7 +1119,8 @@ do { \
PULLUP_TO(hlen, ulp, struct tcphdr);
dst_port = TCP(ulp)->th_dport;
src_port = TCP(ulp)->th_sport;
- args->f_id.flags = TCP(ulp)->th_flags;
+ /* save flags for dynamic rules */
+ args->f_id._flags = TCP(ulp)->th_flags;
break;
case IPPROTO_UDP:
@@ -2561,7 +1131,7 @@ do { \
case IPPROTO_ICMP:
PULLUP_TO(hlen, ulp, struct icmphdr);
- args->f_id.flags = ICMP(ulp)->icmp_type;
+ //args->f_id.flags = ICMP(ulp)->icmp_type;
break;
default:
@@ -2585,64 +1155,47 @@ do { \
IPFW_RUNLOCK(chain);
return (IP_FW_PASS); /* accept */
}
- mtag = m_tag_find(m, PACKET_TAG_DIVERT, NULL);
- if (args->rule) {
+ if (args->rule.slot) {
/*
- * Packet has already been tagged. Look for the next rule
- * to restart processing. Make sure that args->rule still
- * exists and not changed.
+ * Packet has already been tagged as a result of a previous
+ * match on rule args->rule aka args->rule_id (PIPE, QUEUE,
+ * REASS, NETGRAPH, DIVERT/TEE...)
+ * Validate the slot and continue from the next one
+ * if still present, otherwise do a lookup.
*/
- if (chain->id != args->chain_id) {
- for (f = chain->rules; f != NULL; f = f->next)
- if (f == args->rule && f->id == args->rule_id)
- break;
-
- if (f != NULL)
- f = f->next_rule;
- else
- f = ip_fw_default_rule;
- } else
- f = args->rule->next_rule;
-
- if (f == NULL)
- f = lookup_next_rule(args->rule, 0);
+ f_pos = (args->rule.chain_id == chain->id) ?
+ args->rule.slot :
+ ipfw_find_rule(chain, args->rule.rulenum,
+ args->rule.rule_id);
} else {
- /*
- * Find the starting rule. It can be either the first
- * one, or the one after divert_rule if asked so.
- */
- int skipto = mtag ? divert_cookie(mtag) : 0;
-
- f = chain->rules;
- if (args->eh == NULL && skipto != 0) {
- if (skipto >= IPFW_DEFAULT_RULE) {
- IPFW_RUNLOCK(chain);
- return (IP_FW_DENY); /* invalid */
- }
- while (f && f->rulenum <= skipto)
- f = f->next;
- if (f == NULL) { /* drop packet */
- IPFW_RUNLOCK(chain);
- return (IP_FW_DENY);
- }
- }
- }
- /* reset divert rule to avoid confusion later */
- if (mtag) {
- divinput_flags = divert_info(mtag) &
- (IP_FW_DIVERT_OUTPUT_FLAG | IP_FW_DIVERT_LOOPBACK_FLAG);
- m_tag_delete(m, mtag);
+ f_pos = 0;
}
/*
* Now scan the rules, and parse microinstructions for each rule.
+ * We have two nested loops and an inner switch. Sometimes we
+ * need to break out of one or both loops, or re-enter one of
+ * the loops with updated variables. Loop variables are:
+ *
+ * f_pos (outer loop) points to the current rule.
+ * On output it points to the matching rule.
+ * done (outer loop) is used as a flag to break the loop.
+ * l (inner loop) residual length of current rule.
+ * cmd points to the current microinstruction.
+ *
+ * We break the inner loop by setting l=0 and possibly
+ * cmdlen=0 if we don't want to advance cmd.
+ * We break the outer loop by setting done=1
+ * We can restart the inner loop by setting l>0 and f_pos, f, cmd
+ * as needed.
*/
- for (; f; f = f->next) {
+ for (; f_pos < chain->n_rules; f_pos++) {
ipfw_insn *cmd;
uint32_t tablearg = 0;
int l, cmdlen, skip_or; /* skip rest of OR block */
+ struct ip_fw *f;
-again:
+ f = chain->map[f_pos];
if (V_set_disable & (1 << f->set) )
continue;
@@ -2657,7 +1210,7 @@ again:
* the target rule.
*/
-check_body:
+/* check_body: */
cmdlen = F_LEN(cmd);
/*
* An OR block (insn_1 || .. || insn_n) has the
@@ -2708,8 +1261,13 @@ check_body:
(ipfw_insn_u32 *)cmd,
proto, oif,
dst_ip, dst_port,
- src_ip, src_port, &ucred_cache,
- &ucred_lookup, args->inp);
+ src_ip, src_port, &ucred_lookup,
+#ifdef __FreeBSD__
+ &ucred_cache, args->inp);
+#else
+ (void *)&ucred_cache,
+ (struct inpcb *)args->m);
+#endif
break;
case O_RECV:
@@ -2767,10 +1325,15 @@ check_body:
break;
case O_DIVERTED:
- match = (cmd->arg1 & 1 && divinput_flags &
- IP_FW_DIVERT_LOOPBACK_FLAG) ||
- (cmd->arg1 & 2 && divinput_flags &
- IP_FW_DIVERT_OUTPUT_FLAG);
+ {
+ /* For diverted packets, args->rule.info
+ * contains the divert port (in host format)
+ * reason and direction.
+ */
+ uint32_t i = args->rule.info;
+ match = (i&IPFW_IS_MASK) == IPFW_IS_DIVERT &&
+ cmd->arg1 & ((i & IPFW_INFO_IN) ? 1 : 2);
+ }
break;
case O_PROTO:
@@ -2790,13 +1353,57 @@ check_body:
case O_IP_SRC_LOOKUP:
case O_IP_DST_LOOKUP:
if (is_ipv4) {
- uint32_t a =
+ uint32_t key =
(cmd->opcode == O_IP_DST_LOOKUP) ?
dst_ip.s_addr : src_ip.s_addr;
uint32_t v = 0;
- match = lookup_table(chain, cmd->arg1, a,
- &v);
+ if (cmdlen > F_INSN_SIZE(ipfw_insn_u32)) {
+ /* generic lookup. The key must be
+ * in 32bit big-endian format.
+ */
+ v = ((ipfw_insn_u32 *)cmd)->d[1];
+ if (v == 0)
+ key = dst_ip.s_addr;
+ else if (v == 1)
+ key = src_ip.s_addr;
+ else if (v == 6) /* dscp */
+ key = (ip->ip_tos >> 2) & 0x3f;
+ else if (offset != 0)
+ break;
+ else if (proto != IPPROTO_TCP &&
+ proto != IPPROTO_UDP)
+ break;
+ else if (v == 2)
+ key = htonl(dst_port);
+ else if (v == 3)
+ key = htonl(src_port);
+ else if (v == 4 || v == 5) {
+ check_uidgid(
+ (ipfw_insn_u32 *)cmd,
+ proto, oif,
+ dst_ip, dst_port,
+ src_ip, src_port, &ucred_lookup,
+#ifdef __FreeBSD__
+ &ucred_cache, args->inp);
+ if (v == 4 /* O_UID */)
+ key = ucred_cache->cr_uid;
+ else if (v == 5 /* O_JAIL */)
+ key = ucred_cache->cr_prison->pr_id;
+#else /* !__FreeBSD__ */
+ (void *)&ucred_cache,
+ (struct inpcb *)args->m);
+ if (v ==4 /* O_UID */)
+ key = ucred_cache.uid;
+ else if (v == 5 /* O_JAIL */)
+ key = ucred_cache.xid;
+#endif /* !__FreeBSD__ */
+ key = htonl(key);
+ } else
+ break;
+ }
+ match = ipfw_lookup_table(chain,
+ cmd->arg1, key, &v);
if (!match)
break;
if (cmdlen == F_INSN_SIZE(ipfw_insn_u32))
@@ -2827,7 +1434,13 @@ check_body:
INADDR_TO_IFP(src_ip, tif);
match = (tif != NULL);
+ break;
}
+#ifdef INET6
+ /* FALLTHROUGH */
+ case O_IP6_SRC_ME:
+ match= is_ipv6 && search_ip6_addr_net(&args->f_id.src_ip6);
+#endif
break;
case O_IP_DST_SET:
@@ -2860,9 +1473,16 @@ check_body:
INADDR_TO_IFP(dst_ip, tif);
match = (tif != NULL);
+ break;
}
+#ifdef INET6
+ /* FALLTHROUGH */
+ case O_IP6_DST_ME:
+ match= is_ipv6 && search_ip6_addr_net(&args->f_id.dst_ip6);
+#endif
break;
+
case O_IP_SRCPORT:
case O_IP_DSTPORT:
/*
@@ -2919,7 +1539,7 @@ check_body:
int i;
if (cmd->opcode == O_IPLEN)
- x = ip_len;
+ x = iplen;
else if (cmd->opcode == O_IPTTL)
x = ip->ip_ttl;
else /* must be IPID */
@@ -2954,7 +1574,7 @@ check_body:
int i;
tcp = TCP(ulp);
- x = ip_len -
+ x = iplen -
((ip->ip_hl + tcp->th_off) << 2);
if (cmdlen == 1) {
match = (cmd->arg1 == x);
@@ -3029,8 +1649,7 @@ check_body:
}
case O_LOG:
- if (V_fw_verbose)
- ipfw_log(f, hlen, args, m,
+ ipfw_log(f, hlen, args, m,
oif, offset, tablearg, ip);
match = 1;
break;
@@ -3129,14 +1748,6 @@ check_body:
}
break;
- case O_IP6_SRC_ME:
- match= is_ipv6 && search_ip6_addr_net(&args->f_id.src_ip6);
- break;
-
- case O_IP6_DST_ME:
- match= is_ipv6 && search_ip6_addr_net(&args->f_id.dst_ip6);
- break;
-
case O_FLOW6ID:
match = is_ipv6 &&
flow6id_match(args->f_id.flow_id6,
@@ -3158,6 +1769,7 @@ check_body:
break;
case O_TAG: {
+ struct m_tag *mtag;
uint32_t tag = (cmd->arg1 == IP_FW_TABLEARG) ?
tablearg : cmd->arg1;
@@ -3174,12 +1786,13 @@ check_body:
if (cmd->len & F_NOT) { /* `untag' action */
if (mtag != NULL)
m_tag_delete(m, mtag);
+ match = 0;
} else if (mtag == NULL) {
if ((mtag = m_tag_alloc(MTAG_IPFW,
tag, 0, M_NOWAIT)) != NULL)
m_tag_prepend(m, mtag);
+ match = 1;
}
- match = (cmd->len & F_NOT) ? 0: 1;
break;
}
@@ -3189,6 +1802,7 @@ check_body:
break;
case O_TAGGED: {
+ struct m_tag *mtag;
uint32_t tag = (cmd->arg1 == IP_FW_TABLEARG) ?
tablearg : cmd->arg1;
@@ -3228,14 +1842,13 @@ check_body:
*
* In general, here we set retval and terminate the
* outer loop (would be a 'break 3' in some language,
- * but we need to do a 'goto done').
+ * but we need to set l=0, done=1)
*
* Exceptions:
* O_COUNT and O_SKIPTO actions:
* instead of terminating, we jump to the next rule
- * ('goto next_rule', equivalent to a 'break 2'),
- * or to the SKIPTO target ('goto again' after
- * having set f, cmd and l), respectively.
+ * (setting l=0), or to the SKIPTO target (setting
+ * f/f_len, cmd and l as needed), respectively.
*
* O_TAG, O_LOG and O_ALTQ action parameters:
* perform some action and set match = 1;
@@ -3246,25 +1859,28 @@ check_body:
* These opcodes try to install an entry in the
* state tables; if successful, we continue with
* the next opcode (match=1; break;), otherwise
- * the packet * must be dropped
- * ('goto done' after setting retval);
+ * the packet must be dropped (set retval,
+ * break loops with l=0, done=1)
*
* O_PROBE_STATE and O_CHECK_STATE: these opcodes
* cause a lookup of the state table, and a jump
* to the 'action' part of the parent rule
- * ('goto check_body') if an entry is found, or
+ * if an entry is found, or
* (CHECK_STATE only) a jump to the next rule if
- * the entry is not found ('goto next_rule').
- * The result of the lookup is cached to make
- * further instances of these opcodes are
- * effectively NOPs.
+ * the entry is not found.
+ * The result of the lookup is cached so that
+ * further instances of these opcodes become NOPs.
+ * The jump to the next rule is done by setting
+ * l=0, cmdlen=0.
*/
case O_LIMIT:
case O_KEEP_STATE:
- if (install_state(f,
+ if (ipfw_install_state(f,
(ipfw_insn_limit *)cmd, args, tablearg)) {
+ /* error or limit violation */
retval = IP_FW_DENY;
- goto done; /* error/limit violation */
+ l = 0; /* exit inner loop */
+ done = 1; /* exit outer loop */
}
match = 1;
break;
@@ -3281,22 +1897,32 @@ check_body:
* to be run first).
*/
if (dyn_dir == MATCH_UNKNOWN &&
- (q = lookup_dyn_rule(&args->f_id,
+ (q = ipfw_lookup_dyn_rule(&args->f_id,
&dyn_dir, proto == IPPROTO_TCP ?
TCP(ulp) : NULL))
!= NULL) {
/*
* Found dynamic entry, update stats
* and jump to the 'action' part of
- * the parent rule.
+ * the parent rule by setting
+ * f, cmd, l and clearing cmdlen.
*/
q->pcnt++;
q->bcnt += pktlen;
+ /* XXX we would like to have f_pos
+ * readily accessible in the dynamic
+ * rule, instead of having to
+ * lookup q->rule.
+ */
f = q->rule;
+ f_pos = ipfw_find_rule(chain,
+ f->rulenum, f->id);
cmd = ACTION_PTR(f);
l = f->cmd_len - f->act_ofs;
- IPFW_DYN_UNLOCK();
- goto check_body;
+ ipfw_dyn_unlock();
+ cmdlen = 0;
+ match = 1;
+ break;
}
/*
* Dynamic entry not found. If CHECK_STATE,
@@ -3304,70 +1930,96 @@ check_body:
* ignore and continue with next opcode.
*/
if (cmd->opcode == O_CHECK_STATE)
- goto next_rule;
+ l = 0; /* exit inner loop */
match = 1;
break;
case O_ACCEPT:
retval = 0; /* accept */
- goto done;
+ l = 0; /* exit inner loop */
+ done = 1; /* exit outer loop */
+ break;
case O_PIPE:
case O_QUEUE:
- args->rule = f; /* report matching rule */
- args->rule_id = f->id;
- args->chain_id = chain->id;
- if (cmd->arg1 == IP_FW_TABLEARG)
- args->cookie = tablearg;
- else
- args->cookie = cmd->arg1;
+ set_match(args, f_pos, chain);
+ args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ?
+ tablearg : cmd->arg1;
+ if (cmd->opcode == O_PIPE)
+ args->rule.info |= IPFW_IS_PIPE;
+ if (V_fw_one_pass)
+ args->rule.info |= IPFW_ONEPASS;
retval = IP_FW_DUMMYNET;
- goto done;
+ l = 0; /* exit inner loop */
+ done = 1; /* exit outer loop */
+ break;
case O_DIVERT:
- case O_TEE: {
- struct divert_tag *dt;
-
+ case O_TEE:
if (args->eh) /* not on layer 2 */
- break;
- mtag = m_tag_get(PACKET_TAG_DIVERT,
- sizeof(struct divert_tag),
- M_NOWAIT);
- if (mtag == NULL) {
- /* XXX statistic */
- /* drop packet */
- IPFW_RUNLOCK(chain);
- if (ucred_cache != NULL)
- crfree(ucred_cache);
- return (IP_FW_DENY);
- }
- dt = (struct divert_tag *)(mtag+1);
- dt->cookie = f->rulenum;
- if (cmd->arg1 == IP_FW_TABLEARG)
- dt->info = tablearg;
- else
- dt->info = cmd->arg1;
- m_tag_prepend(m, mtag);
+ break;
+ /* otherwise this is terminal */
+ l = 0; /* exit inner loop */
+ done = 1; /* exit outer loop */
retval = (cmd->opcode == O_DIVERT) ?
- IP_FW_DIVERT : IP_FW_TEE;
- goto done;
- }
+ IP_FW_DIVERT : IP_FW_TEE;
+ set_match(args, f_pos, chain);
+ args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ?
+ tablearg : cmd->arg1;
+ break;
+
case O_COUNT:
- case O_SKIPTO:
f->pcnt++; /* update stats */
f->bcnt += pktlen;
f->timestamp = time_uptime;
- if (cmd->opcode == O_COUNT)
- goto next_rule;
- /* handle skipto */
- if (cmd->arg1 == IP_FW_TABLEARG) {
- f = lookup_next_rule(f, tablearg);
- } else {
- if (f->next_rule == NULL)
- lookup_next_rule(f, 0);
- f = f->next_rule;
+ l = 0; /* exit inner loop */
+ break;
+
+ case O_SKIPTO:
+ f->pcnt++; /* update stats */
+ f->bcnt += pktlen;
+ f->timestamp = time_uptime;
+ /* If possible use cached f_pos (in f->next_rule),
+ * whose version is written in f->next_rule
+ * (horrible hacks to avoid changing the ABI).
+ */
+ if (cmd->arg1 != IP_FW_TABLEARG &&
+ (uintptr_t)f->x_next == chain->id) {
+ f_pos = (uintptr_t)f->next_rule;
+ } else {
+ int i = (cmd->arg1 == IP_FW_TABLEARG) ?
+ tablearg : cmd->arg1;
+ /* make sure we do not jump backward */
+ if (i <= f->rulenum)
+ i = f->rulenum + 1;
+ f_pos = ipfw_find_rule(chain, i, 0);
+ /* update the cache */
+ if (cmd->arg1 != IP_FW_TABLEARG) {
+ f->next_rule =
+ (void *)(uintptr_t)f_pos;
+ f->x_next =
+ (void *)(uintptr_t)chain->id;
}
- goto again;
+ }
+ /*
+ * Skip disabled rules, and re-enter
+ * the inner loop with the correct
+ * f_pos, f, l and cmd.
+ * Also clear cmdlen and skip_or
+ */
+ for (; f_pos < chain->n_rules - 1 &&
+ (V_set_disable &
+ (1 << chain->map[f_pos]->set));
+ f_pos++)
+ ;
+ /* prepare to enter the inner loop */
+ f = chain->map[f_pos];
+ l = f->cmd_len;
+ cmd = f->cmd;
+ match = 1;
+ cmdlen = 0;
+ skip_or = 0;
+ break;
case O_REJECT:
/*
@@ -3380,7 +2032,7 @@ check_body:
is_icmp_query(ICMP(ulp))) &&
!(m->m_flags & (M_BCAST|M_MCAST)) &&
!IN_MULTICAST(ntohl(dst_ip.s_addr))) {
- send_reject(args, cmd->arg1, ip_len, ip);
+ send_reject(args, cmd->arg1, iplen, ip);
m = args->m;
}
/* FALLTHROUGH */
@@ -3389,7 +2041,7 @@ check_body:
if (hlen > 0 && is_ipv6 &&
((offset & IP6F_OFF_MASK) == 0) &&
(proto != IPPROTO_ICMPV6 ||
- (is_icmp6_query(args->f_id.flags) == 1)) &&
+ (is_icmp6_query(icmp6_type) == 1)) &&
!(m->m_flags & (M_BCAST|M_MCAST)) &&
!IN6_IS_ADDR_MULTICAST(&args->f_id.dst_ip6)) {
send_reject6(
@@ -3401,41 +2053,41 @@ check_body:
#endif
case O_DENY:
retval = IP_FW_DENY;
- goto done;
+ l = 0; /* exit inner loop */
+ done = 1; /* exit outer loop */
+ break;
- case O_FORWARD_IP: {
- struct sockaddr_in *sa;
- sa = &(((ipfw_insn_sa *)cmd)->sa);
+ case O_FORWARD_IP:
if (args->eh) /* not valid on layer2 pkts */
break;
if (!q || dyn_dir == MATCH_FORWARD) {
- if (sa->sin_addr.s_addr == INADDR_ANY) {
- bcopy(sa, &args->hopstore,
+ struct sockaddr_in *sa;
+ sa = &(((ipfw_insn_sa *)cmd)->sa);
+ if (sa->sin_addr.s_addr == INADDR_ANY) {
+ bcopy(sa, &args->hopstore,
sizeof(*sa));
- args->hopstore.sin_addr.s_addr =
+ args->hopstore.sin_addr.s_addr =
htonl(tablearg);
- args->next_hop =
- &args->hopstore;
- } else {
- args->next_hop = sa;
- }
+ args->next_hop = &args->hopstore;
+ } else {
+ args->next_hop = sa;
+ }
}
retval = IP_FW_PASS;
- }
- goto done;
+ l = 0; /* exit inner loop */
+ done = 1; /* exit outer loop */
+ break;
case O_NETGRAPH:
case O_NGTEE:
- args->rule = f; /* report matching rule */
- args->rule_id = f->id;
- args->chain_id = chain->id;
- if (cmd->arg1 == IP_FW_TABLEARG)
- args->cookie = tablearg;
- else
- args->cookie = cmd->arg1;
+ set_match(args, f_pos, chain);
+ args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ?
+ tablearg : cmd->arg1;
retval = (cmd->opcode == O_NETGRAPH) ?
IP_FW_NETGRAPH : IP_FW_NGTEE;
- goto done;
+ l = 0; /* exit inner loop */
+ done = 1; /* exit outer loop */
+ break;
case O_SETFIB:
f->pcnt++; /* update stats */
@@ -3443,88 +2095,86 @@ check_body:
f->timestamp = time_uptime;
M_SETFIB(m, cmd->arg1);
args->f_id.fib = cmd->arg1;
- goto next_rule;
+ l = 0; /* exit inner loop */
+ break;
+
+ case O_NAT:
+ if (!IPFW_NAT_LOADED) {
+ retval = IP_FW_DENY;
+ } else {
+ struct cfg_nat *t;
+ int nat_id;
- case O_NAT: {
- struct cfg_nat *t;
- int nat_id;
+ set_match(args, f_pos, chain);
+ t = ((ipfw_insn_nat *)cmd)->nat;
+ if (t == NULL) {
+ nat_id = (cmd->arg1 == IP_FW_TABLEARG) ?
+ tablearg : cmd->arg1;
+ t = (*lookup_nat_ptr)(&chain->nat, nat_id);
- if (IPFW_NAT_LOADED) {
- args->rule = f; /* Report matching rule. */
- args->rule_id = f->id;
- args->chain_id = chain->id;
- t = ((ipfw_insn_nat *)cmd)->nat;
if (t == NULL) {
- nat_id = (cmd->arg1 == IP_FW_TABLEARG) ?
- tablearg : cmd->arg1;
- LOOKUP_NAT(V_layer3_chain, nat_id, t);
- if (t == NULL) {
- retval = IP_FW_DENY;
- goto done;
- }
- if (cmd->arg1 != IP_FW_TABLEARG)
- ((ipfw_insn_nat *)cmd)->nat = t;
+ retval = IP_FW_DENY;
+ l = 0; /* exit inner loop */
+ done = 1; /* exit outer loop */
+ break;
}
- retval = ipfw_nat_ptr(args, t, m);
- } else
- retval = IP_FW_DENY;
- goto done;
- }
+ if (cmd->arg1 != IP_FW_TABLEARG)
+ ((ipfw_insn_nat *)cmd)->nat = t;
+ }
+ retval = ipfw_nat_ptr(args, t, m);
+ }
+ l = 0; /* exit inner loop */
+ done = 1; /* exit outer loop */
+ break;
case O_REASS: {
int ip_off;
f->pcnt++;
f->bcnt += pktlen;
- ip_off = (args->eh != NULL) ? ntohs(ip->ip_off) : ip->ip_off;
- if (ip_off & (IP_MF | IP_OFFMASK)) {
- /*
- * ip_reass() expects len & off in host
- * byte order: fix them in case we come
- * from layer2.
- */
- if (args->eh != NULL) {
- ip->ip_len = ntohs(ip->ip_len);
- ip->ip_off = ntohs(ip->ip_off);
- }
+ l = 0; /* in any case exit inner loop */
+ ip_off = ntohs(ip->ip_off);
- m = ip_reass(m);
- args->m = m;
-
- /*
- * IP header checksum fixup after
- * reassembly and leave header
- * in network byte order.
- */
- if (m != NULL) {
- int hlen;
-
- ip = mtod(m, struct ip *);
- hlen = ip->ip_hl << 2;
- /* revert len & off for layer2 pkts */
- if (args->eh != NULL)
- ip->ip_len = htons(ip->ip_len);
- ip->ip_sum = 0;
- if (hlen == sizeof(struct ip))
- ip->ip_sum = in_cksum_hdr(ip);
- else
- ip->ip_sum = in_cksum(m, hlen);
- retval = IP_FW_REASS;
- args->rule = f;
- args->rule_id = f->id;
- args->chain_id = chain->id;
- goto done;
- } else {
- retval = IP_FW_DENY;
- goto done;
- }
+ /* if not fragmented, go to next rule */
+ if ((ip_off & (IP_MF | IP_OFFMASK)) == 0)
+ break;
+ /*
+ * ip_reass() expects len & off in host
+ * byte order.
+ */
+ SET_HOST_IPLEN(ip);
+
+ args->m = m = ip_reass(m);
+
+ /*
+ * do IP header checksum fixup.
+ */
+ if (m == NULL) { /* fragment got swallowed */
+ retval = IP_FW_DENY;
+ } else { /* good, packet complete */
+ int hlen;
+
+ ip = mtod(m, struct ip *);
+ hlen = ip->ip_hl << 2;
+ SET_NET_IPLEN(ip);
+ ip->ip_sum = 0;
+ if (hlen == sizeof(struct ip))
+ ip->ip_sum = in_cksum_hdr(ip);
+ else
+ ip->ip_sum = in_cksum(m, hlen);
+ retval = IP_FW_REASS;
+ set_match(args, f_pos, chain);
}
- goto next_rule;
+ done = 1; /* exit outer loop */
+ break;
}
default:
panic("-- unknown opcode %d\n", cmd->opcode);
} /* end of switch() on opcodes */
+ /*
+ * if we get here with l=0, then match is irrelevant.
+ */
if (cmd->len & F_NOT)
match = !match;
@@ -3537,25 +2187,30 @@ check_body:
break; /* try next rule */
}
- } /* end of inner for, scan opcodes */
+ } /* end of inner loop, scan opcodes */
+
+ if (done)
+ break;
-next_rule:; /* try next rule */
+/* next_rule:; */ /* try next rule */
} /* end of outer for, scan rules */
- printf("ipfw: ouch!, skip past end of rules, denying packet\n");
- IPFW_RUNLOCK(chain);
- if (ucred_cache != NULL)
- crfree(ucred_cache);
- return (IP_FW_DENY);
-done:
- /* Update statistics */
- f->pcnt++;
- f->bcnt += pktlen;
- f->timestamp = time_uptime;
+ if (done) {
+ struct ip_fw *rule = chain->map[f_pos];
+ /* Update statistics */
+ rule->pcnt++;
+ rule->bcnt += pktlen;
+ rule->timestamp = time_uptime;
+ } else {
+ retval = IP_FW_DENY;
+ printf("ipfw: ouch!, skip past end of rules, denying packet\n");
+ }
IPFW_RUNLOCK(chain);
+#ifdef __FreeBSD__
if (ucred_cache != NULL)
crfree(ucred_cache);
+#endif
return (retval);
pullup_failed:
@@ -3565,1150 +2220,10 @@ pullup_failed:
}
/*
- * When a rule is added/deleted, clear the next_rule pointers in all rules.
- * These will be reconstructed on the fly as packets are matched.
- */
-static void
-flush_rule_ptrs(struct ip_fw_chain *chain)
-{
- struct ip_fw *rule;
-
- IPFW_WLOCK_ASSERT(chain);
-
- chain->id++;
-
- for (rule = chain->rules; rule; rule = rule->next)
- rule->next_rule = NULL;
-}
-
-/*
- * Add a new rule to the list. Copy the rule into a malloc'ed area, then
- * possibly create a rule number and add the rule to the list.
- * Update the rule_number in the input struct so the caller knows it as well.
- */
-static int
-add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
-{
- struct ip_fw *rule, *f, *prev;
- int l = RULESIZE(input_rule);
-
- if (chain->rules == NULL && input_rule->rulenum != IPFW_DEFAULT_RULE)
- return (EINVAL);
-
- rule = malloc(l, M_IPFW, M_NOWAIT | M_ZERO);
- if (rule == NULL)
- return (ENOSPC);
-
- bcopy(input_rule, rule, l);
-
- rule->next = NULL;
- rule->next_rule = NULL;
-
- rule->pcnt = 0;
- rule->bcnt = 0;
- rule->timestamp = 0;
-
- IPFW_WLOCK(chain);
-
- if (chain->rules == NULL) { /* default rule */
- chain->rules = rule;
- rule->id = ++chain->id;
- goto done;
- }
-
- /*
- * If rulenum is 0, find highest numbered rule before the
- * default rule, and add autoinc_step
- */
- if (V_autoinc_step < 1)
- V_autoinc_step = 1;
- else if (V_autoinc_step > 1000)
- V_autoinc_step = 1000;
- if (rule->rulenum == 0) {
- /*
- * locate the highest numbered rule before default
- */
- for (f = chain->rules; f; f = f->next) {
- if (f->rulenum == IPFW_DEFAULT_RULE)
- break;
- rule->rulenum = f->rulenum;
- }
- if (rule->rulenum < IPFW_DEFAULT_RULE - V_autoinc_step)
- rule->rulenum += V_autoinc_step;
- input_rule->rulenum = rule->rulenum;
- }
-
- /*
- * Now insert the new rule in the right place in the sorted list.
- */
- for (prev = NULL, f = chain->rules; f; prev = f, f = f->next) {
- if (f->rulenum > rule->rulenum) { /* found the location */
- if (prev) {
- rule->next = f;
- prev->next = rule;
- } else { /* head insert */
- rule->next = chain->rules;
- chain->rules = rule;
- }
- break;
- }
- }
- flush_rule_ptrs(chain);
- /* chain->id incremented inside flush_rule_ptrs() */
- rule->id = chain->id;
-done:
- V_static_count++;
- V_static_len += l;
- IPFW_WUNLOCK(chain);
- DEB(printf("ipfw: installed rule %d, static count now %d\n",
- rule->rulenum, V_static_count);)
- return (0);
-}
-
-/**
- * Remove a static rule (including derived * dynamic rules)
- * and place it on the ``reap list'' for later reclamation.
- * The caller is in charge of clearing rule pointers to avoid
- * dangling pointers.
- * @return a pointer to the next entry.
- * Arguments are not checked, so they better be correct.
- */
-static struct ip_fw *
-remove_rule(struct ip_fw_chain *chain, struct ip_fw *rule,
- struct ip_fw *prev)
-{
- struct ip_fw *n;
- int l = RULESIZE(rule);
-
- IPFW_WLOCK_ASSERT(chain);
-
- n = rule->next;
- IPFW_DYN_LOCK();
- remove_dyn_rule(rule, NULL /* force removal */);
- IPFW_DYN_UNLOCK();
- if (prev == NULL)
- chain->rules = n;
- else
- prev->next = n;
- V_static_count--;
- V_static_len -= l;
-
- rule->next = chain->reap;
- chain->reap = rule;
-
- return n;
-}
-
-/*
- * Reclaim storage associated with a list of rules. This is
- * typically the list created using remove_rule.
- * A NULL pointer on input is handled correctly.
- */
-static void
-reap_rules(struct ip_fw *head)
-{
- struct ip_fw *rule;
-
- while ((rule = head) != NULL) {
- head = head->next;
- free(rule, M_IPFW);
- }
-}
-
-/*
- * Remove all rules from a chain (except rules in set RESVD_SET
- * unless kill_default = 1). The caller is responsible for
- * reclaiming storage for the rules left in chain->reap.
- */
-static void
-free_chain(struct ip_fw_chain *chain, int kill_default)
-{
- struct ip_fw *prev, *rule;
-
- IPFW_WLOCK_ASSERT(chain);
-
- chain->reap = NULL;
- flush_rule_ptrs(chain); /* more efficient to do outside the loop */
- for (prev = NULL, rule = chain->rules; rule ; )
- if (kill_default || rule->set != RESVD_SET)
- rule = remove_rule(chain, rule, prev);
- else {
- prev = rule;
- rule = rule->next;
- }
-}
-
-/**
- * Remove all rules with given number, and also do set manipulation.
- * Assumes chain != NULL && *chain != NULL.
- *
- * The argument is an u_int32_t. The low 16 bit are the rule or set number,
- * the next 8 bits are the new set, the top 8 bits are the command:
- *
- * 0 delete rules with given number
- * 1 delete rules with given set number
- * 2 move rules with given number to new set
- * 3 move rules with given set number to new set
- * 4 swap sets with given numbers
- * 5 delete rules with given number and with given set number
- */
-static int
-del_entry(struct ip_fw_chain *chain, u_int32_t arg)
-{
- struct ip_fw *prev = NULL, *rule;
- u_int16_t rulenum; /* rule or old_set */
- u_int8_t cmd, new_set;
-
- rulenum = arg & 0xffff;
- cmd = (arg >> 24) & 0xff;
- new_set = (arg >> 16) & 0xff;
-
- if (cmd > 5 || new_set > RESVD_SET)
- return EINVAL;
- if (cmd == 0 || cmd == 2 || cmd == 5) {
- if (rulenum >= IPFW_DEFAULT_RULE)
- return EINVAL;
- } else {
- if (rulenum > RESVD_SET) /* old_set */
- return EINVAL;
- }
-
- IPFW_WLOCK(chain);
- rule = chain->rules; /* common starting point */
- chain->reap = NULL; /* prepare for deletions */
- switch (cmd) {
- case 0: /* delete rules with given number */
- /*
- * locate first rule to delete
- */
- for (; rule->rulenum < rulenum; prev = rule, rule = rule->next)
- ;
- if (rule->rulenum != rulenum) {
- IPFW_WUNLOCK(chain);
- return EINVAL;
- }
-
- /*
- * flush pointers outside the loop, then delete all matching
- * rules. prev remains the same throughout the cycle.
- */
- flush_rule_ptrs(chain);
- while (rule->rulenum == rulenum)
- rule = remove_rule(chain, rule, prev);
- break;
-
- case 1: /* delete all rules with given set number */
- flush_rule_ptrs(chain);
- while (rule->rulenum < IPFW_DEFAULT_RULE) {
- if (rule->set == rulenum)
- rule = remove_rule(chain, rule, prev);
- else {
- prev = rule;
- rule = rule->next;
- }
- }
- break;
-
- case 2: /* move rules with given number to new set */
- for (; rule->rulenum < IPFW_DEFAULT_RULE; rule = rule->next)
- if (rule->rulenum == rulenum)
- rule->set = new_set;
- break;
-
- case 3: /* move rules with given set number to new set */
- for (; rule->rulenum < IPFW_DEFAULT_RULE; rule = rule->next)
- if (rule->set == rulenum)
- rule->set = new_set;
- break;
-
- case 4: /* swap two sets */
- for (; rule->rulenum < IPFW_DEFAULT_RULE; rule = rule->next)
- if (rule->set == rulenum)
- rule->set = new_set;
- else if (rule->set == new_set)
- rule->set = rulenum;
- break;
-
- case 5: /* delete rules with given number and with given set number.
- * rulenum - given rule number;
- * new_set - given set number.
- */
- for (; rule->rulenum < rulenum; prev = rule, rule = rule->next)
- ;
- if (rule->rulenum != rulenum) {
- IPFW_WUNLOCK(chain);
- return (EINVAL);
- }
- flush_rule_ptrs(chain);
- while (rule->rulenum == rulenum) {
- if (rule->set == new_set)
- rule = remove_rule(chain, rule, prev);
- else {
- prev = rule;
- rule = rule->next;
- }
- }
- }
- /*
- * Look for rules to reclaim. We grab the list before
- * releasing the lock then reclaim them w/o the lock to
- * avoid a LOR with dummynet.
- */
- rule = chain->reap;
- IPFW_WUNLOCK(chain);
- reap_rules(rule);
- return 0;
-}
-
-/*
- * Clear counters for a specific rule.
- * The enclosing "table" is assumed locked.
- */
-static void
-clear_counters(struct ip_fw *rule, int log_only)
-{
- ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule);
-
- if (log_only == 0) {
- rule->bcnt = rule->pcnt = 0;
- rule->timestamp = 0;
- }
- if (l->o.opcode == O_LOG)
- l->log_left = l->max_log;
-}
-
-/**
- * Reset some or all counters on firewall rules.
- * The argument `arg' is an u_int32_t. The low 16 bit are the rule number,
- * the next 8 bits are the set number, the top 8 bits are the command:
- * 0 work with rules from all set's;
- * 1 work with rules only from specified set.
- * Specified rule number is zero if we want to clear all entries.
- * log_only is 1 if we only want to reset logs, zero otherwise.
- */
-static int
-zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only)
-{
- struct ip_fw *rule;
- char *msg;
-
- uint16_t rulenum = arg & 0xffff;
- uint8_t set = (arg >> 16) & 0xff;
- uint8_t cmd = (arg >> 24) & 0xff;
-
- if (cmd > 1)
- return (EINVAL);
- if (cmd == 1 && set > RESVD_SET)
- return (EINVAL);
-
- IPFW_WLOCK(chain);
- if (rulenum == 0) {
- V_norule_counter = 0;
- for (rule = chain->rules; rule; rule = rule->next) {
- /* Skip rules from another set. */
- if (cmd == 1 && rule->set != set)
- continue;
- clear_counters(rule, log_only);
- }
- msg = log_only ? "All logging counts reset" :
- "Accounting cleared";
- } else {
- int cleared = 0;
- /*
- * We can have multiple rules with the same number, so we
- * need to clear them all.
- */
- for (rule = chain->rules; rule; rule = rule->next)
- if (rule->rulenum == rulenum) {
- while (rule && rule->rulenum == rulenum) {
- if (cmd == 0 || rule->set == set)
- clear_counters(rule, log_only);
- rule = rule->next;
- }
- cleared = 1;
- break;
- }
- if (!cleared) { /* we did not find any matching rules */
- IPFW_WUNLOCK(chain);
- return (EINVAL);
- }
- msg = log_only ? "logging count reset" : "cleared";
- }
- IPFW_WUNLOCK(chain);
-
- if (V_fw_verbose) {
- int lev = LOG_SECURITY | LOG_NOTICE;
-
- if (rulenum)
- log(lev, "ipfw: Entry %d %s.\n", rulenum, msg);
- else
- log(lev, "ipfw: %s.\n", msg);
- }
- return (0);
-}
-
-/*
- * Check validity of the structure before insert.
- * Fortunately rules are simple, so this mostly need to check rule sizes.
- */
-static int
-check_ipfw_struct(struct ip_fw *rule, int size)
-{
- int l, cmdlen = 0;
- int have_action=0;
- ipfw_insn *cmd;
-
- if (size < sizeof(*rule)) {
- printf("ipfw: rule too short\n");
- return (EINVAL);
- }
- /* first, check for valid size */
- l = RULESIZE(rule);
- if (l != size) {
- printf("ipfw: size mismatch (have %d want %d)\n", size, l);
- return (EINVAL);
- }
- if (rule->act_ofs >= rule->cmd_len) {
- printf("ipfw: bogus action offset (%u > %u)\n",
- rule->act_ofs, rule->cmd_len - 1);
- return (EINVAL);
- }
- /*
- * Now go for the individual checks. Very simple ones, basically only
- * instruction sizes.
- */
- for (l = rule->cmd_len, cmd = rule->cmd ;
- l > 0 ; l -= cmdlen, cmd += cmdlen) {
- cmdlen = F_LEN(cmd);
- if (cmdlen > l) {
- printf("ipfw: opcode %d size truncated\n",
- cmd->opcode);
- return EINVAL;
- }
- DEB(printf("ipfw: opcode %d\n", cmd->opcode);)
- switch (cmd->opcode) {
- case O_PROBE_STATE:
- case O_KEEP_STATE:
- case O_PROTO:
- case O_IP_SRC_ME:
- case O_IP_DST_ME:
- case O_LAYER2:
- case O_IN:
- case O_FRAG:
- case O_DIVERTED:
- case O_IPOPT:
- case O_IPTOS:
- case O_IPPRECEDENCE:
- case O_IPVER:
- case O_TCPWIN:
- case O_TCPFLAGS:
- case O_TCPOPTS:
- case O_ESTAB:
- case O_VERREVPATH:
- case O_VERSRCREACH:
- case O_ANTISPOOF:
- case O_IPSEC:
-#ifdef INET6
- case O_IP6_SRC_ME:
- case O_IP6_DST_ME:
- case O_EXT_HDR:
- case O_IP6:
-#endif
- case O_IP4:
- case O_TAG:
- if (cmdlen != F_INSN_SIZE(ipfw_insn))
- goto bad_size;
- break;
-
- case O_FIB:
- if (cmdlen != F_INSN_SIZE(ipfw_insn))
- goto bad_size;
- if (cmd->arg1 >= rt_numfibs) {
- printf("ipfw: invalid fib number %d\n",
- cmd->arg1);
- return EINVAL;
- }
- break;
-
- case O_SETFIB:
- if (cmdlen != F_INSN_SIZE(ipfw_insn))
- goto bad_size;
- if (cmd->arg1 >= rt_numfibs) {
- printf("ipfw: invalid fib number %d\n",
- cmd->arg1);
- return EINVAL;
- }
- goto check_action;
-
- case O_UID:
- case O_GID:
- case O_JAIL:
- case O_IP_SRC:
- case O_IP_DST:
- case O_TCPSEQ:
- case O_TCPACK:
- case O_PROB:
- case O_ICMPTYPE:
- if (cmdlen != F_INSN_SIZE(ipfw_insn_u32))
- goto bad_size;
- break;
-
- case O_LIMIT:
- if (cmdlen != F_INSN_SIZE(ipfw_insn_limit))
- goto bad_size;
- break;
-
- case O_LOG:
- if (cmdlen != F_INSN_SIZE(ipfw_insn_log))
- goto bad_size;
-
- ((ipfw_insn_log *)cmd)->log_left =
- ((ipfw_insn_log *)cmd)->max_log;
-
- break;
-
- case O_IP_SRC_MASK:
- case O_IP_DST_MASK:
- /* only odd command lengths */
- if ( !(cmdlen & 1) || cmdlen > 31)
- goto bad_size;
- break;
-
- case O_IP_SRC_SET:
- case O_IP_DST_SET:
- if (cmd->arg1 == 0 || cmd->arg1 > 256) {
- printf("ipfw: invalid set size %d\n",
- cmd->arg1);
- return EINVAL;
- }
- if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
- (cmd->arg1+31)/32 )
- goto bad_size;
- break;
-
- case O_IP_SRC_LOOKUP:
- case O_IP_DST_LOOKUP:
- if (cmd->arg1 >= IPFW_TABLES_MAX) {
- printf("ipfw: invalid table number %d\n",
- cmd->arg1);
- return (EINVAL);
- }
- if (cmdlen != F_INSN_SIZE(ipfw_insn) &&
- cmdlen != F_INSN_SIZE(ipfw_insn_u32))
- goto bad_size;
- break;
-
- case O_MACADDR2:
- if (cmdlen != F_INSN_SIZE(ipfw_insn_mac))
- goto bad_size;
- break;
-
- case O_NOP:
- case O_IPID:
- case O_IPTTL:
- case O_IPLEN:
- case O_TCPDATALEN:
- case O_TAGGED:
- if (cmdlen < 1 || cmdlen > 31)
- goto bad_size;
- break;
-
- case O_MAC_TYPE:
- case O_IP_SRCPORT:
- case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */
- if (cmdlen < 2 || cmdlen > 31)
- goto bad_size;
- break;
-
- case O_RECV:
- case O_XMIT:
- case O_VIA:
- if (cmdlen != F_INSN_SIZE(ipfw_insn_if))
- goto bad_size;
- break;
-
- case O_ALTQ:
- if (cmdlen != F_INSN_SIZE(ipfw_insn_altq))
- goto bad_size;
- break;
-
- case O_PIPE:
- case O_QUEUE:
- if (cmdlen != F_INSN_SIZE(ipfw_insn))
- goto bad_size;
- goto check_action;
-
- case O_FORWARD_IP:
-#ifdef IPFIREWALL_FORWARD
- if (cmdlen != F_INSN_SIZE(ipfw_insn_sa))
- goto bad_size;
- goto check_action;
-#else
- return EINVAL;
-#endif
-
- case O_DIVERT:
- case O_TEE:
- if (ip_divert_ptr == NULL)
- return EINVAL;
- else
- goto check_size;
- case O_NETGRAPH:
- case O_NGTEE:
- if (!NG_IPFW_LOADED)
- return EINVAL;
- else
- goto check_size;
- case O_NAT:
- if (!IPFW_NAT_LOADED)
- return EINVAL;
- if (cmdlen != F_INSN_SIZE(ipfw_insn_nat))
- goto bad_size;
- goto check_action;
- case O_FORWARD_MAC: /* XXX not implemented yet */
- case O_CHECK_STATE:
- case O_COUNT:
- case O_ACCEPT:
- case O_DENY:
- case O_REJECT:
-#ifdef INET6
- case O_UNREACH6:
-#endif
- case O_SKIPTO:
- case O_REASS:
-check_size:
- if (cmdlen != F_INSN_SIZE(ipfw_insn))
- goto bad_size;
-check_action:
- if (have_action) {
- printf("ipfw: opcode %d, multiple actions"
- " not allowed\n",
- cmd->opcode);
- return EINVAL;
- }
- have_action = 1;
- if (l != cmdlen) {
- printf("ipfw: opcode %d, action must be"
- " last opcode\n",
- cmd->opcode);
- return EINVAL;
- }
- break;
-#ifdef INET6
- case O_IP6_SRC:
- case O_IP6_DST:
- if (cmdlen != F_INSN_SIZE(struct in6_addr) +
- F_INSN_SIZE(ipfw_insn))
- goto bad_size;
- break;
-
- case O_FLOW6ID:
- if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
- ((ipfw_insn_u32 *)cmd)->o.arg1)
- goto bad_size;
- break;
-
- case O_IP6_SRC_MASK:
- case O_IP6_DST_MASK:
- if ( !(cmdlen & 1) || cmdlen > 127)
- goto bad_size;
- break;
- case O_ICMP6TYPE:
- if( cmdlen != F_INSN_SIZE( ipfw_insn_icmp6 ) )
- goto bad_size;
- break;
-#endif
-
- default:
- switch (cmd->opcode) {
-#ifndef INET6
- case O_IP6_SRC_ME:
- case O_IP6_DST_ME:
- case O_EXT_HDR:
- case O_IP6:
- case O_UNREACH6:
- case O_IP6_SRC:
- case O_IP6_DST:
- case O_FLOW6ID:
- case O_IP6_SRC_MASK:
- case O_IP6_DST_MASK:
- case O_ICMP6TYPE:
- printf("ipfw: no IPv6 support in kernel\n");
- return EPROTONOSUPPORT;
-#endif
- default:
- printf("ipfw: opcode %d, unknown opcode\n",
- cmd->opcode);
- return EINVAL;
- }
- }
- }
- if (have_action == 0) {
- printf("ipfw: missing action\n");
- return EINVAL;
- }
- return 0;
-
-bad_size:
- printf("ipfw: opcode %d size %d wrong\n",
- cmd->opcode, cmdlen);
- return EINVAL;
-}
-
-/*
- * Copy the static and dynamic rules to the supplied buffer
- * and return the amount of space actually used.
+ * Module and VNET glue
*/
-static size_t
-ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space)
-{
- char *bp = buf;
- char *ep = bp + space;
- struct ip_fw *rule;
- int i;
- time_t boot_seconds;
-
- boot_seconds = boottime.tv_sec;
- /* XXX this can take a long time and locking will block packet flow */
- IPFW_RLOCK(chain);
- for (rule = chain->rules; rule ; rule = rule->next) {
- /*
- * Verify the entry fits in the buffer in case the
- * rules changed between calculating buffer space and
- * now. This would be better done using a generation
- * number but should suffice for now.
- */
- i = RULESIZE(rule);
- if (bp + i <= ep) {
- bcopy(rule, bp, i);
- /*
- * XXX HACK. Store the disable mask in the "next"
- * pointer in a wild attempt to keep the ABI the same.
- * Why do we do this on EVERY rule?
- */
- bcopy(&V_set_disable,
- &(((struct ip_fw *)bp)->next_rule),
- sizeof(V_set_disable));
- if (((struct ip_fw *)bp)->timestamp)
- ((struct ip_fw *)bp)->timestamp += boot_seconds;
- bp += i;
- }
- }
- IPFW_RUNLOCK(chain);
- if (V_ipfw_dyn_v) {
- ipfw_dyn_rule *p, *last = NULL;
-
- IPFW_DYN_LOCK();
- for (i = 0 ; i < V_curr_dyn_buckets; i++)
- for (p = V_ipfw_dyn_v[i] ; p != NULL; p = p->next) {
- if (bp + sizeof *p <= ep) {
- ipfw_dyn_rule *dst =
- (ipfw_dyn_rule *)bp;
- bcopy(p, dst, sizeof *p);
- bcopy(&(p->rule->rulenum), &(dst->rule),
- sizeof(p->rule->rulenum));
- /*
- * store set number into high word of
- * dst->rule pointer.
- */
- bcopy(&(p->rule->set),
- (char *)&dst->rule +
- sizeof(p->rule->rulenum),
- sizeof(p->rule->set));
- /*
- * store a non-null value in "next".
- * The userland code will interpret a
- * NULL here as a marker
- * for the last dynamic rule.
- */
- bcopy(&dst, &dst->next, sizeof(dst));
- last = dst;
- dst->expire =
- TIME_LEQ(dst->expire, time_uptime) ?
- 0 : dst->expire - time_uptime ;
- bp += sizeof(ipfw_dyn_rule);
- }
- }
- IPFW_DYN_UNLOCK();
- if (last != NULL) /* mark last dynamic rule */
- bzero(&last->next, sizeof(last));
- }
- return (bp - (char *)buf);
-}
-
-
-/**
- * {set|get}sockopt parser.
- */
-static int
-ipfw_ctl(struct sockopt *sopt)
-{
-#define RULE_MAXSIZE (256*sizeof(u_int32_t))
- int error;
- size_t size;
- struct ip_fw *buf, *rule;
- u_int32_t rulenum[2];
-
- error = priv_check(sopt->sopt_td, PRIV_NETINET_IPFW);
- if (error)
- return (error);
-
- /*
- * Disallow modifications in really-really secure mode, but still allow
- * the logging counters to be reset.
- */
- if (sopt->sopt_name == IP_FW_ADD ||
- (sopt->sopt_dir == SOPT_SET && sopt->sopt_name != IP_FW_RESETLOG)) {
- error = securelevel_ge(sopt->sopt_td->td_ucred, 3);
- if (error)
- return (error);
- }
-
- error = 0;
-
- switch (sopt->sopt_name) {
- case IP_FW_GET:
- /*
- * pass up a copy of the current rules. Static rules
- * come first (the last of which has number IPFW_DEFAULT_RULE),
- * followed by a possibly empty list of dynamic rule.
- * The last dynamic rule has NULL in the "next" field.
- *
- * Note that the calculated size is used to bound the
- * amount of data returned to the user. The rule set may
- * change between calculating the size and returning the
- * data in which case we'll just return what fits.
- */
- size = V_static_len; /* size of static rules */
- if (V_ipfw_dyn_v) /* add size of dyn.rules */
- size += (V_dyn_count * sizeof(ipfw_dyn_rule));
-
- if (size >= sopt->sopt_valsize)
- break;
- /*
- * XXX todo: if the user passes a short length just to know
- * how much room is needed, do not bother filling up the
- * buffer, just jump to the sooptcopyout.
- */
- buf = malloc(size, M_TEMP, M_WAITOK);
- error = sooptcopyout(sopt, buf,
- ipfw_getrules(&V_layer3_chain, buf, size));
- free(buf, M_TEMP);
- break;
-
- case IP_FW_FLUSH:
- /*
- * Normally we cannot release the lock on each iteration.
- * We could do it here only because we start from the head all
- * the times so there is no risk of missing some entries.
- * On the other hand, the risk is that we end up with
- * a very inconsistent ruleset, so better keep the lock
- * around the whole cycle.
- *
- * XXX this code can be improved by resetting the head of
- * the list to point to the default rule, and then freeing
- * the old list without the need for a lock.
- */
-
- IPFW_WLOCK(&V_layer3_chain);
- free_chain(&V_layer3_chain, 0 /* keep default rule */);
- rule = V_layer3_chain.reap;
- IPFW_WUNLOCK(&V_layer3_chain);
- reap_rules(rule);
- break;
-
- case IP_FW_ADD:
- rule = malloc(RULE_MAXSIZE, M_TEMP, M_WAITOK);
- error = sooptcopyin(sopt, rule, RULE_MAXSIZE,
- sizeof(struct ip_fw) );
- if (error == 0)
- error = check_ipfw_struct(rule, sopt->sopt_valsize);
- if (error == 0) {
- error = add_rule(&V_layer3_chain, rule);
- size = RULESIZE(rule);
- if (!error && sopt->sopt_dir == SOPT_GET)
- error = sooptcopyout(sopt, rule, size);
- }
- free(rule, M_TEMP);
- break;
-
- case IP_FW_DEL:
- /*
- * IP_FW_DEL is used for deleting single rules or sets,
- * and (ab)used to atomically manipulate sets. Argument size
- * is used to distinguish between the two:
- * sizeof(u_int32_t)
- * delete single rule or set of rules,
- * or reassign rules (or sets) to a different set.
- * 2*sizeof(u_int32_t)
- * atomic disable/enable sets.
- * first u_int32_t contains sets to be disabled,
- * second u_int32_t contains sets to be enabled.
- */
- error = sooptcopyin(sopt, rulenum,
- 2*sizeof(u_int32_t), sizeof(u_int32_t));
- if (error)
- break;
- size = sopt->sopt_valsize;
- if (size == sizeof(u_int32_t)) /* delete or reassign */
- error = del_entry(&V_layer3_chain, rulenum[0]);
- else if (size == 2*sizeof(u_int32_t)) /* set enable/disable */
- V_set_disable =
- (V_set_disable | rulenum[0]) & ~rulenum[1] &
- ~(1<<RESVD_SET); /* set RESVD_SET always enabled */
- else
- error = EINVAL;
- break;
-
- case IP_FW_ZERO:
- case IP_FW_RESETLOG: /* argument is an u_int_32, the rule number */
- rulenum[0] = 0;
- if (sopt->sopt_val != 0) {
- error = sooptcopyin(sopt, rulenum,
- sizeof(u_int32_t), sizeof(u_int32_t));
- if (error)
- break;
- }
- error = zero_entry(&V_layer3_chain, rulenum[0],
- sopt->sopt_name == IP_FW_RESETLOG);
- break;
-
- case IP_FW_TABLE_ADD:
- {
- ipfw_table_entry ent;
-
- error = sooptcopyin(sopt, &ent,
- sizeof(ent), sizeof(ent));
- if (error)
- break;
- error = add_table_entry(&V_layer3_chain, ent.tbl,
- ent.addr, ent.masklen, ent.value);
- }
- break;
-
- case IP_FW_TABLE_DEL:
- {
- ipfw_table_entry ent;
-
- error = sooptcopyin(sopt, &ent,
- sizeof(ent), sizeof(ent));
- if (error)
- break;
- error = del_table_entry(&V_layer3_chain, ent.tbl,
- ent.addr, ent.masklen);
- }
- break;
-
- case IP_FW_TABLE_FLUSH:
- {
- u_int16_t tbl;
-
- error = sooptcopyin(sopt, &tbl,
- sizeof(tbl), sizeof(tbl));
- if (error)
- break;
- IPFW_WLOCK(&V_layer3_chain);
- error = flush_table(&V_layer3_chain, tbl);
- IPFW_WUNLOCK(&V_layer3_chain);
- }
- break;
-
- case IP_FW_TABLE_GETSIZE:
- {
- u_int32_t tbl, cnt;
-
- if ((error = sooptcopyin(sopt, &tbl, sizeof(tbl),
- sizeof(tbl))))
- break;
- IPFW_RLOCK(&V_layer3_chain);
- error = count_table(&V_layer3_chain, tbl, &cnt);
- IPFW_RUNLOCK(&V_layer3_chain);
- if (error)
- break;
- error = sooptcopyout(sopt, &cnt, sizeof(cnt));
- }
- break;
-
- case IP_FW_TABLE_LIST:
- {
- ipfw_table *tbl;
-
- if (sopt->sopt_valsize < sizeof(*tbl)) {
- error = EINVAL;
- break;
- }
- size = sopt->sopt_valsize;
- tbl = malloc(size, M_TEMP, M_WAITOK);
- error = sooptcopyin(sopt, tbl, size, sizeof(*tbl));
- if (error) {
- free(tbl, M_TEMP);
- break;
- }
- tbl->size = (size - sizeof(*tbl)) /
- sizeof(ipfw_table_entry);
- IPFW_RLOCK(&V_layer3_chain);
- error = dump_table(&V_layer3_chain, tbl);
- IPFW_RUNLOCK(&V_layer3_chain);
- if (error) {
- free(tbl, M_TEMP);
- break;
- }
- error = sooptcopyout(sopt, tbl, size);
- free(tbl, M_TEMP);
- }
- break;
-
- case IP_FW_NAT_CFG:
- if (IPFW_NAT_LOADED)
- error = ipfw_nat_cfg_ptr(sopt);
- else {
- printf("IP_FW_NAT_CFG: %s\n",
- "ipfw_nat not present, please load it");
- error = EINVAL;
- }
- break;
-
- case IP_FW_NAT_DEL:
- if (IPFW_NAT_LOADED)
- error = ipfw_nat_del_ptr(sopt);
- else {
- printf("IP_FW_NAT_DEL: %s\n",
- "ipfw_nat not present, please load it");
- error = EINVAL;
- }
- break;
-
- case IP_FW_NAT_GET_CONFIG:
- if (IPFW_NAT_LOADED)
- error = ipfw_nat_get_cfg_ptr(sopt);
- else {
- printf("IP_FW_NAT_GET_CFG: %s\n",
- "ipfw_nat not present, please load it");
- error = EINVAL;
- }
- break;
-
- case IP_FW_NAT_GET_LOG:
- if (IPFW_NAT_LOADED)
- error = ipfw_nat_get_log_ptr(sopt);
- else {
- printf("IP_FW_NAT_GET_LOG: %s\n",
- "ipfw_nat not present, please load it");
- error = EINVAL;
- }
- break;
-
- default:
- printf("ipfw: ipfw_ctl invalid option %d\n", sopt->sopt_name);
- error = EINVAL;
- }
-
- return (error);
-#undef RULE_MAXSIZE
-}
-
/*
- * This procedure is only used to handle keepalives. It is invoked
- * every dyn_keepalive_period
- */
-static void
-ipfw_tick(void * vnetx)
-{
- struct mbuf *m0, *m, *mnext, **mtailp;
-#ifdef INET6
- struct mbuf *m6, **m6_tailp;
-#endif
- int i;
- ipfw_dyn_rule *q;
-#ifdef VIMAGE
- struct vnet *vp = vnetx;
-#endif
-
- CURVNET_SET(vp);
- if (V_dyn_keepalive == 0 || V_ipfw_dyn_v == NULL || V_dyn_count == 0)
- goto done;
-
- /*
- * We make a chain of packets to go out here -- not deferring
- * until after we drop the IPFW dynamic rule lock would result
- * in a lock order reversal with the normal packet input -> ipfw
- * call stack.
- */
- m0 = NULL;
- mtailp = &m0;
-#ifdef INET6
- m6 = NULL;
- m6_tailp = &m6;
-#endif
- IPFW_DYN_LOCK();
- for (i = 0 ; i < V_curr_dyn_buckets ; i++) {
- for (q = V_ipfw_dyn_v[i] ; q ; q = q->next ) {
- if (q->dyn_type == O_LIMIT_PARENT)
- continue;
- if (q->id.proto != IPPROTO_TCP)
- continue;
- if ( (q->state & BOTH_SYN) != BOTH_SYN)
- continue;
- if (TIME_LEQ(time_uptime + V_dyn_keepalive_interval,
- q->expire))
- continue; /* too early */
- if (TIME_LEQ(q->expire, time_uptime))
- continue; /* too late, rule expired */
-
- m = send_pkt(NULL, &(q->id), q->ack_rev - 1,
- q->ack_fwd, TH_SYN);
- mnext = send_pkt(NULL, &(q->id), q->ack_fwd - 1,
- q->ack_rev, 0);
-
- switch (q->id.addr_type) {
- case 4:
- if (m != NULL) {
- *mtailp = m;
- mtailp = &(*mtailp)->m_nextpkt;
- }
- if (mnext != NULL) {
- *mtailp = mnext;
- mtailp = &(*mtailp)->m_nextpkt;
- }
- break;
-#ifdef INET6
- case 6:
- if (m != NULL) {
- *m6_tailp = m;
- m6_tailp = &(*m6_tailp)->m_nextpkt;
- }
- if (mnext != NULL) {
- *m6_tailp = mnext;
- m6_tailp = &(*m6_tailp)->m_nextpkt;
- }
- break;
-#endif
- }
-
- m = mnext = NULL;
- }
- }
- IPFW_DYN_UNLOCK();
- for (m = mnext = m0; m != NULL; m = mnext) {
- mnext = m->m_nextpkt;
- m->m_nextpkt = NULL;
- ip_output(m, NULL, NULL, 0, NULL, NULL);
- }
-#ifdef INET6
- for (m = mnext = m6; m != NULL; m = mnext) {
- mnext = m->m_nextpkt;
- m->m_nextpkt = NULL;
- ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL);
- }
-#endif
-done:
- callout_reset(&V_ipfw_timeout, V_dyn_keepalive_period * hz,
- ipfw_tick, vnetx);
- CURVNET_RESTORE();
-}
-
-/****************
* Stuff that must be initialised only on boot or module load
*/
static int
@@ -4716,11 +2231,7 @@ ipfw_init(void)
{
int error = 0;
- ipfw_dyn_rule_zone = uma_zcreate("IPFW dynamic rule",
- sizeof(ipfw_dyn_rule), NULL, NULL, NULL, NULL,
- UMA_ALIGN_PTR, 0);
-
- IPFW_DYN_LOCK_INIT();
+ ipfw_dyn_attach();
/*
* Only print out this stuff the first time around,
* when called from the sysinit code.
@@ -4764,22 +2275,23 @@ ipfw_init(void)
printf("limited to %d packets/entry by default\n",
V_verbose_limit);
+ ipfw_log_bpf(1); /* init */
return (error);
}
-/**********************
+/*
* Called for the removal of the last instance only on module unload.
*/
static void
ipfw_destroy(void)
{
- uma_zdestroy(ipfw_dyn_rule_zone);
- IPFW_DYN_LOCK_DESTROY();
+ ipfw_log_bpf(0); /* uninit */
+ ipfw_dyn_detach();
printf("IP firewall unloaded\n");
}
-/****************
+/*
* Stuff that must be initialized for every instance
* (including the first of course).
*/
@@ -4787,139 +2299,121 @@ static int
vnet_ipfw_init(const void *unused)
{
int error;
- struct ip_fw default_rule;
+ struct ip_fw *rule = NULL;
+ struct ip_fw_chain *chain;
+
+ chain = &V_layer3_chain;
/* First set up some values that are compile time options */
+ V_autoinc_step = 100; /* bounded to 1..1000 in add_rule() */
+ V_fw_deny_unknown_exthdrs = 1;
#ifdef IPFIREWALL_VERBOSE
V_fw_verbose = 1;
#endif
#ifdef IPFIREWALL_VERBOSE_LIMIT
V_verbose_limit = IPFIREWALL_VERBOSE_LIMIT;
#endif
-
- error = init_tables(&V_layer3_chain);
- if (error) {
- panic("init_tables"); /* XXX Marko fix this ! */
- }
#ifdef IPFIREWALL_NAT
- LIST_INIT(&V_layer3_chain.nat);
+ LIST_INIT(&chain->nat);
#endif
- V_autoinc_step = 100; /* bounded to 1..1000 in add_rule() */
-
- V_ipfw_dyn_v = NULL;
- V_dyn_buckets = 256; /* must be power of 2 */
- V_curr_dyn_buckets = 256; /* must be power of 2 */
-
- V_dyn_ack_lifetime = 300;
- V_dyn_syn_lifetime = 20;
- V_dyn_fin_lifetime = 1;
- V_dyn_rst_lifetime = 1;
- V_dyn_udp_lifetime = 10;
- V_dyn_short_lifetime = 5;
-
- V_dyn_keepalive_interval = 20;
- V_dyn_keepalive_period = 5;
- V_dyn_keepalive = 1; /* do send keepalives */
-
- V_dyn_max = 4096; /* max # of dynamic rules */
-
- V_fw_deny_unknown_exthdrs = 1;
-
- V_layer3_chain.rules = NULL;
- IPFW_LOCK_INIT(&V_layer3_chain);
- callout_init(&V_ipfw_timeout, CALLOUT_MPSAFE);
-
- bzero(&default_rule, sizeof default_rule);
- default_rule.act_ofs = 0;
- default_rule.rulenum = IPFW_DEFAULT_RULE;
- default_rule.cmd_len = 1;
- default_rule.set = RESVD_SET;
- default_rule.cmd[0].len = 1;
- default_rule.cmd[0].opcode = default_to_accept ? O_ACCEPT : O_DENY;
- error = add_rule(&V_layer3_chain, &default_rule);
-
- if (error != 0) {
- printf("ipfw2: error %u initializing default rule "
- "(support disabled)\n", error);
- IPFW_LOCK_DESTROY(&V_layer3_chain);
- printf("leaving ipfw_iattach (1) with error %d\n", error);
- return (error);
+ /* insert the default rule and create the initial map */
+ chain->n_rules = 1;
+ chain->static_len = sizeof(struct ip_fw);
+ chain->map = malloc(sizeof(struct ip_fw *), M_IPFW, M_NOWAIT | M_ZERO);
+ if (chain->map)
+ rule = malloc(chain->static_len, M_IPFW, M_NOWAIT | M_ZERO);
+ if (rule == NULL) {
+ if (chain->map)
+ free(chain->map, M_IPFW);
+ printf("ipfw2: ENOSPC initializing default rule "
+ "(support disabled)\n");
+ return (ENOSPC);
}
-
- ip_fw_default_rule = V_layer3_chain.rules;
-
+ error = ipfw_init_tables(chain);
if (error) {
- IPFW_LOCK_DESTROY(&V_layer3_chain);
- printf("leaving ipfw_iattach (2) with error %d\n", error);
- return (error);
+ panic("init_tables"); /* XXX Marko fix this ! */
}
-#ifdef VIMAGE /* want a better way to do this */
- callout_reset(&V_ipfw_timeout, hz, ipfw_tick, curvnet);
-#else
- callout_reset(&V_ipfw_timeout, hz, ipfw_tick, NULL);
-#endif
+
+ /* fill and insert the default rule */
+ rule->act_ofs = 0;
+ rule->rulenum = IPFW_DEFAULT_RULE;
+ rule->cmd_len = 1;
+ rule->set = RESVD_SET;
+ rule->cmd[0].len = 1;
+ rule->cmd[0].opcode = default_to_accept ? O_ACCEPT : O_DENY;
+ chain->rules = chain->default_rule = chain->map[0] = rule;
+ chain->id = rule->id = 1;
+
+ IPFW_LOCK_INIT(chain);
+ ipfw_dyn_init();
/* First set up some values that are compile time options */
V_ipfw_vnet_ready = 1; /* Open for business */
- /* Hook up the raw inputs */
- V_ip_fw_ctl_ptr = ipfw_ctl;
- V_ip_fw_chk_ptr = ipfw_chk;
-
/*
- * Hook us up to pfil.
+ * Hook the sockopt handler, and the layer2 (V_ip_fw_chk_ptr)
+ * and pfil hooks for ipv4 and ipv6. Even if the latter two fail
+ * we still keep the module alive because the sockopt and
+ * layer2 paths are still useful.
+ * ipfw[6]_hook return 0 on success, ENOENT on failure,
+ * so we can ignore the exact return value and just set a flag.
+ *
+ * Note that V_fw[6]_enable are manipulated by a SYSCTL_PROC so
+ * changes in the underlying (per-vnet) variables trigger
+ * immediate hook()/unhook() calls.
+ * In layer2 we have the same behaviour, except that V_ether_ipfw
+ * is checked on each packet because there are no pfil hooks.
*/
- if (V_fw_enable) {
- if ((error = ipfw_hook()) != 0) {
- printf("ipfw_hook() error\n");
- return (error);
- }
- }
-#ifdef INET6
- if (V_fw6_enable) {
- if ((error = ipfw6_hook()) != 0) {
- printf("ipfw6_hook() error\n");
- /* XXX should we unhook everything else? */
- return (error);
- }
- }
-#endif
- return (0);
+ V_ip_fw_ctl_ptr = ipfw_ctl;
+ V_ip_fw_chk_ptr = ipfw_chk;
+ error = ipfw_attach_hooks(1);
+ return (error);
}
-/***********************
+/*
* Called for the removal of each instance.
*/
static int
vnet_ipfw_uninit(const void *unused)
{
- struct ip_fw *reap;
+ struct ip_fw *reap, *rule;
+ struct ip_fw_chain *chain = &V_layer3_chain;
+ int i;
V_ipfw_vnet_ready = 0; /* tell new callers to go away */
- ipfw_unhook();
-#ifdef INET6
- ipfw6_unhook();
-#endif
- /* layer2 and other entrypoints still come in this way. */
+ /*
+ * disconnect from ipv4, ipv6, layer2 and sockopt.
+ * Then grab, release and grab again the WLOCK so we make
+ * sure the update is propagated and nobody will be in.
+ */
+ (void)ipfw_attach_hooks(0 /* detach */);
V_ip_fw_chk_ptr = NULL;
V_ip_fw_ctl_ptr = NULL;
- IPFW_WLOCK(&V_layer3_chain);
- /* We wait on the wlock here until the last user leaves */
- IPFW_WUNLOCK(&V_layer3_chain);
- IPFW_WLOCK(&V_layer3_chain);
- callout_drain(&V_ipfw_timeout);
- flush_tables(&V_layer3_chain);
- V_layer3_chain.reap = NULL;
- free_chain(&V_layer3_chain, 1 /* kill default rule */);
- reap = V_layer3_chain.reap;
- V_layer3_chain.reap = NULL;
- IPFW_WUNLOCK(&V_layer3_chain);
+ IPFW_UH_WLOCK(chain);
+ IPFW_UH_WUNLOCK(chain);
+ IPFW_UH_WLOCK(chain);
+
+ IPFW_WLOCK(chain);
+ IPFW_WUNLOCK(chain);
+ IPFW_WLOCK(chain);
+
+ ipfw_dyn_uninit(0); /* run the callout_drain */
+ ipfw_destroy_tables(chain);
+ reap = NULL;
+ for (i = 0; i < chain->n_rules; i++) {
+ rule = chain->map[i];
+ rule->x_next = reap;
+ reap = rule;
+ }
+ if (chain->map)
+ free(chain->map, M_IPFW);
+ IPFW_WUNLOCK(chain);
+ IPFW_UH_WUNLOCK(chain);
if (reap != NULL)
- reap_rules(reap);
- IPFW_LOCK_DESTROY(&V_layer3_chain);
- if (V_ipfw_dyn_v != NULL)
- free(V_ipfw_dyn_v, M_IPFW);
+ ipfw_reap_rules(reap);
+ IPFW_LOCK_DESTROY(chain);
+ ipfw_dyn_uninit(1); /* free the remaining parts */
return 0;
}
@@ -4993,4 +2487,4 @@ SYSUNINIT(ipfw_destroy, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER,
ipfw_destroy, NULL);
VNET_SYSUNINIT(vnet_ipfw_uninit, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER,
vnet_ipfw_uninit, NULL);
-
+/* end of file */
diff --git a/sys/netinet/ipfw/ip_fw_dynamic.c b/sys/netinet/ipfw/ip_fw_dynamic.c
new file mode 100644
index 0000000..6947582
--- /dev/null
+++ b/sys/netinet/ipfw/ip_fw_dynamic.c
@@ -0,0 +1,1244 @@
+/*-
+ * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#define DEB(x)
+#define DDB(x) x
+
+/*
+ * Dynamic rule support for ipfw
+ */
+
+#if !defined(KLD_MODULE)
+#include "opt_ipfw.h"
+#include "opt_ipdivert.h"
+#include "opt_ipdn.h"
+#include "opt_inet.h"
+#ifndef INET
+#error IPFIREWALL requires INET.
+#endif /* INET */
+#endif
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <net/ethernet.h> /* for ETHERTYPE_IP */
+#include <net/if.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h> /* ip_defttl */
+#include <netinet/ip_fw.h>
+#include <netinet/ipfw/ip_fw_private.h>
+#include <netinet/tcp_var.h>
+#include <netinet/udp.h>
+
+#include <netinet/ip6.h> /* IN6_ARE_ADDR_EQUAL */
+#ifdef INET6
+#include <netinet6/in6_var.h>
+#include <netinet6/ip6_var.h>
+#endif
+
+#include <machine/in_cksum.h> /* XXX for in_cksum */
+
+#ifdef MAC
+#include <security/mac/mac_framework.h>
+#endif
+
+/*
+ * Description of dynamic rules.
+ *
+ * Dynamic rules are stored in lists accessed through a hash table
+ * (ipfw_dyn_v) whose size is curr_dyn_buckets. This value can
+ * be modified through the sysctl variable dyn_buckets which is
+ * updated when the table becomes empty.
+ *
+ * XXX currently there is only one list, ipfw_dyn.
+ *
+ * When a packet is received, its address fields are first masked
+ * with the mask defined for the rule, then hashed, then matched
+ * against the entries in the corresponding list.
+ * Dynamic rules can be used for different purposes:
+ * + stateful rules;
+ * + enforcing limits on the number of sessions;
+ * + in-kernel NAT (not implemented yet)
+ *
+ * The lifetime of dynamic rules is regulated by dyn_*_lifetime,
+ * measured in seconds and depending on the flags.
+ *
+ * The total number of dynamic rules is stored in dyn_count.
+ * The max number of dynamic rules is dyn_max. When we reach
+ * the maximum number of rules we do not create anymore. This is
+ * done to avoid consuming too much memory, but also too much
+ * time when searching on each packet (ideally, we should try instead
+ * to put a limit on the length of the list on each bucket...).
+ *
+ * Each dynamic rule holds a pointer to the parent ipfw rule so
+ * we know what action to perform. Dynamic rules are removed when
+ * the parent rule is deleted. XXX we should make them survive.
+ *
+ * There are some limitations with dynamic rules -- we do not
+ * obey the 'randomized match', and we do not do multiple
+ * passes through the firewall. XXX check the latter!!!
+ */
+
+/*
+ * Static variables followed by global ones
+ */
+static VNET_DEFINE(ipfw_dyn_rule **, ipfw_dyn_v);
+static VNET_DEFINE(u_int32_t, dyn_buckets);
+static VNET_DEFINE(u_int32_t, curr_dyn_buckets);
+static VNET_DEFINE(struct callout, ipfw_timeout);
+#define V_ipfw_dyn_v VNET(ipfw_dyn_v)
+#define V_dyn_buckets VNET(dyn_buckets)
+#define V_curr_dyn_buckets VNET(curr_dyn_buckets)
+#define V_ipfw_timeout VNET(ipfw_timeout)
+
+static uma_zone_t ipfw_dyn_rule_zone;
+#ifndef __FreeBSD__
+DEFINE_SPINLOCK(ipfw_dyn_mtx);
+#else
+static struct mtx ipfw_dyn_mtx; /* mutex guarding dynamic rules */
+#endif
+
+#define IPFW_DYN_LOCK_INIT() \
+ mtx_init(&ipfw_dyn_mtx, "IPFW dynamic rules", NULL, MTX_DEF)
+#define IPFW_DYN_LOCK_DESTROY() mtx_destroy(&ipfw_dyn_mtx)
+#define IPFW_DYN_LOCK() mtx_lock(&ipfw_dyn_mtx)
+#define IPFW_DYN_UNLOCK() mtx_unlock(&ipfw_dyn_mtx)
+#define IPFW_DYN_LOCK_ASSERT() mtx_assert(&ipfw_dyn_mtx, MA_OWNED)
+
+void
+ipfw_dyn_unlock(void)
+{
+ IPFW_DYN_UNLOCK();
+}
+
+/*
+ * Timeouts for various events in handing dynamic rules.
+ */
+static VNET_DEFINE(u_int32_t, dyn_ack_lifetime);
+static VNET_DEFINE(u_int32_t, dyn_syn_lifetime);
+static VNET_DEFINE(u_int32_t, dyn_fin_lifetime);
+static VNET_DEFINE(u_int32_t, dyn_rst_lifetime);
+static VNET_DEFINE(u_int32_t, dyn_udp_lifetime);
+static VNET_DEFINE(u_int32_t, dyn_short_lifetime);
+
+#define V_dyn_ack_lifetime VNET(dyn_ack_lifetime)
+#define V_dyn_syn_lifetime VNET(dyn_syn_lifetime)
+#define V_dyn_fin_lifetime VNET(dyn_fin_lifetime)
+#define V_dyn_rst_lifetime VNET(dyn_rst_lifetime)
+#define V_dyn_udp_lifetime VNET(dyn_udp_lifetime)
+#define V_dyn_short_lifetime VNET(dyn_short_lifetime)
+
+/*
+ * Keepalives are sent if dyn_keepalive is set. They are sent every
+ * dyn_keepalive_period seconds, in the last dyn_keepalive_interval
+ * seconds of lifetime of a rule.
+ * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower
+ * than dyn_keepalive_period.
+ */
+
+static VNET_DEFINE(u_int32_t, dyn_keepalive_interval);
+static VNET_DEFINE(u_int32_t, dyn_keepalive_period);
+static VNET_DEFINE(u_int32_t, dyn_keepalive);
+
+#define V_dyn_keepalive_interval VNET(dyn_keepalive_interval)
+#define V_dyn_keepalive_period VNET(dyn_keepalive_period)
+#define V_dyn_keepalive VNET(dyn_keepalive)
+
+static VNET_DEFINE(u_int32_t, dyn_count); /* # of dynamic rules */
+static VNET_DEFINE(u_int32_t, dyn_max); /* max # of dynamic rules */
+
+#define V_dyn_count VNET(dyn_count)
+#define V_dyn_max VNET(dyn_max)
+
+#ifdef SYSCTL_NODE
+
+SYSBEGIN(f2)
+
+SYSCTL_DECL(_net_inet_ip_fw);
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_buckets,
+ CTLFLAG_RW, &VNET_NAME(dyn_buckets), 0,
+ "Number of dyn. buckets");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets,
+ CTLFLAG_RD, &VNET_NAME(curr_dyn_buckets), 0,
+ "Current Number of dyn. buckets");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_count,
+ CTLFLAG_RD, &VNET_NAME(dyn_count), 0,
+ "Number of dyn. rules");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_max,
+ CTLFLAG_RW, &VNET_NAME(dyn_max), 0,
+ "Max number of dyn. rules");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime,
+ CTLFLAG_RW, &VNET_NAME(dyn_ack_lifetime), 0,
+ "Lifetime of dyn. rules for acks");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime,
+ CTLFLAG_RW, &VNET_NAME(dyn_syn_lifetime), 0,
+ "Lifetime of dyn. rules for syn");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime,
+ CTLFLAG_RW, &VNET_NAME(dyn_fin_lifetime), 0,
+ "Lifetime of dyn. rules for fin");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime,
+ CTLFLAG_RW, &VNET_NAME(dyn_rst_lifetime), 0,
+ "Lifetime of dyn. rules for rst");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime,
+ CTLFLAG_RW, &VNET_NAME(dyn_udp_lifetime), 0,
+ "Lifetime of dyn. rules for UDP");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime,
+ CTLFLAG_RW, &VNET_NAME(dyn_short_lifetime), 0,
+ "Lifetime of dyn. rules for other situations");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive,
+ CTLFLAG_RW, &VNET_NAME(dyn_keepalive), 0,
+ "Enable keepalives for dyn. rules");
+
+SYSEND
+
+#endif /* SYSCTL_NODE */
+
+
+static __inline int
+hash_packet6(struct ipfw_flow_id *id)
+{
+ u_int32_t i;
+ i = (id->dst_ip6.__u6_addr.__u6_addr32[2]) ^
+ (id->dst_ip6.__u6_addr.__u6_addr32[3]) ^
+ (id->src_ip6.__u6_addr.__u6_addr32[2]) ^
+ (id->src_ip6.__u6_addr.__u6_addr32[3]) ^
+ (id->dst_port) ^ (id->src_port);
+ return i;
+}
+
+/*
+ * IMPORTANT: the hash function for dynamic rules must be commutative
+ * in source and destination (ip,port), because rules are bidirectional
+ * and we want to find both in the same bucket.
+ */
+static __inline int
+hash_packet(struct ipfw_flow_id *id)
+{
+ u_int32_t i;
+
+#ifdef INET6
+ if (IS_IP6_FLOW_ID(id))
+ i = hash_packet6(id);
+ else
+#endif /* INET6 */
+ i = (id->dst_ip) ^ (id->src_ip) ^ (id->dst_port) ^ (id->src_port);
+ i &= (V_curr_dyn_buckets - 1);
+ return i;
+}
+
+static __inline void
+unlink_dyn_rule_print(struct ipfw_flow_id *id)
+{
+ struct in_addr da;
+#ifdef INET6
+ char src[INET6_ADDRSTRLEN], dst[INET6_ADDRSTRLEN];
+#else
+ char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN];
+#endif
+
+#ifdef INET6
+ if (IS_IP6_FLOW_ID(id)) {
+ ip6_sprintf(src, &id->src_ip6);
+ ip6_sprintf(dst, &id->dst_ip6);
+ } else
+#endif
+ {
+ da.s_addr = htonl(id->src_ip);
+ inet_ntoa_r(da, src);
+ da.s_addr = htonl(id->dst_ip);
+ inet_ntoa_r(da, dst);
+ }
+ printf("ipfw: unlink entry %s %d -> %s %d, %d left\n",
+ src, id->src_port, dst, id->dst_port, V_dyn_count - 1);
+}
+
+/**
+ * unlink a dynamic rule from a chain. prev is a pointer to
+ * the previous one, q is a pointer to the rule to delete,
+ * head is a pointer to the head of the queue.
+ * Modifies q and potentially also head.
+ */
+#define UNLINK_DYN_RULE(prev, head, q) { \
+ ipfw_dyn_rule *old_q = q; \
+ \
+ /* remove a refcount to the parent */ \
+ if (q->dyn_type == O_LIMIT) \
+ q->parent->count--; \
+ DEB(unlink_dyn_rule_print(&q->id);) \
+ if (prev != NULL) \
+ prev->next = q = q->next; \
+ else \
+ head = q = q->next; \
+ V_dyn_count--; \
+ uma_zfree(ipfw_dyn_rule_zone, old_q); }
+
+#define TIME_LEQ(a,b) ((int)((a)-(b)) <= 0)
+
+/**
+ * Remove dynamic rules pointing to "rule", or all of them if rule == NULL.
+ *
+ * If keep_me == NULL, rules are deleted even if not expired,
+ * otherwise only expired rules are removed.
+ *
+ * The value of the second parameter is also used to point to identify
+ * a rule we absolutely do not want to remove (e.g. because we are
+ * holding a reference to it -- this is the case with O_LIMIT_PARENT
+ * rules). The pointer is only used for comparison, so any non-null
+ * value will do.
+ */
+static void
+remove_dyn_rule(struct ip_fw *rule, ipfw_dyn_rule *keep_me)
+{
+ static u_int32_t last_remove = 0;
+
+#define FORCE (keep_me == NULL)
+
+ ipfw_dyn_rule *prev, *q;
+ int i, pass = 0, max_pass = 0;
+
+ IPFW_DYN_LOCK_ASSERT();
+
+ if (V_ipfw_dyn_v == NULL || V_dyn_count == 0)
+ return;
+ /* do not expire more than once per second, it is useless */
+ if (!FORCE && last_remove == time_uptime)
+ return;
+ last_remove = time_uptime;
+
+ /*
+ * because O_LIMIT refer to parent rules, during the first pass only
+ * remove child and mark any pending LIMIT_PARENT, and remove
+ * them in a second pass.
+ */
+next_pass:
+ for (i = 0 ; i < V_curr_dyn_buckets ; i++) {
+ for (prev=NULL, q = V_ipfw_dyn_v[i] ; q ; ) {
+ /*
+ * Logic can become complex here, so we split tests.
+ */
+ if (q == keep_me)
+ goto next;
+ if (rule != NULL && rule != q->rule)
+ goto next; /* not the one we are looking for */
+ if (q->dyn_type == O_LIMIT_PARENT) {
+ /*
+ * handle parent in the second pass,
+ * record we need one.
+ */
+ max_pass = 1;
+ if (pass == 0)
+ goto next;
+ if (FORCE && q->count != 0 ) {
+ /* XXX should not happen! */
+ printf("ipfw: OUCH! cannot remove rule,"
+ " count %d\n", q->count);
+ }
+ } else {
+ if (!FORCE &&
+ !TIME_LEQ( q->expire, time_uptime ))
+ goto next;
+ }
+ if (q->dyn_type != O_LIMIT_PARENT || !q->count) {
+ UNLINK_DYN_RULE(prev, V_ipfw_dyn_v[i], q);
+ continue;
+ }
+next:
+ prev=q;
+ q=q->next;
+ }
+ }
+ if (pass++ < max_pass)
+ goto next_pass;
+}
+
+void
+ipfw_remove_dyn_children(struct ip_fw *rule)
+{
+ IPFW_DYN_LOCK();
+ remove_dyn_rule(rule, NULL /* force removal */);
+ IPFW_DYN_UNLOCK();
+}
+
+/**
+ * lookup a dynamic rule, locked version
+ */
+static ipfw_dyn_rule *
+lookup_dyn_rule_locked(struct ipfw_flow_id *pkt, int *match_direction,
+ struct tcphdr *tcp)
+{
+ /*
+ * stateful ipfw extensions.
+ * Lookup into dynamic session queue
+ */
+#define MATCH_REVERSE 0
+#define MATCH_FORWARD 1
+#define MATCH_NONE 2
+#define MATCH_UNKNOWN 3
+ int i, dir = MATCH_NONE;
+ ipfw_dyn_rule *prev, *q=NULL;
+
+ IPFW_DYN_LOCK_ASSERT();
+
+ if (V_ipfw_dyn_v == NULL)
+ goto done; /* not found */
+ i = hash_packet( pkt );
+ for (prev=NULL, q = V_ipfw_dyn_v[i] ; q != NULL ; ) {
+ if (q->dyn_type == O_LIMIT_PARENT && q->count)
+ goto next;
+ if (TIME_LEQ( q->expire, time_uptime)) { /* expire entry */
+ UNLINK_DYN_RULE(prev, V_ipfw_dyn_v[i], q);
+ continue;
+ }
+ if (pkt->proto == q->id.proto &&
+ q->dyn_type != O_LIMIT_PARENT) {
+ if (IS_IP6_FLOW_ID(pkt)) {
+ if (IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6),
+ &(q->id.src_ip6)) &&
+ IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6),
+ &(q->id.dst_ip6)) &&
+ pkt->src_port == q->id.src_port &&
+ pkt->dst_port == q->id.dst_port ) {
+ dir = MATCH_FORWARD;
+ break;
+ }
+ if (IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6),
+ &(q->id.dst_ip6)) &&
+ IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6),
+ &(q->id.src_ip6)) &&
+ pkt->src_port == q->id.dst_port &&
+ pkt->dst_port == q->id.src_port ) {
+ dir = MATCH_REVERSE;
+ break;
+ }
+ } else {
+ if (pkt->src_ip == q->id.src_ip &&
+ pkt->dst_ip == q->id.dst_ip &&
+ pkt->src_port == q->id.src_port &&
+ pkt->dst_port == q->id.dst_port ) {
+ dir = MATCH_FORWARD;
+ break;
+ }
+ if (pkt->src_ip == q->id.dst_ip &&
+ pkt->dst_ip == q->id.src_ip &&
+ pkt->src_port == q->id.dst_port &&
+ pkt->dst_port == q->id.src_port ) {
+ dir = MATCH_REVERSE;
+ break;
+ }
+ }
+ }
+next:
+ prev = q;
+ q = q->next;
+ }
+ if (q == NULL)
+ goto done; /* q = NULL, not found */
+
+ if ( prev != NULL) { /* found and not in front */
+ prev->next = q->next;
+ q->next = V_ipfw_dyn_v[i];
+ V_ipfw_dyn_v[i] = q;
+ }
+ if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */
+ u_char flags = pkt->_flags & (TH_FIN|TH_SYN|TH_RST);
+
+#define BOTH_SYN (TH_SYN | (TH_SYN << 8))
+#define BOTH_FIN (TH_FIN | (TH_FIN << 8))
+ q->state |= (dir == MATCH_FORWARD ) ? flags : (flags << 8);
+ switch (q->state) {
+ case TH_SYN: /* opening */
+ q->expire = time_uptime + V_dyn_syn_lifetime;
+ break;
+
+ case BOTH_SYN: /* move to established */
+ case BOTH_SYN | TH_FIN : /* one side tries to close */
+ case BOTH_SYN | (TH_FIN << 8) :
+ if (tcp) {
+#define _SEQ_GE(a,b) ((int)(a) - (int)(b) >= 0)
+ u_int32_t ack = ntohl(tcp->th_ack);
+ if (dir == MATCH_FORWARD) {
+ if (q->ack_fwd == 0 || _SEQ_GE(ack, q->ack_fwd))
+ q->ack_fwd = ack;
+ else { /* ignore out-of-sequence */
+ break;
+ }
+ } else {
+ if (q->ack_rev == 0 || _SEQ_GE(ack, q->ack_rev))
+ q->ack_rev = ack;
+ else { /* ignore out-of-sequence */
+ break;
+ }
+ }
+ }
+ q->expire = time_uptime + V_dyn_ack_lifetime;
+ break;
+
+ case BOTH_SYN | BOTH_FIN: /* both sides closed */
+ if (V_dyn_fin_lifetime >= V_dyn_keepalive_period)
+ V_dyn_fin_lifetime = V_dyn_keepalive_period - 1;
+ q->expire = time_uptime + V_dyn_fin_lifetime;
+ break;
+
+ default:
+#if 0
+ /*
+ * reset or some invalid combination, but can also
+ * occur if we use keep-state the wrong way.
+ */
+ if ( (q->state & ((TH_RST << 8)|TH_RST)) == 0)
+ printf("invalid state: 0x%x\n", q->state);
+#endif
+ if (V_dyn_rst_lifetime >= V_dyn_keepalive_period)
+ V_dyn_rst_lifetime = V_dyn_keepalive_period - 1;
+ q->expire = time_uptime + V_dyn_rst_lifetime;
+ break;
+ }
+ } else if (pkt->proto == IPPROTO_UDP) {
+ q->expire = time_uptime + V_dyn_udp_lifetime;
+ } else {
+ /* other protocols */
+ q->expire = time_uptime + V_dyn_short_lifetime;
+ }
+done:
+ if (match_direction)
+ *match_direction = dir;
+ return q;
+}
+
+ipfw_dyn_rule *
+ipfw_lookup_dyn_rule(struct ipfw_flow_id *pkt, int *match_direction,
+ struct tcphdr *tcp)
+{
+ ipfw_dyn_rule *q;
+
+ IPFW_DYN_LOCK();
+ q = lookup_dyn_rule_locked(pkt, match_direction, tcp);
+ if (q == NULL)
+ IPFW_DYN_UNLOCK();
+ /* NB: return table locked when q is not NULL */
+ return q;
+}
+
+static void
+realloc_dynamic_table(void)
+{
+ IPFW_DYN_LOCK_ASSERT();
+
+ /*
+ * Try reallocation, make sure we have a power of 2 and do
+ * not allow more than 64k entries. In case of overflow,
+ * default to 1024.
+ */
+
+ if (V_dyn_buckets > 65536)
+ V_dyn_buckets = 1024;
+ if ((V_dyn_buckets & (V_dyn_buckets-1)) != 0) { /* not a power of 2 */
+ V_dyn_buckets = V_curr_dyn_buckets; /* reset */
+ return;
+ }
+ V_curr_dyn_buckets = V_dyn_buckets;
+ if (V_ipfw_dyn_v != NULL)
+ free(V_ipfw_dyn_v, M_IPFW);
+ for (;;) {
+ V_ipfw_dyn_v = malloc(V_curr_dyn_buckets * sizeof(ipfw_dyn_rule *),
+ M_IPFW, M_NOWAIT | M_ZERO);
+ if (V_ipfw_dyn_v != NULL || V_curr_dyn_buckets <= 2)
+ break;
+ V_curr_dyn_buckets /= 2;
+ }
+}
+
+/**
+ * Install state of type 'type' for a dynamic session.
+ * The hash table contains two type of rules:
+ * - regular rules (O_KEEP_STATE)
+ * - rules for sessions with limited number of sess per user
+ * (O_LIMIT). When they are created, the parent is
+ * increased by 1, and decreased on delete. In this case,
+ * the third parameter is the parent rule and not the chain.
+ * - "parent" rules for the above (O_LIMIT_PARENT).
+ */
+static ipfw_dyn_rule *
+add_dyn_rule(struct ipfw_flow_id *id, u_int8_t dyn_type, struct ip_fw *rule)
+{
+ ipfw_dyn_rule *r;
+ int i;
+
+ IPFW_DYN_LOCK_ASSERT();
+
+ if (V_ipfw_dyn_v == NULL ||
+ (V_dyn_count == 0 && V_dyn_buckets != V_curr_dyn_buckets)) {
+ realloc_dynamic_table();
+ if (V_ipfw_dyn_v == NULL)
+ return NULL; /* failed ! */
+ }
+ i = hash_packet(id);
+
+ r = uma_zalloc(ipfw_dyn_rule_zone, M_NOWAIT | M_ZERO);
+ if (r == NULL) {
+ printf ("ipfw: sorry cannot allocate state\n");
+ return NULL;
+ }
+
+ /* increase refcount on parent, and set pointer */
+ if (dyn_type == O_LIMIT) {
+ ipfw_dyn_rule *parent = (ipfw_dyn_rule *)rule;
+ if ( parent->dyn_type != O_LIMIT_PARENT)
+ panic("invalid parent");
+ parent->count++;
+ r->parent = parent;
+ rule = parent->rule;
+ }
+
+ r->id = *id;
+ r->expire = time_uptime + V_dyn_syn_lifetime;
+ r->rule = rule;
+ r->dyn_type = dyn_type;
+ r->pcnt = r->bcnt = 0;
+ r->count = 0;
+
+ r->bucket = i;
+ r->next = V_ipfw_dyn_v[i];
+ V_ipfw_dyn_v[i] = r;
+ V_dyn_count++;
+ DEB({
+ struct in_addr da;
+#ifdef INET6
+ char src[INET6_ADDRSTRLEN];
+ char dst[INET6_ADDRSTRLEN];
+#else
+ char src[INET_ADDRSTRLEN];
+ char dst[INET_ADDRSTRLEN];
+#endif
+
+#ifdef INET6
+ if (IS_IP6_FLOW_ID(&(r->id))) {
+ ip6_sprintf(src, &r->id.src_ip6);
+ ip6_sprintf(dst, &r->id.dst_ip6);
+ } else
+#endif
+ {
+ da.s_addr = htonl(r->id.src_ip);
+ inet_ntoa_r(da, src);
+ da.s_addr = htonl(r->id.dst_ip);
+ inet_ntoa_r(da, dst);
+ }
+ printf("ipfw: add dyn entry ty %d %s %d -> %s %d, total %d\n",
+ dyn_type, src, r->id.src_port, dst, r->id.dst_port,
+ V_dyn_count);
+ })
+ return r;
+}
+
+/**
+ * lookup dynamic parent rule using pkt and rule as search keys.
+ * If the lookup fails, then install one.
+ */
+static ipfw_dyn_rule *
+lookup_dyn_parent(struct ipfw_flow_id *pkt, struct ip_fw *rule)
+{
+ ipfw_dyn_rule *q;
+ int i;
+
+ IPFW_DYN_LOCK_ASSERT();
+
+ if (V_ipfw_dyn_v) {
+ int is_v6 = IS_IP6_FLOW_ID(pkt);
+ i = hash_packet( pkt );
+ for (q = V_ipfw_dyn_v[i] ; q != NULL ; q=q->next)
+ if (q->dyn_type == O_LIMIT_PARENT &&
+ rule== q->rule &&
+ pkt->proto == q->id.proto &&
+ pkt->src_port == q->id.src_port &&
+ pkt->dst_port == q->id.dst_port &&
+ (
+ (is_v6 &&
+ IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6),
+ &(q->id.src_ip6)) &&
+ IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6),
+ &(q->id.dst_ip6))) ||
+ (!is_v6 &&
+ pkt->src_ip == q->id.src_ip &&
+ pkt->dst_ip == q->id.dst_ip)
+ )
+ ) {
+ q->expire = time_uptime + V_dyn_short_lifetime;
+ DEB(printf("ipfw: lookup_dyn_parent found 0x%p\n",q);)
+ return q;
+ }
+ }
+ return add_dyn_rule(pkt, O_LIMIT_PARENT, rule);
+}
+
+/**
+ * Install dynamic state for rule type cmd->o.opcode
+ *
+ * Returns 1 (failure) if state is not installed because of errors or because
+ * session limitations are enforced.
+ */
+int
+ipfw_install_state(struct ip_fw *rule, ipfw_insn_limit *cmd,
+ struct ip_fw_args *args, uint32_t tablearg)
+{
+ static int last_log;
+ ipfw_dyn_rule *q;
+ struct in_addr da;
+#ifdef INET6
+ char src[INET6_ADDRSTRLEN + 2], dst[INET6_ADDRSTRLEN + 2];
+#else
+ char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN];
+#endif
+
+ src[0] = '\0';
+ dst[0] = '\0';
+
+ IPFW_DYN_LOCK();
+
+ DEB(
+#ifdef INET6
+ if (IS_IP6_FLOW_ID(&(args->f_id))) {
+ ip6_sprintf(src, &args->f_id.src_ip6);
+ ip6_sprintf(dst, &args->f_id.dst_ip6);
+ } else
+#endif
+ {
+ da.s_addr = htonl(args->f_id.src_ip);
+ inet_ntoa_r(da, src);
+ da.s_addr = htonl(args->f_id.dst_ip);
+ inet_ntoa_r(da, dst);
+ }
+ printf("ipfw: %s: type %d %s %u -> %s %u\n",
+ __func__, cmd->o.opcode, src, args->f_id.src_port,
+ dst, args->f_id.dst_port);
+ src[0] = '\0';
+ dst[0] = '\0';
+ )
+
+ q = lookup_dyn_rule_locked(&args->f_id, NULL, NULL);
+
+ if (q != NULL) { /* should never occur */
+ if (last_log != time_uptime) {
+ last_log = time_uptime;
+ printf("ipfw: %s: entry already present, done\n",
+ __func__);
+ }
+ IPFW_DYN_UNLOCK();
+ return (0);
+ }
+
+ if (V_dyn_count >= V_dyn_max)
+ /* Run out of slots, try to remove any expired rule. */
+ remove_dyn_rule(NULL, (ipfw_dyn_rule *)1);
+
+ if (V_dyn_count >= V_dyn_max) {
+ if (last_log != time_uptime) {
+ last_log = time_uptime;
+ printf("ipfw: %s: Too many dynamic rules\n", __func__);
+ }
+ IPFW_DYN_UNLOCK();
+ return (1); /* cannot install, notify caller */
+ }
+
+ switch (cmd->o.opcode) {
+ case O_KEEP_STATE: /* bidir rule */
+ add_dyn_rule(&args->f_id, O_KEEP_STATE, rule);
+ break;
+
+ case O_LIMIT: { /* limit number of sessions */
+ struct ipfw_flow_id id;
+ ipfw_dyn_rule *parent;
+ uint32_t conn_limit;
+ uint16_t limit_mask = cmd->limit_mask;
+
+ conn_limit = (cmd->conn_limit == IP_FW_TABLEARG) ?
+ tablearg : cmd->conn_limit;
+
+ DEB(
+ if (cmd->conn_limit == IP_FW_TABLEARG)
+ printf("ipfw: %s: O_LIMIT rule, conn_limit: %u "
+ "(tablearg)\n", __func__, conn_limit);
+ else
+ printf("ipfw: %s: O_LIMIT rule, conn_limit: %u\n",
+ __func__, conn_limit);
+ )
+
+ id.dst_ip = id.src_ip = id.dst_port = id.src_port = 0;
+ id.proto = args->f_id.proto;
+ id.addr_type = args->f_id.addr_type;
+ id.fib = M_GETFIB(args->m);
+
+ if (IS_IP6_FLOW_ID (&(args->f_id))) {
+ if (limit_mask & DYN_SRC_ADDR)
+ id.src_ip6 = args->f_id.src_ip6;
+ if (limit_mask & DYN_DST_ADDR)
+ id.dst_ip6 = args->f_id.dst_ip6;
+ } else {
+ if (limit_mask & DYN_SRC_ADDR)
+ id.src_ip = args->f_id.src_ip;
+ if (limit_mask & DYN_DST_ADDR)
+ id.dst_ip = args->f_id.dst_ip;
+ }
+ if (limit_mask & DYN_SRC_PORT)
+ id.src_port = args->f_id.src_port;
+ if (limit_mask & DYN_DST_PORT)
+ id.dst_port = args->f_id.dst_port;
+ if ((parent = lookup_dyn_parent(&id, rule)) == NULL) {
+ printf("ipfw: %s: add parent failed\n", __func__);
+ IPFW_DYN_UNLOCK();
+ return (1);
+ }
+
+ if (parent->count >= conn_limit) {
+ /* See if we can remove some expired rule. */
+ remove_dyn_rule(rule, parent);
+ if (parent->count >= conn_limit) {
+ if (V_fw_verbose && last_log != time_uptime) {
+ last_log = time_uptime;
+#ifdef INET6
+ /*
+ * XXX IPv6 flows are not
+ * supported yet.
+ */
+ if (IS_IP6_FLOW_ID(&(args->f_id))) {
+ char ip6buf[INET6_ADDRSTRLEN];
+ snprintf(src, sizeof(src),
+ "[%s]", ip6_sprintf(ip6buf,
+ &args->f_id.src_ip6));
+ snprintf(dst, sizeof(dst),
+ "[%s]", ip6_sprintf(ip6buf,
+ &args->f_id.dst_ip6));
+ } else
+#endif
+ {
+ da.s_addr =
+ htonl(args->f_id.src_ip);
+ inet_ntoa_r(da, src);
+ da.s_addr =
+ htonl(args->f_id.dst_ip);
+ inet_ntoa_r(da, dst);
+ }
+ log(LOG_SECURITY | LOG_DEBUG,
+ "ipfw: %d %s %s:%u -> %s:%u, %s\n",
+ parent->rule->rulenum,
+ "drop session",
+ src, (args->f_id.src_port),
+ dst, (args->f_id.dst_port),
+ "too many entries");
+ }
+ IPFW_DYN_UNLOCK();
+ return (1);
+ }
+ }
+ add_dyn_rule(&args->f_id, O_LIMIT, (struct ip_fw *)parent);
+ break;
+ }
+ default:
+ printf("ipfw: %s: unknown dynamic rule type %u\n",
+ __func__, cmd->o.opcode);
+ IPFW_DYN_UNLOCK();
+ return (1);
+ }
+
+ /* XXX just set lifetime */
+ lookup_dyn_rule_locked(&args->f_id, NULL, NULL);
+
+ IPFW_DYN_UNLOCK();
+ return (0);
+}
+
+/*
+ * Generate a TCP packet, containing either a RST or a keepalive.
+ * When flags & TH_RST, we are sending a RST packet, because of a
+ * "reset" action matched the packet.
+ * Otherwise we are sending a keepalive, and flags & TH_
+ * The 'replyto' mbuf is the mbuf being replied to, if any, and is required
+ * so that MAC can label the reply appropriately.
+ */
+struct mbuf *
+ipfw_send_pkt(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t seq,
+ u_int32_t ack, int flags)
+{
+#ifndef __FreeBSD__
+ return NULL;
+#else
+ struct mbuf *m;
+ int len, dir;
+ struct ip *h = NULL; /* stupid compiler */
+#ifdef INET6
+ struct ip6_hdr *h6 = NULL;
+#endif
+ struct tcphdr *th = NULL;
+
+ MGETHDR(m, M_DONTWAIT, MT_DATA);
+ if (m == NULL)
+ return (NULL);
+
+ M_SETFIB(m, id->fib);
+#ifdef MAC
+ if (replyto != NULL)
+ mac_netinet_firewall_reply(replyto, m);
+ else
+ mac_netinet_firewall_send(m);
+#else
+ (void)replyto; /* don't warn about unused arg */
+#endif
+
+ switch (id->addr_type) {
+ case 4:
+ len = sizeof(struct ip) + sizeof(struct tcphdr);
+ break;
+#ifdef INET6
+ case 6:
+ len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
+ break;
+#endif
+ default:
+ /* XXX: log me?!? */
+ FREE_PKT(m);
+ return (NULL);
+ }
+ dir = ((flags & (TH_SYN | TH_RST)) == TH_SYN);
+
+ m->m_data += max_linkhdr;
+ m->m_flags |= M_SKIP_FIREWALL;
+ m->m_pkthdr.len = m->m_len = len;
+ m->m_pkthdr.rcvif = NULL;
+ bzero(m->m_data, len);
+
+ switch (id->addr_type) {
+ case 4:
+ h = mtod(m, struct ip *);
+
+ /* prepare for checksum */
+ h->ip_p = IPPROTO_TCP;
+ h->ip_len = htons(sizeof(struct tcphdr));
+ if (dir) {
+ h->ip_src.s_addr = htonl(id->src_ip);
+ h->ip_dst.s_addr = htonl(id->dst_ip);
+ } else {
+ h->ip_src.s_addr = htonl(id->dst_ip);
+ h->ip_dst.s_addr = htonl(id->src_ip);
+ }
+
+ th = (struct tcphdr *)(h + 1);
+ break;
+#ifdef INET6
+ case 6:
+ h6 = mtod(m, struct ip6_hdr *);
+
+ /* prepare for checksum */
+ h6->ip6_nxt = IPPROTO_TCP;
+ h6->ip6_plen = htons(sizeof(struct tcphdr));
+ if (dir) {
+ h6->ip6_src = id->src_ip6;
+ h6->ip6_dst = id->dst_ip6;
+ } else {
+ h6->ip6_src = id->dst_ip6;
+ h6->ip6_dst = id->src_ip6;
+ }
+
+ th = (struct tcphdr *)(h6 + 1);
+ break;
+#endif
+ }
+
+ if (dir) {
+ th->th_sport = htons(id->src_port);
+ th->th_dport = htons(id->dst_port);
+ } else {
+ th->th_sport = htons(id->dst_port);
+ th->th_dport = htons(id->src_port);
+ }
+ th->th_off = sizeof(struct tcphdr) >> 2;
+
+ if (flags & TH_RST) {
+ if (flags & TH_ACK) {
+ th->th_seq = htonl(ack);
+ th->th_flags = TH_RST;
+ } else {
+ if (flags & TH_SYN)
+ seq++;
+ th->th_ack = htonl(seq);
+ th->th_flags = TH_RST | TH_ACK;
+ }
+ } else {
+ /*
+ * Keepalive - use caller provided sequence numbers
+ */
+ th->th_seq = htonl(seq);
+ th->th_ack = htonl(ack);
+ th->th_flags = TH_ACK;
+ }
+
+ switch (id->addr_type) {
+ case 4:
+ th->th_sum = in_cksum(m, len);
+
+ /* finish the ip header */
+ h->ip_v = 4;
+ h->ip_hl = sizeof(*h) >> 2;
+ h->ip_tos = IPTOS_LOWDELAY;
+ h->ip_off = 0;
+ /* ip_len must be in host format for ip_output */
+ h->ip_len = len;
+ h->ip_ttl = V_ip_defttl;
+ h->ip_sum = 0;
+ break;
+#ifdef INET6
+ case 6:
+ th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(*h6),
+ sizeof(struct tcphdr));
+
+ /* finish the ip6 header */
+ h6->ip6_vfc |= IPV6_VERSION;
+ h6->ip6_hlim = IPV6_DEFHLIM;
+ break;
+#endif
+ }
+
+ return (m);
+#endif /* __FreeBSD__ */
+}
+
+/*
+ * This procedure is only used to handle keepalives. It is invoked
+ * every dyn_keepalive_period
+ */
+static void
+ipfw_tick(void * vnetx)
+{
+ struct mbuf *m0, *m, *mnext, **mtailp;
+#ifdef INET6
+ struct mbuf *m6, **m6_tailp;
+#endif
+ int i;
+ ipfw_dyn_rule *q;
+#ifdef VIMAGE
+ struct vnet *vp = vnetx;
+#endif
+
+ CURVNET_SET(vp);
+ if (V_dyn_keepalive == 0 || V_ipfw_dyn_v == NULL || V_dyn_count == 0)
+ goto done;
+
+ /*
+ * We make a chain of packets to go out here -- not deferring
+ * until after we drop the IPFW dynamic rule lock would result
+ * in a lock order reversal with the normal packet input -> ipfw
+ * call stack.
+ */
+ m0 = NULL;
+ mtailp = &m0;
+#ifdef INET6
+ m6 = NULL;
+ m6_tailp = &m6;
+#endif
+ IPFW_DYN_LOCK();
+ for (i = 0 ; i < V_curr_dyn_buckets ; i++) {
+ for (q = V_ipfw_dyn_v[i] ; q ; q = q->next ) {
+ if (q->dyn_type == O_LIMIT_PARENT)
+ continue;
+ if (q->id.proto != IPPROTO_TCP)
+ continue;
+ if ( (q->state & BOTH_SYN) != BOTH_SYN)
+ continue;
+ if (TIME_LEQ(time_uptime + V_dyn_keepalive_interval,
+ q->expire))
+ continue; /* too early */
+ if (TIME_LEQ(q->expire, time_uptime))
+ continue; /* too late, rule expired */
+
+ m = ipfw_send_pkt(NULL, &(q->id), q->ack_rev - 1,
+ q->ack_fwd, TH_SYN);
+ mnext = ipfw_send_pkt(NULL, &(q->id), q->ack_fwd - 1,
+ q->ack_rev, 0);
+
+ switch (q->id.addr_type) {
+ case 4:
+ if (m != NULL) {
+ *mtailp = m;
+ mtailp = &(*mtailp)->m_nextpkt;
+ }
+ if (mnext != NULL) {
+ *mtailp = mnext;
+ mtailp = &(*mtailp)->m_nextpkt;
+ }
+ break;
+#ifdef INET6
+ case 6:
+ if (m != NULL) {
+ *m6_tailp = m;
+ m6_tailp = &(*m6_tailp)->m_nextpkt;
+ }
+ if (mnext != NULL) {
+ *m6_tailp = mnext;
+ m6_tailp = &(*m6_tailp)->m_nextpkt;
+ }
+ break;
+#endif
+ }
+
+ m = mnext = NULL;
+ }
+ }
+ IPFW_DYN_UNLOCK();
+ for (m = mnext = m0; m != NULL; m = mnext) {
+ mnext = m->m_nextpkt;
+ m->m_nextpkt = NULL;
+ ip_output(m, NULL, NULL, 0, NULL, NULL);
+ }
+#ifdef INET6
+ for (m = mnext = m6; m != NULL; m = mnext) {
+ mnext = m->m_nextpkt;
+ m->m_nextpkt = NULL;
+ ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL);
+ }
+#endif
+done:
+ callout_reset(&V_ipfw_timeout, V_dyn_keepalive_period * hz,
+ ipfw_tick, vnetx);
+ CURVNET_RESTORE();
+}
+
+void
+ipfw_dyn_attach(void)
+{
+ ipfw_dyn_rule_zone = uma_zcreate("IPFW dynamic rule",
+ sizeof(ipfw_dyn_rule), NULL, NULL, NULL, NULL,
+ UMA_ALIGN_PTR, 0);
+
+ IPFW_DYN_LOCK_INIT();
+}
+
+void
+ipfw_dyn_detach(void)
+{
+ uma_zdestroy(ipfw_dyn_rule_zone);
+ IPFW_DYN_LOCK_DESTROY();
+}
+
+void
+ipfw_dyn_init(void)
+{
+ V_ipfw_dyn_v = NULL;
+ V_dyn_buckets = 256; /* must be power of 2 */
+ V_curr_dyn_buckets = 256; /* must be power of 2 */
+
+ V_dyn_ack_lifetime = 300;
+ V_dyn_syn_lifetime = 20;
+ V_dyn_fin_lifetime = 1;
+ V_dyn_rst_lifetime = 1;
+ V_dyn_udp_lifetime = 10;
+ V_dyn_short_lifetime = 5;
+
+ V_dyn_keepalive_interval = 20;
+ V_dyn_keepalive_period = 5;
+ V_dyn_keepalive = 1; /* do send keepalives */
+
+ V_dyn_max = 4096; /* max # of dynamic rules */
+ callout_init(&V_ipfw_timeout, CALLOUT_MPSAFE);
+ callout_reset(&V_ipfw_timeout, hz, ipfw_tick, curvnet);
+}
+
+void
+ipfw_dyn_uninit(int pass)
+{
+ if (pass == 0)
+ callout_drain(&V_ipfw_timeout);
+ else {
+ if (V_ipfw_dyn_v != NULL)
+ free(V_ipfw_dyn_v, M_IPFW);
+ }
+}
+
+int
+ipfw_dyn_len(void)
+{
+ return (V_ipfw_dyn_v == NULL) ? 0 :
+ (V_dyn_count * sizeof(ipfw_dyn_rule));
+}
+
+void
+ipfw_get_dynamic(char **pbp, const char *ep)
+{
+ ipfw_dyn_rule *p, *last = NULL;
+ char *bp;
+ int i;
+
+ if (V_ipfw_dyn_v == NULL)
+ return;
+ bp = *pbp;
+
+ IPFW_DYN_LOCK();
+ for (i = 0 ; i < V_curr_dyn_buckets; i++)
+ for (p = V_ipfw_dyn_v[i] ; p != NULL; p = p->next) {
+ if (bp + sizeof *p <= ep) {
+ ipfw_dyn_rule *dst =
+ (ipfw_dyn_rule *)bp;
+ bcopy(p, dst, sizeof *p);
+ bcopy(&(p->rule->rulenum), &(dst->rule),
+ sizeof(p->rule->rulenum));
+ /*
+ * store set number into high word of
+ * dst->rule pointer.
+ */
+ bcopy(&(p->rule->set),
+ (char *)&dst->rule +
+ sizeof(p->rule->rulenum),
+ sizeof(p->rule->set));
+ /*
+ * store a non-null value in "next".
+ * The userland code will interpret a
+ * NULL here as a marker
+ * for the last dynamic rule.
+ */
+ bcopy(&dst, &dst->next, sizeof(dst));
+ last = dst;
+ dst->expire =
+ TIME_LEQ(dst->expire, time_uptime) ?
+ 0 : dst->expire - time_uptime ;
+ bp += sizeof(ipfw_dyn_rule);
+ }
+ }
+ IPFW_DYN_UNLOCK();
+ if (last != NULL) /* mark last dynamic rule */
+ bzero(&last->next, sizeof(last));
+ *pbp = bp;
+}
+/* end of file */
diff --git a/sys/netinet/ipfw/ip_fw_log.c b/sys/netinet/ipfw/ip_fw_log.c
new file mode 100644
index 0000000..93bd19b
--- /dev/null
+++ b/sys/netinet/ipfw/ip_fw_log.c
@@ -0,0 +1,435 @@
+/*-
+ * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ * Logging support for ipfw
+ */
+
+#if !defined(KLD_MODULE)
+#include "opt_ipfw.h"
+#include "opt_ipdivert.h"
+#include "opt_ipdn.h"
+#include "opt_inet.h"
+#ifndef INET
+#error IPFIREWALL requires INET.
+#endif /* INET */
+#endif
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <net/ethernet.h> /* for ETHERTYPE_IP */
+#include <net/if.h>
+#include <net/vnet.h>
+#include <net/if_types.h> /* for IFT_ETHER */
+#include <net/bpf.h> /* for BPF */
+
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip_fw.h>
+#include <netinet/ipfw/ip_fw_private.h>
+#include <netinet/tcp_var.h>
+#include <netinet/udp.h>
+
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+#ifdef INET6
+#include <netinet6/in6_var.h> /* ip6_sprintf() */
+#endif
+
+#ifdef MAC
+#include <security/mac/mac_framework.h>
+#endif
+
+/*
+ * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T
+ * Other macros just cast void * into the appropriate type
+ */
+#define L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl))
+#define TCP(p) ((struct tcphdr *)(p))
+#define SCTP(p) ((struct sctphdr *)(p))
+#define UDP(p) ((struct udphdr *)(p))
+#define ICMP(p) ((struct icmphdr *)(p))
+#define ICMP6(p) ((struct icmp6_hdr *)(p))
+
+#define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0
+#define SNP(buf) buf, sizeof(buf)
+
+#ifdef WITHOUT_BPF
+void
+ipfw_log_bpf(int onoff)
+{
+}
+#else /* !WITHOUT_BPF */
+static struct ifnet *log_if; /* hook to attach to bpf */
+
+/* we use this dummy function for all ifnet callbacks */
+static int
+log_dummy(struct ifnet *ifp, u_long cmd, caddr_t addr)
+{
+ return EINVAL;
+}
+
+void
+ipfw_log_bpf(int onoff)
+{
+ struct ifnet *ifp;
+
+ if (onoff) {
+ if (log_if)
+ return;
+ ifp = if_alloc(IFT_ETHER);
+ if (ifp == NULL)
+ return;
+ if_initname(ifp, "ipfw", 0);
+ ifp->if_mtu = 65536;
+ ifp->if_flags = IFF_UP | IFF_SIMPLEX | IFF_MULTICAST;
+ ifp->if_init = (void *)log_dummy;
+ ifp->if_ioctl = log_dummy;
+ ifp->if_start = (void *)log_dummy;
+ ifp->if_output = (void *)log_dummy;
+ ifp->if_addrlen = 6;
+ ifp->if_hdrlen = 14;
+ if_attach(ifp);
+ ifp->if_baudrate = IF_Mbps(10);
+ bpfattach(ifp, DLT_EN10MB, 14);
+ log_if = ifp;
+ } else {
+ if (log_if) {
+ ether_ifdetach(log_if);
+ if_free(log_if);
+ }
+ log_if = NULL;
+ }
+}
+#endif /* !WITHOUT_BPF */
+
+/*
+ * We enter here when we have a rule with O_LOG.
+ * XXX this function alone takes about 2Kbytes of code!
+ */
+void
+ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args,
+ struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg,
+ struct ip *ip)
+{
+ char *action;
+ int limit_reached = 0;
+ char action2[40], proto[128], fragment[32];
+
+ if (V_fw_verbose == 0) {
+#ifndef WITHOUT_BPF
+ struct m_hdr mh;
+
+ if (log_if == NULL || log_if->if_bpf == NULL)
+ return;
+ /* BPF treats the "mbuf" as read-only */
+ mh.mh_next = m;
+ mh.mh_len = ETHER_HDR_LEN;
+ if (args->eh) { /* layer2, use orig hdr */
+ mh.mh_data = (char *)args->eh;
+ } else {
+ /* add fake header. Later we will store
+ * more info in the header
+ */
+ mh.mh_data = "DDDDDDSSSSSS\x08\x00";
+ }
+ BPF_MTAP(log_if, (struct mbuf *)&mh);
+#endif /* !WITHOUT_BPF */
+ return;
+ }
+ /* the old 'log' function */
+ fragment[0] = '\0';
+ proto[0] = '\0';
+
+ if (f == NULL) { /* bogus pkt */
+ if (V_verbose_limit != 0 && V_norule_counter >= V_verbose_limit)
+ return;
+ V_norule_counter++;
+ if (V_norule_counter == V_verbose_limit)
+ limit_reached = V_verbose_limit;
+ action = "Refuse";
+ } else { /* O_LOG is the first action, find the real one */
+ ipfw_insn *cmd = ACTION_PTR(f);
+ ipfw_insn_log *l = (ipfw_insn_log *)cmd;
+
+ if (l->max_log != 0 && l->log_left == 0)
+ return;
+ l->log_left--;
+ if (l->log_left == 0)
+ limit_reached = l->max_log;
+ cmd += F_LEN(cmd); /* point to first action */
+ if (cmd->opcode == O_ALTQ) {
+ ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd;
+
+ snprintf(SNPARGS(action2, 0), "Altq %d",
+ altq->qid);
+ cmd += F_LEN(cmd);
+ }
+ if (cmd->opcode == O_PROB)
+ cmd += F_LEN(cmd);
+
+ if (cmd->opcode == O_TAG)
+ cmd += F_LEN(cmd);
+
+ action = action2;
+ switch (cmd->opcode) {
+ case O_DENY:
+ action = "Deny";
+ break;
+
+ case O_REJECT:
+ if (cmd->arg1==ICMP_REJECT_RST)
+ action = "Reset";
+ else if (cmd->arg1==ICMP_UNREACH_HOST)
+ action = "Reject";
+ else
+ snprintf(SNPARGS(action2, 0), "Unreach %d",
+ cmd->arg1);
+ break;
+
+ case O_UNREACH6:
+ if (cmd->arg1==ICMP6_UNREACH_RST)
+ action = "Reset";
+ else
+ snprintf(SNPARGS(action2, 0), "Unreach %d",
+ cmd->arg1);
+ break;
+
+ case O_ACCEPT:
+ action = "Accept";
+ break;
+ case O_COUNT:
+ action = "Count";
+ break;
+ case O_DIVERT:
+ snprintf(SNPARGS(action2, 0), "Divert %d",
+ cmd->arg1);
+ break;
+ case O_TEE:
+ snprintf(SNPARGS(action2, 0), "Tee %d",
+ cmd->arg1);
+ break;
+ case O_SETFIB:
+ snprintf(SNPARGS(action2, 0), "SetFib %d",
+ cmd->arg1);
+ break;
+ case O_SKIPTO:
+ snprintf(SNPARGS(action2, 0), "SkipTo %d",
+ cmd->arg1);
+ break;
+ case O_PIPE:
+ snprintf(SNPARGS(action2, 0), "Pipe %d",
+ cmd->arg1);
+ break;
+ case O_QUEUE:
+ snprintf(SNPARGS(action2, 0), "Queue %d",
+ cmd->arg1);
+ break;
+ case O_FORWARD_IP: {
+ ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd;
+ int len;
+ struct in_addr dummyaddr;
+ if (sa->sa.sin_addr.s_addr == INADDR_ANY)
+ dummyaddr.s_addr = htonl(tablearg);
+ else
+ dummyaddr.s_addr = sa->sa.sin_addr.s_addr;
+
+ len = snprintf(SNPARGS(action2, 0), "Forward to %s",
+ inet_ntoa(dummyaddr));
+
+ if (sa->sa.sin_port)
+ snprintf(SNPARGS(action2, len), ":%d",
+ sa->sa.sin_port);
+ }
+ break;
+ case O_NETGRAPH:
+ snprintf(SNPARGS(action2, 0), "Netgraph %d",
+ cmd->arg1);
+ break;
+ case O_NGTEE:
+ snprintf(SNPARGS(action2, 0), "Ngtee %d",
+ cmd->arg1);
+ break;
+ case O_NAT:
+ action = "Nat";
+ break;
+ case O_REASS:
+ action = "Reass";
+ break;
+ default:
+ action = "UNKNOWN";
+ break;
+ }
+ }
+
+ if (hlen == 0) { /* non-ip */
+ snprintf(SNPARGS(proto, 0), "MAC");
+
+ } else {
+ int len;
+#ifdef INET6
+ char src[INET6_ADDRSTRLEN + 2], dst[INET6_ADDRSTRLEN + 2];
+#else
+ char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN];
+#endif
+ struct icmphdr *icmp;
+ struct tcphdr *tcp;
+ struct udphdr *udp;
+#ifdef INET6
+ struct ip6_hdr *ip6 = NULL;
+ struct icmp6_hdr *icmp6;
+#endif
+ src[0] = '\0';
+ dst[0] = '\0';
+#ifdef INET6
+ if (IS_IP6_FLOW_ID(&(args->f_id))) {
+ char ip6buf[INET6_ADDRSTRLEN];
+ snprintf(src, sizeof(src), "[%s]",
+ ip6_sprintf(ip6buf, &args->f_id.src_ip6));
+ snprintf(dst, sizeof(dst), "[%s]",
+ ip6_sprintf(ip6buf, &args->f_id.dst_ip6));
+
+ ip6 = (struct ip6_hdr *)ip;
+ tcp = (struct tcphdr *)(((char *)ip) + hlen);
+ udp = (struct udphdr *)(((char *)ip) + hlen);
+ } else
+#endif
+ {
+ tcp = L3HDR(struct tcphdr, ip);
+ udp = L3HDR(struct udphdr, ip);
+
+ inet_ntoa_r(ip->ip_src, src);
+ inet_ntoa_r(ip->ip_dst, dst);
+ }
+
+ switch (args->f_id.proto) {
+ case IPPROTO_TCP:
+ len = snprintf(SNPARGS(proto, 0), "TCP %s", src);
+ if (offset == 0)
+ snprintf(SNPARGS(proto, len), ":%d %s:%d",
+ ntohs(tcp->th_sport),
+ dst,
+ ntohs(tcp->th_dport));
+ else
+ snprintf(SNPARGS(proto, len), " %s", dst);
+ break;
+
+ case IPPROTO_UDP:
+ len = snprintf(SNPARGS(proto, 0), "UDP %s", src);
+ if (offset == 0)
+ snprintf(SNPARGS(proto, len), ":%d %s:%d",
+ ntohs(udp->uh_sport),
+ dst,
+ ntohs(udp->uh_dport));
+ else
+ snprintf(SNPARGS(proto, len), " %s", dst);
+ break;
+
+ case IPPROTO_ICMP:
+ icmp = L3HDR(struct icmphdr, ip);
+ if (offset == 0)
+ len = snprintf(SNPARGS(proto, 0),
+ "ICMP:%u.%u ",
+ icmp->icmp_type, icmp->icmp_code);
+ else
+ len = snprintf(SNPARGS(proto, 0), "ICMP ");
+ len += snprintf(SNPARGS(proto, len), "%s", src);
+ snprintf(SNPARGS(proto, len), " %s", dst);
+ break;
+#ifdef INET6
+ case IPPROTO_ICMPV6:
+ icmp6 = (struct icmp6_hdr *)(((char *)ip) + hlen);
+ if (offset == 0)
+ len = snprintf(SNPARGS(proto, 0),
+ "ICMPv6:%u.%u ",
+ icmp6->icmp6_type, icmp6->icmp6_code);
+ else
+ len = snprintf(SNPARGS(proto, 0), "ICMPv6 ");
+ len += snprintf(SNPARGS(proto, len), "%s", src);
+ snprintf(SNPARGS(proto, len), " %s", dst);
+ break;
+#endif
+ default:
+ len = snprintf(SNPARGS(proto, 0), "P:%d %s",
+ args->f_id.proto, src);
+ snprintf(SNPARGS(proto, len), " %s", dst);
+ break;
+ }
+
+#ifdef INET6
+ if (IS_IP6_FLOW_ID(&(args->f_id))) {
+ if (offset & (IP6F_OFF_MASK | IP6F_MORE_FRAG))
+ snprintf(SNPARGS(fragment, 0),
+ " (frag %08x:%d@%d%s)",
+ args->f_id.extra,
+ ntohs(ip6->ip6_plen) - hlen,
+ ntohs(offset & IP6F_OFF_MASK) << 3,
+ (offset & IP6F_MORE_FRAG) ? "+" : "");
+ } else
+#endif
+ {
+ int ipoff, iplen;
+ ipoff = ntohs(ip->ip_off);
+ iplen = ntohs(ip->ip_len);
+ if (ipoff & (IP_MF | IP_OFFMASK))
+ snprintf(SNPARGS(fragment, 0),
+ " (frag %d:%d@%d%s)",
+ ntohs(ip->ip_id), iplen - (ip->ip_hl << 2),
+ offset << 3,
+ (ipoff & IP_MF) ? "+" : "");
+ }
+ }
+#ifdef __FreeBSD__
+ if (oif || m->m_pkthdr.rcvif)
+ log(LOG_SECURITY | LOG_INFO,
+ "ipfw: %d %s %s %s via %s%s\n",
+ f ? f->rulenum : -1,
+ action, proto, oif ? "out" : "in",
+ oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname,
+ fragment);
+ else
+#endif
+ log(LOG_SECURITY | LOG_INFO,
+ "ipfw: %d %s %s [no if info]%s\n",
+ f ? f->rulenum : -1,
+ action, proto, fragment);
+ if (limit_reached)
+ log(LOG_SECURITY | LOG_NOTICE,
+ "ipfw: limit %d reached on entry %d\n",
+ limit_reached, f ? f->rulenum : -1);
+}
+/* end of file */
diff --git a/sys/netinet/ipfw/ip_fw_nat.c b/sys/netinet/ipfw/ip_fw_nat.c
index cd6a1cf..f30b754 100644
--- a/sys/netinet/ipfw/ip_fw_nat.c
+++ b/sys/netinet/ipfw/ip_fw_nat.c
@@ -29,114 +29,80 @@ __FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
-#include <sys/condvar.h>
#include <sys/eventhandler.h>
#include <sys/malloc.h>
-#include <sys/mbuf.h>
#include <sys/kernel.h>
#include <sys/lock.h>
-#include <sys/jail.h>
#include <sys/module.h>
-#include <sys/priv.h>
-#include <sys/proc.h>
#include <sys/rwlock.h>
-#include <sys/socket.h>
-#include <sys/socketvar.h>
-#include <sys/sysctl.h>
-#include <sys/syslog.h>
-#include <sys/ucred.h>
+
+#define IPFW_INTERNAL /* Access to protected data structures in ip_fw.h. */
#include <netinet/libalias/alias.h>
#include <netinet/libalias/alias_local.h>
-#define IPFW_INTERNAL /* Access to protected data structures in ip_fw.h. */
-
#include <net/if.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/ip_var.h>
-#include <netinet/ip_icmp.h>
#include <netinet/ip_fw.h>
+#include <netinet/ipfw/ip_fw_private.h>
#include <netinet/tcp.h>
-#include <netinet/tcp_timer.h>
-#include <netinet/tcp_var.h>
-#include <netinet/tcpip.h>
#include <netinet/udp.h>
-#include <netinet/udp_var.h>
#include <machine/in_cksum.h> /* XXX for in_cksum */
-MALLOC_DECLARE(M_IPFW);
-
static VNET_DEFINE(eventhandler_tag, ifaddr_event_tag);
#define V_ifaddr_event_tag VNET(ifaddr_event_tag)
-extern ipfw_nat_t *ipfw_nat_ptr;
-extern ipfw_nat_cfg_t *ipfw_nat_cfg_ptr;
-extern ipfw_nat_cfg_t *ipfw_nat_del_ptr;
-extern ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr;
-extern ipfw_nat_cfg_t *ipfw_nat_get_log_ptr;
-
-static void
+static void
ifaddr_change(void *arg __unused, struct ifnet *ifp)
{
struct cfg_nat *ptr;
struct ifaddr *ifa;
+ struct ip_fw_chain *chain;
- IPFW_WLOCK(&V_layer3_chain);
+ chain = &V_layer3_chain;
+ IPFW_WLOCK(chain);
/* Check every nat entry... */
- LIST_FOREACH(ptr, &V_layer3_chain.nat, _next) {
+ LIST_FOREACH(ptr, &chain->nat, _next) {
/* ...using nic 'ifp->if_xname' as dynamic alias address. */
- if (strncmp(ptr->if_name, ifp->if_xname, IF_NAMESIZE) == 0) {
- if_addr_rlock(ifp);
- TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
- if (ifa->ifa_addr == NULL)
- continue;
- if (ifa->ifa_addr->sa_family != AF_INET)
- continue;
- ptr->ip = ((struct sockaddr_in *)
- (ifa->ifa_addr))->sin_addr;
- LibAliasSetAddress(ptr->lib, ptr->ip);
- }
- if_addr_runlock(ifp);
+ if (strncmp(ptr->if_name, ifp->if_xname, IF_NAMESIZE) != 0)
+ continue;
+ if_addr_rlock(ifp);
+ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ if (ifa->ifa_addr == NULL)
+ continue;
+ if (ifa->ifa_addr->sa_family != AF_INET)
+ continue;
+ ptr->ip = ((struct sockaddr_in *)
+ (ifa->ifa_addr))->sin_addr;
+ LibAliasSetAddress(ptr->lib, ptr->ip);
}
+ if_addr_runlock(ifp);
}
- IPFW_WUNLOCK(&V_layer3_chain);
+ IPFW_WUNLOCK(chain);
}
+/*
+ * delete the pointers for nat entry ix, or all of them if ix < 0
+ */
static void
-flush_nat_ptrs(const int i)
+flush_nat_ptrs(struct ip_fw_chain *chain, const int ix)
{
- struct ip_fw *rule;
-
- IPFW_WLOCK_ASSERT(&V_layer3_chain);
- for (rule = V_layer3_chain.rules; rule; rule = rule->next) {
- ipfw_insn_nat *cmd = (ipfw_insn_nat *)ACTION_PTR(rule);
- if (cmd->o.opcode != O_NAT)
- continue;
- if (cmd->nat != NULL && cmd->nat->id == i)
+ int i;
+ ipfw_insn_nat *cmd;
+
+ IPFW_WLOCK_ASSERT(chain);
+ for (i = 0; i < chain->n_rules; i++) {
+ cmd = (ipfw_insn_nat *)ACTION_PTR(chain->map[i]);
+ /* XXX skip log and the like ? */
+ if (cmd->o.opcode == O_NAT && cmd->nat != NULL &&
+ (ix < 0 || cmd->nat->id == ix))
cmd->nat = NULL;
}
}
-#define HOOK_NAT(b, p) do { \
- IPFW_WLOCK_ASSERT(&V_layer3_chain); \
- LIST_INSERT_HEAD(b, p, _next); \
- } while (0)
-
-#define UNHOOK_NAT(p) do { \
- IPFW_WLOCK_ASSERT(&V_layer3_chain); \
- LIST_REMOVE(p, _next); \
- } while (0)
-
-#define HOOK_REDIR(b, p) do { \
- LIST_INSERT_HEAD(b, p, _next); \
- } while (0)
-
-#define HOOK_SPOOL(b, p) do { \
- LIST_INSERT_HEAD(b, p, _next); \
- } while (0)
-
static void
del_redir_spool_cfg(struct cfg_nat *n, struct redir_chain *head)
{
@@ -165,9 +131,9 @@ del_redir_spool_cfg(struct cfg_nat *n, struct redir_chain *head)
free(r, M_IPFW);
break;
default:
- printf("unknown redirect mode: %u\n", r->mode);
+ printf("unknown redirect mode: %u\n", r->mode);
/* XXX - panic?!?!? */
- break;
+ break;
}
}
}
@@ -178,7 +144,6 @@ add_redir_spool_cfg(char *buf, struct cfg_nat *ptr)
struct cfg_redir *r, *ser_r;
struct cfg_spool *s, *ser_s;
int cnt, off, i;
- char *panic_err;
for (cnt = 0, off = 0; cnt < ptr->redir_cnt; cnt++) {
ser_r = (struct cfg_redir *)&buf[off];
@@ -201,7 +166,7 @@ add_redir_spool_cfg(char *buf, struct cfg_nat *ptr)
remotePortCopy = 0;
r->alink[i] = LibAliasRedirectPort(ptr->lib,
r->laddr, htons(r->lport + i), r->raddr,
- htons(remotePortCopy), r->paddr,
+ htons(remotePortCopy), r->paddr,
htons(r->pport + i), r->proto);
if (r->alink[i] == NULL) {
r->alink[0] = NULL;
@@ -215,30 +180,26 @@ add_redir_spool_cfg(char *buf, struct cfg_nat *ptr)
break;
default:
printf("unknown redirect mode: %u\n", r->mode);
- break;
+ break;
+ }
+ /* XXX perhaps return an error instead of panic ? */
+ if (r->alink[0] == NULL)
+ panic("LibAliasRedirect* returned NULL");
+ /* LSNAT handling. */
+ for (i = 0; i < r->spool_cnt; i++) {
+ ser_s = (struct cfg_spool *)&buf[off];
+ s = malloc(SOF_REDIR, M_IPFW, M_WAITOK | M_ZERO);
+ memcpy(s, ser_s, SOF_SPOOL);
+ LibAliasAddServer(ptr->lib, r->alink[0],
+ s->addr, htons(s->port));
+ off += SOF_SPOOL;
+ /* Hook spool entry. */
+ LIST_INSERT_HEAD(&r->spool_chain, s, _next);
}
- if (r->alink[0] == NULL) {
- panic_err = "LibAliasRedirect* returned NULL";
- goto bad;
- } else /* LSNAT handling. */
- for (i = 0; i < r->spool_cnt; i++) {
- ser_s = (struct cfg_spool *)&buf[off];
- s = malloc(SOF_REDIR, M_IPFW,
- M_WAITOK | M_ZERO);
- memcpy(s, ser_s, SOF_SPOOL);
- LibAliasAddServer(ptr->lib, r->alink[0],
- s->addr, htons(s->port));
- off += SOF_SPOOL;
- /* Hook spool entry. */
- HOOK_SPOOL(&r->spool_chain, s);
- }
/* And finally hook this redir entry. */
- HOOK_REDIR(&ptr->redir_chain, r);
+ LIST_INSERT_HEAD(&ptr->redir_chain, r, _next);
}
return (1);
-bad:
- /* something really bad happened: panic! */
- panic("%s\n", panic_err);
}
static int
@@ -252,101 +213,80 @@ ipfw_nat(struct ip_fw_args *args, struct cfg_nat *t, struct mbuf *m)
ldt = 0;
retval = 0;
- if ((mcl = m_megapullup(m, m->m_pkthdr.len)) ==
- NULL)
- goto badnat;
- ip = mtod(mcl, struct ip *);
- if (args->eh == NULL) {
- ip->ip_len = htons(ip->ip_len);
- ip->ip_off = htons(ip->ip_off);
+ mcl = m_megapullup(m, m->m_pkthdr.len);
+ if (mcl == NULL) {
+ args->m = NULL;
+ return (IP_FW_DENY);
}
+ ip = mtod(mcl, struct ip *);
- /*
+ /*
* XXX - Libalias checksum offload 'duct tape':
- *
- * locally generated packets have only
- * pseudo-header checksum calculated
- * and libalias will screw it[1], so
- * mark them for later fix. Moreover
- * there are cases when libalias
- * modify tcp packet data[2], mark it
- * for later fix too.
*
- * [1] libalias was never meant to run
- * in kernel, so it doesn't have any
- * knowledge about checksum
- * offloading, and it expects a packet
- * with a full internet
- * checksum. Unfortunately, packets
- * generated locally will have just the
- * pseudo header calculated, and when
- * libalias tries to adjust the
- * checksum it will actually screw it.
+ * locally generated packets have only pseudo-header checksum
+ * calculated and libalias will break it[1], so mark them for
+ * later fix. Moreover there are cases when libalias modifies
+ * tcp packet data[2], mark them for later fix too.
+ *
+ * [1] libalias was never meant to run in kernel, so it does
+ * not have any knowledge about checksum offloading, and
+ * expects a packet with a full internet checksum.
+ * Unfortunately, packets generated locally will have just the
+ * pseudo header calculated, and when libalias tries to adjust
+ * the checksum it will actually compute a wrong value.
*
- * [2] when libalias modify tcp's data
- * content, full TCP checksum has to
- * be recomputed: the problem is that
- * libalias doesn't have any idea
- * about checksum offloading To
- * workaround this, we do not do
- * checksumming in LibAlias, but only
- * mark the packets in th_x2 field. If
- * we receive a marked packet, we
- * calculate correct checksum for it
- * aware of offloading. Why such a
- * terrible hack instead of
- * recalculating checksum for each
- * packet? Because the previous
- * checksum was not checked!
- * Recalculating checksums for EVERY
- * packet will hide ALL transmission
- * errors. Yes, marked packets still
- * suffer from this problem. But,
- * sigh, natd(8) has this problem,
- * too.
+ * [2] when libalias modifies tcp's data content, full TCP
+ * checksum has to be recomputed: the problem is that
+ * libalias does not have any idea about checksum offloading.
+ * To work around this, we do not do checksumming in LibAlias,
+ * but only mark the packets in th_x2 field. If we receive a
+ * marked packet, we calculate correct checksum for it
+ * aware of offloading. Why such a terrible hack instead of
+ * recalculating checksum for each packet?
+ * Because the previous checksum was not checked!
+ * Recalculating checksums for EVERY packet will hide ALL
+ * transmission errors. Yes, marked packets still suffer from
+ * this problem. But, sigh, natd(8) has this problem, too.
*
* TODO: -make libalias mbuf aware (so
* it can handle delayed checksum and tso)
*/
- if (mcl->m_pkthdr.rcvif == NULL &&
- mcl->m_pkthdr.csum_flags &
- CSUM_DELAY_DATA)
+ if (mcl->m_pkthdr.rcvif == NULL &&
+ mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA)
ldt = 1;
c = mtod(mcl, char *);
if (args->oif == NULL)
- retval = LibAliasIn(t->lib, c,
+ retval = LibAliasIn(t->lib, c,
mcl->m_len + M_TRAILINGSPACE(mcl));
else
- retval = LibAliasOut(t->lib, c,
+ retval = LibAliasOut(t->lib, c,
mcl->m_len + M_TRAILINGSPACE(mcl));
if (retval == PKT_ALIAS_RESPOND) {
- m->m_flags |= M_SKIP_FIREWALL;
- retval = PKT_ALIAS_OK;
+ m->m_flags |= M_SKIP_FIREWALL;
+ retval = PKT_ALIAS_OK;
}
if (retval != PKT_ALIAS_OK &&
retval != PKT_ALIAS_FOUND_HEADER_FRAGMENT) {
/* XXX - should i add some logging? */
m_free(mcl);
- badnat:
args->m = NULL;
return (IP_FW_DENY);
}
- mcl->m_pkthdr.len = mcl->m_len =
- ntohs(ip->ip_len);
+ mcl->m_pkthdr.len = mcl->m_len = ntohs(ip->ip_len);
- /*
- * XXX - libalias checksum offload
- * 'duct tape' (see above)
+ /*
+ * XXX - libalias checksum offload
+ * 'duct tape' (see above)
*/
- if ((ip->ip_off & htons(IP_OFFMASK)) == 0 &&
+ if ((ip->ip_off & htons(IP_OFFMASK)) == 0 &&
ip->ip_p == IPPROTO_TCP) {
- struct tcphdr *th;
+ struct tcphdr *th;
th = (struct tcphdr *)(ip + 1);
- if (th->th_x2)
+ if (th->th_x2)
ldt = 1;
}
@@ -355,82 +295,83 @@ ipfw_nat(struct ip_fw_args *args, struct cfg_nat *t, struct mbuf *m)
struct udphdr *uh;
u_short cksum;
- ip->ip_len = ntohs(ip->ip_len);
+ /* XXX check if ip_len can stay in net format */
cksum = in_pseudo(
ip->ip_src.s_addr,
- ip->ip_dst.s_addr,
- htons(ip->ip_p + ip->ip_len - (ip->ip_hl << 2))
+ ip->ip_dst.s_addr,
+ htons(ip->ip_p + ntohs(ip->ip_len) - (ip->ip_hl << 2))
);
-
+
switch (ip->ip_p) {
case IPPROTO_TCP:
th = (struct tcphdr *)(ip + 1);
- /*
- * Maybe it was set in
- * libalias...
+ /*
+ * Maybe it was set in
+ * libalias...
*/
th->th_x2 = 0;
th->th_sum = cksum;
- mcl->m_pkthdr.csum_data =
+ mcl->m_pkthdr.csum_data =
offsetof(struct tcphdr, th_sum);
break;
case IPPROTO_UDP:
uh = (struct udphdr *)(ip + 1);
uh->uh_sum = cksum;
- mcl->m_pkthdr.csum_data =
+ mcl->m_pkthdr.csum_data =
offsetof(struct udphdr, uh_sum);
- break;
+ break;
}
- /*
- * No hw checksum offloading: do it
- * by ourself.
- */
- if ((mcl->m_pkthdr.csum_flags &
- CSUM_DELAY_DATA) == 0) {
+ /* No hw checksum offloading: do it ourselves */
+ if ((mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA) == 0) {
in_delayed_cksum(mcl);
- mcl->m_pkthdr.csum_flags &=
- ~CSUM_DELAY_DATA;
+ mcl->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
}
- ip->ip_len = htons(ip->ip_len);
- }
-
- if (args->eh == NULL) {
- ip->ip_len = ntohs(ip->ip_len);
- ip->ip_off = ntohs(ip->ip_off);
}
-
args->m = mcl;
return (IP_FW_NAT);
}
-static int
+static struct cfg_nat *
+lookup_nat(struct nat_list *l, int nat_id)
+{
+ struct cfg_nat *res;
+
+ LIST_FOREACH(res, l, _next) {
+ if (res->id == nat_id)
+ break;
+ }
+ return res;
+}
+
+static int
ipfw_nat_cfg(struct sockopt *sopt)
{
struct cfg_nat *ptr, *ser_n;
char *buf;
+ struct ip_fw_chain *chain = &V_layer3_chain;
buf = malloc(NAT_BUF_LEN, M_IPFW, M_WAITOK | M_ZERO);
- sooptcopyin(sopt, buf, NAT_BUF_LEN,
- sizeof(struct cfg_nat));
+ sooptcopyin(sopt, buf, NAT_BUF_LEN, sizeof(struct cfg_nat));
ser_n = (struct cfg_nat *)buf;
- /*
+ /* check valid parameter ser_n->id > 0 ? */
+ /*
* Find/create nat rule.
*/
- IPFW_WLOCK(&V_layer3_chain);
- LOOKUP_NAT(V_layer3_chain, ser_n->id, ptr);
+ IPFW_WLOCK(chain);
+ ptr = lookup_nat(&chain->nat, ser_n->id);
if (ptr == NULL) {
/* New rule: allocate and init new instance. */
- ptr = malloc(sizeof(struct cfg_nat),
+ ptr = malloc(sizeof(struct cfg_nat),
M_IPFW, M_NOWAIT | M_ZERO);
if (ptr == NULL) {
- IPFW_WUNLOCK(&V_layer3_chain);
+ IPFW_WUNLOCK(chain);
free(buf, M_IPFW);
return (ENOSPC);
}
ptr->lib = LibAliasInit(NULL);
if (ptr->lib == NULL) {
- IPFW_WUNLOCK(&V_layer3_chain);
+ IPFW_WUNLOCK(chain);
free(ptr, M_IPFW);
free(buf, M_IPFW);
return (EINVAL);
@@ -438,18 +379,18 @@ ipfw_nat_cfg(struct sockopt *sopt)
LIST_INIT(&ptr->redir_chain);
} else {
/* Entry already present: temporarly unhook it. */
- UNHOOK_NAT(ptr);
- flush_nat_ptrs(ser_n->id);
+ LIST_REMOVE(ptr, _next);
+ flush_nat_ptrs(chain, ser_n->id);
}
- IPFW_WUNLOCK(&V_layer3_chain);
+ IPFW_WUNLOCK(chain);
- /*
+ /*
* Basic nat configuration.
*/
ptr->id = ser_n->id;
- /*
- * XXX - what if this rule doesn't nat any ip and just
- * redirect?
+ /*
+ * XXX - what if this rule doesn't nat any ip and just
+ * redirect?
* do we set aliasaddress to 0.0.0.0?
*/
ptr->ip = ser_n->ip;
@@ -459,7 +400,7 @@ ipfw_nat_cfg(struct sockopt *sopt)
LibAliasSetAddress(ptr->lib, ptr->ip);
memcpy(ptr->if_name, ser_n->if_name, IF_NAMESIZE);
- /*
+ /*
* Redir and LSNAT configuration.
*/
/* Delete old cfgs. */
@@ -467,9 +408,9 @@ ipfw_nat_cfg(struct sockopt *sopt)
/* Add new entries. */
add_redir_spool_cfg(&buf[(sizeof(struct cfg_nat))], ptr);
free(buf, M_IPFW);
- IPFW_WLOCK(&V_layer3_chain);
- HOOK_NAT(&V_layer3_chain.nat, ptr);
- IPFW_WUNLOCK(&V_layer3_chain);
+ IPFW_WLOCK(chain);
+ LIST_INSERT_HEAD(&chain->nat, ptr, _next);
+ IPFW_WUNLOCK(chain);
return (0);
}
@@ -477,18 +418,20 @@ static int
ipfw_nat_del(struct sockopt *sopt)
{
struct cfg_nat *ptr;
+ struct ip_fw_chain *chain = &V_layer3_chain;
int i;
-
+
sooptcopyin(sopt, &i, sizeof i, sizeof i);
- IPFW_WLOCK(&V_layer3_chain);
- LOOKUP_NAT(V_layer3_chain, i, ptr);
+ /* XXX validate i */
+ IPFW_WLOCK(chain);
+ ptr = lookup_nat(&chain->nat, i);
if (ptr == NULL) {
- IPFW_WUNLOCK(&V_layer3_chain);
+ IPFW_WUNLOCK(chain);
return (EINVAL);
}
- UNHOOK_NAT(ptr);
- flush_nat_ptrs(i);
- IPFW_WUNLOCK(&V_layer3_chain);
+ LIST_REMOVE(ptr, _next);
+ flush_nat_ptrs(chain, i);
+ IPFW_WUNLOCK(chain);
del_redir_spool_cfg(ptr, &ptr->redir_chain);
LibAliasUninit(ptr->lib);
free(ptr, M_IPFW);
@@ -497,56 +440,53 @@ ipfw_nat_del(struct sockopt *sopt)
static int
ipfw_nat_get_cfg(struct sockopt *sopt)
-{
+{
uint8_t *data;
struct cfg_nat *n;
struct cfg_redir *r;
struct cfg_spool *s;
int nat_cnt, off;
-
+ struct ip_fw_chain *chain;
+ int err = ENOSPC;
+
+ chain = &V_layer3_chain;
nat_cnt = 0;
off = sizeof(nat_cnt);
data = malloc(NAT_BUF_LEN, M_IPFW, M_WAITOK | M_ZERO);
- IPFW_RLOCK(&V_layer3_chain);
+ IPFW_RLOCK(chain);
/* Serialize all the data. */
- LIST_FOREACH(n, &V_layer3_chain.nat, _next) {
+ LIST_FOREACH(n, &chain->nat, _next) {
nat_cnt++;
- if (off + SOF_NAT < NAT_BUF_LEN) {
- bcopy(n, &data[off], SOF_NAT);
- off += SOF_NAT;
- LIST_FOREACH(r, &n->redir_chain, _next) {
- if (off + SOF_REDIR < NAT_BUF_LEN) {
- bcopy(r, &data[off],
- SOF_REDIR);
- off += SOF_REDIR;
- LIST_FOREACH(s, &r->spool_chain,
- _next) {
- if (off + SOF_SPOOL <
- NAT_BUF_LEN) {
- bcopy(s, &data[off],
- SOF_SPOOL);
- off += SOF_SPOOL;
- } else
- goto nospace;
- }
- } else
+ if (off + SOF_NAT >= NAT_BUF_LEN)
+ goto nospace;
+ bcopy(n, &data[off], SOF_NAT);
+ off += SOF_NAT;
+ LIST_FOREACH(r, &n->redir_chain, _next) {
+ if (off + SOF_REDIR >= NAT_BUF_LEN)
+ goto nospace;
+ bcopy(r, &data[off], SOF_REDIR);
+ off += SOF_REDIR;
+ LIST_FOREACH(s, &r->spool_chain, _next) {
+ if (off + SOF_SPOOL >= NAT_BUF_LEN)
goto nospace;
+ bcopy(s, &data[off], SOF_SPOOL);
+ off += SOF_SPOOL;
}
- } else
- goto nospace;
+ }
}
- bcopy(&nat_cnt, data, sizeof(nat_cnt));
- IPFW_RUNLOCK(&V_layer3_chain);
- sooptcopyout(sopt, data, NAT_BUF_LEN);
- free(data, M_IPFW);
- return (0);
+ err = 0; /* all good */
nospace:
- IPFW_RUNLOCK(&V_layer3_chain);
- printf("serialized data buffer not big enough:"
- "please increase NAT_BUF_LEN\n");
+ IPFW_RUNLOCK(chain);
+ if (err == 0) {
+ bcopy(&nat_cnt, data, sizeof(nat_cnt));
+ sooptcopyout(sopt, data, NAT_BUF_LEN);
+ } else {
+ printf("serialized data buffer not big enough:"
+ "please increase NAT_BUF_LEN\n");
+ }
free(data, M_IPFW);
- return (ENOSPC);
+ return (err);
}
static int
@@ -554,30 +494,35 @@ ipfw_nat_get_log(struct sockopt *sopt)
{
uint8_t *data;
struct cfg_nat *ptr;
- int i, size, cnt, sof;
+ int i, size;
+ struct ip_fw_chain *chain;
- data = NULL;
- sof = LIBALIAS_BUF_SIZE;
- cnt = 0;
+ chain = &V_layer3_chain;
- IPFW_RLOCK(&V_layer3_chain);
- size = i = 0;
- LIST_FOREACH(ptr, &V_layer3_chain.nat, _next) {
- if (ptr->lib->logDesc == NULL)
+ IPFW_RLOCK(chain);
+ /* one pass to count, one to copy the data */
+ i = 0;
+ LIST_FOREACH(ptr, &chain->nat, _next) {
+ if (ptr->lib->logDesc == NULL)
+ continue;
+ i++;
+ }
+ size = i * (LIBALIAS_BUF_SIZE + sizeof(int));
+ data = malloc(size, M_IPFW, M_NOWAIT | M_ZERO);
+ if (data == NULL) {
+ IPFW_RUNLOCK(chain);
+ return (ENOSPC);
+ }
+ i = 0;
+ LIST_FOREACH(ptr, &chain->nat, _next) {
+ if (ptr->lib->logDesc == NULL)
continue;
- cnt++;
- size = cnt * (sof + sizeof(int));
- data = realloc(data, size, M_IPFW, M_NOWAIT | M_ZERO);
- if (data == NULL) {
- IPFW_RUNLOCK(&V_layer3_chain);
- return (ENOSPC);
- }
bcopy(&ptr->id, &data[i], sizeof(int));
i += sizeof(int);
- bcopy(ptr->lib->logDesc, &data[i], sof);
- i += sof;
+ bcopy(ptr->lib->logDesc, &data[i], LIBALIAS_BUF_SIZE);
+ i += LIBALIAS_BUF_SIZE;
}
- IPFW_RUNLOCK(&V_layer3_chain);
+ IPFW_RUNLOCK(chain);
sooptcopyout(sopt, data, size);
free(data, M_IPFW);
return(0);
@@ -590,38 +535,41 @@ ipfw_nat_init(void)
IPFW_WLOCK(&V_layer3_chain);
/* init ipfw hooks */
ipfw_nat_ptr = ipfw_nat;
+ lookup_nat_ptr = lookup_nat;
ipfw_nat_cfg_ptr = ipfw_nat_cfg;
ipfw_nat_del_ptr = ipfw_nat_del;
ipfw_nat_get_cfg_ptr = ipfw_nat_get_cfg;
ipfw_nat_get_log_ptr = ipfw_nat_get_log;
IPFW_WUNLOCK(&V_layer3_chain);
- V_ifaddr_event_tag = EVENTHANDLER_REGISTER(ifaddr_event, ifaddr_change,
+ V_ifaddr_event_tag = EVENTHANDLER_REGISTER(
+ ifaddr_event, ifaddr_change,
NULL, EVENTHANDLER_PRI_ANY);
}
static void
ipfw_nat_destroy(void)
{
- struct ip_fw *rule;
struct cfg_nat *ptr, *ptr_temp;
-
- IPFW_WLOCK(&V_layer3_chain);
- LIST_FOREACH_SAFE(ptr, &V_layer3_chain.nat, _next, ptr_temp) {
+ struct ip_fw_chain *chain;
+
+ chain = &V_layer3_chain;
+ IPFW_WLOCK(chain);
+ LIST_FOREACH_SAFE(ptr, &chain->nat, _next, ptr_temp) {
LIST_REMOVE(ptr, _next);
del_redir_spool_cfg(ptr, &ptr->redir_chain);
LibAliasUninit(ptr->lib);
free(ptr, M_IPFW);
}
EVENTHANDLER_DEREGISTER(ifaddr_event, V_ifaddr_event_tag);
- /* flush all nat ptrs */
- for (rule = V_layer3_chain.rules; rule; rule = rule->next) {
- ipfw_insn_nat *cmd = (ipfw_insn_nat *)ACTION_PTR(rule);
- if (cmd->o.opcode == O_NAT)
- cmd->nat = NULL;
- }
+ flush_nat_ptrs(chain, -1 /* flush all */);
/* deregister ipfw_nat */
ipfw_nat_ptr = NULL;
- IPFW_WUNLOCK(&V_layer3_chain);
+ lookup_nat_ptr = NULL;
+ ipfw_nat_cfg_ptr = NULL;
+ ipfw_nat_del_ptr = NULL;
+ ipfw_nat_get_cfg_ptr = NULL;
+ ipfw_nat_get_log_ptr = NULL;
+ IPFW_WUNLOCK(chain);
}
static int
@@ -655,3 +603,4 @@ DECLARE_MODULE(ipfw_nat, ipfw_nat_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY
MODULE_DEPEND(ipfw_nat, libalias, 1, 1, 1);
MODULE_DEPEND(ipfw_nat, ipfw, 2, 2, 2);
MODULE_VERSION(ipfw_nat, 1);
+/* end of file */
diff --git a/sys/netinet/ipfw/ip_fw_pfil.c b/sys/netinet/ipfw/ip_fw_pfil.c
index db73084..b4e31d4 100644
--- a/sys/netinet/ipfw/ip_fw_pfil.c
+++ b/sys/netinet/ipfw/ip_fw_pfil.c
@@ -46,9 +46,7 @@ __FBSDID("$FreeBSD$");
#include <sys/lock.h>
#include <sys/rwlock.h>
#include <sys/socket.h>
-#include <sys/socketvar.h>
#include <sys/sysctl.h>
-#include <sys/ucred.h>
#include <net/if.h>
#include <net/route.h>
@@ -60,457 +58,309 @@ __FBSDID("$FreeBSD$");
#include <netinet/ip.h>
#include <netinet/ip_var.h>
#include <netinet/ip_fw.h>
-#include <netinet/ip_divert.h>
-#include <netinet/ip_dummynet.h>
-
+#include <netinet/ipfw/ip_fw_private.h>
#include <netgraph/ng_ipfw.h>
#include <machine/in_cksum.h>
-VNET_DEFINE(int, fw_enable) = 1;
+static VNET_DEFINE(int, fw_enable) = 1;
+#define V_fw_enable VNET(fw_enable)
+
#ifdef INET6
-VNET_DEFINE(int, fw6_enable) = 1;
+static VNET_DEFINE(int, fw6_enable) = 1;
+#define V_fw6_enable VNET(fw6_enable)
#endif
int ipfw_chg_hook(SYSCTL_HANDLER_ARGS);
-/* Divert hooks. */
-ip_divert_packet_t *ip_divert_ptr = NULL;
-
-/* ng_ipfw hooks. */
-ng_ipfw_input_t *ng_ipfw_input_p = NULL;
-
/* Forward declarations. */
-static int ipfw_divert(struct mbuf **, int, int);
-#define DIV_DIR_IN 1
-#define DIV_DIR_OUT 0
-
-int
-ipfw_check_in(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir,
- struct inpcb *inp)
-{
- struct ip_fw_args args;
- struct ng_ipfw_tag *ng_tag;
- struct m_tag *dn_tag;
- int ipfw = 0;
- int divert;
- int tee;
-#ifdef IPFIREWALL_FORWARD
- struct m_tag *fwd_tag;
-#endif
-
- KASSERT(dir == PFIL_IN, ("ipfw_check_in wrong direction!"));
-
- bzero(&args, sizeof(args));
-
- ng_tag = (struct ng_ipfw_tag *)m_tag_locate(*m0, NGM_IPFW_COOKIE, 0,
- NULL);
- if (ng_tag != NULL) {
- KASSERT(ng_tag->dir == NG_IPFW_IN,
- ("ng_ipfw tag with wrong direction"));
- args.rule = ng_tag->rule;
- args.rule_id = ng_tag->rule_id;
- args.chain_id = ng_tag->chain_id;
- m_tag_delete(*m0, (struct m_tag *)ng_tag);
- }
-
-again:
- dn_tag = m_tag_find(*m0, PACKET_TAG_DUMMYNET, NULL);
- if (dn_tag != NULL){
- struct dn_pkt_tag *dt;
-
- dt = (struct dn_pkt_tag *)(dn_tag+1);
- args.rule = dt->rule;
- args.rule_id = dt->rule_id;
- args.chain_id = dt->chain_id;
-
- m_tag_delete(*m0, dn_tag);
- }
+static int ipfw_divert(struct mbuf **, int, struct ipfw_rule_ref *, int);
- args.m = *m0;
- args.inp = inp;
- tee = 0;
-
- if (V_fw_one_pass == 0 || args.rule == NULL) {
- ipfw = ipfw_chk(&args);
- *m0 = args.m;
- } else
- ipfw = IP_FW_PASS;
-
- KASSERT(*m0 != NULL || ipfw == IP_FW_DENY, ("%s: m0 is NULL",
- __func__));
+#ifdef SYSCTL_NODE
- switch (ipfw) {
- case IP_FW_PASS:
- if (args.next_hop == NULL)
- goto pass;
+SYSBEGIN(f1)
-#ifdef IPFIREWALL_FORWARD
- fwd_tag = m_tag_get(PACKET_TAG_IPFORWARD,
- sizeof(struct sockaddr_in), M_NOWAIT);
- if (fwd_tag == NULL)
- goto drop;
- bcopy(args.next_hop, (fwd_tag+1), sizeof(struct sockaddr_in));
- m_tag_prepend(*m0, fwd_tag);
-
- if (in_localip(args.next_hop->sin_addr))
- (*m0)->m_flags |= M_FASTFWD_OURS;
- goto pass;
-#endif
- break; /* not reached */
-
- case IP_FW_DENY:
- goto drop;
- break; /* not reached */
-
- case IP_FW_DUMMYNET:
- if (ip_dn_io_ptr == NULL)
- goto drop;
- if (mtod(*m0, struct ip *)->ip_v == 4)
- ip_dn_io_ptr(m0, DN_TO_IP_IN, &args);
- else if (mtod(*m0, struct ip *)->ip_v == 6)
- ip_dn_io_ptr(m0, DN_TO_IP6_IN, &args);
- if (*m0 != NULL)
- goto again;
- return 0; /* packet consumed */
-
- case IP_FW_TEE:
- tee = 1;
- /* fall through */
-
- case IP_FW_DIVERT:
- divert = ipfw_divert(m0, DIV_DIR_IN, tee);
- if (divert) {
- *m0 = NULL;
- return 0; /* packet consumed */
- } else {
- args.rule = NULL;
- goto again; /* continue with packet */
- }
-
- case IP_FW_NGTEE:
- if (!NG_IPFW_LOADED)
- goto drop;
- (void)ng_ipfw_input_p(m0, NG_IPFW_IN, &args, 1);
- goto again; /* continue with packet */
-
- case IP_FW_NETGRAPH:
- if (!NG_IPFW_LOADED)
- goto drop;
- return ng_ipfw_input_p(m0, NG_IPFW_IN, &args, 0);
-
- case IP_FW_NAT:
- goto again; /* continue with packet */
-
- case IP_FW_REASS:
- goto again;
+SYSCTL_DECL(_net_inet_ip_fw);
+SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, enable,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_enable), 0,
+ ipfw_chg_hook, "I", "Enable ipfw");
+#ifdef INET6
+SYSCTL_DECL(_net_inet6_ip6_fw);
+SYSCTL_VNET_PROC(_net_inet6_ip6_fw, OID_AUTO, enable,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw6_enable), 0,
+ ipfw_chg_hook, "I", "Enable ipfw+6");
+#endif /* INET6 */
- default:
- KASSERT(0, ("%s: unknown retval", __func__));
- }
+SYSEND
-drop:
- if (*m0)
- m_freem(*m0);
- *m0 = NULL;
- return (EACCES);
-pass:
- return 0; /* not filtered */
-}
+#endif /* SYSCTL_NODE */
+/*
+ * The pfilter hook to pass packets to ipfw_chk and then to
+ * dummynet, divert, netgraph or other modules.
+ * The packet may be consumed.
+ */
int
-ipfw_check_out(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir,
+ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir,
struct inpcb *inp)
{
struct ip_fw_args args;
- struct ng_ipfw_tag *ng_tag;
- struct m_tag *dn_tag;
- int ipfw = 0;
- int divert;
- int tee;
-#ifdef IPFIREWALL_FORWARD
- struct m_tag *fwd_tag;
-#endif
+ struct m_tag *tag;
+ int ipfw;
+ int ret;
- KASSERT(dir == PFIL_OUT, ("ipfw_check_out wrong direction!"));
+ /* all the processing now uses ip_len in net format */
+ if (mtod(*m0, struct ip *)->ip_v == 4)
+ SET_NET_IPLEN(mtod(*m0, struct ip *));
+ /* convert dir to IPFW values */
+ dir = (dir == PFIL_IN) ? DIR_IN : DIR_OUT;
bzero(&args, sizeof(args));
- ng_tag = (struct ng_ipfw_tag *)m_tag_locate(*m0, NGM_IPFW_COOKIE, 0,
- NULL);
- if (ng_tag != NULL) {
- KASSERT(ng_tag->dir == NG_IPFW_OUT,
- ("ng_ipfw tag with wrong direction"));
- args.rule = ng_tag->rule;
- args.rule_id = ng_tag->rule_id;
- args.chain_id = ng_tag->chain_id;
- m_tag_delete(*m0, (struct m_tag *)ng_tag);
- }
-
again:
- dn_tag = m_tag_find(*m0, PACKET_TAG_DUMMYNET, NULL);
- if (dn_tag != NULL) {
- struct dn_pkt_tag *dt;
-
- dt = (struct dn_pkt_tag *)(dn_tag+1);
- args.rule = dt->rule;
- args.rule_id = dt->rule_id;
- args.chain_id = dt->chain_id;
-
- m_tag_delete(*m0, dn_tag);
+ /*
+ * extract and remove the tag if present. If we are left
+ * with onepass, optimize the outgoing path.
+ */
+ tag = m_tag_locate(*m0, MTAG_IPFW_RULE, 0, NULL);
+ if (tag != NULL) {
+ args.rule = *((struct ipfw_rule_ref *)(tag+1));
+ m_tag_delete(*m0, tag);
+ if (args.rule.info & IPFW_ONEPASS) {
+ SET_HOST_IPLEN(mtod(*m0, struct ip *));
+ return 0;
+ }
}
args.m = *m0;
- args.oif = ifp;
+ args.oif = dir == DIR_OUT ? ifp : NULL;
args.inp = inp;
- tee = 0;
- if (V_fw_one_pass == 0 || args.rule == NULL) {
- ipfw = ipfw_chk(&args);
- *m0 = args.m;
- } else
- ipfw = IP_FW_PASS;
+ ipfw = ipfw_chk(&args);
+ *m0 = args.m;
KASSERT(*m0 != NULL || ipfw == IP_FW_DENY, ("%s: m0 is NULL",
__func__));
+ /* breaking out of the switch means drop */
+ ret = 0; /* default return value for pass */
switch (ipfw) {
case IP_FW_PASS:
+ /* next_hop may be set by ipfw_chk */
if (args.next_hop == NULL)
- goto pass;
-#ifdef IPFIREWALL_FORWARD
- /* Overwrite existing tag. */
- fwd_tag = m_tag_find(*m0, PACKET_TAG_IPFORWARD, NULL);
- if (fwd_tag == NULL) {
+ break; /* pass */
+#ifndef IPFIREWALL_FORWARD
+ ret = EACCES;
+#else
+ {
+ struct m_tag *fwd_tag;
+
+ /* Incoming packets should not be tagged so we do not
+ * m_tag_find. Outgoing packets may be tagged, so we
+ * reuse the tag if present.
+ */
+ fwd_tag = (dir == DIR_IN) ? NULL :
+ m_tag_find(*m0, PACKET_TAG_IPFORWARD, NULL);
+ if (fwd_tag != NULL) {
+ m_tag_unlink(*m0, fwd_tag);
+ } else {
fwd_tag = m_tag_get(PACKET_TAG_IPFORWARD,
sizeof(struct sockaddr_in), M_NOWAIT);
- if (fwd_tag == NULL)
- goto drop;
- } else
- m_tag_unlink(*m0, fwd_tag);
+ if (fwd_tag == NULL) {
+ ret = EACCES;
+ break; /* i.e. drop */
+ }
+ }
bcopy(args.next_hop, (fwd_tag+1), sizeof(struct sockaddr_in));
m_tag_prepend(*m0, fwd_tag);
if (in_localip(args.next_hop->sin_addr))
(*m0)->m_flags |= M_FASTFWD_OURS;
- goto pass;
+ }
#endif
- break; /* not reached */
+ break;
case IP_FW_DENY:
- goto drop;
- break; /* not reached */
+ ret = EACCES;
+ break; /* i.e. drop */
case IP_FW_DUMMYNET:
+ ret = EACCES;
if (ip_dn_io_ptr == NULL)
- break;
+ break; /* i.e. drop */
if (mtod(*m0, struct ip *)->ip_v == 4)
- ip_dn_io_ptr(m0, DN_TO_IP_OUT, &args);
+ ret = ip_dn_io_ptr(m0, dir, &args);
else if (mtod(*m0, struct ip *)->ip_v == 6)
- ip_dn_io_ptr(m0, DN_TO_IP6_OUT, &args);
+ ret = ip_dn_io_ptr(m0, dir | PROTO_IPV6, &args);
+ else
+ break; /* drop it */
+ /*
+ * XXX should read the return value.
+ * dummynet normally eats the packet and sets *m0=NULL
+ * unless the packet can be sent immediately. In this
+ * case args is updated and we should re-run the
+ * check without clearing args.
+ */
if (*m0 != NULL)
goto again;
- return 0; /* packet consumed */
-
break;
case IP_FW_TEE:
- tee = 1;
- /* fall through */
-
case IP_FW_DIVERT:
- divert = ipfw_divert(m0, DIV_DIR_OUT, tee);
- if (divert) {
- *m0 = NULL;
- return 0; /* packet consumed */
- } else {
- args.rule = NULL;
- goto again; /* continue with packet */
+ if (ip_divert_ptr == NULL) {
+ ret = EACCES;
+ break; /* i.e. drop */
}
+ ret = ipfw_divert(m0, dir, &args.rule,
+ (ipfw == IP_FW_TEE) ? 1 : 0);
+ /* continue processing for the original packet (tee). */
+ if (*m0)
+ goto again;
+ break;
case IP_FW_NGTEE:
- if (!NG_IPFW_LOADED)
- goto drop;
- (void)ng_ipfw_input_p(m0, NG_IPFW_OUT, &args, 1);
- goto again; /* continue with packet */
-
case IP_FW_NETGRAPH:
- if (!NG_IPFW_LOADED)
- goto drop;
- return ng_ipfw_input_p(m0, NG_IPFW_OUT, &args, 0);
+ if (ng_ipfw_input_p == NULL) {
+ ret = EACCES;
+ break; /* i.e. drop */
+ }
+ ret = ng_ipfw_input_p(m0, dir, &args,
+ (ipfw == IP_FW_NGTEE) ? 1 : 0);
+ if (ipfw == IP_FW_NGTEE) /* ignore errors for NGTEE */
+ goto again; /* continue with packet */
+ break;
case IP_FW_NAT:
- goto again; /* continue with packet */
-
case IP_FW_REASS:
- goto again;
+ goto again; /* continue with packet */
default:
KASSERT(0, ("%s: unknown retval", __func__));
}
-drop:
- if (*m0)
- m_freem(*m0);
- *m0 = NULL;
- return (EACCES);
-pass:
- return 0; /* not filtered */
+ if (ret != 0) {
+ if (*m0)
+ FREE_PKT(*m0);
+ *m0 = NULL;
+ }
+ if (*m0 && mtod(*m0, struct ip *)->ip_v == 4)
+ SET_HOST_IPLEN(mtod(*m0, struct ip *));
+ return ret;
}
+/* do the divert, return 1 on error 0 on success */
static int
-ipfw_divert(struct mbuf **m, int incoming, int tee)
+ipfw_divert(struct mbuf **m0, int incoming, struct ipfw_rule_ref *rule,
+ int tee)
{
/*
* ipfw_chk() has already tagged the packet with the divert tag.
* If tee is set, copy packet and return original.
* If not tee, consume packet and send it to divert socket.
*/
- struct mbuf *clone, *reass;
+ struct mbuf *clone;
struct ip *ip;
- int hlen;
-
- reass = NULL;
-
- /* Is divert module loaded? */
- if (ip_divert_ptr == NULL)
- goto nodivert;
+ struct m_tag *tag;
/* Cloning needed for tee? */
- if (tee)
- clone = m_dup(*m, M_DONTWAIT);
- else
- clone = *m;
-
- /* In case m_dup was unable to allocate mbufs. */
- if (clone == NULL)
- goto teeout;
+ if (tee == 0) {
+ clone = *m0; /* use the original mbuf */
+ *m0 = NULL;
+ } else {
+ clone = m_dup(*m0, M_DONTWAIT);
+ /* If we cannot duplicate the mbuf, we sacrifice the divert
+ * chain and continue with the tee-ed packet.
+ */
+ if (clone == NULL)
+ return 1;
+ }
/*
- * Divert listeners can only handle non-fragmented packets.
- * However when tee is set we will *not* de-fragment the packets;
- * Doing do would put the reassembly into double-jeopardy. On top
- * of that someone doing a tee will probably want to get the packet
- * in its original form.
+ * Divert listeners can normally handle non-fragmented packets,
+ * but we can only reass in the non-tee case.
+ * This means that listeners on a tee rule may get fragments,
+ * and have to live with that.
+ * Note that we now have the 'reass' ipfw option so if we care
+ * we can do it before a 'tee'.
*/
ip = mtod(clone, struct ip *);
- if (!tee && ip->ip_off & (IP_MF | IP_OFFMASK)) {
-
- /* Reassemble packet. */
- reass = ip_reass(clone);
-
+ if (!tee && ntohs(ip->ip_off) & (IP_MF | IP_OFFMASK)) {
+ int hlen;
+ struct mbuf *reass;
+
+ SET_HOST_IPLEN(ip); /* ip_reass wants host order */
+ reass = ip_reass(clone); /* Reassemble packet. */
+ if (reass == NULL)
+ return 0; /* not an error */
+ /* if reass = NULL then it was consumed by ip_reass */
/*
* IP header checksum fixup after reassembly and leave header
* in network byte order.
*/
- if (reass != NULL) {
- ip = mtod(reass, struct ip *);
- hlen = ip->ip_hl << 2;
- ip->ip_len = htons(ip->ip_len);
- ip->ip_off = htons(ip->ip_off);
- ip->ip_sum = 0;
- if (hlen == sizeof(struct ip))
- ip->ip_sum = in_cksum_hdr(ip);
- else
- ip->ip_sum = in_cksum(reass, hlen);
- clone = reass;
- } else
- clone = NULL;
- } else {
- /* Convert header to network byte order. */
- ip->ip_len = htons(ip->ip_len);
- ip->ip_off = htons(ip->ip_off);
+ ip = mtod(reass, struct ip *);
+ hlen = ip->ip_hl << 2;
+ SET_NET_IPLEN(ip);
+ ip->ip_sum = 0;
+ if (hlen == sizeof(struct ip))
+ ip->ip_sum = in_cksum_hdr(ip);
+ else
+ ip->ip_sum = in_cksum(reass, hlen);
+ clone = reass;
+ }
+ /* attach a tag to the packet with the reinject info */
+ tag = m_tag_alloc(MTAG_IPFW_RULE, 0,
+ sizeof(struct ipfw_rule_ref), M_NOWAIT);
+ if (tag == NULL) {
+ FREE_PKT(clone);
+ return 1;
}
+ *((struct ipfw_rule_ref *)(tag+1)) = *rule;
+ m_tag_prepend(clone, tag);
/* Do the dirty job... */
- if (clone && ip_divert_ptr != NULL)
- ip_divert_ptr(clone, incoming);
-
-teeout:
- /*
- * For tee we leave the divert tag attached to original packet.
- * It will then continue rule evaluation after the tee rule.
- */
- if (tee)
- return 0;
-
- /* Packet diverted and consumed */
- return 1;
-
-nodivert:
- m_freem(*m);
- return 1;
-}
-
-int
-ipfw_hook(void)
-{
- struct pfil_head *pfh_inet;
-
- pfh_inet = pfil_head_get(PFIL_TYPE_AF, AF_INET);
- if (pfh_inet == NULL)
- return ENOENT;
-
- (void)pfil_add_hook(ipfw_check_in, NULL, PFIL_IN | PFIL_WAITOK,
- pfh_inet);
- (void)pfil_add_hook(ipfw_check_out, NULL, PFIL_OUT | PFIL_WAITOK,
- pfh_inet);
-
+ ip_divert_ptr(clone, incoming);
return 0;
}
-int
-ipfw_unhook(void)
-{
- struct pfil_head *pfh_inet;
-
- pfh_inet = pfil_head_get(PFIL_TYPE_AF, AF_INET);
- if (pfh_inet == NULL)
- return ENOENT;
-
- (void)pfil_remove_hook(ipfw_check_in, NULL, PFIL_IN | PFIL_WAITOK,
- pfh_inet);
- (void)pfil_remove_hook(ipfw_check_out, NULL, PFIL_OUT | PFIL_WAITOK,
- pfh_inet);
-
- return 0;
-}
-
-#ifdef INET6
-int
-ipfw6_hook(void)
+/*
+ * attach or detach hooks for a given protocol family
+ */
+static int
+ipfw_hook(int onoff, int pf)
{
- struct pfil_head *pfh_inet6;
+ struct pfil_head *pfh;
- pfh_inet6 = pfil_head_get(PFIL_TYPE_AF, AF_INET6);
- if (pfh_inet6 == NULL)
+ pfh = pfil_head_get(PFIL_TYPE_AF, pf);
+ if (pfh == NULL)
return ENOENT;
- (void)pfil_add_hook(ipfw_check_in, NULL, PFIL_IN | PFIL_WAITOK,
- pfh_inet6);
- (void)pfil_add_hook(ipfw_check_out, NULL, PFIL_OUT | PFIL_WAITOK,
- pfh_inet6);
+ (void) (onoff ? pfil_add_hook : pfil_remove_hook)
+ (ipfw_check_hook, NULL, PFIL_IN | PFIL_OUT | PFIL_WAITOK, pfh);
return 0;
}
int
-ipfw6_unhook(void)
+ipfw_attach_hooks(int arg)
{
- struct pfil_head *pfh_inet6;
-
- pfh_inet6 = pfil_head_get(PFIL_TYPE_AF, AF_INET6);
- if (pfh_inet6 == NULL)
- return ENOENT;
-
- (void)pfil_remove_hook(ipfw_check_in, NULL, PFIL_IN | PFIL_WAITOK,
- pfh_inet6);
- (void)pfil_remove_hook(ipfw_check_out, NULL, PFIL_OUT | PFIL_WAITOK,
- pfh_inet6);
-
- return 0;
+ int error = 0;
+
+ if (arg == 0) /* detach */
+ ipfw_hook(0, AF_INET);
+ else if (V_fw_enable && ipfw_hook(1, AF_INET) != 0) {
+ error = ENOENT; /* see ip_fw_pfil.c::ipfw_hook() */
+ printf("ipfw_hook() error\n");
+ }
+#ifdef INET6
+ if (arg == 0) /* detach */
+ ipfw_hook(0, AF_INET6);
+ else if (V_fw6_enable && ipfw_hook(1, AF_INET6) != 0) {
+ error = ENOENT;
+ printf("ipfw6_hook() error\n");
+ }
+#endif
+ return error;
}
-#endif /* INET6 */
int
ipfw_chg_hook(SYSCTL_HANDLER_ARGS)
@@ -518,13 +368,16 @@ ipfw_chg_hook(SYSCTL_HANDLER_ARGS)
int enable;
int oldenable;
int error;
+ int af;
if (arg1 == &VNET_NAME(fw_enable)) {
enable = V_fw_enable;
+ af = AF_INET;
}
#ifdef INET6
else if (arg1 == &VNET_NAME(fw6_enable)) {
enable = V_fw6_enable;
+ af = AF_INET6;
}
#endif
else
@@ -542,27 +395,16 @@ ipfw_chg_hook(SYSCTL_HANDLER_ARGS)
if (enable == oldenable)
return (0);
- if (arg1 == &VNET_NAME(fw_enable)) {
- if (enable)
- error = ipfw_hook();
- else
- error = ipfw_unhook();
- if (error)
- return (error);
+ error = ipfw_hook(enable, af);
+ if (error)
+ return (error);
+ if (af == AF_INET)
V_fw_enable = enable;
- }
#ifdef INET6
- else if (arg1 == &VNET_NAME(fw6_enable)) {
- if (enable)
- error = ipfw6_hook();
- else
- error = ipfw6_unhook();
- if (error)
- return (error);
+ else if (af == AF_INET6)
V_fw6_enable = enable;
- }
#endif
return (0);
}
-
+/* end of file */
diff --git a/sys/netinet/ipfw/ip_fw_private.h b/sys/netinet/ipfw/ip_fw_private.h
new file mode 100644
index 0000000..c29ae0a
--- /dev/null
+++ b/sys/netinet/ipfw/ip_fw_private.h
@@ -0,0 +1,301 @@
+/*-
+ * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IPFW2_PRIVATE_H
+#define _IPFW2_PRIVATE_H
+
+/*
+ * Internal constants and data structures used by ipfw components
+ * and not meant to be exported outside the kernel.
+ */
+
+#ifdef _KERNEL
+
+/*
+ * For platforms that do not have SYSCTL support, we wrap the
+ * SYSCTL_* into a function (one per file) to collect the values
+ * into an array at module initialization. The wrapping macros,
+ * SYSBEGIN() and SYSEND, are empty in the default case.
+ */
+#ifndef SYSBEGIN
+#define SYSBEGIN(x)
+#endif
+#ifndef SYSEND
+#define SYSEND
+#endif
+
+/* Return values from ipfw_chk() */
+enum {
+ IP_FW_PASS = 0,
+ IP_FW_DENY,
+ IP_FW_DIVERT,
+ IP_FW_TEE,
+ IP_FW_DUMMYNET,
+ IP_FW_NETGRAPH,
+ IP_FW_NGTEE,
+ IP_FW_NAT,
+ IP_FW_REASS,
+};
+
+/*
+ * Structure for collecting parameters to dummynet for ip6_output forwarding
+ */
+struct _ip6dn_args {
+ struct ip6_pktopts *opt_or;
+ struct route_in6 ro_or;
+ int flags_or;
+ struct ip6_moptions *im6o_or;
+ struct ifnet *origifp_or;
+ struct ifnet *ifp_or;
+ struct sockaddr_in6 dst_or;
+ u_long mtu_or;
+ struct route_in6 ro_pmtu_or;
+};
+
+
+/*
+ * Arguments for calling ipfw_chk() and dummynet_io(). We put them
+ * all into a structure because this way it is easier and more
+ * efficient to pass variables around and extend the interface.
+ */
+struct ip_fw_args {
+ struct mbuf *m; /* the mbuf chain */
+ struct ifnet *oif; /* output interface */
+ struct sockaddr_in *next_hop; /* forward address */
+
+ /*
+ * On return, it points to the matching rule.
+ * On entry, rule.slot > 0 means the info is valid and
+ * contains the the starting rule for an ipfw search.
+ * If chain_id == chain->id && slot >0 then jump to that slot.
+ * Otherwise, we locate the first rule >= rulenum:rule_id
+ */
+ struct ipfw_rule_ref rule; /* match/restart info */
+
+ struct ether_header *eh; /* for bridged packets */
+
+ struct ipfw_flow_id f_id; /* grabbed from IP header */
+ //uint32_t cookie; /* a cookie depending on rule action */
+ struct inpcb *inp;
+
+ struct _ip6dn_args dummypar; /* dummynet->ip6_output */
+ struct sockaddr_in hopstore; /* store here if cannot use a pointer */
+};
+
+MALLOC_DECLARE(M_IPFW);
+
+/*
+ * Hooks sometime need to know the direction of the packet
+ * (divert, dummynet, netgraph, ...)
+ * We use a generic definition here, with bit0-1 indicating the
+ * direction, bit 2 indicating layer2 or 3, bit 3-4 indicating the
+ * specific protocol
+ * indicating the protocol (if necessary)
+ */
+enum {
+ DIR_MASK = 0x3,
+ DIR_OUT = 0,
+ DIR_IN = 1,
+ DIR_FWD = 2,
+ DIR_DROP = 3,
+ PROTO_LAYER2 = 0x4, /* set for layer 2 */
+ /* PROTO_DEFAULT = 0, */
+ PROTO_IPV4 = 0x08,
+ PROTO_IPV6 = 0x10,
+ PROTO_IFB = 0x0c, /* layer2 + ifbridge */
+ /* PROTO_OLDBDG = 0x14, unused, old bridge */
+};
+
+/* wrapper for freeing a packet, in case we need to do more work */
+#ifndef FREE_PKT
+#if defined(__linux__) || defined(_WIN32)
+#define FREE_PKT(m) netisr_dispatch(-1, m)
+#else
+#define FREE_PKT(m) m_freem(m)
+#endif
+#endif /* !FREE_PKT */
+
+/*
+ * Function definitions.
+ */
+
+/* attach (arg = 1) or detach (arg = 0) hooks */
+int ipfw_attach_hooks(int);
+#ifdef NOTYET
+void ipfw_nat_destroy(void);
+#endif
+
+/* In ip_fw_log.c */
+struct ip;
+void ipfw_log_bpf(int);
+void ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args,
+ struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg,
+ struct ip *ip);
+VNET_DECLARE(u_int64_t, norule_counter);
+#define V_norule_counter VNET(norule_counter)
+VNET_DECLARE(int, verbose_limit);
+#define V_verbose_limit VNET(verbose_limit)
+
+/* In ip_fw_dynamic.c */
+
+enum { /* result for matching dynamic rules */
+ MATCH_REVERSE = 0,
+ MATCH_FORWARD,
+ MATCH_NONE,
+ MATCH_UNKNOWN,
+};
+
+/*
+ * The lock for dynamic rules is only used once outside the file,
+ * and only to release the result of lookup_dyn_rule().
+ * Eventually we may implement it with a callback on the function.
+ */
+void ipfw_dyn_unlock(void);
+
+struct tcphdr;
+struct mbuf *ipfw_send_pkt(struct mbuf *, struct ipfw_flow_id *,
+ u_int32_t, u_int32_t, int);
+int ipfw_install_state(struct ip_fw *rule, ipfw_insn_limit *cmd,
+ struct ip_fw_args *args, uint32_t tablearg);
+ipfw_dyn_rule *ipfw_lookup_dyn_rule(struct ipfw_flow_id *pkt,
+ int *match_direction, struct tcphdr *tcp);
+void ipfw_remove_dyn_children(struct ip_fw *rule);
+void ipfw_get_dynamic(char **bp, const char *ep);
+
+void ipfw_dyn_attach(void); /* uma_zcreate .... */
+void ipfw_dyn_detach(void); /* uma_zdestroy ... */
+void ipfw_dyn_init(void); /* per-vnet initialization */
+void ipfw_dyn_uninit(int); /* per-vnet deinitialization */
+int ipfw_dyn_len(void);
+
+/* common variables */
+VNET_DECLARE(int, fw_one_pass);
+#define V_fw_one_pass VNET(fw_one_pass)
+
+VNET_DECLARE(int, fw_verbose);
+#define V_fw_verbose VNET(fw_verbose)
+
+VNET_DECLARE(struct ip_fw_chain, layer3_chain);
+#define V_layer3_chain VNET(layer3_chain)
+
+VNET_DECLARE(u_int32_t, set_disable);
+#define V_set_disable VNET(set_disable)
+
+VNET_DECLARE(int, autoinc_step);
+#define V_autoinc_step VNET(autoinc_step)
+
+struct ip_fw_chain {
+ struct ip_fw *rules; /* list of rules */
+ struct ip_fw *reap; /* list of rules to reap */
+ struct ip_fw *default_rule;
+ int n_rules; /* number of static rules */
+ int static_len; /* total len of static rules */
+ struct ip_fw **map; /* array of rule ptrs to ease lookup */
+ LIST_HEAD(nat_list, cfg_nat) nat; /* list of nat entries */
+ struct radix_node_head *tables[IPFW_TABLES_MAX];
+#if defined( __linux__ ) || defined( _WIN32 )
+ spinlock_t rwmtx;
+ spinlock_t uh_lock;
+#else
+ struct rwlock rwmtx;
+ struct rwlock uh_lock; /* lock for upper half */
+#endif
+ uint32_t id; /* ruleset id */
+};
+
+struct sockopt; /* used by tcp_var.h */
+
+/*
+ * The lock is heavily used by ip_fw2.c (the main file) and ip_fw_nat.c
+ * so the variable and the macros must be here.
+ */
+
+#define IPFW_LOCK_INIT(_chain) do { \
+ rw_init(&(_chain)->rwmtx, "IPFW static rules"); \
+ rw_init(&(_chain)->uh_lock, "IPFW UH lock"); \
+ } while (0)
+
+#define IPFW_LOCK_DESTROY(_chain) do { \
+ rw_destroy(&(_chain)->rwmtx); \
+ rw_destroy(&(_chain)->uh_lock); \
+ } while (0)
+
+#define IPFW_WLOCK_ASSERT(_chain) rw_assert(&(_chain)->rwmtx, RA_WLOCKED)
+
+#define IPFW_RLOCK(p) rw_rlock(&(p)->rwmtx)
+#define IPFW_RUNLOCK(p) rw_runlock(&(p)->rwmtx)
+#define IPFW_WLOCK(p) rw_wlock(&(p)->rwmtx)
+#define IPFW_WUNLOCK(p) rw_wunlock(&(p)->rwmtx)
+
+#define IPFW_UH_RLOCK(p) rw_rlock(&(p)->uh_lock)
+#define IPFW_UH_RUNLOCK(p) rw_runlock(&(p)->uh_lock)
+#define IPFW_UH_WLOCK(p) rw_wlock(&(p)->uh_lock)
+#define IPFW_UH_WUNLOCK(p) rw_wunlock(&(p)->uh_lock)
+
+/* In ip_fw_sockopt.c */
+int ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id);
+int ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule);
+int ipfw_ctl(struct sockopt *sopt);
+int ipfw_chk(struct ip_fw_args *args);
+void ipfw_reap_rules(struct ip_fw *head);
+
+/* In ip_fw_pfil */
+int ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir,
+ struct inpcb *inp);
+
+/* In ip_fw_table.c */
+struct radix_node;
+int ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+ uint32_t *val);
+int ipfw_init_tables(struct ip_fw_chain *ch);
+void ipfw_destroy_tables(struct ip_fw_chain *ch);
+int ipfw_flush_table(struct ip_fw_chain *ch, uint16_t tbl);
+int ipfw_add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+ uint8_t mlen, uint32_t value);
+int ipfw_dump_table_entry(struct radix_node *rn, void *arg);
+int ipfw_del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+ uint8_t mlen);
+int ipfw_count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt);
+int ipfw_dump_table(struct ip_fw_chain *ch, ipfw_table *tbl);
+
+/* In ip_fw_nat.c -- XXX to be moved to ip_var.h */
+
+extern struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int);
+
+typedef int ipfw_nat_t(struct ip_fw_args *, struct cfg_nat *, struct mbuf *);
+typedef int ipfw_nat_cfg_t(struct sockopt *);
+
+extern ipfw_nat_t *ipfw_nat_ptr;
+#define IPFW_NAT_LOADED (ipfw_nat_ptr != NULL)
+
+extern ipfw_nat_cfg_t *ipfw_nat_cfg_ptr;
+extern ipfw_nat_cfg_t *ipfw_nat_del_ptr;
+extern ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr;
+extern ipfw_nat_cfg_t *ipfw_nat_get_log_ptr;
+
+#endif /* _KERNEL */
+#endif /* _IPFW2_PRIVATE_H */
diff --git a/sys/netinet/ipfw/ip_fw_sockopt.c b/sys/netinet/ipfw/ip_fw_sockopt.c
new file mode 100644
index 0000000..e25b960
--- /dev/null
+++ b/sys/netinet/ipfw/ip_fw_sockopt.c
@@ -0,0 +1,1287 @@
+/*-
+ * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
+ *
+ * Supported by: Valeria Paoli
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ * Sockopt support for ipfw. The routines here implement
+ * the upper half of the ipfw code.
+ */
+
+#if !defined(KLD_MODULE)
+#include "opt_ipfw.h"
+#include "opt_ipdivert.h"
+#include "opt_ipdn.h"
+#include "opt_inet.h"
+#ifndef INET
+#error IPFIREWALL requires INET.
+#endif /* INET */
+#endif
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h> /* struct m_tag used by nested headers */
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <net/if.h>
+#include <net/route.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/ip_var.h> /* hooks */
+#include <netinet/ip_fw.h>
+#include <netinet/ipfw/ip_fw_private.h>
+
+#ifdef MAC
+#include <security/mac/mac_framework.h>
+#endif
+
+MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");
+
+/*
+ * static variables followed by global ones (none in this file)
+ */
+
+/*
+ * Find the smallest rule >= key, id.
+ * We could use bsearch but it is so simple that we code it directly
+ */
+int
+ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id)
+{
+ int i, lo, hi;
+ struct ip_fw *r;
+
+ for (lo = 0, hi = chain->n_rules - 1; lo < hi;) {
+ i = (lo + hi) / 2;
+ r = chain->map[i];
+ if (r->rulenum < key)
+ lo = i + 1; /* continue from the next one */
+ else if (r->rulenum > key)
+ hi = i; /* this might be good */
+ else if (r->id < id)
+ lo = i + 1; /* continue from the next one */
+ else /* r->id >= id */
+ hi = i; /* this might be good */
+ };
+ return hi;
+}
+
+/*
+ * allocate a new map, returns the chain locked. extra is the number
+ * of entries to add or delete.
+ */
+static struct ip_fw **
+get_map(struct ip_fw_chain *chain, int extra, int locked)
+{
+
+ for (;;) {
+ struct ip_fw **map;
+ int i;
+
+ i = chain->n_rules + extra;
+ map = malloc(i * sizeof(struct ip_fw *), M_IPFW,
+ locked ? M_NOWAIT : M_WAITOK);
+ if (map == NULL) {
+ printf("%s: cannot allocate map\n", __FUNCTION__);
+ return NULL;
+ }
+ if (!locked)
+ IPFW_UH_WLOCK(chain);
+ if (i >= chain->n_rules + extra) /* good */
+ return map;
+ /* otherwise we lost the race, free and retry */
+ if (!locked)
+ IPFW_UH_WUNLOCK(chain);
+ free(map, M_IPFW);
+ }
+}
+
+/*
+ * swap the maps. It is supposed to be called with IPFW_UH_WLOCK
+ */
+static struct ip_fw **
+swap_map(struct ip_fw_chain *chain, struct ip_fw **new_map, int new_len)
+{
+ struct ip_fw **old_map;
+
+ IPFW_WLOCK(chain);
+ chain->id++;
+ chain->n_rules = new_len;
+ old_map = chain->map;
+ chain->map = new_map;
+ IPFW_WUNLOCK(chain);
+ return old_map;
+}
+
+/*
+ * Add a new rule to the list. Copy the rule into a malloc'ed area, then
+ * possibly create a rule number and add the rule to the list.
+ * Update the rule_number in the input struct so the caller knows it as well.
+ * XXX DO NOT USE FOR THE DEFAULT RULE.
+ * Must be called without IPFW_UH held
+ */
+int
+ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
+{
+ struct ip_fw *rule;
+ int i, l, insert_before;
+ struct ip_fw **map; /* the new array of pointers */
+
+ if (chain->rules == NULL || input_rule->rulenum > IPFW_DEFAULT_RULE-1)
+ return (EINVAL);
+
+ l = RULESIZE(input_rule);
+ rule = malloc(l, M_IPFW, M_WAITOK | M_ZERO);
+ if (rule == NULL)
+ return (ENOSPC);
+ /* get_map returns with IPFW_UH_WLOCK if successful */
+ map = get_map(chain, 1, 0 /* not locked */);
+ if (map == NULL) {
+ free(rule, M_IPFW);
+ return ENOSPC;
+ }
+
+ bcopy(input_rule, rule, l);
+ /* clear fields not settable from userland */
+ rule->x_next = NULL;
+ rule->next_rule = NULL;
+ rule->pcnt = 0;
+ rule->bcnt = 0;
+ rule->timestamp = 0;
+
+ if (V_autoinc_step < 1)
+ V_autoinc_step = 1;
+ else if (V_autoinc_step > 1000)
+ V_autoinc_step = 1000;
+ /* find the insertion point, we will insert before */
+ insert_before = rule->rulenum ? rule->rulenum + 1 : IPFW_DEFAULT_RULE;
+ i = ipfw_find_rule(chain, insert_before, 0);
+ /* duplicate first part */
+ if (i > 0)
+ bcopy(chain->map, map, i * sizeof(struct ip_fw *));
+ map[i] = rule;
+ /* duplicate remaining part, we always have the default rule */
+ bcopy(chain->map + i, map + i + 1,
+ sizeof(struct ip_fw *) *(chain->n_rules - i));
+ if (rule->rulenum == 0) {
+ /* write back the number */
+ rule->rulenum = i > 0 ? map[i-1]->rulenum : 0;
+ if (rule->rulenum < IPFW_DEFAULT_RULE - V_autoinc_step)
+ rule->rulenum += V_autoinc_step;
+ input_rule->rulenum = rule->rulenum;
+ }
+
+ rule->id = chain->id + 1;
+ map = swap_map(chain, map, chain->n_rules + 1);
+ chain->static_len += l;
+ IPFW_UH_WUNLOCK(chain);
+ if (map)
+ free(map, M_IPFW);
+ return (0);
+}
+
+/*
+ * Reclaim storage associated with a list of rules. This is
+ * typically the list created using remove_rule.
+ * A NULL pointer on input is handled correctly.
+ */
+void
+ipfw_reap_rules(struct ip_fw *head)
+{
+ struct ip_fw *rule;
+
+ while ((rule = head) != NULL) {
+ head = head->x_next;
+ free(rule, M_IPFW);
+ }
+}
+
+/**
+ * Remove all rules with given number, and also do set manipulation.
+ * Assumes chain != NULL && *chain != NULL.
+ *
+ * The argument is an u_int32_t. The low 16 bit are the rule or set number,
+ * the next 8 bits are the new set, the top 8 bits are the command:
+ *
+ * 0 delete rules with given number
+ * 1 delete rules with given set number
+ * 2 move rules with given number to new set
+ * 3 move rules with given set number to new set
+ * 4 swap sets with given numbers
+ * 5 delete rules with given number and with given set number
+ */
+static int
+del_entry(struct ip_fw_chain *chain, u_int32_t arg)
+{
+ struct ip_fw *rule;
+ uint32_t rulenum; /* rule or old_set */
+ uint8_t cmd, new_set;
+ int start, end = 0, i, ofs, n;
+ struct ip_fw **map = NULL;
+ int error = 0;
+
+ rulenum = arg & 0xffff;
+ cmd = (arg >> 24) & 0xff;
+ new_set = (arg >> 16) & 0xff;
+
+ if (cmd > 5 || new_set > RESVD_SET)
+ return EINVAL;
+ if (cmd == 0 || cmd == 2 || cmd == 5) {
+ if (rulenum >= IPFW_DEFAULT_RULE)
+ return EINVAL;
+ } else {
+ if (rulenum > RESVD_SET) /* old_set */
+ return EINVAL;
+ }
+
+ IPFW_UH_WLOCK(chain); /* prevent conflicts among the writers */
+ chain->reap = NULL; /* prepare for deletions */
+
+ switch (cmd) {
+ case 0: /* delete rules with given number (0 is special means all) */
+ case 1: /* delete all rules with given set number, rule->set == rulenum */
+ case 5: /* delete rules with given number and with given set number.
+ * rulenum - given rule number;
+ * new_set - given set number.
+ */
+ /* locate first rule to delete (start), the one after the
+ * last one (end), and count how many rules to delete (n)
+ */
+ n = 0;
+ if (cmd == 1) { /* look for a specific set, must scan all */
+ for (start = -1, i = 0; i < chain->n_rules; i++) {
+ if (chain->map[start]->set != rulenum)
+ continue;
+ if (start < 0)
+ start = i;
+ end = i;
+ n++;
+ }
+ end++; /* first non-matching */
+ } else {
+ start = ipfw_find_rule(chain, rulenum, 0);
+ for (end = start; end < chain->n_rules; end++) {
+ rule = chain->map[end];
+ if (rulenum > 0 && rule->rulenum != rulenum)
+ break;
+ if (rule->set != RESVD_SET &&
+ (cmd == 0 || rule->set == new_set) )
+ n++;
+ }
+ }
+ if (n == 0 && arg == 0)
+ break; /* special case, flush on empty ruleset */
+ /* allocate the map, if needed */
+ if (n > 0)
+ map = get_map(chain, -n, 1 /* locked */);
+ if (n == 0 || map == NULL) {
+ error = EINVAL;
+ break;
+ }
+ /* copy the initial part of the map */
+ if (start > 0)
+ bcopy(chain->map, map, start * sizeof(struct ip_fw *));
+ /* copy active rules between start and end */
+ for (i = ofs = start; i < end; i++) {
+ rule = chain->map[i];
+ if (!(rule->set != RESVD_SET &&
+ (cmd == 0 || rule->set == new_set) ))
+ map[ofs++] = chain->map[i];
+ }
+ /* finally the tail */
+ bcopy(chain->map + end, map + ofs,
+ (chain->n_rules - end) * sizeof(struct ip_fw *));
+ map = swap_map(chain, map, chain->n_rules - n);
+ /* now remove the rules deleted */
+ for (i = start; i < end; i++) {
+ rule = map[i];
+ if (rule->set != RESVD_SET &&
+ (cmd == 0 || rule->set == new_set) ) {
+ int l = RULESIZE(rule);
+
+ chain->static_len -= l;
+ ipfw_remove_dyn_children(rule);
+ rule->x_next = chain->reap;
+ chain->reap = rule;
+ }
+ }
+ break;
+
+ case 2: /* move rules with given number to new set */
+ for (i = 0; i < chain->n_rules; i++) {
+ rule = chain->map[i];
+ if (rule->rulenum == rulenum)
+ rule->set = new_set;
+ }
+ break;
+
+ case 3: /* move rules with given set number to new set */
+ for (i = 0; i < chain->n_rules; i++) {
+ rule = chain->map[i];
+ if (rule->set == rulenum)
+ rule->set = new_set;
+ }
+ break;
+
+ case 4: /* swap two sets */
+ for (i = 0; i < chain->n_rules; i++) {
+ rule = chain->map[i];
+ if (rule->set == rulenum)
+ rule->set = new_set;
+ else if (rule->set == new_set)
+ rule->set = rulenum;
+ }
+ break;
+ }
+ rule = chain->reap;
+ chain->reap = NULL;
+ IPFW_UH_WUNLOCK(chain);
+ ipfw_reap_rules(rule);
+ if (map)
+ free(map, M_IPFW);
+ return error;
+}
+
+/*
+ * Clear counters for a specific rule.
+ * Normally run under IPFW_UH_RLOCK, but these are idempotent ops
+ * so we only care that rules do not disappear.
+ */
+static void
+clear_counters(struct ip_fw *rule, int log_only)
+{
+ ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule);
+
+ if (log_only == 0) {
+ rule->bcnt = rule->pcnt = 0;
+ rule->timestamp = 0;
+ }
+ if (l->o.opcode == O_LOG)
+ l->log_left = l->max_log;
+}
+
+/**
+ * Reset some or all counters on firewall rules.
+ * The argument `arg' is an u_int32_t. The low 16 bit are the rule number,
+ * the next 8 bits are the set number, the top 8 bits are the command:
+ * 0 work with rules from all set's;
+ * 1 work with rules only from specified set.
+ * Specified rule number is zero if we want to clear all entries.
+ * log_only is 1 if we only want to reset logs, zero otherwise.
+ */
+static int
+zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only)
+{
+ struct ip_fw *rule;
+ char *msg;
+ int i;
+
+ uint16_t rulenum = arg & 0xffff;
+ uint8_t set = (arg >> 16) & 0xff;
+ uint8_t cmd = (arg >> 24) & 0xff;
+
+ if (cmd > 1)
+ return (EINVAL);
+ if (cmd == 1 && set > RESVD_SET)
+ return (EINVAL);
+
+ IPFW_UH_RLOCK(chain);
+ if (rulenum == 0) {
+ V_norule_counter = 0;
+ for (i = 0; i < chain->n_rules; i++) {
+ rule = chain->map[i];
+ /* Skip rules not in our set. */
+ if (cmd == 1 && rule->set != set)
+ continue;
+ clear_counters(rule, log_only);
+ }
+ msg = log_only ? "All logging counts reset" :
+ "Accounting cleared";
+ } else {
+ int cleared = 0;
+ for (i = 0; i < chain->n_rules; i++) {
+ rule = chain->map[i];
+ if (rule->rulenum == rulenum) {
+ if (cmd == 0 || rule->set == set)
+ clear_counters(rule, log_only);
+ cleared = 1;
+ }
+ if (rule->rulenum > rulenum)
+ break;
+ }
+ if (!cleared) { /* we did not find any matching rules */
+ IPFW_WUNLOCK(chain);
+ return (EINVAL);
+ }
+ msg = log_only ? "logging count reset" : "cleared";
+ }
+ IPFW_UH_RUNLOCK(chain);
+
+ if (V_fw_verbose) {
+ int lev = LOG_SECURITY | LOG_NOTICE;
+
+ if (rulenum)
+ log(lev, "ipfw: Entry %d %s.\n", rulenum, msg);
+ else
+ log(lev, "ipfw: %s.\n", msg);
+ }
+ return (0);
+}
+
+/*
+ * Check validity of the structure before insert.
+ * Rules are simple, so this mostly need to check rule sizes.
+ */
+static int
+check_ipfw_struct(struct ip_fw *rule, int size)
+{
+ int l, cmdlen = 0;
+ int have_action=0;
+ ipfw_insn *cmd;
+
+ if (size < sizeof(*rule)) {
+ printf("ipfw: rule too short\n");
+ return (EINVAL);
+ }
+ /* first, check for valid size */
+ l = RULESIZE(rule);
+ if (l != size) {
+ printf("ipfw: size mismatch (have %d want %d)\n", size, l);
+ return (EINVAL);
+ }
+ if (rule->act_ofs >= rule->cmd_len) {
+ printf("ipfw: bogus action offset (%u > %u)\n",
+ rule->act_ofs, rule->cmd_len - 1);
+ return (EINVAL);
+ }
+ /*
+ * Now go for the individual checks. Very simple ones, basically only
+ * instruction sizes.
+ */
+ for (l = rule->cmd_len, cmd = rule->cmd ;
+ l > 0 ; l -= cmdlen, cmd += cmdlen) {
+ cmdlen = F_LEN(cmd);
+ if (cmdlen > l) {
+ printf("ipfw: opcode %d size truncated\n",
+ cmd->opcode);
+ return EINVAL;
+ }
+ switch (cmd->opcode) {
+ case O_PROBE_STATE:
+ case O_KEEP_STATE:
+ case O_PROTO:
+ case O_IP_SRC_ME:
+ case O_IP_DST_ME:
+ case O_LAYER2:
+ case O_IN:
+ case O_FRAG:
+ case O_DIVERTED:
+ case O_IPOPT:
+ case O_IPTOS:
+ case O_IPPRECEDENCE:
+ case O_IPVER:
+ case O_TCPWIN:
+ case O_TCPFLAGS:
+ case O_TCPOPTS:
+ case O_ESTAB:
+ case O_VERREVPATH:
+ case O_VERSRCREACH:
+ case O_ANTISPOOF:
+ case O_IPSEC:
+#ifdef INET6
+ case O_IP6_SRC_ME:
+ case O_IP6_DST_ME:
+ case O_EXT_HDR:
+ case O_IP6:
+#endif
+ case O_IP4:
+ case O_TAG:
+ if (cmdlen != F_INSN_SIZE(ipfw_insn))
+ goto bad_size;
+ break;
+
+ case O_FIB:
+ if (cmdlen != F_INSN_SIZE(ipfw_insn))
+ goto bad_size;
+ if (cmd->arg1 >= rt_numfibs) {
+ printf("ipfw: invalid fib number %d\n",
+ cmd->arg1);
+ return EINVAL;
+ }
+ break;
+
+ case O_SETFIB:
+ if (cmdlen != F_INSN_SIZE(ipfw_insn))
+ goto bad_size;
+ if (cmd->arg1 >= rt_numfibs) {
+ printf("ipfw: invalid fib number %d\n",
+ cmd->arg1);
+ return EINVAL;
+ }
+ goto check_action;
+
+ case O_UID:
+ case O_GID:
+ case O_JAIL:
+ case O_IP_SRC:
+ case O_IP_DST:
+ case O_TCPSEQ:
+ case O_TCPACK:
+ case O_PROB:
+ case O_ICMPTYPE:
+ if (cmdlen != F_INSN_SIZE(ipfw_insn_u32))
+ goto bad_size;
+ break;
+
+ case O_LIMIT:
+ if (cmdlen != F_INSN_SIZE(ipfw_insn_limit))
+ goto bad_size;
+ break;
+
+ case O_LOG:
+ if (cmdlen != F_INSN_SIZE(ipfw_insn_log))
+ goto bad_size;
+
+ ((ipfw_insn_log *)cmd)->log_left =
+ ((ipfw_insn_log *)cmd)->max_log;
+
+ break;
+
+ case O_IP_SRC_MASK:
+ case O_IP_DST_MASK:
+ /* only odd command lengths */
+ if ( !(cmdlen & 1) || cmdlen > 31)
+ goto bad_size;
+ break;
+
+ case O_IP_SRC_SET:
+ case O_IP_DST_SET:
+ if (cmd->arg1 == 0 || cmd->arg1 > 256) {
+ printf("ipfw: invalid set size %d\n",
+ cmd->arg1);
+ return EINVAL;
+ }
+ if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
+ (cmd->arg1+31)/32 )
+ goto bad_size;
+ break;
+
+ case O_IP_SRC_LOOKUP:
+ case O_IP_DST_LOOKUP:
+ if (cmd->arg1 >= IPFW_TABLES_MAX) {
+ printf("ipfw: invalid table number %d\n",
+ cmd->arg1);
+ return (EINVAL);
+ }
+ if (cmdlen != F_INSN_SIZE(ipfw_insn) &&
+ cmdlen != F_INSN_SIZE(ipfw_insn_u32) + 1 &&
+ cmdlen != F_INSN_SIZE(ipfw_insn_u32))
+ goto bad_size;
+ break;
+
+ case O_MACADDR2:
+ if (cmdlen != F_INSN_SIZE(ipfw_insn_mac))
+ goto bad_size;
+ break;
+
+ case O_NOP:
+ case O_IPID:
+ case O_IPTTL:
+ case O_IPLEN:
+ case O_TCPDATALEN:
+ case O_TAGGED:
+ if (cmdlen < 1 || cmdlen > 31)
+ goto bad_size;
+ break;
+
+ case O_MAC_TYPE:
+ case O_IP_SRCPORT:
+ case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */
+ if (cmdlen < 2 || cmdlen > 31)
+ goto bad_size;
+ break;
+
+ case O_RECV:
+ case O_XMIT:
+ case O_VIA:
+ if (cmdlen != F_INSN_SIZE(ipfw_insn_if))
+ goto bad_size;
+ break;
+
+ case O_ALTQ:
+ if (cmdlen != F_INSN_SIZE(ipfw_insn_altq))
+ goto bad_size;
+ break;
+
+ case O_PIPE:
+ case O_QUEUE:
+ if (cmdlen != F_INSN_SIZE(ipfw_insn))
+ goto bad_size;
+ goto check_action;
+
+ case O_FORWARD_IP:
+#ifdef IPFIREWALL_FORWARD
+ if (cmdlen != F_INSN_SIZE(ipfw_insn_sa))
+ goto bad_size;
+ goto check_action;
+#else
+ return EINVAL;
+#endif
+
+ case O_DIVERT:
+ case O_TEE:
+ if (ip_divert_ptr == NULL)
+ return EINVAL;
+ else
+ goto check_size;
+ case O_NETGRAPH:
+ case O_NGTEE:
+ if (ng_ipfw_input_p == NULL)
+ return EINVAL;
+ else
+ goto check_size;
+ case O_NAT:
+ if (!IPFW_NAT_LOADED)
+ return EINVAL;
+ if (cmdlen != F_INSN_SIZE(ipfw_insn_nat))
+ goto bad_size;
+ goto check_action;
+ case O_FORWARD_MAC: /* XXX not implemented yet */
+ case O_CHECK_STATE:
+ case O_COUNT:
+ case O_ACCEPT:
+ case O_DENY:
+ case O_REJECT:
+#ifdef INET6
+ case O_UNREACH6:
+#endif
+ case O_SKIPTO:
+ case O_REASS:
+check_size:
+ if (cmdlen != F_INSN_SIZE(ipfw_insn))
+ goto bad_size;
+check_action:
+ if (have_action) {
+ printf("ipfw: opcode %d, multiple actions"
+ " not allowed\n",
+ cmd->opcode);
+ return EINVAL;
+ }
+ have_action = 1;
+ if (l != cmdlen) {
+ printf("ipfw: opcode %d, action must be"
+ " last opcode\n",
+ cmd->opcode);
+ return EINVAL;
+ }
+ break;
+#ifdef INET6
+ case O_IP6_SRC:
+ case O_IP6_DST:
+ if (cmdlen != F_INSN_SIZE(struct in6_addr) +
+ F_INSN_SIZE(ipfw_insn))
+ goto bad_size;
+ break;
+
+ case O_FLOW6ID:
+ if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
+ ((ipfw_insn_u32 *)cmd)->o.arg1)
+ goto bad_size;
+ break;
+
+ case O_IP6_SRC_MASK:
+ case O_IP6_DST_MASK:
+ if ( !(cmdlen & 1) || cmdlen > 127)
+ goto bad_size;
+ break;
+ case O_ICMP6TYPE:
+ if( cmdlen != F_INSN_SIZE( ipfw_insn_icmp6 ) )
+ goto bad_size;
+ break;
+#endif
+
+ default:
+ switch (cmd->opcode) {
+#ifndef INET6
+ case O_IP6_SRC_ME:
+ case O_IP6_DST_ME:
+ case O_EXT_HDR:
+ case O_IP6:
+ case O_UNREACH6:
+ case O_IP6_SRC:
+ case O_IP6_DST:
+ case O_FLOW6ID:
+ case O_IP6_SRC_MASK:
+ case O_IP6_DST_MASK:
+ case O_ICMP6TYPE:
+ printf("ipfw: no IPv6 support in kernel\n");
+ return EPROTONOSUPPORT;
+#endif
+ default:
+ printf("ipfw: opcode %d, unknown opcode\n",
+ cmd->opcode);
+ return EINVAL;
+ }
+ }
+ }
+ if (have_action == 0) {
+ printf("ipfw: missing action\n");
+ return EINVAL;
+ }
+ return 0;
+
+bad_size:
+ printf("ipfw: opcode %d size %d wrong\n",
+ cmd->opcode, cmdlen);
+ return EINVAL;
+}
+
+
+/*
+ * Translation of requests for compatibility with FreeBSD 7.2/8.
+ * a static variable tells us if we have an old client from userland,
+ * and if necessary we translate requests and responses between the
+ * two formats.
+ */
+static int is7 = 0;
+
+struct ip_fw7 {
+ struct ip_fw7 *next; /* linked list of rules */
+ struct ip_fw7 *next_rule; /* ptr to next [skipto] rule */
+ /* 'next_rule' is used to pass up 'set_disable' status */
+
+ uint16_t act_ofs; /* offset of action in 32-bit units */
+ uint16_t cmd_len; /* # of 32-bit words in cmd */
+ uint16_t rulenum; /* rule number */
+ uint8_t set; /* rule set (0..31) */
+ // #define RESVD_SET 31 /* set for default and persistent rules */
+ uint8_t _pad; /* padding */
+ // uint32_t id; /* rule id, only in v.8 */
+ /* These fields are present in all rules. */
+ uint64_t pcnt; /* Packet counter */
+ uint64_t bcnt; /* Byte counter */
+ uint32_t timestamp; /* tv_sec of last match */
+
+ ipfw_insn cmd[1]; /* storage for commands */
+};
+
+ int convert_rule_to_7(struct ip_fw *rule);
+int convert_rule_to_8(struct ip_fw *rule);
+
+#ifndef RULESIZE7
+#define RULESIZE7(rule) (sizeof(struct ip_fw7) + \
+ ((struct ip_fw7 *)(rule))->cmd_len * 4 - 4)
+#endif
+
+
+/*
+ * Copy the static and dynamic rules to the supplied buffer
+ * and return the amount of space actually used.
+ * Must be run under IPFW_UH_RLOCK
+ */
+static size_t
+ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space)
+{
+ char *bp = buf;
+ char *ep = bp + space;
+ struct ip_fw *rule, *dst;
+ int l, i;
+ time_t boot_seconds;
+
+ boot_seconds = boottime.tv_sec;
+ for (i = 0; i < chain->n_rules; i++) {
+ rule = chain->map[i];
+
+ if (is7) {
+ /* Convert rule to FreeBSd 7.2 format */
+ l = RULESIZE7(rule);
+ if (bp + l + sizeof(uint32_t) <= ep) {
+ int error;
+ bcopy(rule, bp, l + sizeof(uint32_t));
+ error = convert_rule_to_7((struct ip_fw *) bp);
+ if (error)
+ return 0; /*XXX correct? */
+ /*
+ * XXX HACK. Store the disable mask in the "next"
+ * pointer in a wild attempt to keep the ABI the same.
+ * Why do we do this on EVERY rule?
+ */
+ bcopy(&V_set_disable,
+ &(((struct ip_fw7 *)bp)->next_rule),
+ sizeof(V_set_disable));
+ if (((struct ip_fw7 *)bp)->timestamp)
+ ((struct ip_fw7 *)bp)->timestamp += boot_seconds;
+ bp += l;
+ }
+ continue; /* go to next rule */
+ }
+
+ /* normal mode, don't touch rules */
+ l = RULESIZE(rule);
+ if (bp + l > ep) { /* should not happen */
+ printf("overflow dumping static rules\n");
+ break;
+ }
+ dst = (struct ip_fw *)bp;
+ bcopy(rule, dst, l);
+ /*
+ * XXX HACK. Store the disable mask in the "next"
+ * pointer in a wild attempt to keep the ABI the same.
+ * Why do we do this on EVERY rule?
+ */
+ bcopy(&V_set_disable, &dst->next_rule, sizeof(V_set_disable));
+ if (dst->timestamp)
+ dst->timestamp += boot_seconds;
+ bp += l;
+ }
+ ipfw_get_dynamic(&bp, ep); /* protected by the dynamic lock */
+ return (bp - (char *)buf);
+}
+
+
+/**
+ * {set|get}sockopt parser.
+ */
+int
+ipfw_ctl(struct sockopt *sopt)
+{
+#define RULE_MAXSIZE (256*sizeof(u_int32_t))
+ int error;
+ size_t size;
+ struct ip_fw *buf, *rule;
+ struct ip_fw_chain *chain;
+ u_int32_t rulenum[2];
+
+ error = priv_check(sopt->sopt_td, PRIV_NETINET_IPFW);
+ if (error)
+ return (error);
+
+ /*
+ * Disallow modifications in really-really secure mode, but still allow
+ * the logging counters to be reset.
+ */
+ if (sopt->sopt_name == IP_FW_ADD ||
+ (sopt->sopt_dir == SOPT_SET && sopt->sopt_name != IP_FW_RESETLOG)) {
+ error = securelevel_ge(sopt->sopt_td->td_ucred, 3);
+ if (error)
+ return (error);
+ }
+
+ chain = &V_layer3_chain;
+ error = 0;
+
+ switch (sopt->sopt_name) {
+ case IP_FW_GET:
+ /*
+ * pass up a copy of the current rules. Static rules
+ * come first (the last of which has number IPFW_DEFAULT_RULE),
+ * followed by a possibly empty list of dynamic rule.
+ * The last dynamic rule has NULL in the "next" field.
+ *
+ * Note that the calculated size is used to bound the
+ * amount of data returned to the user. The rule set may
+ * change between calculating the size and returning the
+ * data in which case we'll just return what fits.
+ */
+ for (;;) {
+ int len = 0, want;
+
+ size = chain->static_len;
+ size += ipfw_dyn_len();
+ if (size >= sopt->sopt_valsize)
+ break;
+ buf = malloc(size, M_TEMP, M_WAITOK);
+ if (buf == NULL)
+ break;
+ IPFW_UH_RLOCK(chain);
+ /* check again how much space we need */
+ want = chain->static_len + ipfw_dyn_len();
+ if (size >= want)
+ len = ipfw_getrules(chain, buf, size);
+ IPFW_UH_RUNLOCK(chain);
+ if (size >= want)
+ error = sooptcopyout(sopt, buf, len);
+ free(buf, M_TEMP);
+ if (size >= want)
+ break;
+ }
+ break;
+
+ case IP_FW_FLUSH:
+ /* locking is done within del_entry() */
+ error = del_entry(chain, 0); /* special case, rule=0, cmd=0 means all */
+ break;
+
+ case IP_FW_ADD:
+ rule = malloc(RULE_MAXSIZE, M_TEMP, M_WAITOK);
+ error = sooptcopyin(sopt, rule, RULE_MAXSIZE,
+ sizeof(struct ip_fw7) );
+
+ /*
+ * If the size of commands equals RULESIZE7 then we assume
+ * a FreeBSD7.2 binary is talking to us (set is7=1).
+ * is7 is persistent so the next 'ipfw list' command
+ * will use this format.
+ * NOTE: If wrong version is guessed (this can happen if
+ * the first ipfw command is 'ipfw [pipe] list')
+ * the ipfw binary may crash or loop infinitly...
+ */
+ if (sopt->sopt_valsize == RULESIZE7(rule)) {
+ is7 = 1;
+ error = convert_rule_to_8(rule);
+ if (error)
+ return error;
+ if (error == 0)
+ error = check_ipfw_struct(rule, RULESIZE(rule));
+ } else {
+ is7 = 0;
+ if (error == 0)
+ error = check_ipfw_struct(rule, sopt->sopt_valsize);
+ }
+ if (error == 0) {
+ /* locking is done within ipfw_add_rule() */
+ error = ipfw_add_rule(chain, rule);
+ size = RULESIZE(rule);
+ if (!error && sopt->sopt_dir == SOPT_GET) {
+ if (is7) {
+ error = convert_rule_to_7(rule);
+ size = RULESIZE7(rule);
+ if (error)
+ return error;
+ }
+ error = sooptcopyout(sopt, rule, size);
+ }
+ }
+ free(rule, M_TEMP);
+ break;
+
+ case IP_FW_DEL:
+ /*
+ * IP_FW_DEL is used for deleting single rules or sets,
+ * and (ab)used to atomically manipulate sets. Argument size
+ * is used to distinguish between the two:
+ * sizeof(u_int32_t)
+ * delete single rule or set of rules,
+ * or reassign rules (or sets) to a different set.
+ * 2*sizeof(u_int32_t)
+ * atomic disable/enable sets.
+ * first u_int32_t contains sets to be disabled,
+ * second u_int32_t contains sets to be enabled.
+ */
+ error = sooptcopyin(sopt, rulenum,
+ 2*sizeof(u_int32_t), sizeof(u_int32_t));
+ if (error)
+ break;
+ size = sopt->sopt_valsize;
+ if (size == sizeof(u_int32_t) && rulenum[0] != 0) {
+ /* delete or reassign, locking done in del_entry() */
+ error = del_entry(chain, rulenum[0]);
+ } else if (size == 2*sizeof(u_int32_t)) { /* set enable/disable */
+ IPFW_UH_WLOCK(chain);
+ V_set_disable =
+ (V_set_disable | rulenum[0]) & ~rulenum[1] &
+ ~(1<<RESVD_SET); /* set RESVD_SET always enabled */
+ IPFW_UH_WUNLOCK(chain);
+ } else
+ error = EINVAL;
+ break;
+
+ case IP_FW_ZERO:
+ case IP_FW_RESETLOG: /* argument is an u_int_32, the rule number */
+ rulenum[0] = 0;
+ if (sopt->sopt_val != 0) {
+ error = sooptcopyin(sopt, rulenum,
+ sizeof(u_int32_t), sizeof(u_int32_t));
+ if (error)
+ break;
+ }
+ error = zero_entry(chain, rulenum[0],
+ sopt->sopt_name == IP_FW_RESETLOG);
+ break;
+
+ /*--- TABLE manipulations are protected by the IPFW_LOCK ---*/
+ case IP_FW_TABLE_ADD:
+ {
+ ipfw_table_entry ent;
+
+ error = sooptcopyin(sopt, &ent,
+ sizeof(ent), sizeof(ent));
+ if (error)
+ break;
+ error = ipfw_add_table_entry(chain, ent.tbl,
+ ent.addr, ent.masklen, ent.value);
+ }
+ break;
+
+ case IP_FW_TABLE_DEL:
+ {
+ ipfw_table_entry ent;
+
+ error = sooptcopyin(sopt, &ent,
+ sizeof(ent), sizeof(ent));
+ if (error)
+ break;
+ error = ipfw_del_table_entry(chain, ent.tbl,
+ ent.addr, ent.masklen);
+ }
+ break;
+
+ case IP_FW_TABLE_FLUSH:
+ {
+ u_int16_t tbl;
+
+ error = sooptcopyin(sopt, &tbl,
+ sizeof(tbl), sizeof(tbl));
+ if (error)
+ break;
+ IPFW_WLOCK(chain);
+ error = ipfw_flush_table(chain, tbl);
+ IPFW_WUNLOCK(chain);
+ }
+ break;
+
+ case IP_FW_TABLE_GETSIZE:
+ {
+ u_int32_t tbl, cnt;
+
+ if ((error = sooptcopyin(sopt, &tbl, sizeof(tbl),
+ sizeof(tbl))))
+ break;
+ IPFW_RLOCK(chain);
+ error = ipfw_count_table(chain, tbl, &cnt);
+ IPFW_RUNLOCK(chain);
+ if (error)
+ break;
+ error = sooptcopyout(sopt, &cnt, sizeof(cnt));
+ }
+ break;
+
+ case IP_FW_TABLE_LIST:
+ {
+ ipfw_table *tbl;
+
+ if (sopt->sopt_valsize < sizeof(*tbl)) {
+ error = EINVAL;
+ break;
+ }
+ size = sopt->sopt_valsize;
+ tbl = malloc(size, M_TEMP, M_WAITOK);
+ error = sooptcopyin(sopt, tbl, size, sizeof(*tbl));
+ if (error) {
+ free(tbl, M_TEMP);
+ break;
+ }
+ tbl->size = (size - sizeof(*tbl)) /
+ sizeof(ipfw_table_entry);
+ IPFW_RLOCK(chain);
+ error = ipfw_dump_table(chain, tbl);
+ IPFW_RUNLOCK(chain);
+ if (error) {
+ free(tbl, M_TEMP);
+ break;
+ }
+ error = sooptcopyout(sopt, tbl, size);
+ free(tbl, M_TEMP);
+ }
+ break;
+
+ /*--- NAT operations are protected by the IPFW_LOCK ---*/
+ case IP_FW_NAT_CFG:
+ if (IPFW_NAT_LOADED)
+ error = ipfw_nat_cfg_ptr(sopt);
+ else {
+ printf("IP_FW_NAT_CFG: %s\n",
+ "ipfw_nat not present, please load it");
+ error = EINVAL;
+ }
+ break;
+
+ case IP_FW_NAT_DEL:
+ if (IPFW_NAT_LOADED)
+ error = ipfw_nat_del_ptr(sopt);
+ else {
+ printf("IP_FW_NAT_DEL: %s\n",
+ "ipfw_nat not present, please load it");
+ error = EINVAL;
+ }
+ break;
+
+ case IP_FW_NAT_GET_CONFIG:
+ if (IPFW_NAT_LOADED)
+ error = ipfw_nat_get_cfg_ptr(sopt);
+ else {
+ printf("IP_FW_NAT_GET_CFG: %s\n",
+ "ipfw_nat not present, please load it");
+ error = EINVAL;
+ }
+ break;
+
+ case IP_FW_NAT_GET_LOG:
+ if (IPFW_NAT_LOADED)
+ error = ipfw_nat_get_log_ptr(sopt);
+ else {
+ printf("IP_FW_NAT_GET_LOG: %s\n",
+ "ipfw_nat not present, please load it");
+ error = EINVAL;
+ }
+ break;
+
+ default:
+ printf("ipfw: ipfw_ctl invalid option %d\n", sopt->sopt_name);
+ error = EINVAL;
+ }
+
+ return (error);
+#undef RULE_MAXSIZE
+}
+
+
+#define RULE_MAXSIZE (256*sizeof(u_int32_t))
+
+/* Functions to convert rules 7.2 <==> 8.0 */
+int
+convert_rule_to_7(struct ip_fw *rule)
+{
+ /* Used to modify original rule */
+ struct ip_fw7 *rule7 = (struct ip_fw7 *)rule;
+ /* copy of original rule, version 8 */
+ struct ip_fw *tmp;
+
+ /* Used to copy commands */
+ ipfw_insn *ccmd, *dst;
+ int ll = 0, ccmdlen = 0;
+
+ tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO);
+ if (tmp == NULL) {
+ return 1; //XXX error
+ }
+ bcopy(rule, tmp, RULE_MAXSIZE);
+
+ /* Copy fields */
+ rule7->_pad = tmp->_pad;
+ rule7->set = tmp->set;
+ rule7->rulenum = tmp->rulenum;
+ rule7->cmd_len = tmp->cmd_len;
+ rule7->act_ofs = tmp->act_ofs;
+ rule7->next_rule = (struct ip_fw7 *)tmp->next_rule;
+ rule7->next = (struct ip_fw7 *)tmp->x_next;
+ rule7->cmd_len = tmp->cmd_len;
+ rule7->pcnt = tmp->pcnt;
+ rule7->bcnt = tmp->bcnt;
+ rule7->timestamp = tmp->timestamp;
+
+ /* Copy commands */
+ for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule7->cmd ;
+ ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) {
+ ccmdlen = F_LEN(ccmd);
+
+ bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t));
+
+ if (dst->opcode > O_NAT)
+ /* O_REASS doesn't exists in 7.2 version, so
+ * decrement opcode if it is after O_REASS
+ */
+ dst->opcode--;
+
+ if (ccmdlen > ll) {
+ printf("ipfw: opcode %d size truncated\n",
+ ccmd->opcode);
+ return EINVAL;
+ }
+ }
+ free(tmp, M_TEMP);
+
+ return 0;
+}
+
+int
+convert_rule_to_8(struct ip_fw *rule)
+{
+ /* Used to modify original rule */
+ struct ip_fw7 *rule7 = (struct ip_fw7 *) rule;
+
+ /* Used to copy commands */
+ ipfw_insn *ccmd, *dst;
+ int ll = 0, ccmdlen = 0;
+
+ /* Copy of original rule */
+ struct ip_fw7 *tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO);
+ if (tmp == NULL) {
+ return 1; //XXX error
+ }
+
+ bcopy(rule7, tmp, RULE_MAXSIZE);
+
+ for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule->cmd ;
+ ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) {
+ ccmdlen = F_LEN(ccmd);
+
+ bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t));
+
+ if (dst->opcode > O_NAT)
+ /* O_REASS doesn't exists in 7.2 version, so
+ * increment opcode if it is after O_REASS
+ */
+ dst->opcode++;
+
+ if (ccmdlen > ll) {
+ printf("ipfw: opcode %d size truncated\n",
+ ccmd->opcode);
+ return EINVAL;
+ }
+ }
+
+ rule->_pad = tmp->_pad;
+ rule->set = tmp->set;
+ rule->rulenum = tmp->rulenum;
+ rule->cmd_len = tmp->cmd_len;
+ rule->act_ofs = tmp->act_ofs;
+ rule->next_rule = (struct ip_fw *)tmp->next_rule;
+ rule->x_next = (struct ip_fw *)tmp->next;
+ rule->cmd_len = tmp->cmd_len;
+ rule->id = 0; /* XXX see if is ok = 0 */
+ rule->pcnt = tmp->pcnt;
+ rule->bcnt = tmp->bcnt;
+ rule->timestamp = tmp->timestamp;
+
+ free (tmp, M_TEMP);
+ return 0;
+}
+
+/* end of file */
diff --git a/sys/netinet/ipfw/ip_fw_table.c b/sys/netinet/ipfw/ip_fw_table.c
new file mode 100644
index 0000000..517622f
--- /dev/null
+++ b/sys/netinet/ipfw/ip_fw_table.c
@@ -0,0 +1,286 @@
+/*-
+ * Copyright (c) 2004 Ruslan Ermilov and Vsevolod Lobko.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ * Lookup table support for ipfw
+ *
+ * Lookup tables are implemented (at the moment) using the radix
+ * tree used for routing tables. Tables store key-value entries, where
+ * keys are network prefixes (addr/masklen), and values are integers.
+ * As a degenerate case we can interpret keys as 32-bit integers
+ * (with a /32 mask).
+ *
+ * The table is protected by the IPFW lock even for manipulation coming
+ * from userland, because operations are typically fast.
+ */
+
+#if !defined(KLD_MODULE)
+#include "opt_ipfw.h"
+#include "opt_ipdivert.h"
+#include "opt_ipdn.h"
+#include "opt_inet.h"
+#ifndef INET
+#error IPFIREWALL requires INET.
+#endif /* INET */
+#endif
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <net/if.h> /* ip_fw.h requires IFNAMSIZ */
+#include <net/radix.h>
+#include <net/route.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/ip_var.h> /* struct ipfw_rule_ref */
+#include <netinet/ip_fw.h>
+#include <sys/queue.h> /* LIST_HEAD */
+#include <netinet/ipfw/ip_fw_private.h>
+
+#ifdef MAC
+#include <security/mac/mac_framework.h>
+#endif
+
+MALLOC_DEFINE(M_IPFW_TBL, "ipfw_tbl", "IpFw tables");
+
+struct table_entry {
+ struct radix_node rn[2];
+ struct sockaddr_in addr, mask;
+ u_int32_t value;
+};
+
+/*
+ * The radix code expects addr and mask to be array of bytes,
+ * with the first byte being the length of the array. rn_inithead
+ * is called with the offset in bits of the lookup key within the
+ * array. If we use a sockaddr_in as the underlying type,
+ * sin_len is conveniently located at offset 0, sin_addr is at
+ * offset 4 and normally aligned.
+ * But for portability, let's avoid assumption and make the code explicit
+ */
+#define KEY_LEN(v) *((uint8_t *)&(v))
+#define KEY_OFS (8*offsetof(struct sockaddr_in, sin_addr))
+
+int
+ipfw_add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+ uint8_t mlen, uint32_t value)
+{
+ struct radix_node_head *rnh;
+ struct table_entry *ent;
+ struct radix_node *rn;
+
+ if (tbl >= IPFW_TABLES_MAX)
+ return (EINVAL);
+ rnh = ch->tables[tbl];
+ ent = malloc(sizeof(*ent), M_IPFW_TBL, M_NOWAIT | M_ZERO);
+ if (ent == NULL)
+ return (ENOMEM);
+ ent->value = value;
+ KEY_LEN(ent->addr) = KEY_LEN(ent->mask) = 8;
+ ent->mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0);
+ ent->addr.sin_addr.s_addr = addr & ent->mask.sin_addr.s_addr;
+ IPFW_WLOCK(ch);
+ rn = rnh->rnh_addaddr(&ent->addr, &ent->mask, rnh, (void *)ent);
+ if (rn == NULL) {
+ IPFW_WUNLOCK(ch);
+ free(ent, M_IPFW_TBL);
+ return (EEXIST);
+ }
+ IPFW_WUNLOCK(ch);
+ return (0);
+}
+
+int
+ipfw_del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+ uint8_t mlen)
+{
+ struct radix_node_head *rnh;
+ struct table_entry *ent;
+ struct sockaddr_in sa, mask;
+
+ if (tbl >= IPFW_TABLES_MAX)
+ return (EINVAL);
+ rnh = ch->tables[tbl];
+ KEY_LEN(sa) = KEY_LEN(mask) = 8;
+ mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0);
+ sa.sin_addr.s_addr = addr & mask.sin_addr.s_addr;
+ IPFW_WLOCK(ch);
+ ent = (struct table_entry *)rnh->rnh_deladdr(&sa, &mask, rnh);
+ if (ent == NULL) {
+ IPFW_WUNLOCK(ch);
+ return (ESRCH);
+ }
+ IPFW_WUNLOCK(ch);
+ free(ent, M_IPFW_TBL);
+ return (0);
+}
+
+static int
+flush_table_entry(struct radix_node *rn, void *arg)
+{
+ struct radix_node_head * const rnh = arg;
+ struct table_entry *ent;
+
+ ent = (struct table_entry *)
+ rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh);
+ if (ent != NULL)
+ free(ent, M_IPFW_TBL);
+ return (0);
+}
+
+int
+ipfw_flush_table(struct ip_fw_chain *ch, uint16_t tbl)
+{
+ struct radix_node_head *rnh;
+
+ IPFW_WLOCK_ASSERT(ch);
+
+ if (tbl >= IPFW_TABLES_MAX)
+ return (EINVAL);
+ rnh = ch->tables[tbl];
+ KASSERT(rnh != NULL, ("NULL IPFW table"));
+ rnh->rnh_walktree(rnh, flush_table_entry, rnh);
+ return (0);
+}
+
+void
+ipfw_destroy_tables(struct ip_fw_chain *ch)
+{
+ uint16_t tbl;
+ struct radix_node_head *rnh;
+
+ IPFW_WLOCK_ASSERT(ch);
+
+ for (tbl = 0; tbl < IPFW_TABLES_MAX; tbl++) {
+ ipfw_flush_table(ch, tbl);
+ rnh = ch->tables[tbl];
+ rn_detachhead((void **)&rnh);
+ }
+}
+
+int
+ipfw_init_tables(struct ip_fw_chain *ch)
+{
+ int i;
+ uint16_t j;
+
+ for (i = 0; i < IPFW_TABLES_MAX; i++) {
+ if (!rn_inithead((void **)&ch->tables[i], KEY_OFS)) {
+ for (j = 0; j < i; j++) {
+ (void) ipfw_flush_table(ch, j);
+ }
+ return (ENOMEM);
+ }
+ }
+ return (0);
+}
+
+int
+ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+ uint32_t *val)
+{
+ struct radix_node_head *rnh;
+ struct table_entry *ent;
+ struct sockaddr_in sa;
+
+ if (tbl >= IPFW_TABLES_MAX)
+ return (0);
+ rnh = ch->tables[tbl];
+ KEY_LEN(sa) = 8;
+ sa.sin_addr.s_addr = addr;
+ ent = (struct table_entry *)(rnh->rnh_lookup(&sa, NULL, rnh));
+ if (ent != NULL) {
+ *val = ent->value;
+ return (1);
+ }
+ return (0);
+}
+
+static int
+count_table_entry(struct radix_node *rn, void *arg)
+{
+ u_int32_t * const cnt = arg;
+
+ (*cnt)++;
+ return (0);
+}
+
+int
+ipfw_count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt)
+{
+ struct radix_node_head *rnh;
+
+ if (tbl >= IPFW_TABLES_MAX)
+ return (EINVAL);
+ rnh = ch->tables[tbl];
+ *cnt = 0;
+ rnh->rnh_walktree(rnh, count_table_entry, cnt);
+ return (0);
+}
+
+static int
+dump_table_entry(struct radix_node *rn, void *arg)
+{
+ struct table_entry * const n = (struct table_entry *)rn;
+ ipfw_table * const tbl = arg;
+ ipfw_table_entry *ent;
+
+ if (tbl->cnt == tbl->size)
+ return (1);
+ ent = &tbl->ent[tbl->cnt];
+ ent->tbl = tbl->tbl;
+ if (in_nullhost(n->mask.sin_addr))
+ ent->masklen = 0;
+ else
+ ent->masklen = 33 - ffs(ntohl(n->mask.sin_addr.s_addr));
+ ent->addr = n->addr.sin_addr.s_addr;
+ ent->value = n->value;
+ tbl->cnt++;
+ return (0);
+}
+
+int
+ipfw_dump_table(struct ip_fw_chain *ch, ipfw_table *tbl)
+{
+ struct radix_node_head *rnh;
+
+ if (tbl->tbl >= IPFW_TABLES_MAX)
+ return (EINVAL);
+ rnh = ch->tables[tbl->tbl];
+ tbl->cnt = 0;
+ rnh->rnh_walktree(rnh, dump_table_entry, tbl);
+ return (0);
+}
+/* end of file */
diff --git a/sys/netinet/ipfw/test/Makefile b/sys/netinet/ipfw/test/Makefile
new file mode 100644
index 0000000..bbeb942
--- /dev/null
+++ b/sys/netinet/ipfw/test/Makefile
@@ -0,0 +1,50 @@
+#
+# $FreeBSD$
+#
+# Makefile for building userland tests
+# this is written in a form compatible with gmake
+
+SCHED_SRCS = test_dn_sched.c
+SCHED_SRCS += dn_sched_fifo.c
+SCHED_SRCS += dn_sched_wf2q.c
+SCHED_SRCS += dn_sched_qfq.c
+SCHED_SRCS += dn_sched_rr.c
+SCHED_SRCS += dn_heap.c
+SCHED_SRCS += main.c
+
+SCHED_OBJS=$(SCHED_SRCS:.c=.o)
+
+HEAP_SRCS = dn_heap.c test_dn_heap.c
+HEAP_OBJS=$(HEAP_SRCS:.c=.o)
+
+VPATH= .:..
+
+CFLAGS = -I.. -I. -Wall -Werror -O3 -DIPFW
+TARGETS= test_sched # no test_heap by default
+
+all: $(TARGETS)
+
+test_heap : $(HEAP_OBJS)
+ $(CC) -o $@ $(HEAP_OBJS)
+
+test_sched : $(SCHED_OBJS)
+ $(CC) -o $@ $(SCHED_OBJS)
+
+$(SCHED_OBJS): dn_test.h
+main.o: mylist.h
+
+clean:
+ - rm *.o $(TARGETS) *.core
+
+ALLSRCS = $(SCHED_SRCS) dn_test.h mylist.h \
+ dn_sched.h dn_heap.h ip_dn_private.h Makefile
+TMPBASE = /tmp/testXYZ
+TMPDIR = $(TMPBASE)/test
+
+tgz:
+ -rm -rf $(TMPDIR)
+ mkdir -p $(TMPDIR)
+ -cp -p $(ALLSRCS) $(TMPDIR)
+ -(cd ..; cp -p $(ALLSRCS) $(TMPDIR))
+ ls -la $(TMPDIR)
+ (cd $(TMPBASE); tar cvzf /tmp/test.tgz test)
diff --git a/sys/netinet/ipfw/test/dn_test.h b/sys/netinet/ipfw/test/dn_test.h
new file mode 100644
index 0000000..4e079bc
--- /dev/null
+++ b/sys/netinet/ipfw/test/dn_test.h
@@ -0,0 +1,175 @@
+/*
+ * $FreeBSD$
+ *
+ * userspace compatibility code for dummynet schedulers
+ */
+
+#ifndef _DN_TEST_H
+#define _DN_TEST_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h> /* bzero, ffs, ... */
+#include <string.h> /* strcmp */
+#include <errno.h>
+#include <sys/queue.h>
+#include <sys/time.h>
+
+extern int debug;
+#define ND(fmt, args...) do {} while (0)
+#define D1(fmt, args...) do {} while (0)
+#define D(fmt, args...) fprintf(stderr, "%-8s " fmt "\n", \
+ __FUNCTION__, ## args)
+#define DX(lev, fmt, args...) do { \
+ if (debug > lev) D(fmt, ## args); } while (0)
+
+
+#ifndef offsetof
+#define offsetof(t,m) (int)((&((t *)0L)->m))
+#endif
+
+#include <mylist.h>
+
+/* prevent include of other system headers */
+#define _NETINET_IP_VAR_H_ /* ip_fw_args */
+#define _IPFW2_H
+#define _SYS_MBUF_H_
+
+enum {
+ DN_QUEUE,
+};
+
+enum {
+ DN_SCHED_FIFO,
+ DN_SCHED_WF2QP,
+};
+
+struct dn_id {
+ int type, subtype, len, id;
+};
+
+struct dn_fs {
+ int par[4]; /* flowset parameters */
+
+ /* simulation entries.
+ * 'index' is not strictly necessary
+ * y is used for the inverse mapping ,
+ */
+ int index;
+ int y; /* inverse mapping */
+ int base_y; /* inverse mapping */
+ int next_y; /* inverse mapping */
+ int n_flows;
+ int first_flow;
+ int next_flow; /* first_flow + n_flows */
+ /*
+ * when generating, let 'cur' go from 0 to n_flows-1,
+ * then point to flow first_flow + cur
+ */
+ int cur;
+};
+
+struct dn_sch {
+};
+
+struct dn_flow {
+ struct dn_id oid;
+ int length;
+ int len_bytes;
+ int drops;
+ uint64_t tot_bytes;
+ uint32_t flow_id;
+ struct list_head h; /* used by the generator */
+};
+
+struct dn_link {
+};
+
+struct ip_fw_args {
+};
+
+struct mbuf {
+ struct {
+ int len;
+ } m_pkthdr;
+ struct mbuf *m_nextpkt;
+ int flow_id; /* for testing, index of a flow */
+ //int flowset_id; /* for testing, index of a flowset */
+ void *cfg; /* config args */
+};
+
+#define MALLOC_DECLARE(x)
+#define KASSERT(x, y) do { if (!(x)) printf y ; exit(0); } while (0)
+struct ipfw_flow_id {
+};
+
+typedef void * module_t;
+
+struct _md_t {
+ const char *name;
+ int (*f)(module_t, int, void *);
+ void *p;
+};
+
+typedef struct _md_t moduledata_t;
+
+#define DECLARE_MODULE(name, b, c, d) \
+ moduledata_t *_g_##name = & b
+#define MODULE_DEPEND(a, b, c, d, e)
+
+#ifdef IPFW
+#include <dn_heap.h>
+#include <ip_dn_private.h>
+#include <dn_sched.h>
+#else
+struct dn_queue {
+ struct dn_fsk *fs; /* parent flowset. */
+ struct dn_sch_inst *_si; /* parent sched instance. */
+};
+struct dn_schk {
+};
+struct dn_fsk {
+ struct dn_fs fs;
+ struct dn_schk *sched;
+};
+struct dn_sch_inst {
+ struct dn_schk *sched;
+};
+struct dn_alg {
+ int type;
+ const char *name;
+ void *enqueue, *dequeue;
+ int q_datalen, si_datalen, schk_datalen;
+ int (*config)(struct dn_schk *);
+ int (*new_sched)(struct dn_sch_inst *);
+ int (*new_fsk)(struct dn_fsk *);
+ int (*new_queue)(struct dn_queue *q);
+};
+
+#endif
+
+#ifndef __FreeBSD__
+int fls(int);
+#endif
+
+static inline void
+mq_append(struct mq *q, struct mbuf *m)
+{
+ if (q->head == NULL)
+ q->head = m;
+ else
+ q->tail->m_nextpkt = m;
+ q->tail = m;
+ m->m_nextpkt = NULL;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _DN_TEST_H */
diff --git a/sys/netinet/ipfw/test/main.c b/sys/netinet/ipfw/test/main.c
new file mode 100644
index 0000000..be9fdf5
--- /dev/null
+++ b/sys/netinet/ipfw/test/main.c
@@ -0,0 +1,636 @@
+/*
+ * $FreeBSD$
+ *
+ * Testing program for schedulers
+ *
+ * The framework include a simple controller which, at each
+ * iteration, decides whether we can enqueue and/or dequeue.
+ * Then the mainloop runs the required number of tests,
+ * keeping track of statistics.
+ */
+
+#include "dn_test.h"
+
+struct q_list {
+ struct list_head h;
+};
+
+struct cfg_s {
+ int ac;
+ char * const *av;
+
+ const char *name;
+ int loops;
+ struct timeval time;
+
+ /* running counters */
+ uint32_t _enqueue;
+ uint32_t drop;
+ uint32_t pending;
+ uint32_t dequeue;
+
+ /* generator parameters */
+ int th_min, th_max;
+ int maxburst;
+ int lmin, lmax; /* packet len */
+ int flows; /* number of flows */
+ int flowsets; /* number of flowsets */
+ int wsum; /* sum of weights of all flows */
+ int max_y; /* max random number in the generation */
+ int cur_y, cur_fs; /* used in generation, between 0 and max_y - 1 */
+ const char *fs_config; /* flowset config */
+ int can_dequeue;
+ int burst; /* count of packets sent in a burst */
+ struct mbuf *tosend; /* packet to send -- also flag to enqueue */
+
+ struct mbuf *freelist;
+
+ struct mbuf *head, *tail; /* a simple tailq */
+
+ /* scheduler hooks */
+ int (*enq)(struct dn_sch_inst *, struct dn_queue *,
+ struct mbuf *);
+ struct mbuf * (*deq)(struct dn_sch_inst *);
+ /* size of the three fields including sched-specific areas */
+ int schk_len;
+ int q_len; /* size of a queue including sched-fields */
+ int si_len; /* size of a sch_inst including sched-fields */
+ char *q; /* array of flow queues */
+ /* use a char* because size is variable */
+ struct dn_fsk *fs; /* array of flowsets */
+ struct dn_sch_inst *si;
+ struct dn_schk *sched;
+
+ /* generator state */
+ int state; /* 0 = going up, 1: going down */
+
+ /*
+ * We keep lists for each backlog level, and always serve
+ * the one with shortest backlog. llmask contains a bitmap
+ * of lists, and ll are the heads of the lists. The last
+ * entry (BACKLOG) contains all entries considered 'full'
+ * XXX to optimize things, entry i could contain queues with
+ * 2^{i-1}+1 .. 2^i entries.
+ */
+#define BACKLOG 30
+ uint32_t llmask;
+ struct list_head ll[BACKLOG + 10];
+};
+
+/* FI2Q and Q2FI converts from flow_id to dn_queue and back.
+ * We cannot easily use pointer arithmetic because it is variable size.
+ */
+#define FI2Q(c, i) ((struct dn_queue *)((c)->q + (c)->q_len * (i)))
+#define Q2FI(c, q) (((char *)(q) - (c)->q)/(c)->q_len)
+
+int debug = 0;
+
+struct dn_parms dn_cfg;
+
+static void controller(struct cfg_s *c);
+
+/* release a packet: put the mbuf in the freelist, and the queue in
+ * the bucket.
+ */
+int
+drop(struct cfg_s *c, struct mbuf *m)
+{
+ struct dn_queue *q;
+ int i;
+
+ c->drop++;
+ q = FI2Q(c, m->flow_id);
+ i = q->ni.length; // XXX or ffs...
+
+ ND("q %p id %d current length %d", q, m->flow_id, i);
+ if (i < BACKLOG) {
+ struct list_head *h = &q->ni.h;
+ c->llmask &= ~(1<<(i+1));
+ c->llmask |= (1<<(i));
+ list_del(h);
+ list_add_tail(h, &c->ll[i]);
+ }
+ m->m_nextpkt = c->freelist;
+ c->freelist = m;
+ return 0;
+}
+
+/* dequeue returns NON-NULL when a packet is dropped */
+static int
+enqueue(struct cfg_s *c, void *_m)
+{
+ struct mbuf *m = _m;
+ if (c->enq)
+ return c->enq(c->si, FI2Q(c, m->flow_id), m);
+ if (c->head == NULL)
+ c->head = m;
+ else
+ c->tail->m_nextpkt = m;
+ c->tail = m;
+ return 0; /* default - success */
+}
+
+/* dequeue returns NON-NULL when a packet is available */
+static void *
+dequeue(struct cfg_s *c)
+{
+ struct mbuf *m;
+ if (c->deq)
+ return c->deq(c->si);
+ if ((m = c->head)) {
+ m = c->head;
+ c->head = m->m_nextpkt;
+ m->m_nextpkt = NULL;
+ }
+ return m;
+}
+
+static int
+mainloop(struct cfg_s *c)
+{
+ int i;
+ struct mbuf *m;
+
+ for (i=0; i < c->loops; i++) {
+ /* implement histeresis */
+ controller(c);
+ DX(3, "loop %d enq %d send %p rx %d",
+ i, c->_enqueue, c->tosend, c->can_dequeue);
+ if ( (m = c->tosend) ) {
+ c->_enqueue++;
+ if (enqueue(c, m)) {
+ drop(c, m);
+ ND("loop %d enqueue fail", i );
+ } else {
+ ND("enqueue ok");
+ c->pending++;
+ }
+ }
+ if (c->can_dequeue) {
+ c->dequeue++;
+ if ((m = dequeue(c))) {
+ c->pending--;
+ drop(c, m);
+ c->drop--; /* compensate */
+ }
+ }
+ }
+ DX(1, "mainloop ends %d", i);
+ return 0;
+}
+
+int
+dump(struct cfg_s *c)
+{
+ int i;
+ struct dn_queue *q;
+
+ for (i=0; i < c->flows; i++) {
+ q = FI2Q(c, i);
+ DX(1, "queue %4d tot %10lld", i, q->ni.tot_bytes);
+ }
+ DX(1, "done %d loops\n", c->loops);
+ return 0;
+}
+
+/* interpret a number in human form */
+static long
+getnum(const char *s, char **next, const char *key)
+{
+ char *end = NULL;
+ long l;
+
+ if (next) /* default */
+ *next = NULL;
+ if (s && *s) {
+ DX(3, "token is <%s> %s", s, key ? key : "-");
+ l = strtol(s, &end, 0);
+ } else {
+ DX(3, "empty string");
+ l = -1;
+ }
+ if (l < 0) {
+ DX(2, "invalid %s for %s", s ? s : "NULL", (key ? key : "") );
+ return 0; // invalid
+ }
+ if (!end || !*end)
+ return l;
+ if (*end == 'n')
+ l = -l; /* multiply by n */
+ else if (*end == 'K')
+ l = l*1000;
+ else if (*end == 'M')
+ l = l*1000000;
+ else if (*end == 'k')
+ l = l*1024;
+ else if (*end == 'm')
+ l = l*1024*1024;
+ else if (*end == 'w')
+ ;
+ else {/* not recognized */
+ D("suffix %s for %s, next %p", end, key, next);
+ end--;
+ }
+ end++;
+ DX(3, "suffix now %s for %s, next %p", end, key, next);
+ if (next && *end) {
+ DX(3, "setting next to %s for %s", end, key);
+ *next = end;
+ }
+ return l;
+}
+
+/*
+ * flowsets are a comma-separated list of
+ * weight:maxlen:flows
+ * indicating how many flows are hooked to that fs.
+ * Both weight and range can be min-max-steps.
+ * In a first pass we just count the number of flowsets and flows,
+ * in a second pass we complete the setup.
+ */
+static void
+parse_flowsets(struct cfg_s *c, const char *fs, int pass)
+{
+ char *s, *cur, *next;
+ int n_flows = 0, n_fs = 0, wsum = 0;
+ int i, j;
+ struct dn_fs *prev = NULL;
+
+ DX(3, "--- pass %d flows %d flowsets %d", pass, c->flows, c->flowsets);
+ if (pass == 0)
+ c->fs_config = fs;
+ s = c->fs_config ? strdup(c->fs_config) : NULL;
+ if (s == NULL) {
+ if (pass == 0)
+ D("no fsconfig");
+ return;
+ }
+ for (next = s; (cur = strsep(&next, ","));) {
+ char *p = NULL;
+ int w, w_h, w_steps, wi;
+ int len, len_h, l_steps, li;
+ int flows;
+
+ w = getnum(strsep(&cur, ":"), &p, "weight");
+ if (w <= 0)
+ w = 1;
+ w_h = p ? getnum(p+1, &p, "weight_max") : w;
+ w_steps = p ? getnum(p+1, &p, "w_steps") : (w_h == w ?1:2);
+ len = getnum(strsep(&cur, ":"), &p, "len");
+ if (len <= 0)
+ len = 1000;
+ len_h = p ? getnum(p+1, &p, "len_max") : len;
+ l_steps = p ? getnum(p+1, &p, "l_steps") : (len_h == len ? 1 : 2);
+ flows = getnum(strsep(&cur, ":"), NULL, "flows");
+ if (flows == 0)
+ flows = 1;
+ DX(4, "weight %d..%d (%d) len %d..%d (%d) flows %d",
+ w, w_h, w_steps, len, len_h, l_steps, flows);
+ if (w == 0 || w_h < w || len == 0 || len_h < len ||
+ flows == 0) {
+ DX(4,"wrong parameters %s", fs);
+ return;
+ }
+ n_flows += flows * w_steps * l_steps;
+ for (i = 0; i < w_steps; i++) {
+ wi = w + ((w_h - w)* i)/(w_steps == 1 ? 1 : (w_steps-1));
+ for (j = 0; j < l_steps; j++, n_fs++) {
+ struct dn_fs *fs = &c->fs[n_fs].fs; // tentative
+ int x;
+
+ li = len + ((len_h - len)* j)/(l_steps == 1 ? 1 : (l_steps-1));
+ x = (wi*2048)/li;
+ DX(3, "----- fs %4d weight %4d lmax %4d X %4d flows %d",
+ n_fs, wi, li, x, flows);
+ if (pass == 0)
+ continue;
+ if (c->fs == NULL || c->flowsets <= n_fs) {
+ D("error in number of flowsets");
+ return;
+ }
+ wsum += wi * flows;
+ fs->par[0] = wi;
+ fs->par[1] = li;
+ fs->index = n_fs;
+ fs->n_flows = flows;
+ fs->cur = fs->first_flow = prev==NULL ? 0 : prev->next_flow;
+ fs->next_flow = fs->first_flow + fs->n_flows;
+ fs->y = x * flows;
+ fs->base_y = (prev == NULL) ? 0 : prev->next_y;
+ fs->next_y = fs->base_y + fs->y;
+ prev = fs;
+ }
+ }
+ }
+ c->max_y = prev ? prev->base_y + prev->y : 0;
+ c->flows = n_flows;
+ c->flowsets = n_fs;
+ c->wsum = wsum;
+ if (pass == 0)
+ return;
+
+ /* now link all flows to their parent flowsets */
+ DX(1,"%d flows on %d flowsets max_y %d", c->flows, c->flowsets, c->max_y);
+ for (i=0; i < c->flowsets; i++) {
+ struct dn_fs *fs = &c->fs[i].fs;
+ DX(1, "fs %3d w %5d l %4d flow %5d .. %5d y %6d .. %6d",
+ i, fs->par[0], fs->par[1],
+ fs->first_flow, fs->next_flow,
+ fs->base_y, fs->next_y);
+ for (j = fs->first_flow; j < fs->next_flow; j++) {
+ struct dn_queue *q = FI2Q(c, j);
+ q->fs = &c->fs[i];
+ }
+ }
+}
+
+static int
+init(struct cfg_s *c)
+{
+ int i;
+ int ac = c->ac;
+ char * const *av = c->av;
+
+ c->si_len = sizeof(struct dn_sch_inst);
+ c->q_len = sizeof(struct dn_queue);
+ moduledata_t *mod = NULL;
+ struct dn_alg *p = NULL;
+
+ c->th_min = 0;
+ c->th_max = -20;/* 20 packets per flow */
+ c->lmin = c->lmax = 1280; /* packet len */
+ c->flows = 1;
+ c->flowsets = 1;
+ c->name = "null";
+ ac--; av++;
+ while (ac > 1) {
+ if (!strcmp(*av, "-n")) {
+ c->loops = getnum(av[1], NULL, av[0]);
+ } else if (!strcmp(*av, "-d")) {
+ debug = atoi(av[1]);
+ } else if (!strcmp(*av, "-alg")) {
+ extern moduledata_t *_g_dn_fifo;
+ extern moduledata_t *_g_dn_wf2qp;
+ extern moduledata_t *_g_dn_rr;
+ extern moduledata_t *_g_dn_qfq;
+#ifdef WITH_KPS
+ extern moduledata_t *_g_dn_kps;
+#endif
+ if (!strcmp(av[1], "rr"))
+ mod = _g_dn_rr;
+ else if (!strcmp(av[1], "wf2qp"))
+ mod = _g_dn_wf2qp;
+ else if (!strcmp(av[1], "fifo"))
+ mod = _g_dn_fifo;
+ else if (!strcmp(av[1], "qfq"))
+ mod = _g_dn_qfq;
+#ifdef WITH_KPS
+ else if (!strcmp(av[1], "kps"))
+ mod = _g_dn_kps;
+#endif
+ else
+ mod = NULL;
+ c->name = mod ? mod->name : "NULL";
+ DX(3, "using scheduler %s", c->name);
+ } else if (!strcmp(*av, "-len")) {
+ c->lmin = getnum(av[1], NULL, av[0]);
+ c->lmax = c->lmin;
+ DX(3, "setting max to %d", c->th_max);
+ } else if (!strcmp(*av, "-burst")) {
+ c->maxburst = getnum(av[1], NULL, av[0]);
+ DX(3, "setting max to %d", c->th_max);
+ } else if (!strcmp(*av, "-qmax")) {
+ c->th_max = getnum(av[1], NULL, av[0]);
+ DX(3, "setting max to %d", c->th_max);
+ } else if (!strcmp(*av, "-qmin")) {
+ c->th_min = getnum(av[1], NULL, av[0]);
+ DX(3, "setting min to %d", c->th_min);
+ } else if (!strcmp(*av, "-flows")) {
+ c->flows = getnum(av[1], NULL, av[0]);
+ DX(3, "setting flows to %d", c->flows);
+ } else if (!strcmp(*av, "-flowsets")) {
+ parse_flowsets(c, av[1], 0);
+ DX(3, "setting flowsets to %d", c->flowsets);
+ } else {
+ D("option %s not recognised, ignore", *av);
+ }
+ ac -= 2; av += 2;
+ }
+ if (c->maxburst <= 0)
+ c->maxburst = 1;
+ if (c->loops <= 0)
+ c->loops = 1;
+ if (c->flows <= 0)
+ c->flows = 1;
+ if (c->flowsets <= 0)
+ c->flowsets = 1;
+ if (c->lmin <= 0)
+ c->lmin = 1;
+ if (c->lmax <= 0)
+ c->lmax = 1;
+ /* multiply by N */
+ if (c->th_min < 0)
+ c->th_min = c->flows * -c->th_min;
+ if (c->th_max < 0)
+ c->th_max = c->flows * -c->th_max;
+ if (c->th_max <= c->th_min)
+ c->th_max = c->th_min + 1;
+ if (mod) {
+ p = mod->p;
+ DX(3, "using module %s f %p p %p", mod->name, mod->f, mod->p);
+ DX(3, "modname %s ty %d", p->name, p->type);
+ c->enq = p->enqueue;
+ c->deq = p->dequeue;
+ c->si_len += p->si_datalen;
+ c->q_len += p->q_datalen;
+ c->schk_len += p->schk_datalen;
+ }
+ /* allocate queues, flowsets and one scheduler */
+ c->q = calloc(c->flows, c->q_len);
+ c->fs = calloc(c->flowsets, sizeof(struct dn_fsk));
+ c->si = calloc(1, c->si_len);
+ c->sched = calloc(c->flows, c->schk_len);
+ if (c->q == NULL || c->fs == NULL) {
+ D("error allocating memory for flows");
+ exit(1);
+ }
+ c->si->sched = c->sched;
+ if (p) {
+ if (p->config)
+ p->config(c->sched);
+ if (p->new_sched)
+ p->new_sched(c->si);
+ }
+ /* parse_flowsets links queues to their flowsets */
+ parse_flowsets(c, av[1], 1);
+ /* complete the work calling new_fsk */
+ for (i = 0; i < c->flowsets; i++) {
+ if (c->fs[i].fs.par[1] == 0)
+ c->fs[i].fs.par[1] = 1000; /* default pkt len */
+ c->fs[i].sched = c->sched;
+ if (p && p->new_fsk)
+ p->new_fsk(&c->fs[i]);
+ }
+
+ /* initialize the lists for the generator, and put
+ * all flows in the list for backlog = 0
+ */
+ for (i=0; i <= BACKLOG+5; i++)
+ INIT_LIST_HEAD(&c->ll[i]);
+
+ for (i = 0; i < c->flows; i++) {
+ struct dn_queue *q = FI2Q(c, i);
+ if (q->fs == NULL)
+ q->fs = &c->fs[0]; /* XXX */
+ q->_si = c->si;
+ if (p && p->new_queue)
+ p->new_queue(q);
+ INIT_LIST_HEAD(&q->ni.h);
+ list_add_tail(&q->ni.h, &c->ll[0]);
+ }
+ c->llmask = 1;
+ return 0;
+}
+
+
+int
+main(int ac, char *av[])
+{
+ struct cfg_s c;
+ struct timeval end;
+ double ll;
+ int i;
+ char msg[40];
+
+ bzero(&c, sizeof(c));
+ c.ac = ac;
+ c.av = av;
+ init(&c);
+ gettimeofday(&c.time, NULL);
+ mainloop(&c);
+ gettimeofday(&end, NULL);
+ end.tv_sec -= c.time.tv_sec;
+ end.tv_usec -= c.time.tv_usec;
+ if (end.tv_usec < 0) {
+ end.tv_usec += 1000000;
+ end.tv_sec--;
+ }
+ c.time = end;
+ ll = end.tv_sec*1000000 + end.tv_usec;
+ ll *= 1000; /* convert to nanoseconds */
+ ll /= c._enqueue;
+ sprintf(msg, "1::%d", c.flows);
+ D("%-8s n %d %d time %d.%06d %8.3f qlen %d %d flows %s drops %d",
+ c.name, c._enqueue, c.loops,
+ (int)c.time.tv_sec, (int)c.time.tv_usec, ll,
+ c.th_min, c.th_max,
+ c.fs_config ? c.fs_config : msg, c.drop);
+ dump(&c);
+ DX(1, "done ac %d av %p", ac, av);
+ for (i=0; i < ac; i++)
+ DX(1, "arg %d %s", i, av[i]);
+ return 0;
+}
+
+/*
+ * The controller decides whether in this iteration we should send
+ * (the packet is in c->tosend) and/or receive (flag c->can_dequeue)
+ */
+static void
+controller(struct cfg_s *c)
+{
+ struct mbuf *m;
+ struct dn_fs *fs;
+ int flow_id;
+
+ /* histeresis between max and min */
+ if (c->state == 0 && c->pending >= c->th_max)
+ c->state = 1;
+ else if (c->state == 1 && c->pending <= c->th_min)
+ c->state = 0;
+ ND(1, "state %d pending %2d", c->state, c->pending);
+ c->can_dequeue = c->state;
+ c->tosend = NULL;
+ if (c->state)
+ return;
+
+ if (1) {
+ int i;
+ struct dn_queue *q;
+ struct list_head *h;
+
+ i = ffs(c->llmask) - 1;
+ if (i < 0) {
+ DX(2, "no candidate");
+ c->can_dequeue = 1;
+ return;
+ }
+ h = &c->ll[i];
+ ND(1, "backlog %d p %p prev %p next %p", i, h, h->prev, h->next);
+ q = list_first_entry(h, struct dn_queue, ni.h);
+ list_del(&q->ni.h);
+ flow_id = Q2FI(c, q);
+ DX(2, "extracted flow %p %d backlog %d", q, flow_id, i);
+ if (list_empty(h)) {
+ ND(2, "backlog %d empty", i);
+ c->llmask &= ~(1<<i);
+ }
+ ND(1, "before %d p %p prev %p next %p", i+1, h+1, h[1].prev, h[1].next);
+ list_add_tail(&q->ni.h, h+1);
+ ND(1, " after %d p %p prev %p next %p", i+1, h+1, h[1].prev, h[1].next);
+ if (i < BACKLOG) {
+ ND(2, "backlog %d full", i+1);
+ c->llmask |= 1<<(1+i);
+ }
+ fs = &q->fs->fs;
+ c->cur_fs = q->fs - c->fs;
+ fs->cur = flow_id;
+ } else {
+ /* XXX this does not work ? */
+ /* now decide whom to send the packet, and the length */
+ /* lookup in the flow table */
+ if (c->cur_y >= c->max_y) { /* handle wraparound */
+ c->cur_y = 0;
+ c->cur_fs = 0;
+ }
+ fs = &c->fs[c->cur_fs].fs;
+ flow_id = fs->cur++;
+ if (fs->cur >= fs->next_flow)
+ fs->cur = fs->first_flow;
+ c->cur_y++;
+ if (c->cur_y >= fs->next_y)
+ c->cur_fs++;
+ }
+
+ /* construct a packet */
+ if (c->freelist) {
+ m = c->tosend = c->freelist;
+ c->freelist = c->freelist->m_nextpkt;
+ } else {
+ m = c->tosend = calloc(1, sizeof(struct mbuf));
+ }
+ if (m == NULL)
+ return;
+
+ m->cfg = c;
+ m->m_nextpkt = NULL;
+ m->m_pkthdr.len = fs->par[1]; // XXX maxlen
+ m->flow_id = flow_id;
+
+ ND(2,"y %6d flow %5d fs %3d weight %4d len %4d",
+ c->cur_y, m->flow_id, c->cur_fs,
+ fs->par[0], m->m_pkthdr.len);
+
+}
+
+/*
+Packet allocation:
+to achieve a distribution that matches weights, for each X=w/lmax class
+we should generate a number of packets proportional to Y = X times the number
+of flows in the class.
+So we construct an array with the cumulative distribution of Y's,
+and use it to identify the flow via inverse mapping (if the Y's are
+not too many we can use an array for the lookup). In practice,
+each flow will have X entries [virtually] pointing to it.
+
+*/
diff --git a/sys/netinet/ipfw/test/mylist.h b/sys/netinet/ipfw/test/mylist.h
new file mode 100644
index 0000000..6247f32
--- /dev/null
+++ b/sys/netinet/ipfw/test/mylist.h
@@ -0,0 +1,49 @@
+/*
+ * $FreeBSD$
+ *
+ * linux-like bidirectional lists
+ */
+
+#ifndef _MYLIST_H
+#define _MYLIST_H
+struct list_head {
+ struct list_head *prev, *next;
+};
+
+#define INIT_LIST_HEAD(l) do { (l)->prev = (l)->next = (l); } while (0)
+#define list_empty(l) ( (l)->next == l )
+static inline void
+__list_add(struct list_head *o, struct list_head *prev,
+ struct list_head *next)
+{
+ next->prev = o;
+ o->next = next;
+ o->prev = prev;
+ prev->next = o;
+}
+
+static inline void
+list_add_tail(struct list_head *o, struct list_head *head)
+{
+ __list_add(o, head->prev, head);
+}
+
+#define list_first_entry(pL, ty, member) \
+ (ty *)((char *)((pL)->next) - offsetof(ty, member))
+
+static inline void
+__list_del(struct list_head *prev, struct list_head *next)
+{
+ next->prev = prev;
+ prev->next = next;
+}
+
+static inline void
+list_del(struct list_head *entry)
+{
+ ND("called on %p", entry);
+ __list_del(entry->prev, entry->next);
+ entry->next = entry->prev = NULL;
+}
+
+#endif /* _MYLIST_H */
diff --git a/sys/netinet/ipfw/test/test_dn_heap.c b/sys/netinet/ipfw/test/test_dn_heap.c
new file mode 100644
index 0000000..d460cf2
--- /dev/null
+++ b/sys/netinet/ipfw/test/test_dn_heap.c
@@ -0,0 +1,162 @@
+/*-
+ * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Userland code for testing binary heaps and hash tables
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+
+#include <stdio.h>
+#include <strings.h>
+#include <stdlib.h>
+
+#include "dn_heap.h"
+#define log(x, arg...) fprintf(stderr, ## arg)
+#define panic(x...) fprintf(stderr, ## x), exit(1)
+
+#include <string.h>
+
+struct x {
+ struct x *ht_link;
+ char buf[0];
+};
+
+uint32_t hf(uintptr_t key, int flags, void *arg)
+{
+ return (flags & DNHT_KEY_IS_OBJ) ?
+ ((struct x *)key)->buf[0] : *(char *)key;
+}
+
+int matchf(void *obj, uintptr_t key, int flags, void *arg)
+{
+ char *s = (flags & DNHT_KEY_IS_OBJ) ?
+ ((struct x *)key)->buf : (char *)key;
+ return (strcmp(((struct x *)obj)->buf, s) == 0);
+}
+
+void *newfn(uintptr_t key, int flags, void *arg)
+{
+ char *s = (char *)key;
+ struct x *p = malloc(sizeof(*p) + 1 + strlen(s));
+ if (p)
+ strcpy(p->buf, s);
+ return p;
+}
+
+char *strings[] = {
+ "undici", "unico", "doppio", "devoto",
+ "uno", "due", "tre", "quattro", "cinque", "sei",
+ "uno", "due", "tre", "quattro", "cinque", "sei",
+ NULL,
+};
+
+int doprint(void *_x, void *arg)
+{
+ struct x *x = _x;
+ printf("found element <%s>\n", x->buf);
+ return (int)arg;
+}
+
+static void
+test_hash()
+{
+ char **p;
+ struct dn_ht *h;
+ uintptr_t x = 0;
+ uintptr_t x1 = 0;
+
+ /* first, find and allocate */
+ h = dn_ht_init(NULL, 10, 0, hf, matchf, newfn);
+
+ for (p = strings; *p; p++) {
+ dn_ht_find(h, (uintptr_t)*p, DNHT_INSERT, NULL);
+ }
+ dn_ht_scan(h, doprint, 0);
+ printf("/* second -- find without allocate */\n");
+ h = dn_ht_init(NULL, 10, 0, hf, matchf, NULL);
+ for (p = strings; *p; p++) {
+ void **y = newfn((uintptr_t)*p, 0, NULL);
+ if (x == 0)
+ x = (uintptr_t)y;
+ else {
+ if (x1 == 0)
+ x1 = (uintptr_t)*p;
+ }
+ dn_ht_find(h, (uintptr_t)y, DNHT_INSERT | DNHT_KEY_IS_OBJ, NULL);
+ }
+ dn_ht_scan(h, doprint, 0);
+ printf("remove %p gives %p\n", (void *)x,
+ dn_ht_find(h, x, DNHT_KEY_IS_OBJ | DNHT_REMOVE, NULL));
+ printf("remove %p gives %p\n", (void *)x,
+ dn_ht_find(h, x, DNHT_KEY_IS_OBJ | DNHT_REMOVE, NULL));
+ printf("remove %p gives %p\n", (void *)x,
+ dn_ht_find(h, x1, DNHT_REMOVE, NULL));
+ printf("remove %p gives %p\n", (void *)x,
+ dn_ht_find(h, x1, DNHT_REMOVE, NULL));
+ dn_ht_scan(h, doprint, 0);
+}
+
+int
+main(int argc, char *argv[])
+{
+ struct dn_heap h;
+ int i, n, n2, n3;
+
+ test_hash();
+ return 0;
+
+ /* n = elements, n2 = cycles */
+ n = (argc > 1) ? atoi(argv[1]) : 0;
+ if (n <= 0 || n > 1000000)
+ n = 100;
+ n2 = (argc > 2) ? atoi(argv[2]) : 0;
+ if (n2 <= 0)
+ n = 1000000;
+ n3 = (argc > 3) ? atoi(argv[3]) : 0;
+ bzero(&h, sizeof(h));
+ heap_init(&h, n, -1);
+ while (n2-- > 0) {
+ uint64_t prevk = 0;
+ for (i=0; i < n; i++)
+ heap_insert(&h, n3 ? n-i: random(), (void *)(100+i));
+
+ for (i=0; h.elements > 0; i++) {
+ uint64_t k = h.p[0].key;
+ if (k < prevk)
+ panic("wrong sequence\n");
+ prevk = k;
+ if (0)
+ printf("%d key %llu, val %p\n",
+ i, h.p[0].key, h.p[0].object);
+ heap_extract(&h, NULL);
+ }
+ }
+ return 0;
+}
diff --git a/sys/netinet/ipfw/test/test_dn_sched.c b/sys/netinet/ipfw/test/test_dn_sched.c
new file mode 100644
index 0000000..ee46c95
--- /dev/null
+++ b/sys/netinet/ipfw/test/test_dn_sched.c
@@ -0,0 +1,89 @@
+/*
+ * $FreeBSD$
+ *
+ * library functions for userland testing of dummynet schedulers
+ */
+
+#include "dn_test.h"
+
+void
+m_freem(struct mbuf *m)
+{
+ printf("free %p\n", m);
+}
+
+int
+dn_sched_modevent(module_t mod, int cmd, void *arg)
+{
+ return 0;
+}
+
+void
+dn_free_pkts(struct mbuf *m)
+{
+ struct mbuf *x;
+ while ( (x = m) ) {
+ m = m->m_nextpkt;
+ m_freem(x);
+ }
+}
+
+int
+dn_delete_queue(void *_q, void *do_free)
+{
+ struct dn_queue *q = _q;
+ if (q->mq.head)
+ dn_free_pkts(q->mq.head);
+ free(q);
+ return 0;
+}
+
+/*
+ * This is a simplified function for testing purposes, which does
+ * not implement statistics or random loss.
+ * Enqueue a packet in q, subject to space and queue management policy
+ * (whose parameters are in q->fs).
+ * Update stats for the queue and the scheduler.
+ * Return 0 on success, 1 on drop. The packet is consumed anyways.
+ */
+int
+dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop)
+{
+ if (drop)
+ goto drop;
+ if (q->ni.length >= 200)
+ goto drop;
+ mq_append(&q->mq, m);
+ q->ni.length++;
+ q->ni.tot_bytes += m->m_pkthdr.len;
+ return 0;
+
+drop:
+ q->ni.drops++;
+ return 1;
+}
+
+int
+ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg)
+{
+ if (*v < lo) {
+ *v = dflt;
+ } else if (*v > hi) {
+ *v = hi;
+ }
+ return *v;
+}
+
+#ifndef __FreeBSD__
+int
+fls(int mask)
+{
+ int bit;
+
+ if (mask == 0)
+ return (0);
+ for (bit = 1; mask != 1; bit++)
+ mask = (unsigned int)mask >> 1;
+ return (bit);
+}
+#endif
diff --git a/sys/netinet/raw_ip.c b/sys/netinet/raw_ip.c
index 3573472..9341cf2 100644
--- a/sys/netinet/raw_ip.c
+++ b/sys/netinet/raw_ip.c
@@ -80,14 +80,18 @@ VNET_DEFINE(struct inpcbinfo, ripcbinfo);
#define V_ripcbinfo VNET(ripcbinfo)
/*
- * Control and data hooks for ipfw and dummynet.
+ * Control and data hooks for ipfw, dummynet, divert and so on.
* The data hooks are not used here but it is convenient
* to keep them all in one place.
*/
VNET_DEFINE(ip_fw_chk_ptr_t, ip_fw_chk_ptr) = NULL;
VNET_DEFINE(ip_fw_ctl_ptr_t, ip_fw_ctl_ptr) = NULL;
-int (*ip_dn_ctl_ptr)(struct sockopt *) = NULL;
-int (*ip_dn_io_ptr)(struct mbuf **m, int dir, struct ip_fw_args *fwa) = NULL;
+
+int (*ip_dn_ctl_ptr)(struct sockopt *);
+int (*ip_dn_io_ptr)(struct mbuf **, int, struct ip_fw_args *);
+void (*ip_divert_ptr)(struct mbuf *, int);
+int (*ng_ipfw_input_p)(struct mbuf **, int,
+ struct ip_fw_args *, int);
/*
* Hooks for multicast routing. They all default to NULL, so leave them not
OpenPOWER on IntegriCloud