summaryrefslogtreecommitdiffstats
path: root/sbin/ipfw/dummynet.c
diff options
context:
space:
mode:
authorluigi <luigi@FreeBSD.org>2010-03-02 17:40:48 +0000
committerluigi <luigi@FreeBSD.org>2010-03-02 17:40:48 +0000
commit5ceeac4aa882074e2b1e42fbc20c79ebbd24f4c5 (patch)
tree225c58cb171a7858ca9880ee492dd9f3d281f623 /sbin/ipfw/dummynet.c
parenta8f766213ad48d0cb09214bdf6ac6a3b74221b03 (diff)
downloadFreeBSD-src-5ceeac4aa882074e2b1e42fbc20c79ebbd24f4c5.zip
FreeBSD-src-5ceeac4aa882074e2b1e42fbc20c79ebbd24f4c5.tar.gz
Bring in the most recent version of ipfw and dummynet, developed
and tested over the past two months in the ipfw3-head branch. This also happens to be the same code available in the Linux and Windows ports of ipfw and dummynet. The major enhancement is a completely restructured version of dummynet, with support for different packet scheduling algorithms (loadable at runtime), faster queue/pipe lookup, and a much cleaner internal architecture and kernel/userland ABI which simplifies future extensions. In addition to the existing schedulers (FIFO and WF2Q+), we include a Deficit Round Robin (DRR or RR for brevity) scheduler, and a new, very fast version of WF2Q+ called QFQ. Some test code is also present (in sys/netinet/ipfw/test) that lets you build and test schedulers in userland. Also, we have added a compatibility layer that understands requests from the RELENG_7 and RELENG_8 versions of the /sbin/ipfw binaries, and replies correctly (at least, it does its best; sometimes you just cannot tell who sent the request and how to answer). The compatibility layer should make it possible to MFC this code in a relatively short time. Some minor glitches (e.g. handling of ipfw set enable/disable, and a workaround for a bug in RELENG_7's /sbin/ipfw) will be fixed with separate commits. CREDITS: This work has been partly supported by the ONELAB2 project, and mostly developed by Riccardo Panicucci and myself. The code for the qfq scheduler is mostly from Fabio Checconi, and Marta Carbone and Francesco Magno have helped with testing, debugging and some bug fixes.
Diffstat (limited to 'sbin/ipfw/dummynet.c')
-rw-r--r--sbin/ipfw/dummynet.c848
1 files changed, 517 insertions, 331 deletions
diff --git a/sbin/ipfw/dummynet.c b/sbin/ipfw/dummynet.c
index 490aa53..68b2220 100644
--- a/sbin/ipfw/dummynet.c
+++ b/sbin/ipfw/dummynet.c
@@ -1,10 +1,5 @@
/*
- * Copyright (c) 2002-2003 Luigi Rizzo
- * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp
- * Copyright (c) 1994 Ugen J.S.Antsilevich
- *
- * Idea and grammar partially left from:
- * Copyright (c) 1993 Daniel Boulet
+ * Copyright (c) 2002-2003,2010 Luigi Rizzo
*
* Redistribution and use in source forms, with and without modification,
* are permitted provided that this entire comment appears intact.
@@ -24,7 +19,6 @@
#include <sys/types.h>
#include <sys/socket.h>
-#include <sys/queue.h>
/* XXX there are several sysctl leftover here */
#include <sys/sysctl.h>
@@ -46,6 +40,7 @@
#include <netinet/ip_dummynet.h>
#include <arpa/inet.h> /* inet_ntoa */
+
static struct _s_x dummynet_params[] = {
{ "plr", TOK_PLR },
{ "noerror", TOK_NOERROR },
@@ -56,27 +51,59 @@ static struct _s_x dummynet_params[] = {
{ "src-port", TOK_SRCPORT },
{ "proto", TOK_PROTO },
{ "weight", TOK_WEIGHT },
+ { "lmax", TOK_LMAX },
+ { "maxlen", TOK_LMAX },
{ "all", TOK_ALL },
- { "mask", TOK_MASK },
+ { "mask", TOK_MASK }, /* alias for both */
+ { "sched_mask", TOK_SCHED_MASK },
+ { "flow_mask", TOK_FLOW_MASK },
{ "droptail", TOK_DROPTAIL },
{ "red", TOK_RED },
{ "gred", TOK_GRED },
{ "bw", TOK_BW },
{ "bandwidth", TOK_BW },
{ "delay", TOK_DELAY },
+ { "link", TOK_LINK },
{ "pipe", TOK_PIPE },
{ "queue", TOK_QUEUE },
+ { "flowset", TOK_FLOWSET },
+ { "sched", TOK_SCHED },
+ { "pri", TOK_PRI },
+ { "priority", TOK_PRI },
+ { "type", TOK_TYPE },
{ "flow-id", TOK_FLOWID},
{ "dst-ipv6", TOK_DSTIP6},
{ "dst-ip6", TOK_DSTIP6},
{ "src-ipv6", TOK_SRCIP6},
{ "src-ip6", TOK_SRCIP6},
- { "profile", TOK_PIPE_PROFILE},
+ { "profile", TOK_PROFILE},
{ "burst", TOK_BURST},
{ "dummynet-params", TOK_NULL },
{ NULL, 0 } /* terminator */
};
+#define O_NEXT(p, len) ((void *)((char *)p + len))
+
+static void
+oid_fill(struct dn_id *oid, int len, int type, uintptr_t id)
+{
+ oid->len = len;
+ oid->type = type;
+ oid->subtype = 0;
+ oid->id = id;
+}
+
+/* make room in the buffer and move the pointer forward */
+static void *
+o_next(struct dn_id **o, int len, int type)
+{
+ struct dn_id *ret = *o;
+ oid_fill(ret, len, type, 0);
+ *o = O_NEXT(*o, len);
+ return ret;
+}
+
+#if 0
static int
sort_q(void *arg, const void *pa, const void *pb)
{
@@ -108,117 +135,81 @@ sort_q(void *arg, const void *pa, const void *pb)
res = 1;
return (int)(rev ? res : -res);
}
+#endif
+/* print a mask and header for the subsequent list of flows */
static void
-list_queues(struct dn_flow_set *fs, struct dn_flow_queue *q)
+print_mask(struct ipfw_flow_id *id)
+{
+ if (!IS_IP6_FLOW_ID(id)) {
+ printf(" "
+ "mask: 0x%02x 0x%08x/0x%04x -> 0x%08x/0x%04x\n",
+ id->proto,
+ id->src_ip, id->src_port,
+ id->dst_ip, id->dst_port);
+
+ printf("BKT Prot ___Source IP/port____ "
+ "____Dest. IP/port____ "
+ "Tot_pkt/bytes Pkt/Byte Drp\n");
+ } else {
+ char buf[255];
+ printf("\n mask: proto: 0x%02x, flow_id: 0x%08x, ",
+ id->proto, id->flow_id6);
+ inet_ntop(AF_INET6, &(id->src_ip6), buf, sizeof(buf));
+ printf("%s/0x%04x -> ", buf, id->src_port);
+ inet_ntop(AF_INET6, &(id->dst_ip6), buf, sizeof(buf));
+ printf("%s/0x%04x\n", buf, id->dst_port);
+
+ printf("BKT ___Prot___ _flow-id_ "
+ "______________Source IPv6/port_______________ "
+ "_______________Dest. IPv6/port_______________ "
+ "Tot_pkt/bytes Pkt/Byte Drp\n");
+ }
+}
+
+static void
+list_flow(struct dn_flow *ni)
{
- int l;
- int index_printed, indexes = 0;
char buff[255];
struct protoent *pe;
+ struct in_addr ina;
+ struct ipfw_flow_id *id = &ni->fid;
- if (fs->rq_elements == 0)
- return;
-
- if (co.do_sort != 0)
- qsort_r(q, fs->rq_elements, sizeof *q, NULL, sort_q);
-
- /* Print IPv4 flows */
- index_printed = 0;
- for (l = 0; l < fs->rq_elements; l++) {
- struct in_addr ina;
-
+ pe = getprotobynumber(id->proto);
/* XXX: Should check for IPv4 flows */
- if (IS_IP6_FLOW_ID(&(q[l].id)))
- continue;
-
- if (!index_printed) {
- index_printed = 1;
- if (indexes > 0) /* currently a no-op */
- printf("\n");
- indexes++;
- printf(" "
- "mask: 0x%02x 0x%08x/0x%04x -> 0x%08x/0x%04x\n",
- fs->flow_mask.proto,
- fs->flow_mask.src_ip, fs->flow_mask.src_port,
- fs->flow_mask.dst_ip, fs->flow_mask.dst_port);
-
- printf("BKT Prot ___Source IP/port____ "
- "____Dest. IP/port____ "
- "Tot_pkt/bytes Pkt/Byte Drp\n");
- }
-
- printf("%3d ", q[l].hash_slot);
- pe = getprotobynumber(q[l].id.proto);
+ printf("%3u ", (ni->oid.id) & 0xff);
+ if (!IS_IP6_FLOW_ID(id)) {
if (pe)
printf("%-4s ", pe->p_name);
else
- printf("%4u ", q[l].id.proto);
- ina.s_addr = htonl(q[l].id.src_ip);
+ printf("%4u ", id->proto);
+ ina.s_addr = htonl(id->src_ip);
printf("%15s/%-5d ",
- inet_ntoa(ina), q[l].id.src_port);
- ina.s_addr = htonl(q[l].id.dst_ip);
+ inet_ntoa(ina), id->src_port);
+ ina.s_addr = htonl(id->dst_ip);
printf("%15s/%-5d ",
- inet_ntoa(ina), q[l].id.dst_port);
- printf("%4llu %8llu %2u %4u %3u\n",
- align_uint64(&q[l].tot_pkts),
- align_uint64(&q[l].tot_bytes),
- q[l].len, q[l].len_bytes, q[l].drops);
- if (co.verbose)
- printf(" S %20llu F %20llu\n",
- align_uint64(&q[l].S), align_uint64(&q[l].F));
- }
-
- /* Print IPv6 flows */
- index_printed = 0;
- for (l = 0; l < fs->rq_elements; l++) {
- if (!IS_IP6_FLOW_ID(&(q[l].id)))
- continue;
-
- if (!index_printed) {
- index_printed = 1;
- if (indexes > 0)
- printf("\n");
- indexes++;
- printf("\n mask: proto: 0x%02x, flow_id: 0x%08x, ",
- fs->flow_mask.proto, fs->flow_mask.flow_id6);
- inet_ntop(AF_INET6, &(fs->flow_mask.src_ip6),
- buff, sizeof(buff));
- printf("%s/0x%04x -> ", buff, fs->flow_mask.src_port);
- inet_ntop( AF_INET6, &(fs->flow_mask.dst_ip6),
- buff, sizeof(buff) );
- printf("%s/0x%04x\n", buff, fs->flow_mask.dst_port);
-
- printf("BKT ___Prot___ _flow-id_ "
- "______________Source IPv6/port_______________ "
- "_______________Dest. IPv6/port_______________ "
- "Tot_pkt/bytes Pkt/Byte Drp\n");
- }
- printf("%3d ", q[l].hash_slot);
- pe = getprotobynumber(q[l].id.proto);
+ inet_ntoa(ina), id->dst_port);
+ } else {
+ /* Print IPv6 flows */
if (pe != NULL)
printf("%9s ", pe->p_name);
else
- printf("%9u ", q[l].id.proto);
- printf("%7d %39s/%-5d ", q[l].id.flow_id6,
- inet_ntop(AF_INET6, &(q[l].id.src_ip6), buff, sizeof(buff)),
- q[l].id.src_port);
+ printf("%9u ", id->proto);
+ printf("%7d %39s/%-5d ", id->flow_id6,
+ inet_ntop(AF_INET6, &(id->src_ip6), buff, sizeof(buff)),
+ id->src_port);
printf(" %39s/%-5d ",
- inet_ntop(AF_INET6, &(q[l].id.dst_ip6), buff, sizeof(buff)),
- q[l].id.dst_port);
- printf(" %4llu %8llu %2u %4u %3u\n",
- align_uint64(&q[l].tot_pkts),
- align_uint64(&q[l].tot_bytes),
- q[l].len, q[l].len_bytes, q[l].drops);
- if (co.verbose)
- printf(" S %20llu F %20llu\n",
- align_uint64(&q[l].S),
- align_uint64(&q[l].F));
+ inet_ntop(AF_INET6, &(id->dst_ip6), buff, sizeof(buff)),
+ id->dst_port);
}
+ printf("%4llu %8llu %2u %4u %3u\n",
+ align_uint64(&ni->tot_pkts),
+ align_uint64(&ni->tot_bytes),
+ ni->length, ni->len_bytes, ni->drops);
}
static void
-print_flowset_parms(struct dn_flow_set *fs, char *prefix)
+print_flowset_parms(struct dn_fs *fs, char *prefix)
{
int l;
char qs[30];
@@ -226,7 +217,7 @@ print_flowset_parms(struct dn_flow_set *fs, char *prefix)
char red[90]; /* Display RED parameters */
l = fs->qsize;
- if (fs->flags_fs & DN_QSIZE_IS_BYTES) {
+ if (fs->flags & DN_QSIZE_BYTES) {
if (l >= 8192)
sprintf(qs, "%d KB", l / 1024);
else
@@ -237,23 +228,34 @@ print_flowset_parms(struct dn_flow_set *fs, char *prefix)
sprintf(plr, "plr %f", 1.0 * fs->plr / (double)(0x7fffffff));
else
plr[0] = '\0';
- if (fs->flags_fs & DN_IS_RED) /* RED parameters */
+
+ if (fs->flags & DN_IS_RED) /* RED parameters */
sprintf(red,
"\n\t %cRED w_q %f min_th %d max_th %d max_p %f",
- (fs->flags_fs & DN_IS_GENTLE_RED) ? 'G' : ' ',
+ (fs->flags & DN_IS_GENTLE_RED) ? 'G' : ' ',
1.0 * fs->w_q / (double)(1 << SCALE_RED),
- SCALE_VAL(fs->min_th),
- SCALE_VAL(fs->max_th),
+ fs->min_th,
+ fs->max_th,
1.0 * fs->max_p / (double)(1 << SCALE_RED));
else
sprintf(red, "droptail");
- printf("%s %s%s %d queues (%d buckets) %s\n",
- prefix, qs, plr, fs->rq_elements, fs->rq_size, red);
+ if (prefix[0]) {
+ printf("%s %s%s %d queues (%d buckets) %s\n",
+ prefix, qs, plr, fs->oid.id, fs->buckets, red);
+ prefix[0] = '\0';
+ } else {
+ printf("q%05d %s%s %d flows (%d buckets) sched %d "
+ "weight %d lmax %d pri %d %s\n",
+ fs->fs_nr, qs, plr, fs->oid.id, fs->buckets,
+ fs->sched_nr, fs->par[0], fs->par[1], fs->par[2], red);
+ if (fs->flags & DN_HAVE_MASK)
+ print_mask(&fs->flow_mask);
+ }
}
static void
-print_extra_delay_parms(struct dn_pipe *p)
+print_extra_delay_parms(struct dn_profile *p)
{
double loss;
if (p->samples_no <= 0)
@@ -265,105 +267,126 @@ print_extra_delay_parms(struct dn_pipe *p)
p->name, loss, p->samples_no);
}
-void
-ipfw_list_pipes(void *data, uint nbytes, int ac, char *av[])
+static void
+flush_buf(char *buf)
{
- int rulenum;
- void *next = data;
- struct dn_pipe *p = (struct dn_pipe *) data;
- struct dn_flow_set *fs;
- struct dn_flow_queue *q;
- int l;
-
- if (ac > 0)
- rulenum = strtoul(*av++, NULL, 10);
- else
- rulenum = 0;
- for (; nbytes >= sizeof *p; p = (struct dn_pipe *)next) {
- double b = p->bandwidth;
- char buf[30];
- char prefix[80];
- char burst[5 + 7];
-
- if (SLIST_NEXT(p, next) != (struct dn_pipe *)DN_IS_PIPE)
- break; /* done with pipes, now queues */
-
- /*
- * compute length, as pipe have variable size
- */
- l = sizeof(*p) + p->fs.rq_elements * sizeof(*q);
- next = (char *)p + l;
- nbytes -= l;
-
- if ((rulenum != 0 && rulenum != p->pipe_nr) || co.do_pipe == 2)
- continue;
-
- /*
- * Print rate (or clocking interface)
- */
- if (p->if_name[0] != '\0')
- sprintf(buf, "%s", p->if_name);
- else if (b == 0)
- sprintf(buf, "unlimited");
- else if (b >= 1000000)
- sprintf(buf, "%7.3f Mbit/s", b/1000000);
- else if (b >= 1000)
- sprintf(buf, "%7.3f Kbit/s", b/1000);
- else
- sprintf(buf, "%7.3f bit/s ", b);
-
- sprintf(prefix, "%05d: %s %4d ms ",
- p->pipe_nr, buf, p->delay);
-
- print_flowset_parms(&(p->fs), prefix);
-
- if (humanize_number(burst, sizeof(burst), p->burst,
- "Byte", HN_AUTOSCALE, 0) < 0 || co.verbose)
- printf("\t burst: %ju Byte\n", p->burst);
- else
- printf("\t burst: %s\n", burst);
-
- print_extra_delay_parms(p);
-
- q = (struct dn_flow_queue *)(p+1);
- list_queues(&(p->fs), q);
- }
- for (fs = next; nbytes >= sizeof *fs; fs = next) {
- char prefix[80];
-
- if (SLIST_NEXT(fs, next) != (struct dn_flow_set *)DN_IS_QUEUE)
- break;
- l = sizeof(*fs) + fs->rq_elements * sizeof(*q);
- next = (char *)fs + l;
- nbytes -= l;
-
- if (rulenum != 0 && ((rulenum != fs->fs_nr && co.do_pipe == 2) ||
- (rulenum != fs->parent_nr && co.do_pipe == 1))) {
- continue;
- }
-
- q = (struct dn_flow_queue *)(fs+1);
- sprintf(prefix, "q%05d: weight %d pipe %d ",
- fs->fs_nr, fs->weight, fs->parent_nr);
- print_flowset_parms(fs, prefix);
- list_queues(fs, q);
+ if (buf[0])
+ printf("%s\n", buf);
+ buf[0] = '\0';
+}
+
+/*
+ * generic list routine. We expect objects in a specific order, i.e.
+ * PIPES AND SCHEDULERS:
+ * link; scheduler; internal flowset if any; instances
+ * we can tell a pipe from the number.
+ *
+ * FLOWSETS:
+ * flowset; queues;
+ * link i (int queue); scheduler i; si(i) { flowsets() : queues }
+ */
+static void
+list_pipes(struct dn_id *oid, struct dn_id *end)
+{
+ char buf[160]; /* pending buffer */
+ buf[0] = '\0';
+
+ for (; oid != end; oid = O_NEXT(oid, oid->len)) {
+ if (oid->len < sizeof(*oid))
+ errx(1, "invalid oid len %d\n", oid->len);
+
+ switch (oid->type) {
+ default:
+ flush_buf(buf);
+ printf("unrecognized object %d size %d\n", oid->type, oid->len);
+ break;
+ case DN_TEXT: /* list of attached flowsets */
+ {
+ int i, l;
+ struct {
+ struct dn_id id;
+ uint32_t p[0];
+ } *d = (void *)oid;
+ l = (oid->len - sizeof(*oid))/sizeof(d->p[0]);
+ if (l == 0)
+ break;
+ printf(" Children flowsets: ");
+ for (i = 0; i < l; i++)
+ printf("%u ", d->p[i]);
+ printf("\n");
+ break;
+ }
+ case DN_CMD_GET:
+ if (co.verbose)
+ printf("answer for cmd %d, len %d\n", oid->type, oid->id);
+ break;
+ case DN_SCH: {
+ struct dn_sch *s = (struct dn_sch *)oid;
+ flush_buf(buf);
+ printf(" sched %d type %s flags 0x%x %d buckets %d active\n",
+ s->sched_nr,
+ s->name, s->flags, s->buckets, s->oid.id);
+ if (s->flags & DN_HAVE_MASK)
+ print_mask(&s->sched_mask);
+ }
+ break;
+
+ case DN_FLOW:
+ list_flow((struct dn_flow *)oid);
+ break;
+
+ case DN_LINK: {
+ struct dn_link *p = (struct dn_link *)oid;
+ double b = p->bandwidth;
+ char bwbuf[30];
+ char burst[5 + 7];
+
+ /* This starts a new object so flush buffer */
+ flush_buf(buf);
+ /* data rate */
+ if (b == 0)
+ sprintf(bwbuf, "unlimited ");
+ else if (b >= 1000000)
+ sprintf(bwbuf, "%7.3f Mbit/s", b/1000000);
+ else if (b >= 1000)
+ sprintf(bwbuf, "%7.3f Kbit/s", b/1000);
+ else
+ sprintf(bwbuf, "%7.3f bit/s ", b);
+
+ if (humanize_number(burst, sizeof(burst), p->burst,
+ "", HN_AUTOSCALE, 0) < 0 || co.verbose)
+ sprintf(burst, "%d", (int)p->burst);
+ sprintf(buf, "%05d: %s %4d ms burst %s",
+ p->link_nr % DN_MAX_ID, bwbuf, p->delay, burst);
+ }
+ break;
+
+ case DN_FS:
+ print_flowset_parms((struct dn_fs *)oid, buf);
+ break;
+ case DN_PROFILE:
+ flush_buf(buf);
+ print_extra_delay_parms((struct dn_profile *)oid);
}
+ flush_buf(buf); // XXX does it really go here ?
+ }
}
/*
- * Delete pipe or queue i
+ * Delete pipe, queue or scheduler i
*/
int
-ipfw_delete_pipe(int pipe_or_queue, int i)
+ipfw_delete_pipe(int do_pipe, int i)
{
- struct dn_pipe p;
-
- memset(&p, 0, sizeof p);
- if (pipe_or_queue == 1)
- p.pipe_nr = i; /* pipe */
- else
- p.fs.fs_nr = i; /* queue */
- i = do_cmd(IP_DUMMYNET_DEL, &p, sizeof p);
+ struct {
+ struct dn_id oid;
+ uintptr_t a[1]; /* add more if we want a list */
+ } cmd;
+ oid_fill((void *)&cmd, sizeof(cmd), DN_CMD_DELETE, DN_API_VERSION);
+ cmd.oid.subtype = (do_pipe == 1) ? DN_LINK :
+ ( (do_pipe == 2) ? DN_FS : DN_SCH);
+ cmd.a[0] = i;
+ i = do_cmd(IP_DUMMYNET3, &cmd, cmd.oid.len);
if (i) {
i = 1;
warn("rule %u: setsockopt(IP_DUMMYNET_DEL)", i);
@@ -400,7 +423,7 @@ ipfw_delete_pipe(int pipe_or_queue, int i)
* The empirical curve may have both vertical and horizontal lines.
* Vertical lines represent constant delay for a range of
* probabilities; horizontal lines correspond to a discontinuty
- * in the delay distribution: the pipe will use the largest delay
+ * in the delay distribution: the link will use the largest delay
* for a given probability.
*
* To pass the curve to dummynet, we must store the parameters
@@ -490,9 +513,12 @@ static void
read_bandwidth(char *arg, int *bandwidth, char *if_name, int namelen)
{
if (*bandwidth != -1)
- warn("duplicate token, override bandwidth value!");
+ warnx("duplicate token, override bandwidth value!");
if (arg[0] >= 'a' && arg[0] <= 'z') {
+ if (!if_name) {
+ errx(1, "no if support");
+ }
if (namelen >= IFNAMSIZ)
warn("interface name truncated");
namelen--;
@@ -521,7 +547,8 @@ read_bandwidth(char *arg, int *bandwidth, char *if_name, int namelen)
errx(EX_DATAERR, "bandwidth too large");
*bandwidth = bw;
- if_name[0] = '\0';
+ if (if_name)
+ if_name[0] = '\0';
}
}
@@ -551,7 +578,8 @@ compare_points(const void *vp1, const void *vp2)
#define ED_EFMT(s) EX_DATAERR,"error in %s at line %d: "#s,filename,lineno
static void
-load_extra_delays(const char *filename, struct dn_pipe *p)
+load_extra_delays(const char *filename, struct dn_profile *p,
+ struct dn_link *link)
{
char line[ED_MAX_LINE_LEN];
FILE *f;
@@ -566,6 +594,9 @@ load_extra_delays(const char *filename, struct dn_pipe *p)
struct point points[ED_MAX_SAMPLES_NO];
int points_no = 0;
+ /* XXX link never NULL? */
+ p->link_nr = link->link_nr;
+
profile_name[0] = '\0';
f = fopen(filename, "r");
if (f == NULL)
@@ -606,7 +637,8 @@ load_extra_delays(const char *filename, struct dn_pipe *p)
ED_MAX_SAMPLES_NO);
do_points = 0;
} else if (!strcasecmp(name, ED_TOK_BW)) {
- read_bandwidth(arg, &p->bandwidth, p->if_name, sizeof(p->if_name));
+ char buf[IFNAMSIZ];
+ read_bandwidth(arg, &link->bandwidth, buf, sizeof(buf));
} else if (!strcasecmp(name, ED_TOK_LOSS)) {
if (loss != -1.0)
errx(ED_EFMT("duplicated token: %s"), name);
@@ -676,17 +708,17 @@ load_extra_delays(const char *filename, struct dn_pipe *p)
double y2 = points[i+1].prob * samples;
double x2 = points[i+1].delay;
- int index = y1;
+ int ix = y1;
int stop = y2;
if (x1 == x2) {
- for (; index<stop; ++index)
- p->samples[index] = x1;
+ for (; ix<stop; ++ix)
+ p->samples[ix] = x1;
} else {
double m = (y2-y1)/(x2-x1);
double c = y1 - m*x1;
- for (; index<stop ; ++index)
- p->samples[index] = (index - c)/m;
+ for (; ix<stop ; ++ix)
+ p->samples[ix] = (ix - c)/m;
}
}
p->samples_no = samples;
@@ -694,27 +726,120 @@ load_extra_delays(const char *filename, struct dn_pipe *p)
strncpy(p->name, profile_name, sizeof(p->name));
}
+/*
+ * configuration of pipes, schedulers, flowsets.
+ * When we configure a new scheduler, an empty pipe is created, so:
+ *
+ * do_pipe = 1 -> "pipe N config ..." only for backward compatibility
+ * sched N+Delta type fifo sched_mask ...
+ * pipe N+Delta <parameters>
+ * flowset N+Delta pipe N+Delta (no parameters)
+ * sched N type wf2q+ sched_mask ...
+ * pipe N <parameters>
+ *
+ * do_pipe = 2 -> flowset N config
+ * flowset N parameters
+ *
+ * do_pipe = 3 -> sched N config
+ * sched N parameters (default no pipe)
+ * optional Pipe N config ...
+ * pipe ==>
+ */
void
ipfw_config_pipe(int ac, char **av)
{
- int samples[ED_MAX_SAMPLES_NO];
- struct dn_pipe p;
- int i;
+ int i, j;
char *end;
void *par = NULL;
-
- memset(&p, 0, sizeof p);
- p.bandwidth = -1;
+ struct dn_id *buf, *base;
+ struct dn_sch *sch = NULL;
+ struct dn_link *p = NULL;
+ struct dn_fs *fs = NULL;
+ struct dn_profile *pf = NULL;
+ struct ipfw_flow_id *mask = NULL;
+ int lmax;
+ uint32_t _foo = 0, *flags = &_foo , *buckets = &_foo;
+
+ /*
+ * allocate space for 1 header,
+ * 1 scheduler, 1 link, 1 flowset, 1 profile
+ */
+ lmax = sizeof(struct dn_id); /* command header */
+ lmax += sizeof(struct dn_sch) + sizeof(struct dn_link) +
+ sizeof(struct dn_fs) + sizeof(struct dn_profile);
av++; ac--;
/* Pipe number */
if (ac && isdigit(**av)) {
i = atoi(*av); av++; ac--;
- if (co.do_pipe == 1)
- p.pipe_nr = i;
- else
- p.fs.fs_nr = i;
+ } else
+ i = -1;
+ if (i <= 0)
+ errx(EX_USAGE, "need a pipe/flowset/sched number");
+ base = buf = safe_calloc(1, lmax);
+ /* all commands start with a 'CONFIGURE' and a version */
+ o_next(&buf, sizeof(struct dn_id), DN_CMD_CONFIG);
+ base->id = DN_API_VERSION;
+
+ switch (co.do_pipe) {
+ case 1: /* "pipe N config ..." */
+ /* Allocate space for the WF2Q+ scheduler, its link
+ * and the FIFO flowset. Set the number, but leave
+ * the scheduler subtype and other parameters to 0
+ * so the kernel will use appropriate defaults.
+ * XXX todo: add a flag to record if a parameter
+ * is actually configured.
+ * If we do a 'pipe config' mask -> sched_mask.
+ * The FIFO scheduler and link are derived from the
+ * WF2Q+ one in the kernel.
+ */
+ sch = o_next(&buf, sizeof(*sch), DN_SCH);
+ p = o_next(&buf, sizeof(*p), DN_LINK);
+ fs = o_next(&buf, sizeof(*fs), DN_FS);
+
+ sch->sched_nr = i;
+ sch->oid.subtype = 0; /* defaults to WF2Q+ */
+ mask = &sch->sched_mask;
+ flags = &sch->flags;
+ buckets = &sch->buckets;
+ *flags |= DN_PIPE_CMD;
+
+ p->link_nr = i;
+
+ /* This flowset is only for the FIFO scheduler */
+ fs->fs_nr = i + 2*DN_MAX_ID;
+ fs->sched_nr = i + DN_MAX_ID;
+ break;
+
+ case 2: /* "queue N config ... " */
+ fs = o_next(&buf, sizeof(*fs), DN_FS);
+ fs->fs_nr = i;
+ mask = &fs->flow_mask;
+ flags = &fs->flags;
+ buckets = &fs->buckets;
+ break;
+
+ case 3: /* "sched N config ..." */
+ sch = o_next(&buf, sizeof(*sch), DN_SCH);
+ fs = o_next(&buf, sizeof(*fs), DN_FS);
+ sch->sched_nr = i;
+ mask = &sch->sched_mask;
+ flags = &sch->flags;
+ buckets = &sch->buckets;
+ /* fs is used only with !MULTIQUEUE schedulers */
+ fs->fs_nr = i + DN_MAX_ID;
+ fs->sched_nr = i;
+ break;
}
+ /* set to -1 those fields for which we want to reuse existing
+ * values from the kernel.
+ * Also, *_nr and subtype = 0 mean reuse the value from the kernel.
+ * XXX todo: support reuse of the mask.
+ */
+ if (p)
+ p->bandwidth = -1;
+ for (j = 0; j < sizeof(fs->par)/sizeof(fs->par[0]); j++)
+ fs->par[j] = -1;
while (ac > 0) {
double d;
int tok = match_token(dummynet_params, *av);
@@ -722,41 +847,48 @@ ipfw_config_pipe(int ac, char **av)
switch(tok) {
case TOK_NOERROR:
- p.fs.flags_fs |= DN_NOERROR;
+ NEED(fs, "noerror is only for pipes");
+ fs->flags |= DN_NOERROR;
break;
case TOK_PLR:
+ NEED(fs, "plr is only for pipes");
NEED1("plr needs argument 0..1\n");
d = strtod(av[0], NULL);
if (d > 1)
d = 1;
else if (d < 0)
d = 0;
- p.fs.plr = (int)(d*0x7fffffff);
+ fs->plr = (int)(d*0x7fffffff);
ac--; av++;
break;
case TOK_QUEUE:
+ NEED(fs, "queue is only for pipes or flowsets");
NEED1("queue needs queue size\n");
end = NULL;
- p.fs.qsize = strtoul(av[0], &end, 0);
+ fs->qsize = strtoul(av[0], &end, 0);
if (*end == 'K' || *end == 'k') {
- p.fs.flags_fs |= DN_QSIZE_IS_BYTES;
- p.fs.qsize *= 1024;
+ fs->flags |= DN_QSIZE_BYTES;
+ fs->qsize *= 1024;
} else if (*end == 'B' ||
_substrcmp2(end, "by", "bytes") == 0) {
- p.fs.flags_fs |= DN_QSIZE_IS_BYTES;
+ fs->flags |= DN_QSIZE_BYTES;
}
ac--; av++;
break;
case TOK_BUCKETS:
+ NEED(fs, "buckets is only for pipes or flowsets");
NEED1("buckets needs argument\n");
- p.fs.rq_size = strtoul(av[0], NULL, 0);
+ *buckets = strtoul(av[0], NULL, 0);
ac--; av++;
break;
+ case TOK_FLOW_MASK:
+ case TOK_SCHED_MASK:
case TOK_MASK:
+ NEED(mask, "tok_mask");
NEED1("mask needs mask specifier\n");
/*
* per-flow queue, mask is dst_ip, dst_port,
@@ -764,7 +896,7 @@ ipfw_config_pipe(int ac, char **av)
*/
par = NULL;
- bzero(&p.fs.flow_mask, sizeof(p.fs.flow_mask));
+ bzero(mask, sizeof(*mask));
end = NULL;
while (ac >= 1) {
@@ -781,43 +913,48 @@ ipfw_config_pipe(int ac, char **av)
/*
* special case, all bits significant
*/
- p.fs.flow_mask.dst_ip = ~0;
- p.fs.flow_mask.src_ip = ~0;
- p.fs.flow_mask.dst_port = ~0;
- p.fs.flow_mask.src_port = ~0;
- p.fs.flow_mask.proto = ~0;
- n2mask(&(p.fs.flow_mask.dst_ip6), 128);
- n2mask(&(p.fs.flow_mask.src_ip6), 128);
- p.fs.flow_mask.flow_id6 = ~0;
- p.fs.flags_fs |= DN_HAVE_FLOW_MASK;
+ mask->dst_ip = ~0;
+ mask->src_ip = ~0;
+ mask->dst_port = ~0;
+ mask->src_port = ~0;
+ mask->proto = ~0;
+ n2mask(&mask->dst_ip6, 128);
+ n2mask(&mask->src_ip6, 128);
+ mask->flow_id6 = ~0;
+ *flags |= DN_HAVE_MASK;
goto end_mask;
case TOK_DSTIP:
- p32 = &p.fs.flow_mask.dst_ip;
+ mask->addr_type = 4;
+ p32 = &mask->dst_ip;
break;
case TOK_SRCIP:
- p32 = &p.fs.flow_mask.src_ip;
+ mask->addr_type = 4;
+ p32 = &mask->src_ip;
break;
case TOK_DSTIP6:
- pa6 = &(p.fs.flow_mask.dst_ip6);
+ mask->addr_type = 6;
+ pa6 = &mask->dst_ip6;
break;
case TOK_SRCIP6:
- pa6 = &(p.fs.flow_mask.src_ip6);
+ mask->addr_type = 6;
+ pa6 = &mask->src_ip6;
break;
case TOK_FLOWID:
- p20 = &p.fs.flow_mask.flow_id6;
+ mask->addr_type = 6;
+ p20 = &mask->flow_id6;
break;
case TOK_DSTPORT:
- p16 = &p.fs.flow_mask.dst_port;
+ p16 = &mask->dst_port;
break;
case TOK_SRCPORT:
- p16 = &p.fs.flow_mask.src_port;
+ p16 = &mask->src_port;
break;
case TOK_PROTO:
@@ -857,10 +994,10 @@ ipfw_config_pipe(int ac, char **av)
if (a > 0xFF)
errx(EX_DATAERR,
"proto mask must be 8 bit");
- p.fs.flow_mask.proto = (uint8_t)a;
+ fs->flow_mask.proto = (uint8_t)a;
}
if (a != 0)
- p.fs.flags_fs |= DN_HAVE_FLOW_MASK;
+ *flags |= DN_HAVE_MASK;
ac--; av++;
} /* end while, config masks */
end_mask:
@@ -869,9 +1006,9 @@ end_mask:
case TOK_RED:
case TOK_GRED:
NEED1("red/gred needs w_q/min_th/max_th/max_p\n");
- p.fs.flags_fs |= DN_IS_RED;
+ fs->flags |= DN_IS_RED;
if (tok == TOK_GRED)
- p.fs.flags_fs |= DN_IS_GENTLE_RED;
+ fs->flags |= DN_IS_GENTLE_RED;
/*
* the format for parameters is w_q/min_th/max_th/max_p
*/
@@ -879,82 +1016,108 @@ end_mask:
double w_q = strtod(end, NULL);
if (w_q > 1 || w_q <= 0)
errx(EX_DATAERR, "0 < w_q <= 1");
- p.fs.w_q = (int) (w_q * (1 << SCALE_RED));
+ fs->w_q = (int) (w_q * (1 << SCALE_RED));
}
if ((end = strsep(&av[0], "/"))) {
- p.fs.min_th = strtoul(end, &end, 0);
+ fs->min_th = strtoul(end, &end, 0);
if (*end == 'K' || *end == 'k')
- p.fs.min_th *= 1024;
+ fs->min_th *= 1024;
}
if ((end = strsep(&av[0], "/"))) {
- p.fs.max_th = strtoul(end, &end, 0);
+ fs->max_th = strtoul(end, &end, 0);
if (*end == 'K' || *end == 'k')
- p.fs.max_th *= 1024;
+ fs->max_th *= 1024;
}
if ((end = strsep(&av[0], "/"))) {
double max_p = strtod(end, NULL);
if (max_p > 1 || max_p <= 0)
errx(EX_DATAERR, "0 < max_p <= 1");
- p.fs.max_p = (int)(max_p * (1 << SCALE_RED));
+ fs->max_p = (int)(max_p * (1 << SCALE_RED));
}
ac--; av++;
break;
case TOK_DROPTAIL:
- p.fs.flags_fs &= ~(DN_IS_RED|DN_IS_GENTLE_RED);
+ NEED(fs, "droptail is only for flowsets");
+ fs->flags &= ~(DN_IS_RED|DN_IS_GENTLE_RED);
break;
case TOK_BW:
+ NEED(p, "bw is only for links");
NEED1("bw needs bandwidth or interface\n");
- if (co.do_pipe != 1)
- errx(EX_DATAERR, "bandwidth only valid for pipes");
- read_bandwidth(av[0], &p.bandwidth, p.if_name, sizeof(p.if_name));
+ read_bandwidth(av[0], &p->bandwidth, NULL, 0);
ac--; av++;
break;
case TOK_DELAY:
- if (co.do_pipe != 1)
- errx(EX_DATAERR, "delay only valid for pipes");
+ NEED(p, "delay is only for links");
NEED1("delay needs argument 0..10000ms\n");
- p.delay = strtoul(av[0], NULL, 0);
+ p->delay = strtoul(av[0], NULL, 0);
+ ac--; av++;
+ break;
+
+ case TOK_TYPE: {
+ int l;
+ NEED(sch, "type is only for schedulers");
+ NEED1("type needs a string");
+ l = strlen(av[0]);
+ if (l == 0 || l > 15)
+ errx(1, "type %s too long\n", av[0]);
+ strcpy(sch->name, av[0]);
+ sch->oid.subtype = 0; /* use string */
ac--; av++;
break;
+ }
case TOK_WEIGHT:
- if (co.do_pipe == 1)
- errx(EX_DATAERR,"weight only valid for queues");
- NEED1("weight needs argument 0..100\n");
- p.fs.weight = strtoul(av[0], &end, 0);
+ NEED(fs, "weight is only for flowsets");
+ NEED1("weight needs argument\n");
+ fs->par[0] = strtol(av[0], &end, 0);
+ ac--; av++;
+ break;
+
+ case TOK_LMAX:
+ NEED(fs, "lmax is only for flowsets");
+ NEED1("lmax needs argument\n");
+ fs->par[1] = strtol(av[0], &end, 0);
ac--; av++;
break;
+ case TOK_PRI:
+ NEED(fs, "priority is only for flowsets");
+ NEED1("priority needs argument\n");
+ fs->par[2] = strtol(av[0], &end, 0);
+ ac--; av++;
+ break;
+
+ case TOK_SCHED:
case TOK_PIPE:
- if (co.do_pipe == 1)
- errx(EX_DATAERR,"pipe only valid for queues");
- NEED1("pipe needs pipe_number\n");
- p.fs.parent_nr = strtoul(av[0], &end, 0);
+ NEED(fs, "pipe/sched");
+ NEED1("pipe/link/sched needs number\n");
+ fs->sched_nr = strtoul(av[0], &end, 0);
ac--; av++;
break;
- case TOK_PIPE_PROFILE:
- if (co.do_pipe != 1)
- errx(EX_DATAERR, "extra delay only valid for pipes");
+ case TOK_PROFILE:
+ NEED((!pf), "profile already set");
+ NEED(p, "profile");
+ {
NEED1("extra delay needs the file name\n");
- p.samples = &samples[0];
- load_extra_delays(av[0], &p);
+ pf = o_next(&buf, sizeof(*pf), DN_PROFILE);
+ load_extra_delays(av[0], pf, p); //XXX can't fail?
--ac; ++av;
+ }
break;
case TOK_BURST:
- if (co.do_pipe != 1)
- errx(EX_DATAERR, "burst only valid for pipes");
+ NEED(p, "burst");
NEED1("burst needs argument\n");
errno = 0;
- if (expand_number(av[0], (int64_t *)&p.burst) < 0)
+ if (expand_number(av[0], (int64_t *)&p->burst) < 0)
if (errno != ERANGE)
errx(EX_DATAERR,
"burst: invalid argument");
- if (errno || p.burst > (1ULL << 48) - 1)
+ if (errno || p->burst > (1ULL << 48) - 1)
errx(EX_DATAERR,
"burst: out of range (0..2^48-1)");
ac--; av++;
@@ -964,26 +1127,17 @@ end_mask:
errx(EX_DATAERR, "unrecognised option ``%s''", av[-1]);
}
}
- if (co.do_pipe == 1) {
- if (p.pipe_nr == 0)
- errx(EX_DATAERR, "pipe_nr must be > 0");
- if (p.delay > 10000)
- errx(EX_DATAERR, "delay must be < 10000");
- } else { /* co.do_pipe == 2, queue */
- if (p.fs.parent_nr == 0)
- errx(EX_DATAERR, "pipe must be > 0");
- if (p.fs.weight >100)
- errx(EX_DATAERR, "weight must be <= 100");
- }
- /* check for bandwidth value */
- if (p.bandwidth == -1) {
- p.bandwidth = 0;
- if (p.samples_no > 0)
- errx(EX_DATAERR, "profile requires a bandwidth limit");
+ /* check validity of parameters */
+ if (p) {
+ if (p->delay > 10000)
+ errx(EX_DATAERR, "delay must be < 10000");
+ if (p->bandwidth == -1)
+ p->bandwidth = 0;
}
-
- if (p.fs.flags_fs & DN_QSIZE_IS_BYTES) {
+ if (fs) {
+ /* XXX accept a 0 scheduler to keep the default */
+ if (fs->flags & DN_QSIZE_BYTES) {
size_t len;
long limit;
@@ -991,9 +1145,9 @@ end_mask:
if (sysctlbyname("net.inet.ip.dummynet.pipe_byte_limit",
&limit, &len, NULL, 0) == -1)
limit = 1024*1024;
- if (p.fs.qsize > limit)
+ if (fs->qsize > limit)
errx(EX_DATAERR, "queue size must be < %ldB", limit);
- } else {
+ } else {
size_t len;
long limit;
@@ -1001,27 +1155,25 @@ end_mask:
if (sysctlbyname("net.inet.ip.dummynet.pipe_slot_limit",
&limit, &len, NULL, 0) == -1)
limit = 100;
- if (p.fs.qsize > limit)
+ if (fs->qsize > limit)
errx(EX_DATAERR, "2 <= queue size <= %ld", limit);
- }
- if (p.fs.flags_fs & DN_IS_RED) {
+ }
+
+ if (fs->flags & DN_IS_RED) {
size_t len;
int lookup_depth, avg_pkt_size;
- double s, idle, weight, w_q;
- struct clockinfo ck;
- int t;
+ double w_q;
- if (p.fs.min_th >= p.fs.max_th)
+ if (fs->min_th >= fs->max_th)
errx(EX_DATAERR, "min_th %d must be < than max_th %d",
- p.fs.min_th, p.fs.max_th);
- if (p.fs.max_th == 0)
+ fs->min_th, fs->max_th);
+ if (fs->max_th == 0)
errx(EX_DATAERR, "max_th must be > 0");
len = sizeof(int);
if (sysctlbyname("net.inet.ip.dummynet.red_lookup_depth",
&lookup_depth, &len, NULL, 0) == -1)
- errx(1, "sysctlbyname(\"%s\")",
- "net.inet.ip.dummynet.red_lookup_depth");
+ lookup_depth = 256;
if (lookup_depth == 0)
errx(EX_DATAERR, "net.inet.ip.dummynet.red_lookup_depth"
" must be greater than zero");
@@ -1029,18 +1181,13 @@ end_mask:
len = sizeof(int);
if (sysctlbyname("net.inet.ip.dummynet.red_avg_pkt_size",
&avg_pkt_size, &len, NULL, 0) == -1)
+ avg_pkt_size = 512;
- errx(1, "sysctlbyname(\"%s\")",
- "net.inet.ip.dummynet.red_avg_pkt_size");
if (avg_pkt_size == 0)
errx(EX_DATAERR,
"net.inet.ip.dummynet.red_avg_pkt_size must"
" be greater than zero");
- len = sizeof(struct clockinfo);
- if (sysctlbyname("kern.clockrate", &ck, &len, NULL, 0) == -1)
- errx(1, "sysctlbyname(\"%s\")", "kern.clockrate");
-
/*
* Ticks needed for sending a medium-sized packet.
* Unfortunately, when we are configuring a WF2Q+ queue, we
@@ -1050,38 +1197,77 @@ end_mask:
* correct. But on the other hand, why do we want RED with
* WF2Q+ ?
*/
+#if 0
if (p.bandwidth==0) /* this is a WF2Q+ queue */
s = 0;
else
s = (double)ck.hz * avg_pkt_size * 8 / p.bandwidth;
-
+#endif
/*
* max idle time (in ticks) before avg queue size becomes 0.
* NOTA: (3/w_q) is approx the value x so that
* (1-w_q)^x < 10^-3.
*/
- w_q = ((double)p.fs.w_q) / (1 << SCALE_RED);
+ w_q = ((double)fs->w_q) / (1 << SCALE_RED);
+#if 0 // go in kernel
idle = s * 3. / w_q;
- p.fs.lookup_step = (int)idle / lookup_depth;
- if (!p.fs.lookup_step)
- p.fs.lookup_step = 1;
+ fs->lookup_step = (int)idle / lookup_depth;
+ if (!fs->lookup_step)
+ fs->lookup_step = 1;
weight = 1 - w_q;
- for (t = p.fs.lookup_step; t > 1; --t)
+ for (t = fs->lookup_step; t > 1; --t)
weight *= 1 - w_q;
- p.fs.lookup_weight = (int)(weight * (1 << SCALE_RED));
+ fs->lookup_weight = (int)(weight * (1 << SCALE_RED));
+#endif
+ }
}
- if (p.samples_no <= 0) {
- i = do_cmd(IP_DUMMYNET_CONFIGURE, &p, sizeof p);
- } else {
- struct dn_pipe_max pm;
- int len = sizeof(pm);
-
- memcpy(&pm.pipe, &p, sizeof(pm.pipe));
- memcpy(&pm.samples, samples, sizeof(pm.samples));
- i = do_cmd(IP_DUMMYNET_CONFIGURE, &pm, len);
- }
+ i = do_cmd(IP_DUMMYNET3, base, (char *)buf - (char *)base);
if (i)
err(1, "setsockopt(%s)", "IP_DUMMYNET_CONFIGURE");
}
+
+void
+dummynet_flush(void)
+{
+ struct dn_id oid;
+ oid_fill(&oid, sizeof(oid), DN_CMD_FLUSH, DN_API_VERSION);
+ do_cmd(IP_DUMMYNET3, &oid, oid.len);
+}
+
+/* main entry point for dummynet list functions. co.do_pipe indicates
+ * which function we want to support.
+ * XXX todo- accept filtering arguments.
+ */
+void
+dummynet_list(int ac, char *av[], int show_counters)
+{
+ struct dn_id oid, *x;
+ int ret, l = sizeof(oid);
+
+ oid_fill(&oid, l, DN_CMD_GET, DN_API_VERSION);
+ switch (co.do_pipe) {
+ case 1:
+ oid.subtype = DN_LINK; /* list pipe */
+ break;
+ case 2:
+ oid.subtype = DN_FS; /* list queue */
+ break;
+ case 3:
+ oid.subtype = DN_SCH; /* list sched */
+ break;
+ }
+ ret = do_cmd(-IP_DUMMYNET3, &oid, (uintptr_t)&l);
+ // printf("%s returns %d need %d\n", __FUNCTION__, ret, oid.id);
+ if (ret != 0 || oid.id <= sizeof(oid))
+ return;
+ l = oid.id;
+ x = safe_calloc(1, l);
+ *x = oid;
+ ret = do_cmd(-IP_DUMMYNET3, x, (uintptr_t)&l);
+ // printf("%s returns %d need %d\n", __FUNCTION__, ret, oid.id);
+ // XXX filter on ac, av
+ list_pipes(x, O_NEXT(x, l));
+ free(x);
+}
OpenPOWER on IntegriCloud