diff options
author | luigi <luigi@FreeBSD.org> | 2010-03-23 09:58:59 +0000 |
---|---|---|
committer | luigi <luigi@FreeBSD.org> | 2010-03-23 09:58:59 +0000 |
commit | 153fa4f49e7ae4d39851638cfb970d383c0f8b91 (patch) | |
tree | 003b5213e062bbdc0758d00a9edc114e010723bd | |
parent | 237d6f0e4a51d1e1fc9572f2c25fb79712126d1f (diff) | |
download | FreeBSD-src-153fa4f49e7ae4d39851638cfb970d383c0f8b91.zip FreeBSD-src-153fa4f49e7ae4d39851638cfb970d383c0f8b91.tar.gz |
MFC of a large number of ipfw and dummynet fixes and enhancements
done in CURRENT over the last 4 months.
HEAD and RELENG_8 are almost in sync now for ipfw, dummynet
the pfil hooks and related components.
Among the most noticeable changes:
- r200855 more efficient lookup of skipto rules, and remove O(N)
blocks from critical sections in the kernel;
- r204591 large restructuring of the dummynet module, with support
for multiple scheduling algorithms (4 available so far)
See the original commit logs for details.
Changes in the kernel/userland ABI should be harmless because the
kernel is able to understand previous requests from RELENG_8 and
RELENG_7. For this reason, this changeset would be applicable
to RELENG_7 as well, but i am not sure if it is worthwhile.
48 files changed, 14723 insertions, 6911 deletions
diff --git a/sbin/ipfw/Makefile b/sbin/ipfw/Makefile index c09ebca..b25f38c 100644 --- a/sbin/ipfw/Makefile +++ b/sbin/ipfw/Makefile @@ -3,6 +3,7 @@ PROG= ipfw SRCS= ipfw2.c dummynet.c ipv6.c main.c nat.c altq.c WARNS?= 2 +DPADD= ${LIBUTIL} LDADD= -lutil MAN= ipfw.8 diff --git a/sbin/ipfw/altq.c b/sbin/ipfw/altq.c index b00a1e0..8cf19e5 100644 --- a/sbin/ipfw/altq.c +++ b/sbin/ipfw/altq.c @@ -39,6 +39,7 @@ #include <net/if.h> /* IFNAMSIZ */ #include <net/pfvar.h> +#include <netinet/in.h> /* in_addr */ #include <netinet/ip_fw.h> /* diff --git a/sbin/ipfw/dummynet.c b/sbin/ipfw/dummynet.c index 9e68e65..eb6547a 100644 --- a/sbin/ipfw/dummynet.c +++ b/sbin/ipfw/dummynet.c @@ -1,10 +1,5 @@ /* - * Copyright (c) 2002-2003 Luigi Rizzo - * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp - * Copyright (c) 1994 Ugen J.S.Antsilevich - * - * Idea and grammar partially left from: - * Copyright (c) 1993 Daniel Boulet + * Copyright (c) 2002-2003,2010 Luigi Rizzo * * Redistribution and use in source forms, with and without modification, * are permitted provided that this entire comment appears intact. @@ -15,8 +10,6 @@ * * This software is provided ``AS IS'' without any warranties of any kind. * - * NEW command line interface for IP firewall facility - * * $FreeBSD$ * * dummynet support @@ -24,7 +17,6 @@ #include <sys/types.h> #include <sys/socket.h> -#include <sys/queue.h> /* XXX there are several sysctl leftover here */ #include <sys/sysctl.h> @@ -46,6 +38,7 @@ #include <netinet/ip_dummynet.h> #include <arpa/inet.h> /* inet_ntoa */ + static struct _s_x dummynet_params[] = { { "plr", TOK_PLR }, { "noerror", TOK_NOERROR }, @@ -56,27 +49,59 @@ static struct _s_x dummynet_params[] = { { "src-port", TOK_SRCPORT }, { "proto", TOK_PROTO }, { "weight", TOK_WEIGHT }, + { "lmax", TOK_LMAX }, + { "maxlen", TOK_LMAX }, { "all", TOK_ALL }, - { "mask", TOK_MASK }, + { "mask", TOK_MASK }, /* alias for both */ + { "sched_mask", TOK_SCHED_MASK }, + { "flow_mask", TOK_FLOW_MASK }, { "droptail", TOK_DROPTAIL }, { "red", TOK_RED }, { "gred", TOK_GRED }, { "bw", TOK_BW }, { "bandwidth", TOK_BW }, { "delay", TOK_DELAY }, + { "link", TOK_LINK }, { "pipe", TOK_PIPE }, { "queue", TOK_QUEUE }, + { "flowset", TOK_FLOWSET }, + { "sched", TOK_SCHED }, + { "pri", TOK_PRI }, + { "priority", TOK_PRI }, + { "type", TOK_TYPE }, { "flow-id", TOK_FLOWID}, { "dst-ipv6", TOK_DSTIP6}, { "dst-ip6", TOK_DSTIP6}, { "src-ipv6", TOK_SRCIP6}, { "src-ip6", TOK_SRCIP6}, - { "profile", TOK_PIPE_PROFILE}, + { "profile", TOK_PROFILE}, { "burst", TOK_BURST}, { "dummynet-params", TOK_NULL }, { NULL, 0 } /* terminator */ }; +#define O_NEXT(p, len) ((void *)((char *)p + len)) + +static void +oid_fill(struct dn_id *oid, int len, int type, uintptr_t id) +{ + oid->len = len; + oid->type = type; + oid->subtype = 0; + oid->id = id; +} + +/* make room in the buffer and move the pointer forward */ +static void * +o_next(struct dn_id **o, int len, int type) +{ + struct dn_id *ret = *o; + oid_fill(ret, len, type, 0); + *o = O_NEXT(*o, len); + return ret; +} + +#if 0 static int sort_q(void *arg, const void *pa, const void *pb) { @@ -108,117 +133,84 @@ sort_q(void *arg, const void *pa, const void *pb) res = 1; return (int)(rev ? res : -res); } +#endif +/* print a mask and header for the subsequent list of flows */ static void -list_queues(struct dn_flow_set *fs, struct dn_flow_queue *q) +print_mask(struct ipfw_flow_id *id) +{ + if (!IS_IP6_FLOW_ID(id)) { + printf(" " + "mask: %s 0x%02x 0x%08x/0x%04x -> 0x%08x/0x%04x\n", + id->extra ? "queue," : "", + id->proto, + id->src_ip, id->src_port, + id->dst_ip, id->dst_port); + + printf("BKT Prot ___Source IP/port____ " + "____Dest. IP/port____ " + "Tot_pkt/bytes Pkt/Byte Drp\n"); + } else { + char buf[255]; + printf("\n mask: %sproto: 0x%02x, flow_id: 0x%08x, ", + id->extra ? "queue," : "", + id->proto, id->flow_id6); + inet_ntop(AF_INET6, &(id->src_ip6), buf, sizeof(buf)); + printf("%s/0x%04x -> ", buf, id->src_port); + inet_ntop(AF_INET6, &(id->dst_ip6), buf, sizeof(buf)); + printf("%s/0x%04x\n", buf, id->dst_port); + + printf("BKT ___Prot___ _flow-id_ " + "______________Source IPv6/port_______________ " + "_______________Dest. IPv6/port_______________ " + "Tot_pkt/bytes Pkt/Byte Drp\n"); + } +} + +static void +list_flow(struct dn_flow *ni) { - int l; - int index_printed, indexes = 0; char buff[255]; struct protoent *pe; + struct in_addr ina; + struct ipfw_flow_id *id = &ni->fid; - if (fs->rq_elements == 0) - return; - - if (co.do_sort != 0) - qsort_r(q, fs->rq_elements, sizeof *q, NULL, sort_q); - - /* Print IPv4 flows */ - index_printed = 0; - for (l = 0; l < fs->rq_elements; l++) { - struct in_addr ina; - + pe = getprotobynumber(id->proto); /* XXX: Should check for IPv4 flows */ - if (IS_IP6_FLOW_ID(&(q[l].id))) - continue; - - if (!index_printed) { - index_printed = 1; - if (indexes > 0) /* currently a no-op */ - printf("\n"); - indexes++; - printf(" " - "mask: 0x%02x 0x%08x/0x%04x -> 0x%08x/0x%04x\n", - fs->flow_mask.proto, - fs->flow_mask.src_ip, fs->flow_mask.src_port, - fs->flow_mask.dst_ip, fs->flow_mask.dst_port); - - printf("BKT Prot ___Source IP/port____ " - "____Dest. IP/port____ " - "Tot_pkt/bytes Pkt/Byte Drp\n"); - } - - printf("%3d ", q[l].hash_slot); - pe = getprotobynumber(q[l].id.proto); + printf("%3u%c", (ni->oid.id) & 0xff, + id->extra ? '*' : ' '); + if (!IS_IP6_FLOW_ID(id)) { if (pe) printf("%-4s ", pe->p_name); else - printf("%4u ", q[l].id.proto); - ina.s_addr = htonl(q[l].id.src_ip); + printf("%4u ", id->proto); + ina.s_addr = htonl(id->src_ip); printf("%15s/%-5d ", - inet_ntoa(ina), q[l].id.src_port); - ina.s_addr = htonl(q[l].id.dst_ip); + inet_ntoa(ina), id->src_port); + ina.s_addr = htonl(id->dst_ip); printf("%15s/%-5d ", - inet_ntoa(ina), q[l].id.dst_port); - printf("%4llu %8llu %2u %4u %3u\n", - align_uint64(&q[l].tot_pkts), - align_uint64(&q[l].tot_bytes), - q[l].len, q[l].len_bytes, q[l].drops); - if (co.verbose) - printf(" S %20llu F %20llu\n", - align_uint64(&q[l].S), align_uint64(&q[l].F)); - } - - /* Print IPv6 flows */ - index_printed = 0; - for (l = 0; l < fs->rq_elements; l++) { - if (!IS_IP6_FLOW_ID(&(q[l].id))) - continue; - - if (!index_printed) { - index_printed = 1; - if (indexes > 0) - printf("\n"); - indexes++; - printf("\n mask: proto: 0x%02x, flow_id: 0x%08x, ", - fs->flow_mask.proto, fs->flow_mask.flow_id6); - inet_ntop(AF_INET6, &(fs->flow_mask.src_ip6), - buff, sizeof(buff)); - printf("%s/0x%04x -> ", buff, fs->flow_mask.src_port); - inet_ntop( AF_INET6, &(fs->flow_mask.dst_ip6), - buff, sizeof(buff) ); - printf("%s/0x%04x\n", buff, fs->flow_mask.dst_port); - - printf("BKT ___Prot___ _flow-id_ " - "______________Source IPv6/port_______________ " - "_______________Dest. IPv6/port_______________ " - "Tot_pkt/bytes Pkt/Byte Drp\n"); - } - printf("%3d ", q[l].hash_slot); - pe = getprotobynumber(q[l].id.proto); + inet_ntoa(ina), id->dst_port); + } else { + /* Print IPv6 flows */ if (pe != NULL) printf("%9s ", pe->p_name); else - printf("%9u ", q[l].id.proto); - printf("%7d %39s/%-5d ", q[l].id.flow_id6, - inet_ntop(AF_INET6, &(q[l].id.src_ip6), buff, sizeof(buff)), - q[l].id.src_port); + printf("%9u ", id->proto); + printf("%7d %39s/%-5d ", id->flow_id6, + inet_ntop(AF_INET6, &(id->src_ip6), buff, sizeof(buff)), + id->src_port); printf(" %39s/%-5d ", - inet_ntop(AF_INET6, &(q[l].id.dst_ip6), buff, sizeof(buff)), - q[l].id.dst_port); - printf(" %4llu %8llu %2u %4u %3u\n", - align_uint64(&q[l].tot_pkts), - align_uint64(&q[l].tot_bytes), - q[l].len, q[l].len_bytes, q[l].drops); - if (co.verbose) - printf(" S %20llu F %20llu\n", - align_uint64(&q[l].S), - align_uint64(&q[l].F)); + inet_ntop(AF_INET6, &(id->dst_ip6), buff, sizeof(buff)), + id->dst_port); } + printf("%4llu %8llu %2u %4u %3u\n", + align_uint64(&ni->tot_pkts), + align_uint64(&ni->tot_bytes), + ni->length, ni->len_bytes, ni->drops); } static void -print_flowset_parms(struct dn_flow_set *fs, char *prefix) +print_flowset_parms(struct dn_fs *fs, char *prefix) { int l; char qs[30]; @@ -226,7 +218,7 @@ print_flowset_parms(struct dn_flow_set *fs, char *prefix) char red[90]; /* Display RED parameters */ l = fs->qsize; - if (fs->flags_fs & DN_QSIZE_IS_BYTES) { + if (fs->flags & DN_QSIZE_BYTES) { if (l >= 8192) sprintf(qs, "%d KB", l / 1024); else @@ -237,23 +229,34 @@ print_flowset_parms(struct dn_flow_set *fs, char *prefix) sprintf(plr, "plr %f", 1.0 * fs->plr / (double)(0x7fffffff)); else plr[0] = '\0'; - if (fs->flags_fs & DN_IS_RED) /* RED parameters */ + + if (fs->flags & DN_IS_RED) /* RED parameters */ sprintf(red, "\n\t %cRED w_q %f min_th %d max_th %d max_p %f", - (fs->flags_fs & DN_IS_GENTLE_RED) ? 'G' : ' ', + (fs->flags & DN_IS_GENTLE_RED) ? 'G' : ' ', 1.0 * fs->w_q / (double)(1 << SCALE_RED), - SCALE_VAL(fs->min_th), - SCALE_VAL(fs->max_th), + fs->min_th, + fs->max_th, 1.0 * fs->max_p / (double)(1 << SCALE_RED)); else sprintf(red, "droptail"); - printf("%s %s%s %d queues (%d buckets) %s\n", - prefix, qs, plr, fs->rq_elements, fs->rq_size, red); + if (prefix[0]) { + printf("%s %s%s %d queues (%d buckets) %s\n", + prefix, qs, plr, fs->oid.id, fs->buckets, red); + prefix[0] = '\0'; + } else { + printf("q%05d %s%s %d flows (%d buckets) sched %d " + "weight %d lmax %d pri %d %s\n", + fs->fs_nr, qs, plr, fs->oid.id, fs->buckets, + fs->sched_nr, fs->par[0], fs->par[1], fs->par[2], red); + if (fs->flags & DN_HAVE_MASK) + print_mask(&fs->flow_mask); + } } static void -print_extra_delay_parms(struct dn_pipe *p) +print_extra_delay_parms(struct dn_profile *p) { double loss; if (p->samples_no <= 0) @@ -265,105 +268,126 @@ print_extra_delay_parms(struct dn_pipe *p) p->name, loss, p->samples_no); } -void -ipfw_list_pipes(void *data, uint nbytes, int ac, char *av[]) +static void +flush_buf(char *buf) { - int rulenum; - void *next = data; - struct dn_pipe *p = (struct dn_pipe *) data; - struct dn_flow_set *fs; - struct dn_flow_queue *q; - int l; - - if (ac > 0) - rulenum = strtoul(*av++, NULL, 10); - else - rulenum = 0; - for (; nbytes >= sizeof *p; p = (struct dn_pipe *)next) { - double b = p->bandwidth; - char buf[30]; - char prefix[80]; - char burst[5 + 7]; - - if (SLIST_NEXT(p, next) != (struct dn_pipe *)DN_IS_PIPE) - break; /* done with pipes, now queues */ - - /* - * compute length, as pipe have variable size - */ - l = sizeof(*p) + p->fs.rq_elements * sizeof(*q); - next = (char *)p + l; - nbytes -= l; - - if ((rulenum != 0 && rulenum != p->pipe_nr) || co.do_pipe == 2) - continue; - - /* - * Print rate (or clocking interface) - */ - if (p->if_name[0] != '\0') - sprintf(buf, "%s", p->if_name); - else if (b == 0) - sprintf(buf, "unlimited"); - else if (b >= 1000000) - sprintf(buf, "%7.3f Mbit/s", b/1000000); - else if (b >= 1000) - sprintf(buf, "%7.3f Kbit/s", b/1000); - else - sprintf(buf, "%7.3f bit/s ", b); - - sprintf(prefix, "%05d: %s %4d ms ", - p->pipe_nr, buf, p->delay); - - print_flowset_parms(&(p->fs), prefix); - - if (humanize_number(burst, sizeof(burst), p->burst, - "Byte", HN_AUTOSCALE, 0) < 0 || co.verbose) - printf("\t burst: %ju Byte\n", p->burst); - else - printf("\t burst: %s\n", burst); - - print_extra_delay_parms(p); - - q = (struct dn_flow_queue *)(p+1); - list_queues(&(p->fs), q); - } - for (fs = next; nbytes >= sizeof *fs; fs = next) { - char prefix[80]; - - if (SLIST_NEXT(fs, next) != (struct dn_flow_set *)DN_IS_QUEUE) - break; - l = sizeof(*fs) + fs->rq_elements * sizeof(*q); - next = (char *)fs + l; - nbytes -= l; - - if (rulenum != 0 && ((rulenum != fs->fs_nr && co.do_pipe == 2) || - (rulenum != fs->parent_nr && co.do_pipe == 1))) { - continue; - } - - q = (struct dn_flow_queue *)(fs+1); - sprintf(prefix, "q%05d: weight %d pipe %d ", - fs->fs_nr, fs->weight, fs->parent_nr); - print_flowset_parms(fs, prefix); - list_queues(fs, q); + if (buf[0]) + printf("%s\n", buf); + buf[0] = '\0'; +} + +/* + * generic list routine. We expect objects in a specific order, i.e. + * PIPES AND SCHEDULERS: + * link; scheduler; internal flowset if any; instances + * we can tell a pipe from the number. + * + * FLOWSETS: + * flowset; queues; + * link i (int queue); scheduler i; si(i) { flowsets() : queues } + */ +static void +list_pipes(struct dn_id *oid, struct dn_id *end) +{ + char buf[160]; /* pending buffer */ + buf[0] = '\0'; + + for (; oid != end; oid = O_NEXT(oid, oid->len)) { + if (oid->len < sizeof(*oid)) + errx(1, "invalid oid len %d\n", oid->len); + + switch (oid->type) { + default: + flush_buf(buf); + printf("unrecognized object %d size %d\n", oid->type, oid->len); + break; + case DN_TEXT: /* list of attached flowsets */ + { + int i, l; + struct { + struct dn_id id; + uint32_t p[0]; + } *d = (void *)oid; + l = (oid->len - sizeof(*oid))/sizeof(d->p[0]); + if (l == 0) + break; + printf(" Children flowsets: "); + for (i = 0; i < l; i++) + printf("%u ", d->p[i]); + printf("\n"); + break; + } + case DN_CMD_GET: + if (co.verbose) + printf("answer for cmd %d, len %d\n", oid->type, oid->id); + break; + case DN_SCH: { + struct dn_sch *s = (struct dn_sch *)oid; + flush_buf(buf); + printf(" sched %d type %s flags 0x%x %d buckets %d active\n", + s->sched_nr, + s->name, s->flags, s->buckets, s->oid.id); + if (s->flags & DN_HAVE_MASK) + print_mask(&s->sched_mask); + } + break; + + case DN_FLOW: + list_flow((struct dn_flow *)oid); + break; + + case DN_LINK: { + struct dn_link *p = (struct dn_link *)oid; + double b = p->bandwidth; + char bwbuf[30]; + char burst[5 + 7]; + + /* This starts a new object so flush buffer */ + flush_buf(buf); + /* data rate */ + if (b == 0) + sprintf(bwbuf, "unlimited "); + else if (b >= 1000000) + sprintf(bwbuf, "%7.3f Mbit/s", b/1000000); + else if (b >= 1000) + sprintf(bwbuf, "%7.3f Kbit/s", b/1000); + else + sprintf(bwbuf, "%7.3f bit/s ", b); + + if (humanize_number(burst, sizeof(burst), p->burst, + "", HN_AUTOSCALE, 0) < 0 || co.verbose) + sprintf(burst, "%d", (int)p->burst); + sprintf(buf, "%05d: %s %4d ms burst %s", + p->link_nr % DN_MAX_ID, bwbuf, p->delay, burst); + } + break; + + case DN_FS: + print_flowset_parms((struct dn_fs *)oid, buf); + break; + case DN_PROFILE: + flush_buf(buf); + print_extra_delay_parms((struct dn_profile *)oid); } + flush_buf(buf); // XXX does it really go here ? + } } /* - * Delete pipe or queue i + * Delete pipe, queue or scheduler i */ int -ipfw_delete_pipe(int pipe_or_queue, int i) +ipfw_delete_pipe(int do_pipe, int i) { - struct dn_pipe p; - - memset(&p, 0, sizeof p); - if (pipe_or_queue == 1) - p.pipe_nr = i; /* pipe */ - else - p.fs.fs_nr = i; /* queue */ - i = do_cmd(IP_DUMMYNET_DEL, &p, sizeof p); + struct { + struct dn_id oid; + uintptr_t a[1]; /* add more if we want a list */ + } cmd; + oid_fill((void *)&cmd, sizeof(cmd), DN_CMD_DELETE, DN_API_VERSION); + cmd.oid.subtype = (do_pipe == 1) ? DN_LINK : + ( (do_pipe == 2) ? DN_FS : DN_SCH); + cmd.a[0] = i; + i = do_cmd(IP_DUMMYNET3, &cmd, cmd.oid.len); if (i) { i = 1; warn("rule %u: setsockopt(IP_DUMMYNET_DEL)", i); @@ -400,7 +424,7 @@ ipfw_delete_pipe(int pipe_or_queue, int i) * The empirical curve may have both vertical and horizontal lines. * Vertical lines represent constant delay for a range of * probabilities; horizontal lines correspond to a discontinuty - * in the delay distribution: the pipe will use the largest delay + * in the delay distribution: the link will use the largest delay * for a given probability. * * To pass the curve to dummynet, we must store the parameters @@ -490,9 +514,12 @@ static void read_bandwidth(char *arg, int *bandwidth, char *if_name, int namelen) { if (*bandwidth != -1) - warn("duplicate token, override bandwidth value!"); + warnx("duplicate token, override bandwidth value!"); if (arg[0] >= 'a' && arg[0] <= 'z') { + if (!if_name) { + errx(1, "no if support"); + } if (namelen >= IFNAMSIZ) warn("interface name truncated"); namelen--; @@ -508,7 +535,7 @@ read_bandwidth(char *arg, int *bandwidth, char *if_name, int namelen) if (*end == 'K' || *end == 'k') { end++; bw *= 1000; - } else if (*end == 'M') { + } else if (*end == 'M' || *end == 'm') { end++; bw *= 1000000; } @@ -521,7 +548,8 @@ read_bandwidth(char *arg, int *bandwidth, char *if_name, int namelen) errx(EX_DATAERR, "bandwidth too large"); *bandwidth = bw; - if_name[0] = '\0'; + if (if_name) + if_name[0] = '\0'; } } @@ -551,7 +579,8 @@ compare_points(const void *vp1, const void *vp2) #define ED_EFMT(s) EX_DATAERR,"error in %s at line %d: "#s,filename,lineno static void -load_extra_delays(const char *filename, struct dn_pipe *p) +load_extra_delays(const char *filename, struct dn_profile *p, + struct dn_link *link) { char line[ED_MAX_LINE_LEN]; FILE *f; @@ -566,6 +595,9 @@ load_extra_delays(const char *filename, struct dn_pipe *p) struct point points[ED_MAX_SAMPLES_NO]; int points_no = 0; + /* XXX link never NULL? */ + p->link_nr = link->link_nr; + profile_name[0] = '\0'; f = fopen(filename, "r"); if (f == NULL) @@ -606,7 +638,8 @@ load_extra_delays(const char *filename, struct dn_pipe *p) ED_MAX_SAMPLES_NO); do_points = 0; } else if (!strcasecmp(name, ED_TOK_BW)) { - read_bandwidth(arg, &p->bandwidth, p->if_name, sizeof(p->if_name)); + char buf[IFNAMSIZ]; + read_bandwidth(arg, &link->bandwidth, buf, sizeof(buf)); } else if (!strcasecmp(name, ED_TOK_LOSS)) { if (loss != -1.0) errx(ED_EFMT("duplicated token: %s"), name); @@ -676,17 +709,17 @@ load_extra_delays(const char *filename, struct dn_pipe *p) double y2 = points[i+1].prob * samples; double x2 = points[i+1].delay; - int index = y1; + int ix = y1; int stop = y2; if (x1 == x2) { - for (; index<stop; ++index) - p->samples[index] = x1; + for (; ix<stop; ++ix) + p->samples[ix] = x1; } else { double m = (y2-y1)/(x2-x1); double c = y1 - m*x1; - for (; index<stop ; ++index) - p->samples[index] = (index - c)/m; + for (; ix<stop ; ++ix) + p->samples[ix] = (ix - c)/m; } } p->samples_no = samples; @@ -694,27 +727,120 @@ load_extra_delays(const char *filename, struct dn_pipe *p) strncpy(p->name, profile_name, sizeof(p->name)); } +/* + * configuration of pipes, schedulers, flowsets. + * When we configure a new scheduler, an empty pipe is created, so: + * + * do_pipe = 1 -> "pipe N config ..." only for backward compatibility + * sched N+Delta type fifo sched_mask ... + * pipe N+Delta <parameters> + * flowset N+Delta pipe N+Delta (no parameters) + * sched N type wf2q+ sched_mask ... + * pipe N <parameters> + * + * do_pipe = 2 -> flowset N config + * flowset N parameters + * + * do_pipe = 3 -> sched N config + * sched N parameters (default no pipe) + * optional Pipe N config ... + * pipe ==> + */ void ipfw_config_pipe(int ac, char **av) { - int samples[ED_MAX_SAMPLES_NO]; - struct dn_pipe p; - int i; + int i, j; char *end; void *par = NULL; - - memset(&p, 0, sizeof p); - p.bandwidth = -1; + struct dn_id *buf, *base; + struct dn_sch *sch = NULL; + struct dn_link *p = NULL; + struct dn_fs *fs = NULL; + struct dn_profile *pf = NULL; + struct ipfw_flow_id *mask = NULL; + int lmax; + uint32_t _foo = 0, *flags = &_foo , *buckets = &_foo; + + /* + * allocate space for 1 header, + * 1 scheduler, 1 link, 1 flowset, 1 profile + */ + lmax = sizeof(struct dn_id); /* command header */ + lmax += sizeof(struct dn_sch) + sizeof(struct dn_link) + + sizeof(struct dn_fs) + sizeof(struct dn_profile); av++; ac--; /* Pipe number */ if (ac && isdigit(**av)) { i = atoi(*av); av++; ac--; - if (co.do_pipe == 1) - p.pipe_nr = i; - else - p.fs.fs_nr = i; + } else + i = -1; + if (i <= 0) + errx(EX_USAGE, "need a pipe/flowset/sched number"); + base = buf = safe_calloc(1, lmax); + /* all commands start with a 'CONFIGURE' and a version */ + o_next(&buf, sizeof(struct dn_id), DN_CMD_CONFIG); + base->id = DN_API_VERSION; + + switch (co.do_pipe) { + case 1: /* "pipe N config ..." */ + /* Allocate space for the WF2Q+ scheduler, its link + * and the FIFO flowset. Set the number, but leave + * the scheduler subtype and other parameters to 0 + * so the kernel will use appropriate defaults. + * XXX todo: add a flag to record if a parameter + * is actually configured. + * If we do a 'pipe config' mask -> sched_mask. + * The FIFO scheduler and link are derived from the + * WF2Q+ one in the kernel. + */ + sch = o_next(&buf, sizeof(*sch), DN_SCH); + p = o_next(&buf, sizeof(*p), DN_LINK); + fs = o_next(&buf, sizeof(*fs), DN_FS); + + sch->sched_nr = i; + sch->oid.subtype = 0; /* defaults to WF2Q+ */ + mask = &sch->sched_mask; + flags = &sch->flags; + buckets = &sch->buckets; + *flags |= DN_PIPE_CMD; + + p->link_nr = i; + + /* This flowset is only for the FIFO scheduler */ + fs->fs_nr = i + 2*DN_MAX_ID; + fs->sched_nr = i + DN_MAX_ID; + break; + + case 2: /* "queue N config ... " */ + fs = o_next(&buf, sizeof(*fs), DN_FS); + fs->fs_nr = i; + mask = &fs->flow_mask; + flags = &fs->flags; + buckets = &fs->buckets; + break; + + case 3: /* "sched N config ..." */ + sch = o_next(&buf, sizeof(*sch), DN_SCH); + fs = o_next(&buf, sizeof(*fs), DN_FS); + sch->sched_nr = i; + mask = &sch->sched_mask; + flags = &sch->flags; + buckets = &sch->buckets; + /* fs is used only with !MULTIQUEUE schedulers */ + fs->fs_nr = i + DN_MAX_ID; + fs->sched_nr = i; + break; } + /* set to -1 those fields for which we want to reuse existing + * values from the kernel. + * Also, *_nr and subtype = 0 mean reuse the value from the kernel. + * XXX todo: support reuse of the mask. + */ + if (p) + p->bandwidth = -1; + for (j = 0; j < sizeof(fs->par)/sizeof(fs->par[0]); j++) + fs->par[j] = -1; while (ac > 0) { double d; int tok = match_token(dummynet_params, *av); @@ -722,41 +848,48 @@ ipfw_config_pipe(int ac, char **av) switch(tok) { case TOK_NOERROR: - p.fs.flags_fs |= DN_NOERROR; + NEED(fs, "noerror is only for pipes"); + fs->flags |= DN_NOERROR; break; case TOK_PLR: + NEED(fs, "plr is only for pipes"); NEED1("plr needs argument 0..1\n"); d = strtod(av[0], NULL); if (d > 1) d = 1; else if (d < 0) d = 0; - p.fs.plr = (int)(d*0x7fffffff); + fs->plr = (int)(d*0x7fffffff); ac--; av++; break; case TOK_QUEUE: + NEED(fs, "queue is only for pipes or flowsets"); NEED1("queue needs queue size\n"); end = NULL; - p.fs.qsize = strtoul(av[0], &end, 0); + fs->qsize = strtoul(av[0], &end, 0); if (*end == 'K' || *end == 'k') { - p.fs.flags_fs |= DN_QSIZE_IS_BYTES; - p.fs.qsize *= 1024; + fs->flags |= DN_QSIZE_BYTES; + fs->qsize *= 1024; } else if (*end == 'B' || _substrcmp2(end, "by", "bytes") == 0) { - p.fs.flags_fs |= DN_QSIZE_IS_BYTES; + fs->flags |= DN_QSIZE_BYTES; } ac--; av++; break; case TOK_BUCKETS: + NEED(fs, "buckets is only for pipes or flowsets"); NEED1("buckets needs argument\n"); - p.fs.rq_size = strtoul(av[0], NULL, 0); + *buckets = strtoul(av[0], NULL, 0); ac--; av++; break; + case TOK_FLOW_MASK: + case TOK_SCHED_MASK: case TOK_MASK: + NEED(mask, "tok_mask"); NEED1("mask needs mask specifier\n"); /* * per-flow queue, mask is dst_ip, dst_port, @@ -764,7 +897,7 @@ ipfw_config_pipe(int ac, char **av) */ par = NULL; - bzero(&p.fs.flow_mask, sizeof(p.fs.flow_mask)); + bzero(mask, sizeof(*mask)); end = NULL; while (ac >= 1) { @@ -780,44 +913,55 @@ ipfw_config_pipe(int ac, char **av) case TOK_ALL: /* * special case, all bits significant + * except 'extra' (the queue number) */ - p.fs.flow_mask.dst_ip = ~0; - p.fs.flow_mask.src_ip = ~0; - p.fs.flow_mask.dst_port = ~0; - p.fs.flow_mask.src_port = ~0; - p.fs.flow_mask.proto = ~0; - n2mask(&(p.fs.flow_mask.dst_ip6), 128); - n2mask(&(p.fs.flow_mask.src_ip6), 128); - p.fs.flow_mask.flow_id6 = ~0; - p.fs.flags_fs |= DN_HAVE_FLOW_MASK; + mask->dst_ip = ~0; + mask->src_ip = ~0; + mask->dst_port = ~0; + mask->src_port = ~0; + mask->proto = ~0; + n2mask(&mask->dst_ip6, 128); + n2mask(&mask->src_ip6, 128); + mask->flow_id6 = ~0; + *flags |= DN_HAVE_MASK; + goto end_mask; + + case TOK_QUEUE: + mask->extra = ~0; + *flags |= DN_HAVE_MASK; goto end_mask; case TOK_DSTIP: - p32 = &p.fs.flow_mask.dst_ip; + mask->addr_type = 4; + p32 = &mask->dst_ip; break; case TOK_SRCIP: - p32 = &p.fs.flow_mask.src_ip; + mask->addr_type = 4; + p32 = &mask->src_ip; break; case TOK_DSTIP6: - pa6 = &(p.fs.flow_mask.dst_ip6); + mask->addr_type = 6; + pa6 = &mask->dst_ip6; break; case TOK_SRCIP6: - pa6 = &(p.fs.flow_mask.src_ip6); + mask->addr_type = 6; + pa6 = &mask->src_ip6; break; case TOK_FLOWID: - p20 = &p.fs.flow_mask.flow_id6; + mask->addr_type = 6; + p20 = &mask->flow_id6; break; case TOK_DSTPORT: - p16 = &p.fs.flow_mask.dst_port; + p16 = &mask->dst_port; break; case TOK_SRCPORT: - p16 = &p.fs.flow_mask.src_port; + p16 = &mask->src_port; break; case TOK_PROTO: @@ -857,10 +1001,10 @@ ipfw_config_pipe(int ac, char **av) if (a > 0xFF) errx(EX_DATAERR, "proto mask must be 8 bit"); - p.fs.flow_mask.proto = (uint8_t)a; + mask->proto = (uint8_t)a; } if (a != 0) - p.fs.flags_fs |= DN_HAVE_FLOW_MASK; + *flags |= DN_HAVE_MASK; ac--; av++; } /* end while, config masks */ end_mask: @@ -869,9 +1013,9 @@ end_mask: case TOK_RED: case TOK_GRED: NEED1("red/gred needs w_q/min_th/max_th/max_p\n"); - p.fs.flags_fs |= DN_IS_RED; + fs->flags |= DN_IS_RED; if (tok == TOK_GRED) - p.fs.flags_fs |= DN_IS_GENTLE_RED; + fs->flags |= DN_IS_GENTLE_RED; /* * the format for parameters is w_q/min_th/max_th/max_p */ @@ -879,82 +1023,108 @@ end_mask: double w_q = strtod(end, NULL); if (w_q > 1 || w_q <= 0) errx(EX_DATAERR, "0 < w_q <= 1"); - p.fs.w_q = (int) (w_q * (1 << SCALE_RED)); + fs->w_q = (int) (w_q * (1 << SCALE_RED)); } if ((end = strsep(&av[0], "/"))) { - p.fs.min_th = strtoul(end, &end, 0); + fs->min_th = strtoul(end, &end, 0); if (*end == 'K' || *end == 'k') - p.fs.min_th *= 1024; + fs->min_th *= 1024; } if ((end = strsep(&av[0], "/"))) { - p.fs.max_th = strtoul(end, &end, 0); + fs->max_th = strtoul(end, &end, 0); if (*end == 'K' || *end == 'k') - p.fs.max_th *= 1024; + fs->max_th *= 1024; } if ((end = strsep(&av[0], "/"))) { double max_p = strtod(end, NULL); if (max_p > 1 || max_p <= 0) errx(EX_DATAERR, "0 < max_p <= 1"); - p.fs.max_p = (int)(max_p * (1 << SCALE_RED)); + fs->max_p = (int)(max_p * (1 << SCALE_RED)); } ac--; av++; break; case TOK_DROPTAIL: - p.fs.flags_fs &= ~(DN_IS_RED|DN_IS_GENTLE_RED); + NEED(fs, "droptail is only for flowsets"); + fs->flags &= ~(DN_IS_RED|DN_IS_GENTLE_RED); break; case TOK_BW: + NEED(p, "bw is only for links"); NEED1("bw needs bandwidth or interface\n"); - if (co.do_pipe != 1) - errx(EX_DATAERR, "bandwidth only valid for pipes"); - read_bandwidth(av[0], &p.bandwidth, p.if_name, sizeof(p.if_name)); + read_bandwidth(av[0], &p->bandwidth, NULL, 0); ac--; av++; break; case TOK_DELAY: - if (co.do_pipe != 1) - errx(EX_DATAERR, "delay only valid for pipes"); + NEED(p, "delay is only for links"); NEED1("delay needs argument 0..10000ms\n"); - p.delay = strtoul(av[0], NULL, 0); + p->delay = strtoul(av[0], NULL, 0); + ac--; av++; + break; + + case TOK_TYPE: { + int l; + NEED(sch, "type is only for schedulers"); + NEED1("type needs a string"); + l = strlen(av[0]); + if (l == 0 || l > 15) + errx(1, "type %s too long\n", av[0]); + strcpy(sch->name, av[0]); + sch->oid.subtype = 0; /* use string */ ac--; av++; break; + } case TOK_WEIGHT: - if (co.do_pipe == 1) - errx(EX_DATAERR,"weight only valid for queues"); - NEED1("weight needs argument 0..100\n"); - p.fs.weight = strtoul(av[0], &end, 0); + NEED(fs, "weight is only for flowsets"); + NEED1("weight needs argument\n"); + fs->par[0] = strtol(av[0], &end, 0); + ac--; av++; + break; + + case TOK_LMAX: + NEED(fs, "lmax is only for flowsets"); + NEED1("lmax needs argument\n"); + fs->par[1] = strtol(av[0], &end, 0); ac--; av++; break; + case TOK_PRI: + NEED(fs, "priority is only for flowsets"); + NEED1("priority needs argument\n"); + fs->par[2] = strtol(av[0], &end, 0); + ac--; av++; + break; + + case TOK_SCHED: case TOK_PIPE: - if (co.do_pipe == 1) - errx(EX_DATAERR,"pipe only valid for queues"); - NEED1("pipe needs pipe_number\n"); - p.fs.parent_nr = strtoul(av[0], &end, 0); + NEED(fs, "pipe/sched"); + NEED1("pipe/link/sched needs number\n"); + fs->sched_nr = strtoul(av[0], &end, 0); ac--; av++; break; - case TOK_PIPE_PROFILE: - if (co.do_pipe != 1) - errx(EX_DATAERR, "extra delay only valid for pipes"); + case TOK_PROFILE: + NEED((!pf), "profile already set"); + NEED(p, "profile"); + { NEED1("extra delay needs the file name\n"); - p.samples = &samples[0]; - load_extra_delays(av[0], &p); + pf = o_next(&buf, sizeof(*pf), DN_PROFILE); + load_extra_delays(av[0], pf, p); //XXX can't fail? --ac; ++av; + } break; case TOK_BURST: - if (co.do_pipe != 1) - errx(EX_DATAERR, "burst only valid for pipes"); + NEED(p, "burst"); NEED1("burst needs argument\n"); errno = 0; - if (expand_number(av[0], &p.burst) < 0) + if (expand_number(av[0], (int64_t *)&p->burst) < 0) if (errno != ERANGE) errx(EX_DATAERR, "burst: invalid argument"); - if (errno || p.burst > (1ULL << 48) - 1) + if (errno || p->burst > (1ULL << 48) - 1) errx(EX_DATAERR, "burst: out of range (0..2^48-1)"); ac--; av++; @@ -964,26 +1134,17 @@ end_mask: errx(EX_DATAERR, "unrecognised option ``%s''", av[-1]); } } - if (co.do_pipe == 1) { - if (p.pipe_nr == 0) - errx(EX_DATAERR, "pipe_nr must be > 0"); - if (p.delay > 10000) - errx(EX_DATAERR, "delay must be < 10000"); - } else { /* co.do_pipe == 2, queue */ - if (p.fs.parent_nr == 0) - errx(EX_DATAERR, "pipe must be > 0"); - if (p.fs.weight >100) - errx(EX_DATAERR, "weight must be <= 100"); - } - /* check for bandwidth value */ - if (p.bandwidth == -1) { - p.bandwidth = 0; - if (p.samples_no > 0) - errx(EX_DATAERR, "profile requires a bandwidth limit"); + /* check validity of parameters */ + if (p) { + if (p->delay > 10000) + errx(EX_DATAERR, "delay must be < 10000"); + if (p->bandwidth == -1) + p->bandwidth = 0; } - - if (p.fs.flags_fs & DN_QSIZE_IS_BYTES) { + if (fs) { + /* XXX accept a 0 scheduler to keep the default */ + if (fs->flags & DN_QSIZE_BYTES) { size_t len; long limit; @@ -991,9 +1152,9 @@ end_mask: if (sysctlbyname("net.inet.ip.dummynet.pipe_byte_limit", &limit, &len, NULL, 0) == -1) limit = 1024*1024; - if (p.fs.qsize > limit) + if (fs->qsize > limit) errx(EX_DATAERR, "queue size must be < %ldB", limit); - } else { + } else { size_t len; long limit; @@ -1001,27 +1162,25 @@ end_mask: if (sysctlbyname("net.inet.ip.dummynet.pipe_slot_limit", &limit, &len, NULL, 0) == -1) limit = 100; - if (p.fs.qsize > limit) + if (fs->qsize > limit) errx(EX_DATAERR, "2 <= queue size <= %ld", limit); - } - if (p.fs.flags_fs & DN_IS_RED) { + } + + if (fs->flags & DN_IS_RED) { size_t len; int lookup_depth, avg_pkt_size; - double s, idle, weight, w_q; - struct clockinfo ck; - int t; + double w_q; - if (p.fs.min_th >= p.fs.max_th) + if (fs->min_th >= fs->max_th) errx(EX_DATAERR, "min_th %d must be < than max_th %d", - p.fs.min_th, p.fs.max_th); - if (p.fs.max_th == 0) + fs->min_th, fs->max_th); + if (fs->max_th == 0) errx(EX_DATAERR, "max_th must be > 0"); len = sizeof(int); if (sysctlbyname("net.inet.ip.dummynet.red_lookup_depth", &lookup_depth, &len, NULL, 0) == -1) - errx(1, "sysctlbyname(\"%s\")", - "net.inet.ip.dummynet.red_lookup_depth"); + lookup_depth = 256; if (lookup_depth == 0) errx(EX_DATAERR, "net.inet.ip.dummynet.red_lookup_depth" " must be greater than zero"); @@ -1029,18 +1188,13 @@ end_mask: len = sizeof(int); if (sysctlbyname("net.inet.ip.dummynet.red_avg_pkt_size", &avg_pkt_size, &len, NULL, 0) == -1) + avg_pkt_size = 512; - errx(1, "sysctlbyname(\"%s\")", - "net.inet.ip.dummynet.red_avg_pkt_size"); if (avg_pkt_size == 0) errx(EX_DATAERR, "net.inet.ip.dummynet.red_avg_pkt_size must" " be greater than zero"); - len = sizeof(struct clockinfo); - if (sysctlbyname("kern.clockrate", &ck, &len, NULL, 0) == -1) - errx(1, "sysctlbyname(\"%s\")", "kern.clockrate"); - /* * Ticks needed for sending a medium-sized packet. * Unfortunately, when we are configuring a WF2Q+ queue, we @@ -1050,38 +1204,181 @@ end_mask: * correct. But on the other hand, why do we want RED with * WF2Q+ ? */ +#if 0 if (p.bandwidth==0) /* this is a WF2Q+ queue */ s = 0; else s = (double)ck.hz * avg_pkt_size * 8 / p.bandwidth; - +#endif /* * max idle time (in ticks) before avg queue size becomes 0. * NOTA: (3/w_q) is approx the value x so that * (1-w_q)^x < 10^-3. */ - w_q = ((double)p.fs.w_q) / (1 << SCALE_RED); + w_q = ((double)fs->w_q) / (1 << SCALE_RED); +#if 0 // go in kernel idle = s * 3. / w_q; - p.fs.lookup_step = (int)idle / lookup_depth; - if (!p.fs.lookup_step) - p.fs.lookup_step = 1; + fs->lookup_step = (int)idle / lookup_depth; + if (!fs->lookup_step) + fs->lookup_step = 1; weight = 1 - w_q; - for (t = p.fs.lookup_step; t > 1; --t) + for (t = fs->lookup_step; t > 1; --t) weight *= 1 - w_q; - p.fs.lookup_weight = (int)(weight * (1 << SCALE_RED)); + fs->lookup_weight = (int)(weight * (1 << SCALE_RED)); +#endif + } } - if (p.samples_no <= 0) { - i = do_cmd(IP_DUMMYNET_CONFIGURE, &p, sizeof p); - } else { - struct dn_pipe_max pm; - int len = sizeof(pm); - memcpy(&pm.pipe, &p, sizeof(pm.pipe)); - memcpy(&pm.samples, samples, sizeof(pm.samples)); - - i = do_cmd(IP_DUMMYNET_CONFIGURE, &pm, len); - } + i = do_cmd(IP_DUMMYNET3, base, (char *)buf - (char *)base); if (i) err(1, "setsockopt(%s)", "IP_DUMMYNET_CONFIGURE"); } + +void +dummynet_flush(void) +{ + struct dn_id oid; + oid_fill(&oid, sizeof(oid), DN_CMD_FLUSH, DN_API_VERSION); + do_cmd(IP_DUMMYNET3, &oid, oid.len); +} + +/* Parse input for 'ipfw [pipe|sched|queue] show [range list]' + * Returns the number of ranges, and possibly stores them + * in the array v of size len. + */ +static int +parse_range(int ac, char *av[], uint32_t *v, int len) +{ + int n = 0; + char *endptr, *s; + uint32_t base[2]; + + if (v == NULL || len < 2) { + v = base; + len = 2; + } + + for (s = *av; s != NULL; av++, ac--) { + v[0] = strtoul(s, &endptr, 10); + v[1] = (*endptr != '-') ? v[0] : + strtoul(endptr+1, &endptr, 10); + if (*endptr == '\0') { /* prepare for next round */ + s = (ac > 0) ? *(av+1) : NULL; + } else { + if (*endptr != ',') { + warn("invalid number: %s", s); + s = ++endptr; + continue; + } + /* continue processing from here */ + s = ++endptr; + ac++; + av--; + } + if (v[1] < v[0] || + v[1] < 0 || v[1] >= DN_MAX_ID-1 || + v[0] < 0 || v[1] >= DN_MAX_ID-1) { + continue; /* invalid entry */ + } + n++; + /* translate if 'pipe list' */ + if (co.do_pipe == 1) { + v[0] += DN_MAX_ID; + v[1] += DN_MAX_ID; + } + v = (n*2 < len) ? v + 2 : base; + } + return n; +} + +/* main entry point for dummynet list functions. co.do_pipe indicates + * which function we want to support. + * av may contain filtering arguments, either individual entries + * or ranges, or lists (space or commas are valid separators). + * Format for a range can be n1-n2 or n3 n4 n5 ... + * In a range n1 must be <= n2, otherwise the range is ignored. + * A number 'n4' is translate in a range 'n4-n4' + * All number must be > 0 and < DN_MAX_ID-1 + */ +void +dummynet_list(int ac, char *av[], int show_counters) +{ + struct dn_id *oid, *x = NULL; + int ret, i, l; + int n; /* # of ranges */ + int buflen; + int max_size; /* largest obj passed up */ + + ac--; + av++; /* skip 'list' | 'show' word */ + + n = parse_range(ac, av, NULL, 0); /* Count # of ranges. */ + + /* Allocate space to store ranges */ + l = sizeof(*oid) + sizeof(uint32_t) * n * 2; + oid = safe_calloc(1, l); + oid_fill(oid, l, DN_CMD_GET, DN_API_VERSION); + + if (n > 0) /* store ranges in idx */ + parse_range(ac, av, (uint32_t *)(oid + 1), n*2); + /* + * Compute the size of the largest object returned. If the + * response leaves at least this much spare space in the + * buffer, then surely the response is complete; otherwise + * there might be a risk of truncation and we will need to + * retry with a larger buffer. + * XXX don't bother with smaller structs. + */ + max_size = sizeof(struct dn_fs); + if (max_size < sizeof(struct dn_sch)) + max_size = sizeof(struct dn_sch); + if (max_size < sizeof(struct dn_flow)) + max_size = sizeof(struct dn_flow); + + switch (co.do_pipe) { + case 1: + oid->subtype = DN_LINK; /* list pipe */ + break; + case 2: + oid->subtype = DN_FS; /* list queue */ + break; + case 3: + oid->subtype = DN_SCH; /* list sched */ + break; + } + + /* + * Ask the kernel an estimate of the required space (result + * in oid.id), unless we are requesting a subset of objects, + * in which case the kernel does not give an exact answer. + * In any case, space might grow in the meantime due to the + * creation of new queues, so we must be prepared to retry. + */ + if (n > 0) { + buflen = 4*1024; + } else { + ret = do_cmd(-IP_DUMMYNET3, oid, (uintptr_t)&l); + if (ret != 0 || oid->id <= sizeof(*oid)) + goto done; + buflen = oid->id + max_size; + oid->len = sizeof(*oid); /* restore */ + } + /* Try a few times, until the buffer fits */ + for (i = 0; i < 20; i++) { + l = buflen; + x = safe_realloc(x, l); + bcopy(oid, x, oid->len); + ret = do_cmd(-IP_DUMMYNET3, x, (uintptr_t)&l); + if (ret != 0 || x->id <= sizeof(*oid)) + goto done; /* no response */ + if (l + max_size <= buflen) + break; /* ok */ + buflen *= 2; /* double for next attempt */ + } + list_pipes(x, O_NEXT(x, l)); +done: + if (x) + free(x); + free(oid); +} diff --git a/sbin/ipfw/ipfw.8 b/sbin/ipfw/ipfw.8 index f8b0746..897cd3f 100644 --- a/sbin/ipfw/ipfw.8 +++ b/sbin/ipfw/ipfw.8 @@ -6,8 +6,10 @@ .Os .Sh NAME .Nm ipfw -.Nd IP firewall and traffic shaper control program +.Nd User interface for firewall, traffic shaper, packet scheduler, +in-kernel NAT. .Sh SYNOPSIS +.Ss FIREWALL CONFIGURATION .Nm .Op Fl cq .Cm add @@ -26,12 +28,6 @@ .Op Cm set Ar N .Brq Cm delete | zero | resetlog .Op Ar number ... -.Nm -.Cm enable -.Brq Cm firewall | altq | one_pass | debug | verbose | dyn_keepalive -.Nm -.Cm disable -.Brq Cm firewall | altq | one_pass | debug | verbose | dyn_keepalive .Pp .Nm .Cm set Oo Cm disable Ar number ... Oc Op Cm enable Ar number ... @@ -43,7 +39,16 @@ .Cm set swap Ar number number .Nm .Cm set show +.Ss SYSCTL SHORTCUTS +.Pp +.Nm +.Cm enable +.Brq Cm firewall | altq | one_pass | debug | verbose | dyn_keepalive +.Nm +.Cm disable +.Brq Cm firewall | altq | one_pass | debug | verbose | dyn_keepalive .Pp +.Ss LOOKUP TABLES .Nm .Cm table Ar number Cm add Ar addr Ns Oo / Ns Ar masklen Oc Op Ar value .Nm @@ -57,17 +62,19 @@ .Brq Ar number | all .Cm list .Pp +.Ss DUMMYNET CONFIGURATION (TRAFFIC SHAPER AND PACKET SCHEDULER) .Nm -.Brq Cm pipe | queue +.Brq Cm pipe | queue | sched .Ar number .Cm config .Ar config-options .Nm .Op Fl s Op Ar field -.Brq Cm pipe | queue +.Brq Cm pipe | queue | sched .Brq Cm delete | list | show .Op Ar number ... .Pp +.Ss IN-KERNEL NAT .Nm .Op Fl q .Cm nat @@ -89,28 +96,27 @@ The .Nm utility is the user interface for controlling the .Xr ipfw 4 -firewall and the +firewall, the .Xr dummynet 4 -traffic shaper in -.Fx . +traffic shaper/packet scheduler, and the +in-kernel NAT services. .Pp -An -.Nm -configuration, or +A firewall configuration, or .Em ruleset , is made of a list of .Em rules numbered from 1 to 65535. -Packets are passed to -.Nm +Packets are passed to the firewall from a number of different places in the protocol stack (depending on the source and destination of the packet, -it is possible that -.Nm -is invoked multiple times on the same packet). +it is possible for the firewall to be +invoked multiple times on the same packet). The packet passed to the firewall is compared -against each of the rules in the firewall -.Em ruleset . +against each of the rules in the +.Em ruleset , +in rule-number order +(multiple rules with the same number are permitted, in which case +they are processed in order of insertion). When a match is found, the action corresponding to the matching rule is performed. .Pp @@ -118,9 +124,7 @@ Depending on the action and certain system settings, packets can be reinjected into the firewall at some rule after the matching one for further processing. .Pp -An -.Nm -ruleset always includes a +A ruleset always includes a .Em default rule (numbered 65535) which cannot be modified or deleted, and matches all packets. @@ -137,14 +141,14 @@ If the ruleset includes one or more rules with the or .Cm limit option, -.Nm -will have a +the firewall will have a .Em stateful -behaviour, i.e., upon a match it will create dynamic rules matching -the exact parameters (source and destination addresses and ports) -of the matching packet. -.Pp -These dynamic rules, which have a limited lifetime, are checked +behaviour, i.e., upon a match it will create +.Em dynamic rules , +i.e. rules that match packets with the same 5-tuple +(protocol, source and destination addresses and ports) +as the packet which caused their creation. +Dynamic rules, which have a limited lifetime, are checked at the first occurrence of a .Cm check-state , .Cm keep-state @@ -283,6 +287,7 @@ When listing, show last match timestamp as seconds from the epoch. This form can be more convenient for postprocessing by scripts. .El .Pp +.Ss LIST OF RULES AND PREPROCESSING To ease configuration, rules can be put into a file which is processed using .Nm @@ -322,14 +327,16 @@ This allows for flexible configuration files (like conditionalizing them on the local hostname) and the use of macros to centralize frequently required arguments like IP addresses. .Pp +.Ss TRAFFIC SHAPER CONFIGURATION The .Nm -.Cm pipe +.Cm pipe , queue and -.Cm queue -commands are used to configure the traffic shaper, as shown in the +.Cm sched +commands are used to configure the traffic shaper and packet scheduler. +See the .Sx TRAFFIC SHAPER (DUMMYNET) CONFIGURATION -Section below. +Section below for details. .Pp If the world and the kernel get out of sync the .Nm @@ -362,7 +369,7 @@ have this picture in mind in order to design a correct ruleset. | to devices | .Ed .Pp -As can be noted from the above picture, the number of +The number of times the same packet goes through the firewall can vary between 0 and 4 depending on packet source and destination, and system configuration. @@ -421,9 +428,9 @@ Keywords are case-sensitive, whereas arguments may or may not be case-sensitive depending on their nature (e.g.\& uid's are, hostnames are not). .Pp -In -.Nm ipfw2 -you can introduce spaces after commas ',' to make +Some arguments (e.g. port or address lists) are comma-separated +lists of values. +In this case, spaces after commas ',' are allowed to make the line more readable. You can also put the entire command (including flags) into a single argument. @@ -434,9 +441,7 @@ ipfw -q add deny src-ip 10.0.0.0/24, 127.0.0.1/8 ipfw "-q add deny src-ip 10.0.0.0/24, 127.0.0.1/8" .Ed .Sh RULE FORMAT -The format of -.Nm -rules is the following: +The format of firewall rules is the following: .Bd -ragged -offset indent .Bk -words .Op Ar rule_number @@ -496,7 +501,7 @@ in future forwarding decisions. .El .Pp Note that some of the above information, e.g.\& source MAC or IP addresses and -TCP/UDP ports, could easily be spoofed, so filtering on those fields +TCP/UDP ports, can be easily spoofed, so filtering on those fields alone might not guarantee the desired results. .Bl -tag -width indent .It Ar rule_number @@ -1002,6 +1007,7 @@ The second format with multiple addresses) is provided for convenience only and its use is discouraged. .It Ar addr : Oo Cm not Oc Bro +.Bl -tag -width indent .Cm any | me | me6 | .Cm table Ns Pq Ar number Ns Op , Ns Ar value .Ar | addr-list | addr-set @@ -1023,6 +1029,7 @@ is also specified, an entry will match only if it has this value. See the .Sx LOOKUP TABLES section below for more information on lookup tables. +.El .It Ar addr-list : ip-addr Ns Op Ns , Ns Ar addr-list .It Ar ip-addr : A host or subnet address specified in one of the following ways: @@ -1389,6 +1396,20 @@ of source and destination addresses and ports can be specified. Currently, only IPv4 flows are supported. +.It Cm lookup Bro Cm dst-ip | dst-port | src-ip | src-port | uid | jail Brc Ar N +Search an entry in lookup table +.Ar N +that matches the field specified as argument. +If not found, the match fails. +Otherwise, the match succeeds and +.Cm tablearg +is set to the value extracted from the table. +.Pp +This option can be useful to quickly dispatch traffic based on +certain packet fields. +See the +.Sx LOOKUP TABLES +section below for more information on lookup tables. .It Cm { MAC | mac } Ar dst-mac src-mac Match packets with a given .Ar dst-mac @@ -1480,7 +1501,7 @@ is invalid) whenever .Cm xmit is used. .Pp -A packet may not have a receive or transmit interface: packets +A packet might not have a receive or transmit interface: packets originating from the local host have no receive interface, while packets destined for the local host have no transmit interface. @@ -1627,15 +1648,17 @@ because it engages only on packets with source addresses of directly connected networks instead of all source addresses. .El .Sh LOOKUP TABLES -Lookup tables are useful to handle large sparse address sets, -typically from a hundred to several thousands of entries. +Lookup tables are useful to handle large sparse sets of +addresses or other search keys (e.g. ports, jail IDs). +In the rest of this section we will use the term ``address'' +to mean any unsigned value of up to 32-bit. There may be up to 128 different lookup tables, numbered 0 to 127. .Pp Each entry is represented by an .Ar addr Ns Op / Ns Ar masklen and will match all addresses with base .Ar addr -(specified as an IP address or a hostname) +(specified as an IP address, a hostname or an unsigned integer) and mask width of .Ar masklen bits. @@ -1653,9 +1676,9 @@ is not specified, it defaults to 0. .Pp An entry can be added to a table .Pq Cm add , -removed from a table -.Pq Cm delete , -a table can be examined +or removed from a table +.Pq Cm delete . +A table can be examined .Pq Cm list or flushed .Pq Cm flush . @@ -1664,7 +1687,7 @@ Internally, each table is stored in a Radix tree, the same way as the routing table (see .Xr route 4 ) . .Pp -Lookup tables currently support IPv4 addresses only. +Lookup tables currently support only ports, jail IDs and IPv4 addresses. .Pp The .Cm tablearg @@ -1822,9 +1845,9 @@ for more examples on how to use dynamic rules. .Nm is also the user interface for the .Nm dummynet -traffic shaper and network emulator, a subsystem that +traffic shaper, packet scheduler and network emulator, a subsystem that can artificially queue, delay or drop packets -emulator the behaviour of certain network links +emulating the behaviour of certain network links or queueing systems. .Pp .Nm dummynet @@ -1836,26 +1859,33 @@ Matching packets are then passed to either of two different objects, which implement the traffic regulation: .Bl -hang -offset XXXX .It Em pipe -A pipe emulates a link with given bandwidth, propagation delay, +A +.Em pipe +emulates a +.Em link +with given bandwidth and propagation delay, +driven by a FIFO scheduler and a single queue with programmable queue size and packet loss rate. -Packets are queued in front of the pipe as they come out from the classifier, -and then transferred to the pipe according to the pipe's parameters. +Packets are appended to the queue as they come out from +.Nm ipfw , +and then transferred in FIFO order to the link at the desired rate. .It Em queue -A queue -is an abstraction used to implement the WF2Q+ -(Worst-case Fair Weighted Fair Queueing) policy, which is -an efficient variant of the WFQ policy. -.Pp -The queue associates a -.Em weight -and a reference pipe to each flow (a flow is a set of packets -with the same addresses and ports after masking). -All backlogged flows (i.e., those -with packets queued) linked to the same pipe share the pipe's -bandwidth proportionally to their weights. -Note that weights are not priorities; a flow with a lower weight -is still guaranteed to get its fraction of the bandwidth even if a -flow with a higher weight is permanently backlogged. +A +.Em queue +is an abstraction used to implement packet scheduling +using one of several packet scheduling algorithms. +Packets sent to a +.Em queue +are first grouped into flows according to a mask on the 5-tuple. +Flows are then passed to the scheduler associated to the +.Em queue , +and each flow uses scheduling parameters (weight and others) +as configured in the +.Em queue +itself. +A scheduler in turn is connected to an emulated link, +and arbitrates the link's bandwidth among backlogged flows according to +weights and to the features of the scheduling algorithm in use. .El .Pp In practice, @@ -1864,6 +1894,52 @@ can be used to set hard limits to the bandwidth that a flow can use, whereas .Em queues can be used to determine how different flows share the available bandwidth. .Pp +A graphical representation of the binding of queues, +flows, schedulers and links is below. +.Bd -literal -offset indent + (flow_mask|sched_mask) sched_mask + +---------+ weight Wx +-------------+ + | |->-[flow]-->--| |-+ + -->--| QUEUE x | ... | | | + | |->-[flow]-->--| SCHEDuler N | | + +---------+ | | | + ... | +--[LINK N]-->-- + +---------+ weight Wy | | +--[LINK N]-->-- + | |->-[flow]-->--| | | + -->--| QUEUE y | ... | | | + | |->-[flow]-->--| | | + +---------+ +-------------+ | + +-------------+ +.Ed +It is important to understand the role of the SCHED_MASK +and FLOW_MASK, which are configured through the commands +.Dl "ipfw sched N config mask SCHED_MASK ..." +and +.Dl "ipfw queue X config mask FLOW_MASK ..." . +.Pp +The SCHED_MASK is used to assign flows to one or more +scheduler instances, one for each +value of the packet's 5-fuple after applying SCHED_MASK. +As an example, using ``src-ip 0xffffff00'' creates one instance +for each /24 destination subnet. +.Pp +The FLOW_MASK, together with the SCHED_MASK, is used to split +packets into flows. As an example, using +``src-ip 0x000000ff'' +together with the previous SCHED_MASK makes a flow for +each individual source address. In turn, flows for each /24 +subnet will be sent to the same scheduler instance. +.Pp +The above diagram holds even for the +.Em pipe +case, with the only restriction that a +.Em pipe +only supports a SCHED_MASK, and forces the use of a FIFO +scheduler (these are for backward compatibility reasons; +in fact, internally, a +.Nm dummynet's +pipe is implemented exactly as above). +.Pp There are two modes of .Nm dummynet operation: @@ -1895,16 +1971,19 @@ mode can be enabled by setting the .Xr sysctl 8 variable to a non-zero value. .Pp -.Ss PIPE AND QUEUE CONFIGURATION +.Ss PIPE, QUEUE AND SCHEDULER CONFIGURATION The -.Em pipe -and +.Em pipe , .Em queue +and +.Em scheduler configuration commands are the following: .Bd -ragged -offset indent .Cm pipe Ar number Cm config Ar pipe-configuration .Pp .Cm queue Ar number Cm config Ar queue-configuration +.Pp +.Cm sched Ar number Cm config Ar sched-configuration .Ed .Pp The following parameters can be configured for a pipe: @@ -2057,6 +2136,41 @@ Specifies the weight to be used for flows matching this queue. The weight must be in the range 1..100, and defaults to 1. .El .Pp +The following parameters can be configured for a scheduler: +.Pp +.Bl -tag -width indent -compact +.It Cm type Ar {fifo | wf2qp | rr | qfq} +specifies the scheduling algorithm to use. +.Bl -tag -width indent -compact +.It cm fifo +is just a FIFO scheduler (which means that all packets +are stored in the same queue as they arrive to the scheduler). +FIFO has O(1) per-packet time complexity, with very low +constants (estimate 60-80ns on a 2Ghz desktop machine) +but gives no service guarantees. +.It Cm wf2qp +implements the WF2Q+ algorithm, which is a Weighted Fair Queueing +algorithm which permits flows to share bandwidth according to +their weights. Note that weights are not priorities; even a flow +with a minuscule weight will never starve. +WF2Q+ has O(log N) per-packet processing cost, where N is the number +of flows, and is the default algorithm used by previous versions +dummynet's queues. +.It Cm rr +implements the Deficit Round Robin algorithm, which has O(1) processing +costs (roughly, 100-150ns per packet) +and permits bandwidth allocation according to weights, but +with poor service guarantees. +.It Cm qfq +implements the QFQ algorithm, which is a very fast variant of +WF2Q+, with similar service guarantees and O(1) processing +costs (roughly, 200-250ns per packet). +.El +.El +.Pp +In addition to the type, all parameters allowed for a pipe can also +be specified for a scheduler. +.Pp Finally, the following parameters can be configured for both pipes and queues: .Pp diff --git a/sbin/ipfw/ipfw2.c b/sbin/ipfw/ipfw2.c index b19f390..1ab827f 100644 --- a/sbin/ipfw/ipfw2.c +++ b/sbin/ipfw/ipfw2.c @@ -57,7 +57,7 @@ struct cmdline_opts co; /* global options */ int resvd_set_number = RESVD_SET; #define GET_UINT_ARG(arg, min, max, tok, s_x) do { \ - if (!ac) \ + if (!av[0]) \ errx(EX_USAGE, "%s: missing argument", match_value(s_x, tok)); \ if (_substrcmp(*av, "tablearg") == 0) { \ arg = IP_FW_TABLEARG; \ @@ -65,23 +65,23 @@ int resvd_set_number = RESVD_SET; } \ \ { \ - long val; \ + long _xval; \ char *end; \ \ - val = strtol(*av, &end, 10); \ + _xval = strtol(*av, &end, 10); \ \ - if (!isdigit(**av) || *end != '\0' || (val == 0 && errno == EINVAL)) \ + if (!isdigit(**av) || *end != '\0' || (_xval == 0 && errno == EINVAL)) \ errx(EX_DATAERR, "%s: invalid argument: %s", \ match_value(s_x, tok), *av); \ \ - if (errno == ERANGE || val < min || val > max) \ + if (errno == ERANGE || _xval < min || _xval > max) \ errx(EX_DATAERR, "%s: argument is out of range (%u..%u): %s", \ match_value(s_x, tok), min, max, *av); \ \ - if (val == IP_FW_TABLEARG) \ + if (_xval == IP_FW_TABLEARG) \ errx(EX_DATAERR, "%s: illegal argument value: %s", \ match_value(s_x, tok), *av); \ - arg = val; \ + arg = _xval; \ } \ } while (0) @@ -224,6 +224,15 @@ static struct _s_x rule_action_params[] = { { NULL, 0 } /* terminator */ }; +/* + * The 'lookup' instruction accepts one of the following arguments. + * -1 is a terminator for the list. + * Arguments are passed as v[1] in O_DST_LOOKUP options. + */ +static int lookup_key[] = { + TOK_DSTIP, TOK_SRCIP, TOK_DSTPORT, TOK_SRCPORT, + TOK_UID, TOK_JAIL, TOK_DSCP, -1 }; + static struct _s_x rule_options[] = { { "tagged", TOK_TAGGED }, { "uid", TOK_UID }, @@ -249,6 +258,7 @@ static struct _s_x rule_options[] = { { "iplen", TOK_IPLEN }, { "ipid", TOK_IPID }, { "ipprecedence", TOK_IPPRECEDENCE }, + { "dscp", TOK_DSCP }, { "iptos", TOK_IPTOS }, { "ipttl", TOK_IPTTL }, { "ipversion", TOK_IPVER }, @@ -290,6 +300,7 @@ static struct _s_x rule_options[] = { { "dst-ip6", TOK_DSTIP6}, { "src-ipv6", TOK_SRCIP6}, { "src-ip6", TOK_SRCIP6}, + { "lookup", TOK_LOOKUP}, { "//", TOK_COMMENT }, { "not", TOK_NOT }, /* pseudo option */ @@ -343,6 +354,7 @@ safe_realloc(void *ptr, size_t size) /* * conditionally runs the command. + * Selected options or negative -> getsockopt */ int do_cmd(int optname, void *optval, uintptr_t optlen) @@ -362,11 +374,15 @@ do_cmd(int optname, void *optval, uintptr_t optlen) optname == IP_FW_ADD || optname == IP_FW_TABLE_LIST || optname == IP_FW_TABLE_GETSIZE || optname == IP_FW_NAT_GET_CONFIG || - optname == IP_FW_NAT_GET_LOG) + optname < 0 || + optname == IP_FW_NAT_GET_LOG) { + if (optname < 0) + optname = -optname; i = getsockopt(s, IPPROTO_IP, optname, optval, (socklen_t *)optlen); - else + } else { i = setsockopt(s, IPPROTO_IP, optname, optval, optlen); + } return i; } @@ -739,9 +755,19 @@ static void print_ip(ipfw_insn_ip *cmd, char const *s) { struct hostent *he = NULL; - int len = F_LEN((ipfw_insn *)cmd); + uint32_t len = F_LEN((ipfw_insn *)cmd); uint32_t *a = ((ipfw_insn_u32 *)cmd)->d; + if (cmd->o.opcode == O_IP_DST_LOOKUP && len > F_INSN_SIZE(ipfw_insn_u32)) { + uint32_t d = a[1]; + const char *arg = "<invalid>"; + + if (d < sizeof(lookup_key)/sizeof(lookup_key[0])) + arg = match_value(rule_options, lookup_key[d]); + printf("%s lookup %s %d", cmd->o.len & F_NOT ? " not": "", + arg, cmd->o.arg1); + return; + } printf("%s%s ", cmd->o.len & F_NOT ? " not": "", s); if (cmd->o.opcode == O_IP_SRC_ME || cmd->o.opcode == O_IP_DST_ME) { @@ -1108,9 +1134,11 @@ show_ipfw(struct ip_fw *rule, int pcwidth, int bcwidth) else printf(" log"); } +#ifndef NO_ALTQ if (altqptr) { print_altq_cmd(altqptr); } +#endif if (tagptr) { if (tagptr->len & F_NOT) PRINT_UINT_ARG(" untag ", tagptr->arg1); @@ -1595,26 +1623,33 @@ show_dyn_ipfw(ipfw_dyn_rule *d, int pcwidth, int bcwidth) * ipfw set move rule X to Y */ void -ipfw_sets_handler(int ac, char *av[]) +ipfw_sets_handler(char *av[]) { uint32_t set_disable, masks[2]; int i, nbytes; uint16_t rulenum; uint8_t cmd, new_set; - ac--; av++; - if (!ac) + if (av[0] == NULL) errx(EX_USAGE, "set needs command"); if (_substrcmp(*av, "show") == 0) { - void *data; + void *data = NULL; char const *msg; - - nbytes = sizeof(struct ip_fw); + int nalloc; + + nalloc = nbytes = sizeof(struct ip_fw); + while (nbytes >= nalloc) { + if (data) + free(data); + nalloc = nalloc * 2 + 200; + nbytes = nalloc; data = safe_calloc(1, nbytes); if (do_cmd(IP_FW_GET, data, (uintptr_t)&nbytes) < 0) err(EX_OSERR, "getsockopt(IP_FW_GET)"); + } + bcopy(&((struct ip_fw *)data)->next_rule, &set_disable, sizeof(set_disable)); @@ -1631,8 +1666,8 @@ ipfw_sets_handler(int ac, char *av[]) } printf("\n"); } else if (_substrcmp(*av, "swap") == 0) { - ac--; av++; - if (ac != 2) + av++; + if ( av[0] == NULL || av[1] == NULL ) errx(EX_USAGE, "set swap needs 2 set numbers\n"); rulenum = atoi(av[0]); new_set = atoi(av[1]); @@ -1643,13 +1678,14 @@ ipfw_sets_handler(int ac, char *av[]) masks[0] = (4 << 24) | (new_set << 16) | (rulenum); i = do_cmd(IP_FW_DEL, masks, sizeof(uint32_t)); } else if (_substrcmp(*av, "move") == 0) { - ac--; av++; - if (ac && _substrcmp(*av, "rule") == 0) { + av++; + if (av[0] && _substrcmp(*av, "rule") == 0) { cmd = 2; - ac--; av++; + av++; } else cmd = 3; - if (ac != 3 || _substrcmp(av[1], "to") != 0) + if (av[0] == NULL || av[1] == NULL || av[2] == NULL || + av[3] != NULL || _substrcmp(av[1], "to") != 0) errx(EX_USAGE, "syntax: set move [rule] X to Y\n"); rulenum = atoi(av[0]); new_set = atoi(av[2]); @@ -1664,10 +1700,10 @@ ipfw_sets_handler(int ac, char *av[]) _substrcmp(*av, "enable") == 0 ) { int which = _substrcmp(*av, "enable") == 0 ? 1 : 0; - ac--; av++; + av++; masks[0] = masks[1] = 0; - while (ac) { + while (av[0]) { if (isdigit(**av)) { i = atoi(*av); if (i < 0 || i > RESVD_SET) @@ -1681,7 +1717,7 @@ ipfw_sets_handler(int ac, char *av[]) else errx(EX_DATAERR, "invalid set command %s\n", *av); - av++; ac--; + av++; } if ( (masks[0] & masks[1]) != 0 ) errx(EX_DATAERR, @@ -1695,12 +1731,11 @@ ipfw_sets_handler(int ac, char *av[]) } void -ipfw_sysctl_handler(int ac, char *av[], int which) +ipfw_sysctl_handler(char *av[], int which) { - ac--; av++; - if (ac == 0) { + if (av[0] == NULL) { warnx("missing keyword to enable/disable\n"); } else if (_substrcmp(*av, "firewall") == 0) { sysctlbyname("net.inet.ip.fw.enable", NULL, 0, @@ -1717,8 +1752,10 @@ ipfw_sysctl_handler(int ac, char *av[], int which) } else if (_substrcmp(*av, "dyn_keepalive") == 0) { sysctlbyname("net.inet.ip.fw.dyn_keepalive", NULL, 0, &which, sizeof(which)); +#ifndef NO_ALTQ } else if (_substrcmp(*av, "altq") == 0) { altq_set_enabled(which); +#endif } else { warnx("unrecognize enable/disable keyword: %s\n", *av); } @@ -1751,6 +1788,10 @@ ipfw_list(int ac, char *av[], int show_counters) fprintf(stderr, "Testing only, list disabled\n"); return; } + if (co.do_pipe) { + dummynet_list(ac, av, show_counters); + return; + } ac--; av++; @@ -1767,11 +1808,6 @@ ipfw_list(int ac, char *av[], int show_counters) co.do_pipe ? "DUMMYNET" : "FW"); } - if (co.do_pipe) { - ipfw_list_pipes(data, nbytes, ac, av); - goto done; - } - /* * Count static rules. They have variable size so we * need to scan the list to count them. @@ -2119,7 +2155,7 @@ fill_ip(ipfw_insn_ip *cmd, char *av) return; } /* A single IP can be stored in an optimized format */ - if (d[1] == ~0 && av == NULL && len == 0) { + if (d[1] == (uint32_t)~0 && av == NULL && len == 0) { cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32); return; } @@ -2188,29 +2224,28 @@ fill_flags(ipfw_insn *cmd, enum ipfw_opcodes opcode, void -ipfw_delete(int ac, char *av[]) +ipfw_delete(char *av[]) { uint32_t rulenum; int i; int exitval = EX_OK; int do_set = 0; - - av++; ac--; + av++; NEED1("missing rule specification"); - if (ac > 0 && _substrcmp(*av, "set") == 0) { + if ( *av && _substrcmp(*av, "set") == 0) { /* Do not allow using the following syntax: * ipfw set N delete set M */ if (co.use_set) errx(EX_DATAERR, "invalid syntax"); do_set = 1; /* delete set */ - ac--; av++; + av++; } /* Rule number */ - while (ac && isdigit(**av)) { - i = atoi(*av); av++; ac--; + while (*av && isdigit(**av)) { + i = atoi(*av); av++; if (co.do_nat) { exitval = do_cmd(IP_FW_NAT_DEL, &i, sizeof i); if (exitval) { @@ -2264,7 +2299,8 @@ fill_iface(ipfw_insn_if *cmd, char *arg) static void get_mac_addr_mask(const char *p, uint8_t *addr, uint8_t *mask) { - int i, l; + int i; + size_t l; char *ap, *ptr, *optr; struct ether_addr *mac; const char *macset = "0123456789abcdefABCDEF:"; @@ -2286,11 +2322,11 @@ get_mac_addr_mask(const char *p, uint8_t *addr, uint8_t *mask) if (ptr != NULL) { /* we have mask? */ if (p[ptr - optr - 1] == '/') { /* mask len */ - l = strtol(ptr, &ap, 10); - if (*ap != 0 || l > ETHER_ADDR_LEN * 8 || l < 0) + long ml = strtol(ptr, &ap, 10); + if (*ap != 0 || ml > ETHER_ADDR_LEN * 8 || ml < 0) errx(EX_DATAERR, "Incorrect mask length"); - for (i = 0; l > 0 && i < ETHER_ADDR_LEN; l -= 8, i++) - mask[i] = (l >= 8) ? 0xff: (~0) << (8 - l); + for (i = 0; ml > 0 && i < ETHER_ADDR_LEN; ml -= 8, i++) + mask[i] = (ml >= 8) ? 0xff: (~0) << (8 - ml); } else { /* mask */ l = strlen(ptr); if (strspn(ptr, macset) != l || @@ -2325,7 +2361,7 @@ next_cmd(ipfw_insn *cmd) * Takes arguments and copies them into a comment */ static void -fill_comment(ipfw_insn *cmd, int ac, char **av) +fill_comment(ipfw_insn *cmd, char **av) { int i, l; char *p = (char *)(cmd + 1); @@ -2334,7 +2370,7 @@ fill_comment(ipfw_insn *cmd, int ac, char **av) cmd->len = (cmd->len & (F_NOT | F_OR)); /* Compute length of comment string. */ - for (i = 0, l = 0; i < ac; i++) + for (i = 0, l = 0; av[i] != NULL; i++) l += strlen(av[i]) + 1; if (l == 0) return; @@ -2343,7 +2379,7 @@ fill_comment(ipfw_insn *cmd, int ac, char **av) "comment too long (max 80 chars)"); l = 1 + (l+3)/4; cmd->len = (cmd->len & (F_NOT | F_OR)) | l; - for (i = 0; i < ac; i++) { + for (i = 0; av[i] != NULL; i++) { strcpy(p, av[i]); p += strlen(av[i]); *p++ = ' '; @@ -2368,11 +2404,11 @@ fill_cmd(ipfw_insn *cmd, enum ipfw_opcodes opcode, int flags, uint16_t arg) * two microinstructions, and returns the pointer to the last one. */ static ipfw_insn * -add_mac(ipfw_insn *cmd, int ac, char *av[]) +add_mac(ipfw_insn *cmd, char *av[]) { ipfw_insn_mac *mac; - if (ac < 2) + if ( ( av[0] == NULL ) || ( av[1] == NULL ) ) errx(EX_DATAERR, "MAC dst src"); cmd->opcode = O_MACADDR2; @@ -2386,9 +2422,9 @@ add_mac(ipfw_insn *cmd, int ac, char *av[]) } static ipfw_insn * -add_mactype(ipfw_insn *cmd, int ac, char *av) +add_mactype(ipfw_insn *cmd, char *av) { - if (ac < 1) + if (!av) errx(EX_DATAERR, "missing MAC type"); if (strcmp(av, "any") != 0) { /* we have a non-null type */ fill_newports((ipfw_insn_u16 *)cmd, av, IPPROTO_ETHERTYPE); @@ -2496,6 +2532,7 @@ add_dstip(ipfw_insn *cmd, char *av) static ipfw_insn * add_ports(ipfw_insn *cmd, char *av, u_char proto, int opcode) { + /* XXX "any" is trapped before. Perhaps "to" */ if (_substrcmp(av, "any") == 0) { return NULL; } else if (fill_newports((ipfw_insn_u16 *)cmd, av, proto)) { @@ -2519,11 +2556,11 @@ add_src(ipfw_insn *cmd, char *av, u_char proto) *ch = '\0'; if (proto == IPPROTO_IPV6 || strcmp(av, "me6") == 0 || - inet_pton(AF_INET6, host, &a)) + inet_pton(AF_INET6, host, &a) == 1) ret = add_srcip6(cmd, av); /* XXX: should check for IPv4, not !IPv6 */ if (ret == NULL && (proto == IPPROTO_IP || strcmp(av, "me") == 0 || - !inet_pton(AF_INET6, host, &a))) + inet_pton(AF_INET6, host, &a) != 1)) ret = add_srcip(cmd, av); if (ret == NULL && strcmp(av, "any") != 0) ret = cmd; @@ -2545,11 +2582,11 @@ add_dst(ipfw_insn *cmd, char *av, u_char proto) *ch = '\0'; if (proto == IPPROTO_IPV6 || strcmp(av, "me6") == 0 || - inet_pton(AF_INET6, host, &a)) + inet_pton(AF_INET6, host, &a) == 1) ret = add_dstip6(cmd, av); /* XXX: should check for IPv4, not !IPv6 */ if (ret == NULL && (proto == IPPROTO_IP || strcmp(av, "me") == 0 || - !inet_pton(AF_INET6, host, &a))) + inet_pton(AF_INET6, host, &a) != 1)) ret = add_dstip(cmd, av); if (ret == NULL && strcmp(av, "any") != 0) ret = cmd; @@ -2571,7 +2608,7 @@ add_dst(ipfw_insn *cmd, char *av, u_char proto) * */ void -ipfw_add(int ac, char *av[]) +ipfw_add(char *av[]) { /* * rules are added into the 'rulebuf' and then copied in @@ -2610,37 +2647,36 @@ ipfw_add(int ac, char *av[]) cmd = (ipfw_insn *)cmdbuf; action = (ipfw_insn *)actbuf; - av++; ac--; + av++; /* [rule N] -- Rule number optional */ - if (ac && isdigit(**av)) { + if (av[0] && isdigit(**av)) { rule->rulenum = atoi(*av); av++; - ac--; } /* [set N] -- set number (0..RESVD_SET), optional */ - if (ac > 1 && _substrcmp(*av, "set") == 0) { + if (av[0] && !av[1] && _substrcmp(*av, "set") == 0) { int set = strtoul(av[1], NULL, 10); if (set < 0 || set > RESVD_SET) errx(EX_DATAERR, "illegal set %s", av[1]); rule->set = set; - av += 2; ac -= 2; + av += 2; } /* [prob D] -- match probability, optional */ - if (ac > 1 && _substrcmp(*av, "prob") == 0) { + if (av[0] && av[1] && _substrcmp(*av, "prob") == 0) { match_prob = strtod(av[1], NULL); if (match_prob <= 0 || match_prob > 1) errx(EX_DATAERR, "illegal match prob. %s", av[1]); - av += 2; ac -= 2; + av += 2; } /* action -- mandatory */ NEED1("missing action"); i = match_token(rule_actions, *av); - ac--; av++; + av++; action->len = 1; /* default */ switch(i) { case TOK_CHECKSTATE: @@ -2676,14 +2712,14 @@ ipfw_add(int ac, char *av[]) action->opcode = O_REJECT; NEED1("missing reject code"); fill_reject_code(&action->arg1, *av); - ac--; av++; + av++; break; case TOK_UNREACH6: action->opcode = O_UNREACH6; NEED1("missing unreach code"); fill_unreach6_code(&action->arg1, *av); - ac--; av++; + av++; break; case TOK_COUNT: @@ -2716,7 +2752,7 @@ ipfw_add(int ac, char *av[]) case TOK_TEE: action->opcode = O_TEE; chkarg: - if (!ac) + if (!av[0]) errx(EX_USAGE, "missing argument for %s", *(av - 1)); if (isdigit(**av)) { action->arg1 = strtoul(*av, NULL, 10); @@ -2735,7 +2771,7 @@ chkarg: errx(EX_DATAERR, "illegal divert/tee port"); } else errx(EX_DATAERR, "illegal argument for %s", *(av - 1)); - ac--; av++; + av++; break; case TOK_FORWARD: { @@ -2773,13 +2809,13 @@ chkarg: p->sa.sin_addr.s_addr = INADDR_ANY; else lookup_host(*av, &(p->sa.sin_addr)); - ac--; av++; + av++; break; } case TOK_COMMENT: /* pretend it is a 'count' rule followed by the comment */ action->opcode = O_COUNT; - ac++; av--; /* go back... */ + av--; /* go back... */ break; case TOK_SETFIB: @@ -2794,7 +2830,7 @@ chkarg: errx(EX_DATAERR, "fibs not suported.\n"); if (action->arg1 >= numfibs) /* Temporary */ errx(EX_DATAERR, "fib too large.\n"); - ac--; av++; + av++; break; } @@ -2814,8 +2850,8 @@ chkarg: * If they exist, it go first in the cmdbuf, but then it is * skipped in the copy section to the end of the buffer. */ - while (ac != 0 && (i = match_token(rule_action_params, *av)) != -1) { - ac--; av++; + while (av[0] != NULL && (i = match_token(rule_action_params, *av)) != -1) { + av++; switch (i) { case TOK_LOG: { @@ -2828,15 +2864,15 @@ chkarg: have_log = (ipfw_insn *)c; cmd->len = F_INSN_SIZE(ipfw_insn_log); cmd->opcode = O_LOG; - if (ac && _substrcmp(*av, "logamount") == 0) { - ac--; av++; + if (av[0] && _substrcmp(*av, "logamount") == 0) { + av++; NEED1("logamount requires argument"); l = atoi(*av); if (l < 0) errx(EX_DATAERR, "logamount must be positive"); c->max_log = l; - ac--; av++; + av++; } else { len = sizeof(c->max_log); if (sysctlbyname("net.inet.ip.fw.verbose_limit", @@ -2847,6 +2883,7 @@ chkarg: } break; +#ifndef NO_ALTQ case TOK_ALTQ: { ipfw_insn_altq *a = (ipfw_insn_altq *)cmd; @@ -2859,9 +2896,10 @@ chkarg: cmd->len = F_INSN_SIZE(ipfw_insn_altq); cmd->opcode = O_ALTQ; a->qid = altq_name_to_qid(*av); - ac--; av++; + av++; } break; +#endif case TOK_TAG: case TOK_UNTAG: { @@ -2874,7 +2912,7 @@ chkarg: rule_action_params); have_tag = cmd; fill_cmd(cmd, O_TAG, (i == TOK_TAG) ? 0: F_NOT, tag); - ac--; av++; + av++; break; } @@ -2888,13 +2926,13 @@ chkarg: goto done; #define OR_START(target) \ - if (ac && (*av[0] == '(' || *av[0] == '{')) { \ + if (av[0] && (*av[0] == '(' || *av[0] == '{')) { \ if (open_par) \ errx(EX_USAGE, "nested \"(\" not allowed\n"); \ prev = NULL; \ open_par = 1; \ if ( (av[0])[1] == '\0') { \ - ac--; av++; \ + av++; \ } else \ (*av)++; \ } \ @@ -2903,30 +2941,30 @@ chkarg: #define CLOSE_PAR \ if (open_par) { \ - if (ac && ( \ + if (av[0] && ( \ strcmp(*av, ")") == 0 || \ strcmp(*av, "}") == 0)) { \ prev = NULL; \ open_par = 0; \ - ac--; av++; \ + av++; \ } else \ errx(EX_USAGE, "missing \")\"\n"); \ } #define NOT_BLOCK \ - if (ac && _substrcmp(*av, "not") == 0) { \ + if (av[0] && _substrcmp(*av, "not") == 0) { \ if (cmd->len & F_NOT) \ errx(EX_USAGE, "double \"not\" not allowed\n"); \ cmd->len |= F_NOT; \ - ac--; av++; \ + av++; \ } #define OR_BLOCK(target) \ - if (ac && _substrcmp(*av, "or") == 0) { \ + if (av[0] && _substrcmp(*av, "or") == 0) { \ if (prev == NULL || open_par == 0) \ errx(EX_DATAERR, "invalid OR block"); \ prev->len |= F_OR; \ - ac--; av++; \ + av++; \ goto target; \ } \ CLOSE_PAR; @@ -2943,15 +2981,15 @@ chkarg: NEED1("missing protocol"); if (_substrcmp(*av, "MAC") == 0 || _substrcmp(*av, "mac") == 0) { - ac--; av++; /* the "MAC" keyword */ - add_mac(cmd, ac, av); /* exits in case of errors */ + av++; /* the "MAC" keyword */ + add_mac(cmd, av); /* exits in case of errors */ cmd = next_cmd(cmd); - ac -= 2; av += 2; /* dst-mac and src-mac */ + av += 2; /* dst-mac and src-mac */ NOT_BLOCK; NEED1("missing mac type"); - if (add_mactype(cmd, ac, av[0])) + if (add_mactype(cmd, av[0])) cmd = next_cmd(cmd); - ac--; av++; /* any or mac-type */ + av++; /* any or mac-type */ goto read_options; } #endif @@ -2963,7 +3001,7 @@ chkarg: NOT_BLOCK; NEED1("missing protocol"); if (add_proto_compat(cmd, *av, &proto)) { - av++; ac--; + av++; if (F_LEN(cmd) != 0) { prev = cmd; cmd = next_cmd(cmd); @@ -2977,9 +3015,9 @@ chkarg: /* * "from", mandatory */ - if (!ac || _substrcmp(*av, "from") != 0) + if ((av[0] == NULL) || _substrcmp(*av, "from") != 0) errx(EX_USAGE, "missing ``from''"); - ac--; av++; + av++; /* * source IP, mandatory @@ -2988,7 +3026,7 @@ chkarg: NOT_BLOCK; /* optional "not" */ NEED1("missing source address"); if (add_src(cmd, *av, proto)) { - ac--; av++; + av++; if (F_LEN(cmd) != 0) { /* ! any */ prev = cmd; cmd = next_cmd(cmd); @@ -3001,10 +3039,10 @@ chkarg: * source ports, optional */ NOT_BLOCK; /* optional "not" */ - if (ac) { + if ( av[0] != NULL ) { if (_substrcmp(*av, "any") == 0 || add_ports(cmd, *av, proto, O_IP_SRCPORT)) { - ac--; av++; + av++; if (F_LEN(cmd) != 0) cmd = next_cmd(cmd); } @@ -3013,9 +3051,9 @@ chkarg: /* * "to", mandatory */ - if (!ac || _substrcmp(*av, "to") != 0) + if ( (av[0] == NULL) || _substrcmp(*av, "to") != 0 ) errx(EX_USAGE, "missing ``to''"); - av++; ac--; + av++; /* * destination, mandatory @@ -3024,7 +3062,7 @@ chkarg: NOT_BLOCK; /* optional "not" */ NEED1("missing dst address"); if (add_dst(cmd, *av, proto)) { - ac--; av++; + av++; if (F_LEN(cmd) != 0) { /* ! any */ prev = cmd; cmd = next_cmd(cmd); @@ -3037,17 +3075,17 @@ chkarg: * dest. ports, optional */ NOT_BLOCK; /* optional "not" */ - if (ac) { + if (av[0]) { if (_substrcmp(*av, "any") == 0 || add_ports(cmd, *av, proto, O_IP_DSTPORT)) { - ac--; av++; + av++; if (F_LEN(cmd) != 0) cmd = next_cmd(cmd); } } read_options: - if (ac && first_cmd == cmd) { + if (av[0] && first_cmd == cmd) { /* * nothing specified so far, store in the rule to ease * printout later. @@ -3055,7 +3093,7 @@ read_options: rule->_pad = 1; } prev = NULL; - while (ac) { + while ( av[0] != NULL ) { char *s; ipfw_insn_u32 *cmd32; /* alias for cmd */ @@ -3069,7 +3107,7 @@ read_options: s++; } i = match_token(rule_options, s); - ac--; av++; + av++; switch(i) { case TOK_NOT: if (cmd->len & F_NOT) @@ -3131,7 +3169,7 @@ read_options: NEED1("recv, xmit, via require interface name" " or address"); fill_iface((ipfw_insn_if *)cmd, av[0]); - ac--; av++; + av++; if (F_LEN(cmd) == 0) /* not a valid address */ break; if (i == TOK_XMIT) @@ -3145,13 +3183,13 @@ read_options: case TOK_ICMPTYPES: NEED1("icmptypes requires list of types"); fill_icmptypes((ipfw_insn_u32 *)cmd, *av); - av++; ac--; + av++; break; case TOK_ICMP6TYPES: NEED1("icmptypes requires list of types"); fill_icmp6types((ipfw_insn_icmp6 *)cmd, *av); - av++; ac--; + av++; break; case TOK_IPTTL: @@ -3161,7 +3199,7 @@ read_options: errx(EX_DATAERR, "invalid ipttl %s", *av); } else fill_cmd(cmd, O_IPTTL, 0, strtoul(*av, NULL, 0)); - ac--; av++; + av++; break; case TOK_IPID: @@ -3171,7 +3209,7 @@ read_options: errx(EX_DATAERR, "invalid ipid %s", *av); } else fill_cmd(cmd, O_IPID, 0, strtoul(*av, NULL, 0)); - ac--; av++; + av++; break; case TOK_IPLEN: @@ -3181,32 +3219,32 @@ read_options: errx(EX_DATAERR, "invalid ip len %s", *av); } else fill_cmd(cmd, O_IPLEN, 0, strtoul(*av, NULL, 0)); - ac--; av++; + av++; break; case TOK_IPVER: NEED1("ipver requires version"); fill_cmd(cmd, O_IPVER, 0, strtoul(*av, NULL, 0)); - ac--; av++; + av++; break; case TOK_IPPRECEDENCE: NEED1("ipprecedence requires value"); fill_cmd(cmd, O_IPPRECEDENCE, 0, (strtoul(*av, NULL, 0) & 7) << 5); - ac--; av++; + av++; break; case TOK_IPOPTS: NEED1("missing argument for ipoptions"); fill_flags(cmd, O_IPOPT, f_ipopts, *av); - ac--; av++; + av++; break; case TOK_IPTOS: NEED1("missing argument for iptos"); fill_flags(cmd, O_IPTOS, f_iptos, *av); - ac--; av++; + av++; break; case TOK_UID: @@ -3223,7 +3261,7 @@ read_options: errx(EX_DATAERR, "uid \"%s\" nonexistent", *av); cmd32->d[0] = pwd->pw_uid; cmd->len |= F_INSN_SIZE(ipfw_insn_u32); - ac--; av++; + av++; } break; @@ -3241,7 +3279,7 @@ read_options: errx(EX_DATAERR, "gid \"%s\" nonexistent", *av); cmd32->d[0] = grp->gr_gid; cmd->len |= F_INSN_SIZE(ipfw_insn_u32); - ac--; av++; + av++; } break; @@ -3257,7 +3295,7 @@ read_options: errx(EX_DATAERR, "jail requires prison ID"); cmd32->d[0] = (uint32_t)jid; cmd->len |= F_INSN_SIZE(ipfw_insn_u32); - ac--; av++; + av++; } break; @@ -3278,13 +3316,13 @@ read_options: } else fill_cmd(cmd, O_TCPDATALEN, 0, strtoul(*av, NULL, 0)); - ac--; av++; + av++; break; case TOK_TCPOPTS: NEED1("missing argument for tcpoptions"); fill_flags(cmd, O_TCPOPTS, f_tcpopts, *av); - ac--; av++; + av++; break; case TOK_TCPSEQ: @@ -3293,21 +3331,21 @@ read_options: cmd->len = F_INSN_SIZE(ipfw_insn_u32); cmd->opcode = (i == TOK_TCPSEQ) ? O_TCPSEQ : O_TCPACK; cmd32->d[0] = htonl(strtoul(*av, NULL, 0)); - ac--; av++; + av++; break; case TOK_TCPWIN: NEED1("tcpwin requires length"); fill_cmd(cmd, O_TCPWIN, 0, htons(strtoul(*av, NULL, 0))); - ac--; av++; + av++; break; case TOK_TCPFLAGS: NEED1("missing argument for tcpflags"); cmd->opcode = O_TCPFLAGS; fill_flags(cmd, O_TCPFLAGS, f_tcpflags, *av); - ac--; av++; + av++; break; case TOK_KEEPSTATE: @@ -3337,11 +3375,11 @@ read_options: cmd->opcode = O_LIMIT; c->limit_mask = c->conn_limit = 0; - while (ac > 0) { + while ( av[0] != NULL ) { if ((val = match_token(limit_masks, *av)) <= 0) break; c->limit_mask |= val; - ac--; av++; + av++; } if (c->limit_mask == 0) @@ -3350,14 +3388,14 @@ read_options: GET_UINT_ARG(c->conn_limit, IPFW_ARG_MIN, IPFW_ARG_MAX, TOK_LIMIT, rule_options); - ac--; av++; + av++; break; } case TOK_PROTO: NEED1("missing protocol"); if (add_proto(cmd, *av, &proto)) { - ac--; av++; + av++; } else errx(EX_DATAERR, "invalid protocol ``%s''", *av); @@ -3366,28 +3404,28 @@ read_options: case TOK_SRCIP: NEED1("missing source IP"); if (add_srcip(cmd, *av)) { - ac--; av++; + av++; } break; case TOK_DSTIP: NEED1("missing destination IP"); if (add_dstip(cmd, *av)) { - ac--; av++; + av++; } break; case TOK_SRCIP6: NEED1("missing source IP6"); if (add_srcip6(cmd, *av)) { - ac--; av++; + av++; } break; case TOK_DSTIP6: NEED1("missing destination IP6"); if (add_dstip6(cmd, *av)) { - ac--; av++; + av++; } break; @@ -3395,7 +3433,7 @@ read_options: NEED1("missing source port"); if (_substrcmp(*av, "any") == 0 || add_ports(cmd, *av, proto, O_IP_SRCPORT)) { - ac--; av++; + av++; } else errx(EX_DATAERR, "invalid source port %s", *av); break; @@ -3404,23 +3442,22 @@ read_options: NEED1("missing destination port"); if (_substrcmp(*av, "any") == 0 || add_ports(cmd, *av, proto, O_IP_DSTPORT)) { - ac--; av++; + av++; } else errx(EX_DATAERR, "invalid destination port %s", *av); break; case TOK_MAC: - if (add_mac(cmd, ac, av)) { - ac -= 2; av += 2; - } + if (add_mac(cmd, av)) + av += 2; break; case TOK_MACTYPE: NEED1("missing mac type"); - if (!add_mactype(cmd, ac, *av)) + if (!add_mactype(cmd, *av)) errx(EX_DATAERR, "invalid mac type %s", *av); - ac--; av++; + av++; break; case TOK_VERREVPATH: @@ -3449,7 +3486,7 @@ read_options: case TOK_EXT6HDR: fill_ext6hdr( cmd, *av ); - ac--; av++; + av++; break; case TOK_FLOWID: @@ -3457,17 +3494,16 @@ read_options: errx( EX_USAGE, "flow-id filter is active " "only for ipv6 protocol\n"); fill_flow6( (ipfw_insn_u32 *) cmd, *av ); - ac--; av++; + av++; break; case TOK_COMMENT: - fill_comment(cmd, ac, av); - av += ac; - ac = 0; + fill_comment(cmd, av); + av[0]=NULL; break; case TOK_TAGGED: - if (ac > 0 && strpbrk(*av, "-,")) { + if (av[0] && strpbrk(*av, "-,")) { if (!add_ports(cmd, *av, 0, O_TAGGED)) errx(EX_DATAERR, "tagged: invalid tag" " list: %s", *av); @@ -3479,13 +3515,38 @@ read_options: TOK_TAGGED, rule_options); fill_cmd(cmd, O_TAGGED, 0, tag); } - ac--; av++; + av++; break; case TOK_FIB: NEED1("fib requires fib number"); fill_cmd(cmd, O_FIB, 0, strtoul(*av, NULL, 0)); - ac--; av++; + av++; + break; + + case TOK_LOOKUP: { + ipfw_insn_u32 *c = (ipfw_insn_u32 *)cmd; + char *p; + int j; + + if (!av[0] || !av[1]) + errx(EX_USAGE, "format: lookup argument tablenum"); + cmd->opcode = O_IP_DST_LOOKUP; + cmd->len |= F_INSN_SIZE(ipfw_insn) + 2; + i = match_token(rule_options, *av); + for (j = 0; lookup_key[j] >= 0 ; j++) { + if (i == lookup_key[j]) + break; + } + if (lookup_key[j] <= 0) + errx(EX_USAGE, "format: cannot lookup on %s", *av); + c->d[1] = j; // i converted to option + av++; + cmd->arg1 = strtoul(*av, &p, 0); + if (p && *p) + errx(EX_USAGE, "format: lookup argument tablenum"); + av++; + } break; default: @@ -3662,6 +3723,10 @@ ipfw_flush(int force) if (c == 'N') /* user said no */ return; } + if (co.do_pipe) { + dummynet_flush(); + return; + } /* `ipfw set N flush` - is the same that `ipfw delete set N` */ if (co.use_set) { uint32_t arg = ((co.use_set - 1) & 0xffff) | (1 << 24); @@ -3775,14 +3840,14 @@ ipfw_table_handler(int ac, char *av[]) } } } else if (_substrcmp(*av, "flush") == 0) { - a = is_all ? tables_max : (ent.tbl + 1); + a = is_all ? tables_max : (uint32_t)(ent.tbl + 1); do { if (do_cmd(IP_FW_TABLE_FLUSH, &ent.tbl, sizeof(ent.tbl)) < 0) err(EX_OSERR, "setsockopt(IP_FW_TABLE_FLUSH)"); } while (++ent.tbl < a); } else if (_substrcmp(*av, "list") == 0) { - a = is_all ? tables_max : (ent.tbl + 1); + a = is_all ? tables_max : (uint32_t)(ent.tbl + 1); do { table_list(ent, is_all); } while (++ent.tbl < a); diff --git a/sbin/ipfw/ipfw2.h b/sbin/ipfw/ipfw2.h index d3ce7fb..d172984 100644 --- a/sbin/ipfw/ipfw2.h +++ b/sbin/ipfw/ipfw2.h @@ -35,7 +35,7 @@ struct cmdline_opts { int do_resolv; /* try to resolve all ip to names */ int do_time; /* Show time stamps */ int do_quiet; /* Be quiet in add and flush */ - int do_pipe; /* this cmd refers to a pipe */ + int do_pipe; /* this cmd refers to a pipe/queue/sched */ int do_nat; /* this cmd refers to a nat config */ int do_dynamic; /* display dynamic rules */ int do_expired; /* display expired dynamic rules */ @@ -82,7 +82,10 @@ enum tokens { TOK_ACCEPT, TOK_COUNT, TOK_PIPE, + TOK_LINK, TOK_QUEUE, + TOK_FLOWSET, + TOK_SCHED, TOK_DIVERT, TOK_TEE, TOK_NETGRAPH, @@ -122,6 +125,7 @@ enum tokens { TOK_IPLEN, TOK_IPID, TOK_IPPRECEDENCE, + TOK_DSCP, TOK_IPTOS, TOK_IPTTL, TOK_IPVER, @@ -151,15 +155,23 @@ enum tokens { TOK_SRCPORT, TOK_ALL, TOK_MASK, + TOK_FLOW_MASK, + TOK_SCHED_MASK, TOK_BW, TOK_DELAY, - TOK_PIPE_PROFILE, + TOK_PROFILE, TOK_BURST, TOK_RED, TOK_GRED, TOK_DROPTAIL, TOK_PROTO, + /* dummynet tokens */ TOK_WEIGHT, + TOK_LMAX, + TOK_PRI, + TOK_TYPE, + TOK_SLOTSIZE, + TOK_IP, TOK_IF, TOK_ALOG, @@ -186,12 +198,14 @@ enum tokens { TOK_FIB, TOK_SETFIB, + TOK_LOOKUP, }; /* * the following macro returns an error message if we run out of * arguments. */ -#define NEED1(msg) {if (!ac) errx(EX_USAGE, msg);} +#define NEED(_p, msg) {if (!_p) errx(EX_USAGE, msg);} +#define NEED1(msg) {if (!(*av)) errx(EX_USAGE, msg);} unsigned long long align_uint64(const uint64_t *pll); @@ -235,14 +249,14 @@ struct _ipfw_insn_icmp6; extern int resvd_set_number; /* first-level command handlers */ -void ipfw_add(int ac, char *av[]); +void ipfw_add(char *av[]); void ipfw_show_nat(int ac, char **av); void ipfw_config_pipe(int ac, char **av); void ipfw_config_nat(int ac, char **av); -void ipfw_sets_handler(int ac, char *av[]); +void ipfw_sets_handler(char *av[]); void ipfw_table_handler(int ac, char *av[]); -void ipfw_sysctl_handler(int ac, char *av[], int which); -void ipfw_delete(int ac, char *av[]); +void ipfw_sysctl_handler(char *av[], int which); +void ipfw_delete(char *av[]); void ipfw_flush(int force); void ipfw_zero(int ac, char *av[], int optname); void ipfw_list(int ac, char *av[], int show_counters); @@ -254,7 +268,8 @@ u_int32_t altq_name_to_qid(const char *name); void print_altq_cmd(struct _ipfw_insn_altq *altqptr); /* dummynet.c */ -void ipfw_list_pipes(void *data, uint nbytes, int ac, char *av[]); +void dummynet_list(int ac, char *av[], int show_counters); +void dummynet_flush(void); int ipfw_delete_pipe(int pipe_or_queue, int n); /* ipv6.c */ diff --git a/sbin/ipfw/main.c b/sbin/ipfw/main.c index 3916057..cd39cf1 100644 --- a/sbin/ipfw/main.c +++ b/sbin/ipfw/main.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2002-2003 Luigi Rizzo + * Copyright (c) 2002-2003,2010 Luigi Rizzo * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp * Copyright (c) 1994 Ugen J.S.Antsilevich * @@ -80,31 +80,27 @@ help(void) } /* - * Free a the (locally allocated) copy of command line arguments. - */ -static void -free_args(int ac, char **av) -{ - int i; - - for (i=0; i < ac; i++) - free(av[i]); - free(av); -} - -/* * Called with the arguments, including program name because getopt * wants it to be present. * Returns 0 if successful, 1 if empty command, errx() in case of errors. + * First thing we do is process parameters creating an argv[] array + * which includes the program name and a NULL entry at the end. + * If we are called with a single string, we split it on whitespace. + * Also, arguments with a trailing ',' are joined to the next one. + * The pointers (av[]) and data are in a a single chunk of memory. + * av[0] points to the original program name, all other entries + * point into the allocated chunk. */ static int ipfw_main(int oldac, char **oldav) { - int ch, ac, save_ac; + int ch, ac; const char *errstr; char **av, **save_av; int do_acct = 0; /* Show packet/byte count */ int try_next = 0; /* set if pipe cmd not found */ + int av_size; /* compute the av size */ + char *av_p; /* used to build the av list */ #define WHITESP " \t\f\v\n\r" if (oldac < 2) @@ -112,10 +108,9 @@ ipfw_main(int oldac, char **oldav) if (oldac == 2) { /* - * If we are called with a single string, try to split it into - * arguments for subsequent parsing. - * But first, remove spaces after a ',', by copying the string - * in-place. + * If we are called with one argument, try to split it into + * words for subsequent parsing. Spaces after a ',' are + * removed by copying the string in-place. */ char *arg = oldav[1]; /* The string is the first arg. */ int l = strlen(arg); @@ -150,31 +145,59 @@ ipfw_main(int oldac, char **oldav) ac++; /* - * Allocate the argument list, including one entry for - * the program name because getopt expects it. + * Allocate the argument list structure as a single block + * of memory, containing pointers and the argument + * strings. We include one entry for the program name + * because getopt expects it, and a NULL at the end + * to simplify further parsing. */ - av = safe_calloc(ac + 1, sizeof(char *)); + ac++; /* add 1 for the program name */ + av_size = (ac+1) * sizeof(char *) + l + 1; + av = safe_calloc(av_size, 1); /* - * Second, copy arguments from arg[] to av[]. For each one, + * Init the argument pointer to the end of the array + * and copy arguments from arg[] to av[]. For each one, * j is the initial character, i is the one past the end. */ - for (ac = 1, i = j = 0; i < l; i++) + av_p = (char *)&av[ac+1]; + for (ac = 1, i = j = 0; i < l; i++) { if (index(WHITESP, arg[i]) != NULL || i == l-1) { if (i == l-1) i++; - av[ac] = safe_calloc(i-j+1, 1); - bcopy(arg+j, av[ac], i-j); + bcopy(arg+j, av_p, i-j); + av[ac] = av_p; + av_p += i-j; /* the lenght of the string */ + *av_p++ = '\0'; ac++; j = i + 1; } + } } else { /* * If an argument ends with ',' join with the next one. */ - int first, i, l; + int first, i, l=0; + + /* + * Allocate the argument list structure as a single block + * of memory, containing both pointers and the argument + * strings. We include some space for the program name + * because getopt expects it. + * We add an extra pointer to the end of the array, + * to make simpler further parsing. + */ + for (i=0; i<oldac; i++) + l += strlen(oldav[i]); - av = safe_calloc(oldac, sizeof(char *)); + av_size = (oldac+1) * sizeof(char *) + l + oldac; + av = safe_calloc(av_size, 1); + + /* + * Init the argument pointer to the end of the array + * and copy arguments from arg[] to av[] + */ + av_p = (char *)&av[oldac+1]; for (first = i = ac = 1, l = 0; i < oldac; i++) { char *arg = oldav[i]; int k = strlen(arg); @@ -182,11 +205,12 @@ ipfw_main(int oldac, char **oldav) l += k; if (arg[k-1] != ',' || i == oldac-1) { /* Time to copy. */ - av[ac] = safe_calloc(l+1, 1); + av[ac] = av_p; for (l=0; first <= i; first++) { - strcat(av[ac]+l, oldav[first]); - l += strlen(oldav[first]); + strcat(av_p, oldav[first]); + av_p += strlen(oldav[first]); } + *av_p++ = '\0'; ac++; l = 0; first = i+1; @@ -194,13 +218,47 @@ ipfw_main(int oldac, char **oldav) } } - av[0] = strdup(oldav[0]); /* copy progname from the caller */ + /* + * set the progname pointer to the original string + * and terminate the array with null + */ + av[0] = oldav[0]; + av[ac] = NULL; + /* Set the force flag for non-interactive processes */ if (!co.do_force) co.do_force = !isatty(STDIN_FILENO); +#ifdef EMULATE_SYSCTL /* sysctl emulation */ + if ( ac >= 2 && !strcmp(av[1], "sysctl")) { + char *s; + int i; + + if (ac != 3) { + printf( "sysctl emulation usage:\n" + " ipfw sysctl name[=value]\n" + " ipfw sysctl -a\n"); + return 0; + } + s = index(av[2], '='); + if (s == NULL) { + s = !strcmp(av[2], "-a") ? NULL : av[2]; + sysctlbyname(s, NULL, NULL, NULL, 0); + } else { /* ipfw sysctl x.y.z=value */ + /* assume an INT value, will extend later */ + if (s[1] == '\0') { + printf("ipfw sysctl: missing value\n\n"); + return 0; + } + *s = '\0'; + i = strtol(s+1, NULL, 0); + sysctlbyname(av[2], NULL, NULL, &i, sizeof(int)); + } + return 0; + } +#endif + /* Save arguments for final freeing of memory. */ - save_ac = ac; save_av = av; optind = optreset = 1; /* restart getopt() */ @@ -232,7 +290,7 @@ ipfw_main(int oldac, char **oldav) break; case 'h': /* help */ - free_args(save_ac, save_av); + free(save_av); help(); break; /* NOTREACHED */ @@ -273,7 +331,7 @@ ipfw_main(int oldac, char **oldav) break; default: - free_args(save_ac, save_av); + free(save_av); return 1; } @@ -304,6 +362,10 @@ ipfw_main(int oldac, char **oldav) co.do_pipe = 1; else if (_substrcmp(*av, "queue") == 0) co.do_pipe = 2; + else if (_substrcmp(*av, "flowset") == 0) + co.do_pipe = 2; + else if (_substrcmp(*av, "sched") == 0) + co.do_pipe = 3; else if (!strncmp(*av, "set", strlen(*av))) { if (ac > 1 && isdigit(av[1][0])) { co.use_set = strtonum(av[1], 0, resvd_set_number, @@ -335,7 +397,7 @@ ipfw_main(int oldac, char **oldav) if (co.use_set == 0) { if (_substrcmp(*av, "add") == 0) - ipfw_add(ac, av); + ipfw_add(av); else if (co.do_nat && _substrcmp(*av, "show") == 0) ipfw_show_nat(ac, av); else if (co.do_pipe && _substrcmp(*av, "config") == 0) @@ -343,20 +405,20 @@ ipfw_main(int oldac, char **oldav) else if (co.do_nat && _substrcmp(*av, "config") == 0) ipfw_config_nat(ac, av); else if (_substrcmp(*av, "set") == 0) - ipfw_sets_handler(ac, av); + ipfw_sets_handler(av); else if (_substrcmp(*av, "table") == 0) ipfw_table_handler(ac, av); else if (_substrcmp(*av, "enable") == 0) - ipfw_sysctl_handler(ac, av, 1); + ipfw_sysctl_handler(av, 1); else if (_substrcmp(*av, "disable") == 0) - ipfw_sysctl_handler(ac, av, 0); + ipfw_sysctl_handler(av, 0); else try_next = 1; } if (co.use_set || try_next) { if (_substrcmp(*av, "delete") == 0) - ipfw_delete(ac, av); + ipfw_delete(av); else if (_substrcmp(*av, "flush") == 0) ipfw_flush(co.do_force); else if (_substrcmp(*av, "zero") == 0) @@ -373,7 +435,7 @@ ipfw_main(int oldac, char **oldav) } /* Free memory allocated in the argument parsing. */ - free_args(save_ac, save_av); + free(save_av); return 0; } @@ -521,6 +583,20 @@ ipfw_readfile(int ac, char *av[]) int main(int ac, char *av[]) { +#if defined(_WIN32) && defined(TCC) + { + WSADATA wsaData; + int ret=0; + unsigned short wVersionRequested = MAKEWORD(2, 2); + ret = WSAStartup(wVersionRequested, &wsaData); + if (ret != 0) { + /* Tell the user that we could not find a usable */ + /* Winsock DLL. */ + printf("WSAStartup failed with error: %d\n", ret); + return 1; + } + } +#endif /* * If the last argument is an absolute pathname, interpret it * as a file to be preprocessed. diff --git a/sys/conf/files b/sys/conf/files index 313b51e..b488012 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -2474,13 +2474,24 @@ netinet/in_proto.c optional inet \ compile-with "${NORMAL_C} -I$S/contrib/pf" netinet/in_rmx.c optional inet netinet/ip_divert.c optional inet ipdivert ipfirewall +netinet/ipfw/dn_heap.c optional inet dummynet +netinet/ipfw/dn_sched_fifo.c optional inet dummynet +netinet/ipfw/dn_sched_rr.c optional inet dummynet +netinet/ipfw/dn_sched_wf2q.c optional inet dummynet +netinet/ipfw/dn_sched_qfq.c optional inet dummynet netinet/ipfw/ip_dummynet.c optional inet dummynet +netinet/ipfw/ip_dn_io.c optional inet dummynet +netinet/ipfw/ip_dn_glue.c optional inet dummynet netinet/ip_ecn.c optional inet | inet6 netinet/ip_encap.c optional inet | inet6 netinet/ip_fastfwd.c optional inet netinet/ipfw/ip_fw2.c optional inet ipfirewall \ compile-with "${NORMAL_C} -I$S/contrib/pf" +netinet/ipfw/ip_fw_dynamic.c optional inet ipfirewall +netinet/ipfw/ip_fw_log.c optional inet ipfirewall netinet/ipfw/ip_fw_pfil.c optional inet ipfirewall +netinet/ipfw/ip_fw_sockopt.c optional inet ipfirewall +netinet/ipfw/ip_fw_table.c optional inet ipfirewall netinet/ipfw/ip_fw_nat.c optional inet ipfirewall_nat netinet/ip_icmp.c optional inet netinet/ip_input.c optional inet diff --git a/sys/net/if_bridge.c b/sys/net/if_bridge.c index 1873095..fe4e18b 100644 --- a/sys/net/if_bridge.c +++ b/sys/net/if_bridge.c @@ -134,7 +134,7 @@ __FBSDID("$FreeBSD$"); #include <net/route.h> #include <netinet/ip_fw.h> -#include <netinet/ip_dummynet.h> +#include <netinet/ipfw/ip_fw_private.h> /* * Size of the route hash table. Must be a power of two. @@ -3058,20 +3058,28 @@ bridge_pfil(struct mbuf **mp, struct ifnet *bifp, struct ifnet *ifp, int dir) goto bad; } - if (V_ip_fw_chk_ptr && pfil_ipfw != 0 && dir == PFIL_OUT && ifp != NULL) { - struct dn_pkt_tag *dn_tag; + /* XXX this section is also in if_ethersubr.c */ + // XXX PFIL_OUT or DIR_OUT ? + if (V_ip_fw_chk_ptr && pfil_ipfw != 0 && + dir == PFIL_OUT && ifp != NULL) { + struct m_tag *mtag; error = -1; - dn_tag = ip_dn_claim_tag(*mp); - if (dn_tag != NULL) { - if (dn_tag->rule != NULL && V_fw_one_pass) - /* packet already partially processed */ + /* fetch the start point from existing tags, if any */ + mtag = m_tag_locate(*mp, MTAG_IPFW_RULE, 0, NULL); + if (mtag == NULL) { + args.rule.slot = 0; + } else { + struct ipfw_rule_ref *r; + + /* XXX can we free the tag after use ? */ + mtag->m_tag_id = PACKET_TAG_NONE; + r = (struct ipfw_rule_ref *)(mtag + 1); + /* packet already partially processed ? */ + if (r->info & IPFW_ONEPASS) goto ipfwpass; - args.rule = dn_tag->rule; /* matching rule to restart */ - args.rule_id = dn_tag->rule_id; - args.chain_id = dn_tag->chain_id; - } else - args.rule = NULL; + args.rule = *r; + } args.m = *mp; args.oif = ifp; @@ -3097,7 +3105,7 @@ bridge_pfil(struct mbuf **mp, struct ifnet *bifp, struct ifnet *ifp, int dir) * packet will return to us via bridge_dummynet(). */ args.oif = ifp; - ip_dn_io_ptr(mp, DN_TO_IFB_FWD, &args); + ip_dn_io_ptr(mp, DIR_FWD | PROTO_IFB, &args); return (error); } diff --git a/sys/net/if_ethersubr.c b/sys/net/if_ethersubr.c index 5dff9bc..46c44e9 100644 --- a/sys/net/if_ethersubr.c +++ b/sys/net/if_ethersubr.c @@ -70,9 +70,9 @@ #include <netinet/in.h> #include <netinet/in_var.h> #include <netinet/if_ether.h> -#include <netinet/ip_fw.h> -#include <netinet/ip_dummynet.h> #include <netinet/ip_var.h> +#include <netinet/ip_fw.h> +#include <netinet/ipfw/ip_fw_private.h> #endif #ifdef INET6 #include <netinet6/nd6.h> @@ -466,19 +466,23 @@ ether_ipfw_chk(struct mbuf **m0, struct ifnet *dst, int shared) struct mbuf *m; int i; struct ip_fw_args args; - struct dn_pkt_tag *dn_tag; - - dn_tag = ip_dn_claim_tag(*m0); + struct m_tag *mtag; - if (dn_tag != NULL) { - if (dn_tag->rule != NULL && V_fw_one_pass) + /* fetch start point from rule, if any */ + mtag = m_tag_locate(*m0, MTAG_IPFW_RULE, 0, NULL); + if (mtag == NULL) { + args.rule.slot = 0; + } else { /* dummynet packet, already partially processed */ + struct ipfw_rule_ref *r; + + /* XXX can we free it after use ? */ + mtag->m_tag_id = PACKET_TAG_NONE; + r = (struct ipfw_rule_ref *)(mtag + 1); + if (r->info & IPFW_ONEPASS) return (1); - args.rule = dn_tag->rule; /* matching rule to restart */ - args.rule_id = dn_tag->rule_id; - args.chain_id = dn_tag->chain_id; - } else - args.rule = NULL; + args.rule = *r; + } /* * I need some amt of data to be contiguous, and in case others need @@ -529,6 +533,7 @@ ether_ipfw_chk(struct mbuf **m0, struct ifnet *dst, int shared) return 1; if (ip_dn_io_ptr && (i == IP_FW_DUMMYNET)) { + int dir; /* * Pass the pkt to dummynet, which consumes it. * If shared, make a copy and keep the original. @@ -544,7 +549,8 @@ ether_ipfw_chk(struct mbuf **m0, struct ifnet *dst, int shared) */ *m0 = NULL ; } - ip_dn_io_ptr(&m, dst ? DN_TO_ETH_OUT: DN_TO_ETH_DEMUX, &args); + dir = PROTO_LAYER2 | (dst ? DIR_OUT : DIR_IN); + ip_dn_io_ptr(&m, dir, &args); return 0; } /* diff --git a/sys/net/radix.c b/sys/net/radix.c index 39b198e..9f2383d 100644 --- a/sys/net/radix.c +++ b/sys/net/radix.c @@ -33,7 +33,6 @@ /* * Routines to build and maintain radix trees for routing lookups. */ -#ifndef _RADIX_H_ #include <sys/param.h> #ifdef _KERNEL #include <sys/lock.h> @@ -41,20 +40,21 @@ #include <sys/rwlock.h> #include <sys/systm.h> #include <sys/malloc.h> -#include <sys/domain.h> -#else -#include <stdlib.h> -#endif #include <sys/syslog.h> #include <net/radix.h> -#endif - #include "opt_mpath.h" - #ifdef RADIX_MPATH #include <net/radix_mpath.h> #endif - +#else /* !_KERNEL */ +#include <stdio.h> +#include <strings.h> +#include <stdlib.h> +#define log(x, arg...) fprintf(stderr, ## arg) +#define panic(x) fprintf(stderr, "PANIC: %s", x), exit(1) +#define min(a, b) ((a) < (b) ? (a) : (b) ) +#include <net/radix.h> +#endif /* !_KERNEL */ static int rn_walktree_from(struct radix_node_head *h, void *a, void *m, walktree_f_t *f, void *w); @@ -72,6 +72,8 @@ static struct radix_node_head *mask_rnhead; /* * Work area -- the following point to 3 buffers of size max_keylen, * allocated in this order in a block of memory malloc'ed by rn_init. + * rn_zeros, rn_ones are set in rn_init and used in readonly afterwards. + * addmask_key is used in rn_addmask in rw mode and not thread-safe. */ static char *rn_zeros, *rn_ones, *addmask_key; @@ -135,8 +137,9 @@ static int rn_satisfies_leaf(char *trial, struct radix_node *leaf, * To make the assumption more explicit, we use the LEN() macro to access * this field. It is safe to pass an expression with side effects * to LEN() as the argument is evaluated only once. + * We cast the result to int as this is the dominant usage. */ -#define LEN(x) (*(const u_char *)(x)) +#define LEN(x) ( (int) (*(const u_char *)(x)) ) /* * XXX THIS NEEDS TO BE FIXED @@ -197,7 +200,7 @@ rn_refines(m_arg, n_arg) { register caddr_t m = m_arg, n = n_arg; register caddr_t lim, lim2 = lim = n + LEN(n); - int longer = LEN(n++) - (int)LEN(m++); + int longer = LEN(n++) - LEN(m++); int masks_are_equal = 1; if (longer > 0) @@ -250,10 +253,10 @@ rn_satisfies_leaf(trial, leaf, skip) char *cplim; int length = min(LEN(cp), LEN(cp2)); - if (cp3 == 0) + if (cp3 == NULL) cp3 = rn_ones; else - length = min(length, *(u_char *)cp3); + length = min(length, LEN(cp3)); cplim = cp + length; cp3 += skip; cp2 += skip; for (cp += skip; cp < cplim; cp++, cp2++, cp3++) if ((*cp ^ *cp2) & *cp3) @@ -424,7 +427,7 @@ rn_insert(v_arg, head, dupentry, nodes) { caddr_t v = v_arg; struct radix_node *top = head->rnh_treetop; - int head_off = top->rn_offset, vlen = (int)LEN(v); + int head_off = top->rn_offset, vlen = LEN(v); register struct radix_node *t = rn_search(v_arg, top); register caddr_t cp = v + head_off; register int b; @@ -933,7 +936,7 @@ on1: if (m) log(LOG_ERR, "rn_delete: Orphaned Mask %p at %p\n", - (void *)m, (void *)x); + m, x); } } /* @@ -1158,17 +1161,28 @@ rn_inithead(head, off) return (1); } +int +rn_detachhead(void **head) +{ + struct radix_node_head *rnh; + + KASSERT((head != NULL && *head != NULL), + ("%s: head already freed", __func__)); + rnh = *head; + + /* Free <left,root,right> nodes. */ + Free(rnh); + + *head = NULL; + return (1); +} + void -rn_init() +rn_init(int maxk) { char *cp, *cplim; -#ifdef _KERNEL - struct domain *dom; - for (dom = domains; dom; dom = dom->dom_next) - if (dom->dom_maxrtkey > max_keylen) - max_keylen = dom->dom_maxrtkey; -#endif + max_keylen = maxk; if (max_keylen == 0) { log(LOG_ERR, "rn_init: radix functions require max_keylen be set\n"); diff --git a/sys/net/radix.h b/sys/net/radix.h index e84072f..29659b5 100644 --- a/sys/net/radix.h +++ b/sys/net/radix.h @@ -160,8 +160,9 @@ struct radix_node_head { #define RADIX_NODE_HEAD_WLOCK_ASSERT(rnh) rw_assert(&(rnh)->rnh_lock, RA_WLOCKED) #endif /* _KERNEL */ -void rn_init(void); +void rn_init(int); int rn_inithead(void **, int); +int rn_detachhead(void **); int rn_refines(void *, void *); struct radix_node *rn_addmask(void *, int, int), diff --git a/sys/net/route.c b/sys/net/route.c index b00ea69..a938c9c 100644 --- a/sys/net/route.c +++ b/sys/net/route.c @@ -169,13 +169,20 @@ rt_tables_get_rnh(int table, int fam) static void route_init(void) { + struct domain *dom; + int max_keylen = 0; /* whack the tunable ints into line. */ if (rt_numfibs > RT_MAXFIBS) rt_numfibs = RT_MAXFIBS; if (rt_numfibs == 0) rt_numfibs = 1; - rn_init(); /* initialize all zeroes, all ones, mask table */ + + for (dom = domains; dom; dom = dom->dom_next) + if (dom->dom_maxrtkey > max_keylen) + max_keylen = dom->dom_maxrtkey; + + rn_init(max_keylen); /* init all zeroes, all ones, mask table */ } SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, 0); diff --git a/sys/netgraph/ng_ipfw.c b/sys/netgraph/ng_ipfw.c index 46bac8e..d331828 100644 --- a/sys/netgraph/ng_ipfw.c +++ b/sys/netgraph/ng_ipfw.c @@ -43,9 +43,10 @@ #include <netinet/in.h> #include <netinet/in_systm.h> #include <netinet/in_var.h> +#include <netinet/ip_var.h> #include <netinet/ip_fw.h> +#include <netinet/ipfw/ip_fw_private.h> #include <netinet/ip.h> -#include <netinet/ip_var.h> #include <netgraph/ng_message.h> #include <netgraph/ng_parse.h> @@ -220,21 +221,23 @@ ng_ipfw_findhook1(node_p node, u_int16_t rulenum) static int ng_ipfw_rcvdata(hook_p hook, item_p item) { - struct ng_ipfw_tag *ngit; + struct ipfw_rule_ref *tag; struct mbuf *m; NGI_GET_M(item, m); NG_FREE_ITEM(item); - if ((ngit = (struct ng_ipfw_tag *)m_tag_locate(m, NGM_IPFW_COOKIE, 0, - NULL)) == NULL) { + tag = (struct ipfw_rule_ref *) + m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL); + if (tag == NULL) { NG_FREE_M(m); return (EINVAL); /* XXX: find smth better */ }; - switch (ngit->dir) { - case NG_IPFW_OUT: - { + if (tag->info & IPFW_INFO_IN) { + ip_input(m); + return (0); + } else { struct ip *ip; if (m->m_len < sizeof(struct ip) && @@ -243,27 +246,16 @@ ng_ipfw_rcvdata(hook_p hook, item_p item) ip = mtod(m, struct ip *); - ip->ip_len = ntohs(ip->ip_len); - ip->ip_off = ntohs(ip->ip_off); + SET_HOST_IPLEN(ip); return ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL); } - case NG_IPFW_IN: - ip_input(m); - return (0); - default: - panic("ng_ipfw_rcvdata: bad dir %u", ngit->dir); - } - - /* not reached */ - return (0); } static int ng_ipfw_input(struct mbuf **m0, int dir, struct ip_fw_args *fwa, int tee) { struct mbuf *m; - struct ng_ipfw_tag *ngit; struct ip *ip; hook_p hook; int error = 0; @@ -272,7 +264,7 @@ ng_ipfw_input(struct mbuf **m0, int dir, struct ip_fw_args *fwa, int tee) * Node must be loaded and corresponding hook must be present. */ if (fw_node == NULL || - (hook = ng_ipfw_findhook1(fw_node, fwa->cookie)) == NULL) { + (hook = ng_ipfw_findhook1(fw_node, fwa->rule.info)) == NULL) { if (tee == 0) m_freem(*m0); return (ESRCH); /* no hook associated with this rule */ @@ -284,20 +276,21 @@ ng_ipfw_input(struct mbuf **m0, int dir, struct ip_fw_args *fwa, int tee) * a copy of a packet and forward it into netgraph without a tag. */ if (tee == 0) { + struct m_tag *tag; + struct ipfw_rule_ref *r; m = *m0; *m0 = NULL; /* it belongs now to netgraph */ - if ((ngit = (struct ng_ipfw_tag *)m_tag_alloc(NGM_IPFW_COOKIE, - 0, TAGSIZ, M_NOWAIT|M_ZERO)) == NULL) { + tag = m_tag_alloc(MTAG_IPFW_RULE, 0, sizeof(*r), + M_NOWAIT|M_ZERO); + if (tag == NULL) { m_freem(m); return (ENOMEM); } - ngit->rule = fwa->rule; - ngit->rule_id = fwa->rule_id; - ngit->chain_id = fwa->chain_id; - ngit->dir = dir; - ngit->ifp = fwa->oif; - m_tag_prepend(m, &ngit->mt); + r = (struct ipfw_rule_ref *)(tag + 1); + *r = fwa->rule; + r->info = dir ? IPFW_INFO_IN : IPFW_INFO_OUT; + m_tag_prepend(m, tag); } else if ((m = m_dup(*m0, M_DONTWAIT)) == NULL) @@ -308,8 +301,6 @@ ng_ipfw_input(struct mbuf **m0, int dir, struct ip_fw_args *fwa, int tee) return (EINVAL); ip = mtod(m, struct ip *); - ip->ip_len = htons(ip->ip_len); - ip->ip_off = htons(ip->ip_off); NG_SEND_DATA_ONLY(error, hook, m); diff --git a/sys/netgraph/ng_ipfw.h b/sys/netgraph/ng_ipfw.h index 29039f2..c2cab6a 100644 --- a/sys/netgraph/ng_ipfw.h +++ b/sys/netgraph/ng_ipfw.h @@ -26,26 +26,8 @@ * $FreeBSD$ */ +#ifndef _NG_IPFW_H +#define _NG_IPFW_H #define NG_IPFW_NODE_TYPE "ipfw" #define NGM_IPFW_COOKIE 1105988990 - -#ifdef _KERNEL - -typedef int ng_ipfw_input_t(struct mbuf **, int, struct ip_fw_args *, int); -extern ng_ipfw_input_t *ng_ipfw_input_p; -#define NG_IPFW_LOADED (ng_ipfw_input_p != NULL) - -struct ng_ipfw_tag { - struct m_tag mt; /* tag header */ - struct ip_fw *rule; /* matching rule */ - uint32_t rule_id; /* matching rule id */ - uint32_t chain_id; /* ruleset id */ - struct ifnet *ifp; /* interface, for ip_output */ - int dir; -#define NG_IPFW_OUT 0 -#define NG_IPFW_IN 1 -}; - -#define TAGSIZ (sizeof(struct ng_ipfw_tag) - sizeof(struct m_tag)) - -#endif /* _KERNEL */ +#endif /* _NG_IPFW_H */ diff --git a/sys/netinet/in.h b/sys/netinet/in.h index 7aa1645..4a7e11a 100644 --- a/sys/netinet/in.h +++ b/sys/netinet/in.h @@ -754,6 +754,32 @@ void in_ifdetach(struct ifnet *); #define sintosa(sin) ((struct sockaddr *)(sin)) #define ifatoia(ifa) ((struct in_ifaddr *)(ifa)) +/* + * Historically, BSD keeps ip_len and ip_off in host format + * when doing layer 3 processing, and this often requires + * to translate the format back and forth. + * To make the process explicit, we define a couple of macros + * that also take into account the fact that at some point + * we may want to keep those fields always in net format. + */ + +#if (BYTE_ORDER == BIG_ENDIAN) || defined(HAVE_NET_IPLEN) +#define SET_NET_IPLEN(p) do {} while (0) +#define SET_HOST_IPLEN(p) do {} while (0) +#else +#define SET_NET_IPLEN(p) do { \ + struct ip *h_ip = (p); \ + h_ip->ip_len = htons(h_ip->ip_len); \ + h_ip->ip_off = htons(h_ip->ip_off); \ + } while (0) + +#define SET_HOST_IPLEN(p) do { \ + struct ip *h_ip = (p); \ + h_ip->ip_len = ntohs(h_ip->ip_len); \ + h_ip->ip_off = ntohs(h_ip->ip_off); \ + } while (0) +#endif /* !HAVE_NET_IPLEN */ + #endif /* _KERNEL */ /* INET6 stuff */ diff --git a/sys/netinet/ip_divert.c b/sys/netinet/ip_divert.c index 401c090..9e7f062 100644 --- a/sys/netinet/ip_divert.c +++ b/sys/netinet/ip_divert.c @@ -32,14 +32,10 @@ __FBSDID("$FreeBSD$"); #if !defined(KLD_MODULE) #include "opt_inet.h" -#include "opt_ipfw.h" #include "opt_sctp.h" #ifndef INET #error "IPDIVERT requires INET." #endif -#ifndef IPFIREWALL -#error "IPDIVERT requires IPFIREWALL" -#endif #endif #include <sys/param.h> @@ -72,9 +68,7 @@ __FBSDID("$FreeBSD$"); #include <netinet/in_systm.h> #include <netinet/in_var.h> #include <netinet/ip.h> -#include <netinet/ip_divert.h> #include <netinet/ip_var.h> -#include <netinet/ip_fw.h> #ifdef SCTP #include <netinet/sctp_crc32.h> #endif @@ -92,27 +86,29 @@ __FBSDID("$FreeBSD$"); #define DIVRCVQ (65536 + 100) /* - * Divert sockets work in conjunction with ipfw, see the divert(4) - * manpage for features. - * Internally, packets selected by ipfw in ip_input() or ip_output(), - * and never diverted before, are passed to the input queue of the - * divert socket with a given 'divert_port' number (as specified in - * the matching ipfw rule), and they are tagged with a 16 bit cookie - * (representing the rule number of the matching ipfw rule), which - * is passed to process reading from the socket. + * Divert sockets work in conjunction with ipfw or other packet filters, + * see the divert(4) manpage for features. + * Packets are selected by the packet filter and tagged with an + * MTAG_IPFW_RULE tag carrying the 'divert port' number (as set by + * the packet filter) and information on the matching filter rule for + * subsequent reinjection. The divert_port is used to put the packet + * on the corresponding divert socket, while the rule number is passed + * up (at least partially) as the sin_port in the struct sockaddr. * - * Packets written to the divert socket are again tagged with a cookie - * (usually the same as above) and a destination address. - * If the destination address is INADDR_ANY then the packet is - * treated as outgoing and sent to ip_output(), otherwise it is - * treated as incoming and sent to ip_input(). - * In both cases, the packet is tagged with the cookie. + * Packets written to the divert socket carry in sin_addr a + * destination address, and in sin_port the number of the filter rule + * after which to continue processing. + * If the destination address is INADDR_ANY, the packet is treated as + * as outgoing and sent to ip_output(); otherwise it is treated as + * incoming and sent to ip_input(). + * Further, sin_zero carries some information on the interface, + * which can be used in the reinject -- see comments in the code. * * On reinjection, processing in ip_input() and ip_output() * will be exactly the same as for the original packet, except that - * ipfw processing will start at the rule number after the one - * written in the cookie (so, tagging a packet with a cookie of 0 - * will cause it to be effectively considered as a standard packet). + * packet filter processing will start at the rule number after the one + * written in the sin_port (ipfw does not allow a rule #0, so sin_port=0 + * will apply the entire ruleset to the packet). */ /* Internal variables. */ @@ -193,7 +189,7 @@ div_destroy(void) * IPPROTO_DIVERT is not in the real IP protocol number space; this * function should never be called. Just in case, drop any packets. */ -void +static void div_input(struct mbuf *m, int off) { @@ -217,9 +213,8 @@ divert_packet(struct mbuf *m, int incoming) struct sockaddr_in divsrc; struct m_tag *mtag; - mtag = m_tag_find(m, PACKET_TAG_DIVERT, NULL); + mtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL); if (mtag == NULL) { - printf("%s: no divert tag\n", __func__); m_freem(m); return; } @@ -244,14 +239,15 @@ divert_packet(struct mbuf *m, int incoming) ip->ip_len = htons(ip->ip_len); } #endif + bzero(&divsrc, sizeof(divsrc)); + divsrc.sin_len = sizeof(divsrc); + divsrc.sin_family = AF_INET; + /* record matching rule, in host format */ + divsrc.sin_port = ((struct ipfw_rule_ref *)(mtag+1))->rulenum; /* * Record receive interface address, if any. * But only for incoming packets. */ - bzero(&divsrc, sizeof(divsrc)); - divsrc.sin_len = sizeof(divsrc); - divsrc.sin_family = AF_INET; - divsrc.sin_port = divert_cookie(mtag); /* record matching rule */ if (incoming) { struct ifaddr *ifa; struct ifnet *ifp; @@ -299,7 +295,7 @@ divert_packet(struct mbuf *m, int incoming) /* Put packet on socket queue, if any */ sa = NULL; - nport = htons((u_int16_t)divert_info(mtag)); + nport = htons((u_int16_t)(((struct ipfw_rule_ref *)(mtag+1))->info)); INP_INFO_RLOCK(&V_divcbinfo); LIST_FOREACH(inp, &V_divcb, inp_list) { /* XXX why does only one socket match? */ @@ -338,7 +334,7 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin, struct mbuf *control) { struct m_tag *mtag; - struct divert_tag *dt; + struct ipfw_rule_ref *dt; int error = 0; struct mbuf *options; @@ -353,23 +349,31 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin, if (control) m_freem(control); /* XXX */ - if ((mtag = m_tag_find(m, PACKET_TAG_DIVERT, NULL)) == NULL) { - mtag = m_tag_get(PACKET_TAG_DIVERT, sizeof(struct divert_tag), - M_NOWAIT | M_ZERO); + mtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL); + if (mtag == NULL) { + /* this should be normal */ + mtag = m_tag_alloc(MTAG_IPFW_RULE, 0, + sizeof(struct ipfw_rule_ref), M_NOWAIT | M_ZERO); if (mtag == NULL) { error = ENOBUFS; goto cantsend; } - dt = (struct divert_tag *)(mtag+1); m_tag_prepend(m, mtag); - } else - dt = (struct divert_tag *)(mtag+1); + } + dt = (struct ipfw_rule_ref *)(mtag+1); /* Loopback avoidance and state recovery */ if (sin) { int i; - dt->cookie = sin->sin_port; + /* set the starting point. We provide a non-zero slot, + * but a non_matching chain_id to skip that info and use + * the rulenum/rule_id. + */ + dt->slot = 1; /* dummy, chain_id is invalid */ + dt->chain_id = 0; + dt->rulenum = sin->sin_port+1; /* host format ? */ + dt->rule_id = 0; /* * Find receive interface with the given name, stuffed * (if it exists) in the sin_zero[] field. @@ -387,7 +391,7 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin, struct ip *const ip = mtod(m, struct ip *); struct inpcb *inp; - dt->info |= IP_FW_DIVERT_OUTPUT_FLAG; + dt->info |= IPFW_IS_DIVERT | IPFW_INFO_OUT; INP_INFO_WLOCK(&V_divcbinfo); inp = sotoinpcb(so); INP_RLOCK(inp); @@ -453,7 +457,7 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin, m_freem(options); } } else { - dt->info |= IP_FW_DIVERT_LOOPBACK_FLAG; + dt->info |= IPFW_IS_DIVERT | IPFW_INFO_IN; if (m->m_pkthdr.rcvif == NULL) { /* * No luck with the name, check by IP address. @@ -587,7 +591,7 @@ div_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, return div_output(so, m, (struct sockaddr_in *)nam, control); } -void +static void div_ctlinput(int cmd, struct sockaddr *sa, void *vip) { struct in_addr faddr; @@ -800,5 +804,5 @@ static moduledata_t ipdivertmod = { }; DECLARE_MODULE(ipdivert, ipdivertmod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); -MODULE_DEPEND(dummynet, ipfw, 2, 2, 2); +MODULE_DEPEND(ipdivert, ipfw, 2, 2, 2); MODULE_VERSION(ipdivert, 1); diff --git a/sys/netinet/ip_divert.h b/sys/netinet/ip_divert.h index 5036355..b8bcf4f 100644 --- a/sys/netinet/ip_divert.h +++ b/sys/netinet/ip_divert.h @@ -36,53 +36,20 @@ #define _NETINET_IP_DIVERT_H_ /* - * Sysctl declaration. - */ -#ifdef SYSCTL_DECL -SYSCTL_DECL(_net_inet_divert); -#endif - -/* - * Divert socket definitions. - */ -struct divert_tag { - u_int32_t info; /* port & flags */ - u_int16_t cookie; /* ipfw rule number */ -}; - -/* - * Return the divert cookie associated with the mbuf; if any. - */ -static __inline u_int16_t -divert_cookie(struct m_tag *mtag) -{ - return ((struct divert_tag *)(mtag+1))->cookie; -} -static __inline u_int16_t -divert_find_cookie(struct mbuf *m) -{ - struct m_tag *mtag = m_tag_find(m, PACKET_TAG_DIVERT, NULL); - return mtag ? divert_cookie(mtag) : 0; -} - -/* - * Return the divert info associated with the mbuf; if any. + * divert has no custom kernel-userland API. + * + * All communication occurs through a sockaddr_in socket where + * + * kernel-->userland + * sin_port = matching rule, host format; + * sin_addr = IN: first address of the incoming interface; + * OUT: INADDR_ANY + * sin_zero = if fits, the interface name (max 7 bytes + NUL) + * + * userland->kernel + * sin_port = restart-rule - 1, host order + * (we restart at sin_port + 1) + * sin_addr = IN: address of the incoming interface; + * OUT: INADDR_ANY */ -static __inline u_int32_t -divert_info(struct m_tag *mtag) -{ - return ((struct divert_tag *)(mtag+1))->info; -} -static __inline u_int32_t -divert_find_info(struct mbuf *m) -{ - struct m_tag *mtag = m_tag_find(m, PACKET_TAG_DIVERT, NULL); - return mtag ? divert_info(mtag) : 0; -} - -typedef void ip_divert_packet_t(struct mbuf *m, int incoming); -extern ip_divert_packet_t *ip_divert_ptr; - -extern void div_input(struct mbuf *, int); -extern void div_ctlinput(int, struct sockaddr *, void *); #endif /* _NETINET_IP_DIVERT_H_ */ diff --git a/sys/netinet/ip_dummynet.h b/sys/netinet/ip_dummynet.h index b5ef19e..0bbc326 100644 --- a/sys/netinet/ip_dummynet.h +++ b/sys/netinet/ip_dummynet.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa + * Copyright (c) 1998-2010 Luigi Rizzo, Universita` di Pisa * Portions Copyright (c) 2000 Akamba Corp. * All rights reserved * @@ -31,262 +31,124 @@ #define _IP_DUMMYNET_H /* - * Definition of dummynet data structures. In the structures, I decided - * not to use the macros in <sys/queue.h> in the hope of making the code - * easier to port to other architectures. The type of lists and queue we - * use here is pretty simple anyways. - */ - -/* - * We start with a heap, which is used in the scheduler to decide when - * to transmit packets etc. - * - * The key for the heap is used for two different values: + * Definition of the kernel-userland API for dummynet. * - * 1. timer ticks- max 10K/second, so 32 bits are enough; + * Setsockopt() and getsockopt() pass a batch of objects, each + * of them starting with a "struct dn_id" which should fully identify + * the object and its relation with others in the sequence. + * The first object in each request should have + * type= DN_CMD_*, id = DN_API_VERSION. + * For other objects, type and subtype specify the object, len indicates + * the total length including the header, and 'id' identifies the specific + * object. * - * 2. virtual times. These increase in steps of len/x, where len is the - * packet length, and x is either the weight of the flow, or the - * sum of all weights. - * If we limit to max 1000 flows and a max weight of 100, then - * x needs 17 bits. The packet size is 16 bits, so we can easily - * overflow if we do not allow errors. - * So we use a key "dn_key" which is 64 bits. Some macros are used to - * compare key values and handle wraparounds. - * MAX64 returns the largest of two key values. - * MY_M is used as a shift count when doing fixed point arithmetic - * (a better name would be useful...). + * Most objects are numbered with an identifier in the range 1..65535. + * DN_MAX_ID indicates the first value outside the range. */ -typedef u_int64_t dn_key ; /* sorting key */ -#define DN_KEY_LT(a,b) ((int64_t)((a)-(b)) < 0) -#define DN_KEY_LEQ(a,b) ((int64_t)((a)-(b)) <= 0) -#define DN_KEY_GT(a,b) ((int64_t)((a)-(b)) > 0) -#define DN_KEY_GEQ(a,b) ((int64_t)((a)-(b)) >= 0) -#define MAX64(x,y) (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x) -#define MY_M 16 /* number of left shift to obtain a larger precision */ -/* - * XXX With this scaling, max 1000 flows, max weight 100, 1Gbit/s, the - * virtual time wraps every 15 days. - */ +#define DN_API_VERSION 12500000 +#define DN_MAX_ID 0x10000 +struct dn_id { + uint16_t len; /* total obj len including this header */ + uint8_t type; + uint8_t subtype; + uint32_t id; /* generic id */ +}; /* - * The maximum hash table size for queues. This value must be a power - * of 2. - */ -#define DN_MAX_HASH_SIZE 65536 - -/* - * A heap entry is made of a key and a pointer to the actual - * object stored in the heap. - * The heap is an array of dn_heap_entry entries, dynamically allocated. - * Current size is "size", with "elements" actually in use. - * The heap normally supports only ordered insert and extract from the top. - * If we want to extract an object from the middle of the heap, we - * have to know where the object itself is located in the heap (or we - * need to scan the whole array). To this purpose, an object has a - * field (int) which contains the index of the object itself into the - * heap. When the object is moved, the field must also be updated. - * The offset of the index in the object is stored in the 'offset' - * field in the heap descriptor. The assumption is that this offset - * is non-zero if we want to support extract from the middle. + * These values are in the type field of struct dn_id. + * To preserve the ABI, never rearrange the list or delete + * entries with the exception of DN_LAST */ -struct dn_heap_entry { - dn_key key ; /* sorting key. Topmost element is smallest one */ - void *object ; /* object pointer */ +enum { + DN_NONE = 0, + DN_LINK = 1, + DN_FS, + DN_SCH, + DN_SCH_I, + DN_QUEUE, + DN_DELAY_LINE, + DN_PROFILE, + DN_FLOW, /* struct dn_flow */ + DN_TEXT, /* opaque text is the object */ + + DN_CMD_CONFIG = 0x80, /* objects follow */ + DN_CMD_DELETE, /* subtype + list of entries */ + DN_CMD_GET, /* subtype + list of entries */ + DN_CMD_FLUSH, + /* for compatibility with FreeBSD 7.2/8 */ + DN_COMPAT_PIPE, + DN_COMPAT_QUEUE, + DN_GET_COMPAT, + + /* special commands for emulation of sysctl variables */ + DN_SYSCTL_GET, + DN_SYSCTL_SET, + + DN_LAST, } ; -struct dn_heap { - int size ; - int elements ; - int offset ; /* XXX if > 0 this is the offset of direct ptr to obj */ - struct dn_heap_entry *p ; /* really an array of "size" entries */ +enum { /* subtype for schedulers, flowset and the like */ + DN_SCHED_UNKNOWN = 0, + DN_SCHED_FIFO = 1, + DN_SCHED_WF2QP = 2, + /* others are in individual modules */ } ; -#ifdef _KERNEL -/* - * Packets processed by dummynet have an mbuf tag associated with - * them that carries their dummynet state. This is used within - * the dummynet code as well as outside when checking for special - * processing requirements. - */ -struct dn_pkt_tag { - struct ip_fw *rule; /* matching rule */ - uint32_t rule_id; /* matching rule id */ - uint32_t chain_id; /* ruleset id */ - int dn_dir; /* action when packet comes out. */ -#define DN_TO_IP_OUT 1 -#define DN_TO_IP_IN 2 -/* Obsolete: #define DN_TO_BDG_FWD 3 */ -#define DN_TO_ETH_DEMUX 4 -#define DN_TO_ETH_OUT 5 -#define DN_TO_IP6_IN 6 -#define DN_TO_IP6_OUT 7 -#define DN_TO_IFB_FWD 8 - - dn_key output_time; /* when the pkt is due for delivery */ - struct ifnet *ifp; /* interface, for ip_output */ - struct _ip6dn_args ip6opt; /* XXX ipv6 options */ +enum { /* user flags */ + DN_HAVE_MASK = 0x0001, /* fs or sched has a mask */ + DN_NOERROR = 0x0002, /* do not report errors */ + DN_QHT_HASH = 0x0004, /* qht is a hash table */ + DN_QSIZE_BYTES = 0x0008, /* queue size is in bytes */ + DN_HAS_PROFILE = 0x0010, /* a link has a profile */ + DN_IS_RED = 0x0020, + DN_IS_GENTLE_RED= 0x0040, + DN_PIPE_CMD = 0x1000, /* pipe config... */ }; -#endif /* _KERNEL */ - -/* - * Overall structure of dummynet (with WF2Q+): - -In dummynet, packets are selected with the firewall rules, and passed -to two different objects: PIPE or QUEUE. - -A QUEUE is just a queue with configurable size and queue management -policy. It is also associated with a mask (to discriminate among -different flows), a weight (used to give different shares of the -bandwidth to different flows) and a "pipe", which essentially -supplies the transmit clock for all queues associated with that -pipe. - -A PIPE emulates a fixed-bandwidth link, whose bandwidth is -configurable. The "clock" for a pipe can come from either an -internal timer, or from the transmit interrupt of an interface. -A pipe is also associated with one (or more, if masks are used) -queue, where all packets for that pipe are stored. - -The bandwidth available on the pipe is shared by the queues -associated with that pipe (only one in case the packet is sent -to a PIPE) according to the WF2Q+ scheduling algorithm and the -configured weights. - -In general, incoming packets are stored in the appropriate queue, -which is then placed into one of a few heaps managed by a scheduler -to decide when the packet should be extracted. -The scheduler (a function called dummynet()) is run at every timer -tick, and grabs queues from the head of the heaps when they are -ready for processing. - -There are three data structures definining a pipe and associated queues: - - + dn_pipe, which contains the main configuration parameters related - to delay and bandwidth; - + dn_flow_set, which contains WF2Q+ configuration, flow - masks, plr and RED configuration; - + dn_flow_queue, which is the per-flow queue (containing the packets) - -Multiple dn_flow_set can be linked to the same pipe, and multiple -dn_flow_queue can be linked to the same dn_flow_set. -All data structures are linked in a linear list which is used for -housekeeping purposes. - -During configuration, we create and initialize the dn_flow_set -and dn_pipe structures (a dn_pipe also contains a dn_flow_set). - -At runtime: packets are sent to the appropriate dn_flow_set (either -WFQ ones, or the one embedded in the dn_pipe for fixed-rate flows), -which in turn dispatches them to the appropriate dn_flow_queue -(created dynamically according to the masks). - -The transmit clock for fixed rate flows (ready_event()) selects the -dn_flow_queue to be used to transmit the next packet. For WF2Q, -wfq_ready_event() extract a pipe which in turn selects the right -flow using a number of heaps defined into the pipe itself. - - * - */ /* - * per flow queue. This contains the flow identifier, the queue - * of packets, counters, and parameters used to support both RED and - * WF2Q+. - * - * A dn_flow_queue is created and initialized whenever a packet for - * a new flow arrives. + * link template. */ -struct dn_flow_queue { - struct dn_flow_queue *next ; - struct ipfw_flow_id id ; - - struct mbuf *head, *tail ; /* queue of packets */ - u_int len ; - u_int len_bytes ; +struct dn_link { + struct dn_id oid; /* - * When we emulate MAC overheads, or channel unavailability due - * to other traffic on a shared medium, we augment the packet at - * the head of the queue with an 'extra_bits' field representsing - * the additional delay the packet will be subject to: - * extra_bits = bw*unavailable_time. - * With large bandwidth and large delays, extra_bits (and also numbytes) - * can become very large, so better play safe and use 64 bit - */ - uint64_t numbytes ; /* credit for transmission (dynamic queues) */ - int64_t extra_bits; /* extra bits simulating unavailable channel */ - - u_int64_t tot_pkts ; /* statistics counters */ - u_int64_t tot_bytes ; - u_int32_t drops ; - - int hash_slot ; /* debugging/diagnostic */ - - /* RED parameters */ - int avg ; /* average queue length est. (scaled) */ - int count ; /* arrivals since last RED drop */ - int random ; /* random value (scaled) */ - dn_key idle_time; /* start of queue idle time */ - - /* WF2Q+ support */ - struct dn_flow_set *fs ; /* parent flow set */ - int heap_pos ; /* position (index) of struct in heap */ - dn_key sched_time ; /* current time when queue enters ready_heap */ - - dn_key S,F ; /* start time, finish time */ - /* - * Setting F < S means the timestamp is invalid. We only need - * to test this when the queue is empty. + * Userland sets bw and delay in bits/s and milliseconds. + * The kernel converts this back and forth to bits/tick and ticks. + * XXX what about burst ? */ + int32_t link_nr; + int bandwidth; /* bit/s or bits/tick. */ + int delay; /* ms and ticks */ + uint64_t burst; /* scaled. bits*Hz XXX */ } ; /* - * flow_set descriptor. Contains the "template" parameters for the - * queue configuration, and pointers to the hash table of dn_flow_queue's. - * - * The hash table is an array of lists -- we identify the slot by - * hashing the flow-id, then scan the list looking for a match. - * The size of the hash table (buckets) is configurable on a per-queue - * basis. - * - * A dn_flow_set is created whenever a new queue or pipe is created (in the - * latter case, the structure is located inside the struct dn_pipe). + * A flowset, which is a template for flows. Contains parameters + * from the command line: id, target scheduler, queue sizes, plr, + * flow masks, buckets for the flow hash, and possibly scheduler- + * specific parameters (weight, quantum and so on). */ -struct dn_flow_set { - SLIST_ENTRY(dn_flow_set) next; /* linked list in a hash slot */ - - u_short fs_nr ; /* flow_set number */ - u_short flags_fs; -#define DN_HAVE_FLOW_MASK 0x0001 -#define DN_IS_RED 0x0002 -#define DN_IS_GENTLE_RED 0x0004 -#define DN_QSIZE_IS_BYTES 0x0008 /* queue size is measured in bytes */ -#define DN_NOERROR 0x0010 /* do not report ENOBUFS on drops */ -#define DN_HAS_PROFILE 0x0020 /* the pipe has a delay profile. */ -#define DN_IS_PIPE 0x4000 -#define DN_IS_QUEUE 0x8000 - - struct dn_pipe *pipe ; /* pointer to parent pipe */ - u_short parent_nr ; /* parent pipe#, 0 if local to a pipe */ - - int weight ; /* WFQ queue weight */ +struct dn_fs { + struct dn_id oid; + uint32_t fs_nr; /* the flowset number */ + uint32_t flags; /* userland flags */ int qsize ; /* queue size in slots or bytes */ - int plr ; /* pkt loss rate (2^31-1 means 100%) */ + int32_t plr; /* PLR, pkt loss rate (2^31-1 means 100%) */ + uint32_t buckets; /* buckets used for the queue hash table */ struct ipfw_flow_id flow_mask ; - - /* hash table of queues onto this flow_set */ - int rq_size ; /* number of slots */ - int rq_elements ; /* active elements */ - struct dn_flow_queue **rq; /* array of rq_size entries */ - - u_int32_t last_expired ; /* do not expire too frequently */ - int backlogged ; /* #active queues for this flowset */ - - /* RED parameters */ + uint32_t sched_nr; /* the scheduler we attach to */ + /* generic scheduler parameters. Leave them at -1 if unset. + * Now we use 0: weight, 1: lmax, 2: priority + */ + int par[4]; + + /* RED/GRED parameters. + * weight and probabilities are in the range 0..1 represented + * in fixed point arithmetic with SCALE_RED decimal bits. + */ #define SCALE_RED 16 #define SCALE(x) ( (x) << SCALE_RED ) #define SCALE_VAL(x) ( (x) >> SCALE_RED ) @@ -295,102 +157,107 @@ struct dn_flow_set { int max_th ; /* maximum threshold for queue (scaled) */ int min_th ; /* minimum threshold for queue (scaled) */ int max_p ; /* maximum value for p_b (scaled) */ - u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */ - u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */ - u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */ - u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */ - u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */ - u_int lookup_depth ; /* depth of lookup table */ - int lookup_step ; /* granularity inside the lookup table */ - int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */ - int avg_pkt_size ; /* medium packet size */ - int max_pkt_size ; /* max packet size */ + }; -SLIST_HEAD(dn_flow_set_head, dn_flow_set); /* - * Pipe descriptor. Contains global parameters, delay-line queue, - * and the flow_set used for fixed-rate queues. - * - * For WF2Q+ support it also has 3 heaps holding dn_flow_queue: - * not_eligible_heap, for queues whose start time is higher - * than the virtual time. Sorted by start time. - * scheduler_heap, for queues eligible for scheduling. Sorted by - * finish time. - * idle_heap, all flows that are idle and can be removed. We - * do that on each tick so we do not slow down too much - * operations during forwarding. - * + * dn_flow collects flow_id and stats for queues and scheduler + * instances, and is used to pass these info to userland. + * oid.type/oid.subtype describe the object, oid.id is number + * of the parent object. */ -struct dn_pipe { /* a pipe */ - SLIST_ENTRY(dn_pipe) next; /* linked list in a hash slot */ - - int pipe_nr ; /* number */ - int bandwidth; /* really, bytes/tick. */ - int delay ; /* really, ticks */ - - struct mbuf *head, *tail ; /* packets in delay line */ - - /* WF2Q+ */ - struct dn_heap scheduler_heap ; /* top extract - key Finish time*/ - struct dn_heap not_eligible_heap; /* top extract- key Start time */ - struct dn_heap idle_heap ; /* random extract - key Start=Finish time */ - - dn_key V ; /* virtual time */ - int sum; /* sum of weights of all active sessions */ - - /* Same as in dn_flow_queue, numbytes can become large */ - int64_t numbytes; /* bits I can transmit (more or less). */ - uint64_t burst; /* burst size, scaled: bits * hz */ +struct dn_flow { + struct dn_id oid; + struct ipfw_flow_id fid; + uint64_t tot_pkts; /* statistics counters */ + uint64_t tot_bytes; + uint32_t length; /* Queue lenght, in packets */ + uint32_t len_bytes; /* Queue lenght, in bytes */ + uint32_t drops; +}; - dn_key sched_time ; /* time pipe was scheduled in ready_heap */ - dn_key idle_time; /* start of pipe idle time */ /* - * When the tx clock come from an interface (if_name[0] != '\0'), its name - * is stored below, whereas the ifp is filled when the rule is configured. + * Scheduler template, mostly indicating the name, number, + * sched_mask and buckets. */ - char if_name[IFNAMSIZ]; - struct ifnet *ifp ; - int ready ; /* set if ifp != NULL and we got a signal from it */ +struct dn_sch { + struct dn_id oid; + uint32_t sched_nr; /* N, scheduler number */ + uint32_t buckets; /* number of buckets for the instances */ + uint32_t flags; /* have_mask, ... */ + + char name[16]; /* null terminated */ + /* mask to select the appropriate scheduler instance */ + struct ipfw_flow_id sched_mask; /* M */ +}; - struct dn_flow_set fs ; /* used with fixed-rate flows */ +/* A delay profile is attached to a link. + * Note that a profile, as any other object, cannot be longer than 2^16 + */ +#define ED_MAX_SAMPLES_NO 1024 +struct dn_profile { + struct dn_id oid; /* fields to simulate a delay profile */ - #define ED_MAX_NAME_LEN 32 char name[ED_MAX_NAME_LEN]; + int link_nr; int loss_level; - int samples_no; - int *samples; -}; - -/* dn_pipe_max is used to pass pipe configuration from userland onto - * kernel space and back - */ -#define ED_MAX_SAMPLES_NO 1024 -struct dn_pipe_max { - struct dn_pipe pipe; - int samples[ED_MAX_SAMPLES_NO]; + int bandwidth; // XXX use link bandwidth? + int samples_no; /* actual length of samples[] */ + int samples[ED_MAX_SAMPLES_NO]; /* may be shorter */ }; -SLIST_HEAD(dn_pipe_head, dn_pipe); -#ifdef _KERNEL /* - * Return the dummynet tag; if any. - * Make sure that the dummynet tag is not reused by lower layers. + * Overall structure of dummynet + +In dummynet, packets are selected with the firewall rules, and passed +to two different objects: PIPE or QUEUE (bad name). + +A QUEUE defines a classifier, which groups packets into flows +according to a 'mask', puts them into independent queues (one +per flow) with configurable size and queue management policy, +and passes flows to a scheduler: + + (flow_mask|sched_mask) sched_mask + +---------+ weight Wx +-------------+ + | |->-[flow]-->--| |-+ + -->--| QUEUE x | ... | | | + | |->-[flow]-->--| SCHEDuler N | | + +---------+ | | | + ... | +--[LINK N]-->-- + +---------+ weight Wy | | +--[LINK N]-->-- + | |->-[flow]-->--| | | + -->--| QUEUE y | ... | | | + | |->-[flow]-->--| | | + +---------+ +-------------+ | + +-------------+ + +Many QUEUE objects can connect to the same scheduler, each +QUEUE object can have its own set of parameters. + +In turn, the SCHEDuler 'forks' multiple instances according +to a 'sched_mask', each instance manages its own set of queues +and transmits on a private instance of a configurable LINK. + +A PIPE is a simplified version of the above, where there +is no flow_mask, and each scheduler instance handles a single queue. + +The following data structures (visible from userland) describe +the objects used by dummynet: + + + dn_link, contains the main configuration parameters related + to delay and bandwidth; + + dn_profile describes a delay profile; + + dn_flow describes the flow status (flow id, statistics) + + + dn_sch describes a scheduler + + dn_fs describes a flowset (msk, weight, queue parameters) + + * */ -static __inline struct dn_pkt_tag * -ip_dn_claim_tag(struct mbuf *m) -{ - struct m_tag *mtag = m_tag_find(m, PACKET_TAG_DUMMYNET, NULL); - if (mtag != NULL) { - mtag->m_tag_id = PACKET_TAG_NONE; - return ((struct dn_pkt_tag *)(mtag + 1)); - } else - return (NULL); -} -#endif + #endif /* _IP_DUMMYNET_H */ diff --git a/sys/netinet/ip_fw.h b/sys/netinet/ip_fw.h index 1e6feb4..cf5d8d0 100644 --- a/sys/netinet/ip_fw.h +++ b/sys/netinet/ip_fw.h @@ -461,7 +461,7 @@ typedef struct _ipfw_insn_icmp6 { */ struct ip_fw { - struct ip_fw *next; /* linked list of rules */ + struct ip_fw *x_next; /* linked list of rules */ struct ip_fw *next_rule; /* ptr to next [skipto] rule */ /* 'next_rule' is used to pass up 'set_disable' status */ @@ -487,24 +487,29 @@ struct ip_fw { #define RULESIZE(rule) (sizeof(struct ip_fw) + \ ((struct ip_fw *)(rule))->cmd_len * 4 - 4) +#if 1 // should be moved to in.h /* * This structure is used as a flow mask and a flow id for various * parts of the code. + * addr_type is used in userland and kernel to mark the address type. + * fib is used in the kernel to record the fib in use. + * _flags is used in the kernel to store tcp flags for dynamic rules. */ struct ipfw_flow_id { - u_int32_t dst_ip; - u_int32_t src_ip; - u_int16_t dst_port; - u_int16_t src_port; - u_int8_t fib; - u_int8_t proto; - u_int8_t flags; /* protocol-specific flags */ - uint8_t addr_type; /* 4 = ipv4, 6 = ipv6, 1=ether ? */ - struct in6_addr dst_ip6; /* could also store MAC addr! */ + uint32_t dst_ip; + uint32_t src_ip; + uint16_t dst_port; + uint16_t src_port; + uint8_t fib; + uint8_t proto; + uint8_t _flags; /* protocol-specific flags */ + uint8_t addr_type; /* 4=ip4, 6=ip6, 1=ether ? */ + struct in6_addr dst_ip6; struct in6_addr src_ip6; - u_int32_t flow_id6; - u_int32_t frag_id6; + uint32_t flow_id6; + uint32_t extra; /* queue/pipe or frag_id */ }; +#endif #define IS_IP6_FLOW_ID(id) ((id)->addr_type == 6) @@ -571,133 +576,4 @@ typedef struct _ipfw_table { ipfw_table_entry ent[0]; /* entries */ } ipfw_table; -/* - * Main firewall chains definitions and global var's definitions. - */ -#ifdef _KERNEL - -#define MTAG_IPFW 1148380143 /* IPFW-tagged cookie */ - -/* Return values from ipfw_chk() */ -enum { - IP_FW_PASS = 0, - IP_FW_DENY, - IP_FW_DIVERT, - IP_FW_TEE, - IP_FW_DUMMYNET, - IP_FW_NETGRAPH, - IP_FW_NGTEE, - IP_FW_NAT, - IP_FW_REASS, -}; - -/* flags for divert mtag */ -#define IP_FW_DIVERT_LOOPBACK_FLAG 0x00080000 -#define IP_FW_DIVERT_OUTPUT_FLAG 0x00100000 - -/* - * Structure for collecting parameters to dummynet for ip6_output forwarding - */ -struct _ip6dn_args { - struct ip6_pktopts *opt_or; - struct route_in6 ro_or; - int flags_or; - struct ip6_moptions *im6o_or; - struct ifnet *origifp_or; - struct ifnet *ifp_or; - struct sockaddr_in6 dst_or; - u_long mtu_or; - struct route_in6 ro_pmtu_or; -}; - -/* - * Arguments for calling ipfw_chk() and dummynet_io(). We put them - * all into a structure because this way it is easier and more - * efficient to pass variables around and extend the interface. - */ -struct ip_fw_args { - struct mbuf *m; /* the mbuf chain */ - struct ifnet *oif; /* output interface */ - struct sockaddr_in *next_hop; /* forward address */ - struct ip_fw *rule; /* matching rule */ - uint32_t rule_id; /* matching rule id */ - uint32_t chain_id; /* ruleset id */ - struct ether_header *eh; /* for bridged packets */ - - struct ipfw_flow_id f_id; /* grabbed from IP header */ - uint32_t cookie; /* a cookie depending on rule action */ - struct inpcb *inp; - - struct _ip6dn_args dummypar; /* dummynet->ip6_output */ - struct sockaddr_in hopstore; /* store here if cannot use a pointer */ -}; - -/* - * Function definitions. - */ - -/* Firewall hooks */ -struct sockopt; -struct dn_flow_set; - -int ipfw_check_in(void *, struct mbuf **, struct ifnet *, int, struct inpcb *inp); -int ipfw_check_out(void *, struct mbuf **, struct ifnet *, int, struct inpcb *inp); - -int ipfw_chk(struct ip_fw_args *); - -int ipfw_hook(void); -int ipfw6_hook(void); -int ipfw_unhook(void); -int ipfw6_unhook(void); -#ifdef NOTYET -void ipfw_nat_destroy(void); -#endif - -VNET_DECLARE(int, fw_one_pass); -VNET_DECLARE(int, fw_enable); -#define V_fw_one_pass VNET(fw_one_pass) -#define V_fw_enable VNET(fw_enable) - -#ifdef INET6 -VNET_DECLARE(int, fw6_enable); -#define V_fw6_enable VNET(fw6_enable) -#endif - -struct ip_fw_chain { - struct ip_fw *rules; /* list of rules */ - struct ip_fw *reap; /* list of rules to reap */ - LIST_HEAD(, cfg_nat) nat; /* list of nat entries */ - struct radix_node_head *tables[IPFW_TABLES_MAX]; - struct rwlock rwmtx; - uint32_t id; /* ruleset id */ -}; - -#ifdef IPFW_INTERNAL - -#define IPFW_LOCK_INIT(_chain) \ - rw_init(&(_chain)->rwmtx, "IPFW static rules") -#define IPFW_LOCK_DESTROY(_chain) rw_destroy(&(_chain)->rwmtx) -#define IPFW_WLOCK_ASSERT(_chain) rw_assert(&(_chain)->rwmtx, RA_WLOCKED) - -#define IPFW_RLOCK(p) rw_rlock(&(p)->rwmtx) -#define IPFW_RUNLOCK(p) rw_runlock(&(p)->rwmtx) -#define IPFW_WLOCK(p) rw_wlock(&(p)->rwmtx) -#define IPFW_WUNLOCK(p) rw_wunlock(&(p)->rwmtx) - -#define LOOKUP_NAT(l, i, p) do { \ - LIST_FOREACH((p), &(l.nat), _next) { \ - if ((p)->id == (i)) { \ - break; \ - } \ - } \ - } while (0) - -typedef int ipfw_nat_t(struct ip_fw_args *, struct cfg_nat *, struct mbuf *); -typedef int ipfw_nat_cfg_t(struct sockopt *); -#endif - -VNET_DECLARE(struct ip_fw_chain, layer3_chain); -#define V_layer3_chain VNET(layer3_chain) - -#endif /* _KERNEL */ #endif /* _IPFW2_H */ diff --git a/sys/netinet/ip_var.h b/sys/netinet/ip_var.h index a1d2166..d041dd3 100644 --- a/sys/netinet/ip_var.h +++ b/sys/netinet/ip_var.h @@ -249,7 +249,43 @@ VNET_DECLARE(struct pfil_head, inet_pfil_hook); /* packet filter hooks */ void in_delayed_cksum(struct mbuf *m); -/* ipfw and dummynet hooks. Most are declared in raw_ip.c */ +/* Hooks for ipfw, dummynet, divert etc. Most are declared in raw_ip.c */ +/* + * Reference to an ipfw or packet filter rule that can be carried + * outside critical sections. + * A rule is identified by rulenum:rule_id which is ordered. + * In version chain_id the rule can be found in slot 'slot', so + * we don't need a lookup if chain_id == chain->id. + * + * On exit from the firewall this structure refers to the rule after + * the matching one (slot points to the new rule; rulenum:rule_id-1 + * is the matching rule), and additional info (e.g. info often contains + * the insn argument or tablearg in the low 16 bits, in host format). + * On entry, the structure is valid if slot>0, and refers to the starting + * rules. 'info' contains the reason for reinject, e.g. divert port, + * divert direction, and so on. + */ +struct ipfw_rule_ref { + uint32_t slot; /* slot for matching rule */ + uint32_t rulenum; /* matching rule number */ + uint32_t rule_id; /* matching rule id */ + uint32_t chain_id; /* ruleset id */ + uint32_t info; /* see below */ +}; + +enum { + IPFW_INFO_MASK = 0x0000ffff, + IPFW_INFO_OUT = 0x00000000, /* outgoing, just for convenience */ + IPFW_INFO_IN = 0x80000000, /* incoming, overloads dir */ + IPFW_ONEPASS = 0x40000000, /* One-pass, do not reinject */ + IPFW_IS_MASK = 0x30000000, /* which source ? */ + IPFW_IS_DIVERT = 0x20000000, + IPFW_IS_DUMMYNET =0x10000000, + IPFW_IS_PIPE = 0x08000000, /* pip1=1, queue = 0 */ +}; +#define MTAG_IPFW 1148380143 /* IPFW-tagged cookie */ +#define MTAG_IPFW_RULE 1262273568 /* rule reference */ + struct ip_fw_args; typedef int (*ip_fw_chk_ptr_t)(struct ip_fw_args *args); typedef int (*ip_fw_ctl_ptr_t)(struct sockopt *); @@ -258,9 +294,14 @@ VNET_DECLARE(ip_fw_ctl_ptr_t, ip_fw_ctl_ptr); #define V_ip_fw_chk_ptr VNET(ip_fw_chk_ptr) #define V_ip_fw_ctl_ptr VNET(ip_fw_ctl_ptr) +/* Divert hooks. */ +extern void (*ip_divert_ptr)(struct mbuf *m, int incoming); +/* ng_ipfw hooks -- XXX make it the same as divert and dummynet */ +extern int (*ng_ipfw_input_p)(struct mbuf **, int, + struct ip_fw_args *, int); + extern int (*ip_dn_ctl_ptr)(struct sockopt *); -extern int (*ip_dn_io_ptr)(struct mbuf **m, int dir, struct ip_fw_args *fwa); -extern void (*ip_dn_ruledel_ptr)(void *); /* in ip_fw2.c */ +extern int (*ip_dn_io_ptr)(struct mbuf **, int, struct ip_fw_args *); VNET_DECLARE(int, ip_do_randomid); #define V_ip_do_randomid VNET(ip_do_randomid) diff --git a/sys/netinet/ipfw/dn_heap.c b/sys/netinet/ipfw/dn_heap.c new file mode 100644 index 0000000..6773851 --- /dev/null +++ b/sys/netinet/ipfw/dn_heap.c @@ -0,0 +1,550 @@ +/*- + * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Binary heap and hash tables, used in dummynet + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +#include <sys/param.h> +#ifdef _KERNEL +__FBSDID("$FreeBSD$"); +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/kernel.h> +#include <netinet/ipfw/dn_heap.h> +#ifndef log +#define log(x, arg...) +#endif + +#else /* !_KERNEL */ + +#include <stdio.h> +#include <dn_test.h> +#include <strings.h> +#include <stdlib.h> + +#include "dn_heap.h" +#define log(x, arg...) fprintf(stderr, ## arg) +#define panic(x...) fprintf(stderr, ## x), exit(1) +#define MALLOC_DEFINE(a, b, c) +static void *my_malloc(int s) { return malloc(s); } +static void my_free(void *p) { free(p); } +#define malloc(s, t, w) my_malloc(s) +#define free(p, t) my_free(p) +#endif /* !_KERNEL */ + +MALLOC_DEFINE(M_DN_HEAP, "dummynet", "dummynet heap"); + +/* + * Heap management functions. + * + * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2. + * Some macros help finding parent/children so we can optimize them. + * + * heap_init() is called to expand the heap when needed. + * Increment size in blocks of 16 entries. + * Returns 1 on error, 0 on success + */ +#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 ) +#define HEAP_LEFT(x) ( (x)+(x) + 1 ) +#define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; } +#define HEAP_INCREMENT 15 + +static int +heap_resize(struct dn_heap *h, unsigned int new_size) +{ + struct dn_heap_entry *p; + + if (h->size >= new_size ) /* have enough room */ + return 0; +#if 1 /* round to the next power of 2 */ + new_size |= new_size >> 1; + new_size |= new_size >> 2; + new_size |= new_size >> 4; + new_size |= new_size >> 8; + new_size |= new_size >> 16; +#else + new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT; +#endif + p = malloc(new_size * sizeof(*p), M_DN_HEAP, M_NOWAIT); + if (p == NULL) { + printf("--- %s, resize %d failed\n", __func__, new_size ); + return 1; /* error */ + } + if (h->size > 0) { + bcopy(h->p, p, h->size * sizeof(*p) ); + free(h->p, M_DN_HEAP); + } + h->p = p; + h->size = new_size; + return 0; +} + +int +heap_init(struct dn_heap *h, int size, int ofs) +{ + if (heap_resize(h, size)) + return 1; + h->elements = 0; + h->ofs = ofs; + return 0; +} + +/* + * Insert element in heap. Normally, p != NULL, we insert p in + * a new position and bubble up. If p == NULL, then the element is + * already in place, and key is the position where to start the + * bubble-up. + * Returns 1 on failure (cannot allocate new heap entry) + * + * If ofs > 0 the position (index, int) of the element in the heap is + * also stored in the element itself at the given offset in bytes. + */ +#define SET_OFFSET(h, i) do { \ + if (h->ofs > 0) \ + *((int32_t *)((char *)(h->p[i].object) + h->ofs)) = i; \ + } while (0) +/* + * RESET_OFFSET is used for sanity checks. It sets ofs + * to an invalid value. + */ +#define RESET_OFFSET(h, i) do { \ + if (h->ofs > 0) \ + *((int32_t *)((char *)(h->p[i].object) + h->ofs)) = -16; \ + } while (0) + +int +heap_insert(struct dn_heap *h, uint64_t key1, void *p) +{ + int son = h->elements; + + //log("%s key %llu p %p\n", __FUNCTION__, key1, p); + if (p == NULL) { /* data already there, set starting point */ + son = key1; + } else { /* insert new element at the end, possibly resize */ + son = h->elements; + if (son == h->size) /* need resize... */ + // XXX expand by 16 or so + if (heap_resize(h, h->elements+16) ) + return 1; /* failure... */ + h->p[son].object = p; + h->p[son].key = key1; + h->elements++; + } + /* make sure that son >= father along the path */ + while (son > 0) { + int father = HEAP_FATHER(son); + struct dn_heap_entry tmp; + + if (DN_KEY_LT( h->p[father].key, h->p[son].key ) ) + break; /* found right position */ + /* son smaller than father, swap and repeat */ + HEAP_SWAP(h->p[son], h->p[father], tmp); + SET_OFFSET(h, son); + son = father; + } + SET_OFFSET(h, son); + return 0; +} + +/* + * remove top element from heap, or obj if obj != NULL + */ +void +heap_extract(struct dn_heap *h, void *obj) +{ + int child, father, max = h->elements - 1; + + if (max < 0) { + printf("--- %s: empty heap 0x%p\n", __FUNCTION__, h); + return; + } + if (obj == NULL) + father = 0; /* default: move up smallest child */ + else { /* extract specific element, index is at offset */ + if (h->ofs <= 0) + panic("%s: extract from middle not set on %p\n", + __FUNCTION__, h); + father = *((int *)((char *)obj + h->ofs)); + if (father < 0 || father >= h->elements) { + panic("%s: father %d out of bound 0..%d\n", + __FUNCTION__, father, h->elements); + } + } + /* + * below, father is the index of the empty element, which + * we replace at each step with the smallest child until we + * reach the bottom level. + */ + // XXX why removing RESET_OFFSET increases runtime by 10% ? + RESET_OFFSET(h, father); + while ( (child = HEAP_LEFT(father)) <= max ) { + if (child != max && + DN_KEY_LT(h->p[child+1].key, h->p[child].key) ) + child++; /* take right child, otherwise left */ + h->p[father] = h->p[child]; + SET_OFFSET(h, father); + father = child; + } + h->elements--; + if (father != max) { + /* + * Fill hole with last entry and bubble up, + * reusing the insert code + */ + h->p[father] = h->p[max]; + heap_insert(h, father, NULL); + } +} + +#if 0 +/* + * change object position and update references + * XXX this one is never used! + */ +static void +heap_move(struct dn_heap *h, uint64_t new_key, void *object) +{ + int temp, i, max = h->elements-1; + struct dn_heap_entry *p, buf; + + if (h->ofs <= 0) + panic("cannot move items on this heap"); + p = h->p; /* shortcut */ + + i = *((int *)((char *)object + h->ofs)); + if (DN_KEY_LT(new_key, p[i].key) ) { /* must move up */ + p[i].key = new_key; + for (; i>0 && + DN_KEY_LT(new_key, p[(temp = HEAP_FATHER(i))].key); + i = temp ) { /* bubble up */ + HEAP_SWAP(p[i], p[temp], buf); + SET_OFFSET(h, i); + } + } else { /* must move down */ + p[i].key = new_key; + while ( (temp = HEAP_LEFT(i)) <= max ) { + /* found left child */ + if (temp != max && + DN_KEY_LT(p[temp+1].key, p[temp].key)) + temp++; /* select child with min key */ + if (DN_KEY_LT(>p[temp].key, new_key)) { + /* go down */ + HEAP_SWAP(p[i], p[temp], buf); + SET_OFFSET(h, i); + } else + break; + i = temp; + } + } + SET_OFFSET(h, i); +} +#endif /* heap_move, unused */ + +/* + * heapify() will reorganize data inside an array to maintain the + * heap property. It is needed when we delete a bunch of entries. + */ +static void +heapify(struct dn_heap *h) +{ + int i; + + for (i = 0; i < h->elements; i++ ) + heap_insert(h, i , NULL); +} + +int +heap_scan(struct dn_heap *h, int (*fn)(void *, uintptr_t), + uintptr_t arg) +{ + int i, ret, found; + + for (i = found = 0 ; i < h->elements ;) { + ret = fn(h->p[i].object, arg); + if (ret & HEAP_SCAN_DEL) { + h->elements-- ; + h->p[i] = h->p[h->elements] ; + found++ ; + } else + i++ ; + if (ret & HEAP_SCAN_END) + break; + } + if (found) + heapify(h); + return found; +} + +/* + * cleanup the heap and free data structure + */ +void +heap_free(struct dn_heap *h) +{ + if (h->size >0 ) + free(h->p, M_DN_HEAP); + bzero(h, sizeof(*h) ); +} + +/* + * hash table support. + */ + +struct dn_ht { + int buckets; /* how many buckets, really buckets - 1*/ + int entries; /* how many entries */ + int ofs; /* offset of link field */ + uint32_t (*hash)(uintptr_t, int, void *arg); + int (*match)(void *_el, uintptr_t key, int, void *); + void *(*newh)(uintptr_t, int, void *); + void **ht; /* bucket heads */ +}; +/* + * Initialize, allocating bucket pointers inline. + * Recycle previous record if possible. + * If the 'newh' function is not supplied, we assume that the + * key passed to ht_find is the same object to be stored in. + */ +struct dn_ht * +dn_ht_init(struct dn_ht *ht, int buckets, int ofs, + uint32_t (*h)(uintptr_t, int, void *), + int (*match)(void *, uintptr_t, int, void *), + void *(*newh)(uintptr_t, int, void *)) +{ + int l; + + /* + * Notes about rounding bucket size to a power of two. + * Given the original bucket size, we compute the nearest lower and + * higher power of two, minus 1 (respectively b_min and b_max) because + * this value will be used to do an AND with the index returned + * by hash function. + * To choice between these two values, the original bucket size is + * compared with b_min. If the original size is greater than 4/3 b_min, + * we round the bucket size to b_max, else to b_min. + * This ratio try to round to the nearest power of two, advantaging + * the greater size if the different between two power is relatively + * big. + * Rounding the bucket size to a power of two avoid the use of + * module when calculating the correct bucket. + * The ht->buckets variable store the bucket size - 1 to simply + * do an AND between the index returned by hash function and ht->bucket + * instead of a module. + */ + int b_min; /* min buckets */ + int b_max; /* max buckets */ + int b_ori; /* original buckets */ + + if (h == NULL || match == NULL) { + printf("--- missing hash or match function"); + return NULL; + } + if (buckets < 1 || buckets > 65536) + return NULL; + + b_ori = buckets; + /* calculate next power of 2, - 1*/ + buckets |= buckets >> 1; + buckets |= buckets >> 2; + buckets |= buckets >> 4; + buckets |= buckets >> 8; + buckets |= buckets >> 16; + + b_max = buckets; /* Next power */ + b_min = buckets >> 1; /* Previous power */ + + /* Calculate the 'nearest' bucket size */ + if (b_min * 4000 / 3000 < b_ori) + buckets = b_max; + else + buckets = b_min; + + if (ht) { /* see if we can reuse */ + if (buckets <= ht->buckets) { + ht->buckets = buckets; + } else { + /* free pointers if not allocated inline */ + if (ht->ht != (void *)(ht + 1)) + free(ht->ht, M_DN_HEAP); + free(ht, M_DN_HEAP); + ht = NULL; + } + } + if (ht == NULL) { + /* Allocate buckets + 1 entries because buckets is use to + * do the AND with the index returned by hash function + */ + l = sizeof(*ht) + (buckets + 1) * sizeof(void **); + ht = malloc(l, M_DN_HEAP, M_NOWAIT | M_ZERO); + } + if (ht) { + ht->ht = (void **)(ht + 1); + ht->buckets = buckets; + ht->ofs = ofs; + ht->hash = h; + ht->match = match; + ht->newh = newh; + } + return ht; +} + +/* dummy callback for dn_ht_free to unlink all */ +static int +do_del(void *obj, void *arg) +{ + return DNHT_SCAN_DEL; +} + +void +dn_ht_free(struct dn_ht *ht, int flags) +{ + if (ht == NULL) + return; + if (flags & DNHT_REMOVE) { + (void)dn_ht_scan(ht, do_del, NULL); + } else { + if (ht->ht && ht->ht != (void *)(ht + 1)) + free(ht->ht, M_DN_HEAP); + free(ht, M_DN_HEAP); + } +} + +int +dn_ht_entries(struct dn_ht *ht) +{ + return ht ? ht->entries : 0; +} + +/* lookup and optionally create or delete element */ +void * +dn_ht_find(struct dn_ht *ht, uintptr_t key, int flags, void *arg) +{ + int i; + void **pp, *p; + + if (ht == NULL) /* easy on an empty hash */ + return NULL; + i = (ht->buckets == 1) ? 0 : + (ht->hash(key, flags, arg) & ht->buckets); + + for (pp = &ht->ht[i]; (p = *pp); pp = (void **)((char *)p + ht->ofs)) { + if (flags & DNHT_MATCH_PTR) { + if (key == (uintptr_t)p) + break; + } else if (ht->match(p, key, flags, arg)) /* found match */ + break; + } + if (p) { + if (flags & DNHT_REMOVE) { + /* link in the next element */ + *pp = *(void **)((char *)p + ht->ofs); + *(void **)((char *)p + ht->ofs) = NULL; + ht->entries--; + } + } else if (flags & DNHT_INSERT) { + // printf("%s before calling new, bucket %d ofs %d\n", + // __FUNCTION__, i, ht->ofs); + p = ht->newh ? ht->newh(key, flags, arg) : (void *)key; + // printf("%s newh returns %p\n", __FUNCTION__, p); + if (p) { + ht->entries++; + *(void **)((char *)p + ht->ofs) = ht->ht[i]; + ht->ht[i] = p; + } + } + return p; +} + +/* + * do a scan with the option to delete the object. Extract next before + * running the callback because the element may be destroyed there. + */ +int +dn_ht_scan(struct dn_ht *ht, int (*fn)(void *, void *), void *arg) +{ + int i, ret, found = 0; + void **curp, *cur, *next; + + if (ht == NULL || fn == NULL) + return 0; + for (i = 0; i <= ht->buckets; i++) { + curp = &ht->ht[i]; + while ( (cur = *curp) != NULL) { + next = *(void **)((char *)cur + ht->ofs); + ret = fn(cur, arg); + if (ret & DNHT_SCAN_DEL) { + found++; + ht->entries--; + *curp = next; + } else { + curp = (void **)((char *)cur + ht->ofs); + } + if (ret & DNHT_SCAN_END) + return found; + } + } + return found; +} + +/* + * Similar to dn_ht_scan(), except thah the scan is performed only + * in the bucket 'bucket'. The function returns a correct bucket number if + * the original is invalid + */ +int +dn_ht_scan_bucket(struct dn_ht *ht, int *bucket, int (*fn)(void *, void *), + void *arg) +{ + int i, ret, found = 0; + void **curp, *cur, *next; + + if (ht == NULL || fn == NULL) + return 0; + if (*bucket > ht->buckets) + *bucket = 0; + i = *bucket; + + curp = &ht->ht[i]; + while ( (cur = *curp) != NULL) { + next = *(void **)((char *)cur + ht->ofs); + ret = fn(cur, arg); + if (ret & DNHT_SCAN_DEL) { + found++; + ht->entries--; + *curp = next; + } else { + curp = (void **)((char *)cur + ht->ofs); + } + if (ret & DNHT_SCAN_END) + return found; + } + return found; +} + diff --git a/sys/netinet/ipfw/dn_heap.h b/sys/netinet/ipfw/dn_heap.h new file mode 100644 index 0000000..c95473a --- /dev/null +++ b/sys/netinet/ipfw/dn_heap.h @@ -0,0 +1,191 @@ +/*- + * Copyright (c) 1998-2010 Luigi Rizzo, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Binary heap and hash tables, header file + * + * $FreeBSD$ + */ + +#ifndef _IP_DN_HEAP_H +#define _IP_DN_HEAP_H + +#define DN_KEY_LT(a,b) ((int64_t)((a)-(b)) < 0) +#define DN_KEY_LEQ(a,b) ((int64_t)((a)-(b)) <= 0) + +/* + * This module implements a binary heap supporting random extraction. + * + * A heap entry contains an uint64_t key and a pointer to object. + * DN_KEY_LT(a,b) returns true if key 'a' is smaller than 'b' + * + * The heap is a struct dn_heap plus a dynamically allocated + * array of dn_heap_entry entries. 'size' represents the size of + * the array, 'elements' count entries in use. The topmost + * element has the smallest key. + * The heap supports ordered insert, and extract from the top. + * To extract an object from the middle of the heap, we the object + * must reserve an 'int32_t' to store the position of the object + * in the heap itself, and the location of this field must be + * passed as an argument to heap_init() -- use -1 if the feature + * is not used. + */ +struct dn_heap_entry { + uint64_t key; /* sorting key, smallest comes first */ + void *object; /* object pointer */ +}; + +struct dn_heap { + int size; /* the size of the array */ + int elements; /* elements in use */ + int ofs; /* offset in the object of heap index */ + struct dn_heap_entry *p; /* array of "size" entries */ +}; + +enum { + HEAP_SCAN_DEL = 1, + HEAP_SCAN_END = 2, +}; + +/* + * heap_init() reinitializes the heap setting the size and the offset + * of the index for random extraction (use -1 if not used). + * The 'elements' counter is set to 0. + * + * SET_HEAP_OFS() indicates where, in the object, is stored the index + * for random extractions from the heap. + * + * heap_free() frees the memory associated to a heap. + * + * heap_insert() adds a key-pointer pair to the heap + * + * HEAP_TOP() returns a pointer to the top element of the heap, + * but makes no checks on its existance (XXX should we change ?) + * + * heap_extract() removes the entry at the top, returing the pointer. + * (the key should have been read before). + * + * heap_scan() invokes a callback on each entry of the heap. + * The callback can return a combination of HEAP_SCAN_DEL and + * HEAP_SCAN_END. HEAP_SCAN_DEL means the current element must + * be removed, and HEAP_SCAN_END means to terminate the scan. + * heap_scan() returns the number of elements removed. + * Because the order is not guaranteed, we should use heap_scan() + * only as a last resort mechanism. + */ +#define HEAP_TOP(h) ((h)->p) +#define SET_HEAP_OFS(h, n) do { (h)->ofs = n; } while (0) +int heap_init(struct dn_heap *h, int size, int ofs); +int heap_insert(struct dn_heap *h, uint64_t key1, void *p); +void heap_extract(struct dn_heap *h, void *obj); +void heap_free(struct dn_heap *h); +int heap_scan(struct dn_heap *, int (*)(void *, uintptr_t), uintptr_t); + +/*------------------------------------------------------ + * This module implements a generic hash table with support for + * running callbacks on the entire table. To avoid allocating + * memory during hash table operations, objects must reserve + * space for a link field. XXX if the heap is moderately full, + * an SLIST suffices, and we can tolerate the cost of a hash + * computation on each removal. + * + * dn_ht_init() initializes the table, setting the number of + * buckets, the offset of the link field, the main callbacks. + * Callbacks are: + * + * hash(key, flags, arg) called to return a bucket index. + * match(obj, key, flags, arg) called to determine if key + * matches the current 'obj' in the heap + * newh(key, flags, arg) optional, used to allocate a new + * object during insertions. + * + * dn_ht_free() frees the heap or unlink elements. + * DNHT_REMOVE unlink elements, 0 frees the heap. + * You need two calls to do both. + * + * dn_ht_find() is the main lookup function, which can also be + * used to insert or delete elements in the hash table. + * The final 'arg' is passed to all callbacks. + * + * dn_ht_scan() is used to invoke a callback on all entries of + * the heap, or possibly on just one bucket. The callback + * is invoked with a pointer to the object, and must return + * one of DNHT_SCAN_DEL or DNHT_SCAN_END to request the + * removal of the object from the heap and the end of the + * scan, respectively. + * + * dn_ht_scan_bucket() is similar to dn_ht_scan(), except that it scans + * only the specific bucket of the table. The bucket is a in-out + * parameter and return a valid bucket number if the original + * is invalid. + * + * A combination of flags can be used to modify the operation + * of the dn_ht_find(), and of the callbacks: + * + * DNHT_KEY_IS_OBJ means the key is the object pointer. + * It is usally of interest for the hash and match functions. + * + * DNHT_MATCH_PTR during a lookup, match pointers instead + * of calling match(). Normally used when removing specific + * entries. Does not imply KEY_IS_OBJ as the latter _is_ used + * by the match function. + * + * DNHT_INSERT insert the element if not found. + * Calls new() to allocates a new object unless + * DNHT_KEY_IS_OBJ is set. + * + * DNHT_UNIQUE only insert if object not found. + * XXX should it imply DNHT_INSERT ? + * + * DNHT_REMOVE remove objects if we find them. + */ +struct dn_ht; /* should be opaque */ + +struct dn_ht *dn_ht_init(struct dn_ht *, int buckets, int ofs, + uint32_t (*hash)(uintptr_t, int, void *), + int (*match)(void *, uintptr_t, int, void *), + void *(*newh)(uintptr_t, int, void *)); +void dn_ht_free(struct dn_ht *, int flags); + +void *dn_ht_find(struct dn_ht *, uintptr_t, int, void *); +int dn_ht_scan(struct dn_ht *, int (*)(void *, void *), void *); +int dn_ht_scan_bucket(struct dn_ht *, int * , int (*)(void *, void *), void *); +int dn_ht_entries(struct dn_ht *); + +enum { /* flags values. + * first two are returned by the scan callback to indicate + * to delete the matching element or to end the scan + */ + DNHT_SCAN_DEL = 0x0001, + DNHT_SCAN_END = 0x0002, + DNHT_KEY_IS_OBJ = 0x0004, /* key is the obj pointer */ + DNHT_MATCH_PTR = 0x0008, /* match by pointer, not match() */ + DNHT_INSERT = 0x0010, /* insert if not found */ + DNHT_UNIQUE = 0x0020, /* report error if already there */ + DNHT_REMOVE = 0x0040, /* remove on find or dn_ht_free */ +}; + +#endif /* _IP_DN_HEAP_H */ diff --git a/sys/netinet/ipfw/dn_sched.h b/sys/netinet/ipfw/dn_sched.h new file mode 100644 index 0000000..fe54b02 --- /dev/null +++ b/sys/netinet/ipfw/dn_sched.h @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2010 Riccardo Panicucci, Luigi Rizzo, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * The API to write a packet scheduling algorithm for dummynet. + * + * $FreeBSD$ + */ + +#ifndef _DN_SCHED_H +#define _DN_SCHED_H + +#define DN_MULTIQUEUE 0x01 +/* + * Descriptor for a scheduling algorithm. + * Contains all function pointers for a given scheduler + * This is typically created when a module is loaded, and stored + * in a global list of schedulers. + */ +struct dn_alg { + uint32_t type; /* the scheduler type */ + const char *name; /* scheduler name */ + uint32_t flags; /* DN_MULTIQUEUE if supports multiple queues */ + + /* + * The following define the size of 3 optional data structures + * that may need to be allocated at runtime, and are appended + * to each of the base data structures: scheduler, sched.inst, + * and queue. We don't have a per-flowset structure. + */ + /* + parameters attached to the template, e.g. + * default queue sizes, weights, quantum size, and so on; + */ + size_t schk_datalen; + + /* + per-instance parameters, such as timestamps, + * containers for queues, etc; + */ + size_t si_datalen; + + size_t q_datalen; /* per-queue parameters (e.g. S,F) */ + + /* + * Methods implemented by the scheduler: + * enqueue enqueue packet 'm' on scheduler 's', queue 'q'. + * q is NULL for !MULTIQUEUE. + * Return 0 on success, 1 on drop (packet consumed anyways). + * Note that q should be interpreted only as a hint + * on the flow that the mbuf belongs to: while a + * scheduler will normally enqueue m into q, it is ok + * to leave q alone and put the mbuf elsewhere. + * This function is called in two cases: + * - when a new packet arrives to the scheduler; + * - when a scheduler is reconfigured. In this case the + * call is issued by the new_queue callback, with a + * non empty queue (q) and m pointing to the first + * mbuf in the queue. For this reason, the function + * should internally check for (m != q->mq.head) + * before calling dn_enqueue(). + * + * dequeue Called when scheduler instance 's' can + * dequeue a packet. Return NULL if none are available. + * XXX what about non work-conserving ? + * + * config called on 'sched X config ...', normally writes + * in the area of size sch_arg + * + * destroy called on 'sched delete', frees everything + * in sch_arg (other parts are handled by more specific + * functions) + * + * new_sched called when a new instance is created, e.g. + * to create the local queue for !MULTIQUEUE, set V or + * copy parameters for WFQ, and so on. + * + * free_sched called when deleting an instance, cleans + * extra data in the per-instance area. + * + * new_fsk called when a flowset is linked to a scheduler, + * e.g. to validate parameters such as weights etc. + * free_fsk when a flowset is unlinked from a scheduler. + * (probably unnecessary) + * + * new_queue called to set the per-queue parameters, + * e.g. S and F, adjust sum of weights in the parent, etc. + * + * The new_queue callback is normally called from when + * creating a new queue. In some cases (such as a + * scheduler change or reconfiguration) it can be called + * with a non empty queue. In this case, the queue + * In case of non empty queue, the new_queue callback could + * need to call the enqueue function. In this case, + * the callback should eventually call enqueue() passing + * as m the first element in the queue. + * + * free_queue actions related to a queue removal, e.g. undo + * all the above. If the queue has data in it, also remove + * from the scheduler. This can e.g. happen during a reconfigure. + */ + int (*enqueue)(struct dn_sch_inst *, struct dn_queue *, + struct mbuf *); + struct mbuf * (*dequeue)(struct dn_sch_inst *); + + int (*config)(struct dn_schk *); + int (*destroy)(struct dn_schk*); + int (*new_sched)(struct dn_sch_inst *); + int (*free_sched)(struct dn_sch_inst *); + int (*new_fsk)(struct dn_fsk *f); + int (*free_fsk)(struct dn_fsk *f); + int (*new_queue)(struct dn_queue *q); + int (*free_queue)(struct dn_queue *q); + + /* run-time fields */ + int ref_count; /* XXX number of instances in the system */ + SLIST_ENTRY(dn_alg) next; /* Next scheduler in the list */ +}; + +/* MSVC does not support initializers so we need this ugly macro */ +#ifdef _WIN32 +#define _SI(fld) +#else +#define _SI(fld) fld +#endif + +/* + * Additionally, dummynet exports some functions and macros + * to be used by schedulers: + */ + +void dn_free_pkts(struct mbuf *mnext); +int dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop); +/* bound a variable between min and max */ +int ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg); + +/* + * Extract the head of a queue, update stats. Must be the very last + * thing done on a dequeue as the queue itself may go away. + */ +static __inline struct mbuf* +dn_dequeue(struct dn_queue *q) +{ + struct mbuf *m = q->mq.head; + if (m == NULL) + return NULL; + q->mq.head = m->m_nextpkt; + q->ni.length--; + q->ni.len_bytes -= m->m_pkthdr.len; + if (q->_si) { + q->_si->ni.length--; + q->_si->ni.len_bytes -= m->m_pkthdr.len; + } + if (q->ni.length == 0) /* queue is now idle */ + q->q_time = dn_cfg.curr_time; + return m; +} + +int dn_sched_modevent(module_t mod, int cmd, void *arg); + +#define DECLARE_DNSCHED_MODULE(name, dnsched) \ + static moduledata_t name##_mod = { \ + #name, dn_sched_modevent, dnsched \ + }; \ + DECLARE_MODULE(name, name##_mod, \ + SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); \ + MODULE_DEPEND(name, dummynet, 3, 3, 3); +#endif /* _DN_SCHED_H */ diff --git a/sys/netinet/ipfw/dn_sched_fifo.c b/sys/netinet/ipfw/dn_sched_fifo.c new file mode 100644 index 0000000..0bb3800 --- /dev/null +++ b/sys/netinet/ipfw/dn_sched_fifo.c @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + */ + +#ifdef _KERNEL +#include <sys/malloc.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/kernel.h> +#include <sys/mbuf.h> +#include <sys/module.h> +#include <net/if.h> /* IFNAMSIZ */ +#include <netinet/in.h> +#include <netinet/ip_var.h> /* ipfw_rule_ref */ +#include <netinet/ip_fw.h> /* flow_id */ +#include <netinet/ip_dummynet.h> +#include <netinet/ipfw/dn_heap.h> +#include <netinet/ipfw/ip_dn_private.h> +#include <netinet/ipfw/dn_sched.h> +#else +#include <dn_test.h> +#endif + +/* + * This file implements a FIFO scheduler for a single queue. + * The queue is allocated as part of the scheduler instance, + * and there is a single flowset is in the template which stores + * queue size and policy. + * Enqueue and dequeue use the default library functions. + */ +static int +fifo_enqueue(struct dn_sch_inst *si, struct dn_queue *q, struct mbuf *m) +{ + /* XXX if called with q != NULL and m=NULL, this is a + * re-enqueue from an existing scheduler, which we should + * handle. + */ + return dn_enqueue((struct dn_queue *)(si+1), m, 0); +} + +static struct mbuf * +fifo_dequeue(struct dn_sch_inst *si) +{ + return dn_dequeue((struct dn_queue *)(si + 1)); +} + +static int +fifo_new_sched(struct dn_sch_inst *si) +{ + /* This scheduler instance contains the queue */ + struct dn_queue *q = (struct dn_queue *)(si + 1); + + set_oid(&q->ni.oid, DN_QUEUE, sizeof(*q)); + q->_si = si; + q->fs = si->sched->fs; + return 0; +} + +static int +fifo_free_sched(struct dn_sch_inst *si) +{ + struct dn_queue *q = (struct dn_queue *)(si + 1); + dn_free_pkts(q->mq.head); + bzero(q, sizeof(*q)); + return 0; +} + +/* + * FIFO scheduler descriptor + * contains the type of the scheduler, the name, the size of extra + * data structures, and function pointers. + */ +static struct dn_alg fifo_desc = { + _SI( .type = ) DN_SCHED_FIFO, + _SI( .name = ) "FIFO", + _SI( .flags = ) 0, + + _SI( .schk_datalen = ) 0, + _SI( .si_datalen = ) sizeof(struct dn_queue), + _SI( .q_datalen = ) 0, + + _SI( .enqueue = ) fifo_enqueue, + _SI( .dequeue = ) fifo_dequeue, + _SI( .config = ) NULL, + _SI( .destroy = ) NULL, + _SI( .new_sched = ) fifo_new_sched, + _SI( .free_sched = ) fifo_free_sched, + _SI( .new_fsk = ) NULL, + _SI( .free_fsk = ) NULL, + _SI( .new_queue = ) NULL, + _SI( .free_queue = ) NULL, +}; + +DECLARE_DNSCHED_MODULE(dn_fifo, &fifo_desc); diff --git a/sys/netinet/ipfw/dn_sched_qfq.c b/sys/netinet/ipfw/dn_sched_qfq.c new file mode 100644 index 0000000..44555ee --- /dev/null +++ b/sys/netinet/ipfw/dn_sched_qfq.c @@ -0,0 +1,864 @@ +/* + * Copyright (c) 2010 Fabio Checconi, Luigi Rizzo, Paolo Valente + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + */ + +#ifdef _KERNEL +#include <sys/malloc.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/kernel.h> +#include <sys/mbuf.h> +#include <sys/module.h> +#include <net/if.h> /* IFNAMSIZ */ +#include <netinet/in.h> +#include <netinet/ip_var.h> /* ipfw_rule_ref */ +#include <netinet/ip_fw.h> /* flow_id */ +#include <netinet/ip_dummynet.h> +#include <netinet/ipfw/dn_heap.h> +#include <netinet/ipfw/ip_dn_private.h> +#include <netinet/ipfw/dn_sched.h> +#else +#include <dn_test.h> +#endif + +#ifdef QFQ_DEBUG +struct qfq_sched; +static void dump_sched(struct qfq_sched *q, const char *msg); +#define NO(x) x +#else +#define NO(x) +#endif +#define DN_SCHED_QFQ 4 // XXX Where? +typedef unsigned long bitmap; + +/* + * bitmaps ops are critical. Some linux versions have __fls + * and the bitmap ops. Some machines have ffs + */ +#if defined(_WIN32) +int fls(unsigned int n) +{ + int i = 0; + for (i = 0; n > 0; n >>= 1, i++) + ; + return i; +} +#endif + +#if !defined(_KERNEL) || defined( __FreeBSD__ ) || defined(_WIN32) +static inline unsigned long __fls(unsigned long word) +{ + return fls(word) - 1; +} +#endif + +#if !defined(_KERNEL) || !defined(__linux__) +#ifdef QFQ_DEBUG +int test_bit(int ix, bitmap *p) +{ + if (ix < 0 || ix > 31) + D("bad index %d", ix); + return *p & (1<<ix); +} +void __set_bit(int ix, bitmap *p) +{ + if (ix < 0 || ix > 31) + D("bad index %d", ix); + *p |= (1<<ix); +} +void __clear_bit(int ix, bitmap *p) +{ + if (ix < 0 || ix > 31) + D("bad index %d", ix); + *p &= ~(1<<ix); +} +#else /* !QFQ_DEBUG */ +/* XXX do we have fast version, or leave it to the compiler ? */ +#define test_bit(ix, pData) ((*pData) & (1<<(ix))) +#define __set_bit(ix, pData) (*pData) |= (1<<(ix)) +#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix)) +#endif /* !QFQ_DEBUG */ +#endif /* !__linux__ */ + +#ifdef __MIPSEL__ +#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix)) +#endif + +/*-------------------------------------------*/ +/* + +Virtual time computations. + +S, F and V are all computed in fixed point arithmetic with +FRAC_BITS decimal bits. + + QFQ_MAX_INDEX is the maximum index allowed for a group. We need + one bit per index. + QFQ_MAX_WSHIFT is the maximum power of two supported as a weight. + The layout of the bits is as below: + + [ MTU_SHIFT ][ FRAC_BITS ] + [ MAX_INDEX ][ MIN_SLOT_SHIFT ] + ^.__grp->index = 0 + *.__grp->slot_shift + + where MIN_SLOT_SHIFT is derived by difference from the others. + +The max group index corresponds to Lmax/w_min, where +Lmax=1<<MTU_SHIFT, w_min = 1 . +From this, and knowing how many groups (MAX_INDEX) we want, +we can derive the shift corresponding to each group. + +Because we often need to compute + F = S + len/w_i and V = V + len/wsum +instead of storing w_i store the value + inv_w = (1<<FRAC_BITS)/w_i +so we can do F = S + len * inv_w * wsum. +We use W_TOT in the formulas so we can easily move between +static and adaptive weight sum. + +The per-scheduler-instance data contain all the data structures +for the scheduler: bitmaps and bucket lists. + + */ +/* + * Maximum number of consecutive slots occupied by backlogged classes + * inside a group. This is approx lmax/lmin + 5. + * XXX check because it poses constraints on MAX_INDEX + */ +#define QFQ_MAX_SLOTS 32 +/* + * Shifts used for class<->group mapping. Class weights are + * in the range [1, QFQ_MAX_WEIGHT], we to map each class i to the + * group with the smallest index that can support the L_i / r_i + * configured for the class. + * + * grp->index is the index of the group; and grp->slot_shift + * is the shift for the corresponding (scaled) sigma_i. + * + * When computing the group index, we do (len<<FP_SHIFT)/weight, + * then compute an FLS (which is like a log2()), and if the result + * is below the MAX_INDEX region we use 0 (which is the same as + * using a larger len). + */ +#define QFQ_MAX_INDEX 19 +#define QFQ_MAX_WSHIFT 16 /* log2(max_weight) */ + +#define QFQ_MAX_WEIGHT (1<<QFQ_MAX_WSHIFT) +#define QFQ_MAX_WSUM (2*QFQ_MAX_WEIGHT) +//#define IWSUM (q->i_wsum) +#define IWSUM ((1<<FRAC_BITS)/QFQ_MAX_WSUM) + +#define FRAC_BITS 30 /* fixed point arithmetic */ +#define ONE_FP (1UL << FRAC_BITS) + +#define QFQ_MTU_SHIFT 11 /* log2(max_len) */ +#define QFQ_MIN_SLOT_SHIFT (FRAC_BITS + QFQ_MTU_SHIFT - QFQ_MAX_INDEX) + +/* + * Possible group states, also indexes for the bitmaps array in + * struct qfq_queue. We rely on ER, IR, EB, IB being numbered 0..3 + */ +enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE }; + +struct qfq_group; +/* + * additional queue info. Some of this info should come from + * the flowset, we copy them here for faster processing. + * This is an overlay of the struct dn_queue + */ +struct qfq_class { + struct dn_queue _q; + uint64_t S, F; /* flow timestamps (exact) */ + struct qfq_class *next; /* Link for the slot list. */ + + /* group we belong to. In principle we would need the index, + * which is log_2(lmax/weight), but we never reference it + * directly, only the group. + */ + struct qfq_group *grp; + + /* these are copied from the flowset. */ + uint32_t inv_w; /* ONE_FP/weight */ + uint32_t lmax; /* Max packet size for this flow. */ +}; + +/* Group descriptor, see the paper for details. + * Basically this contains the bucket lists + */ +struct qfq_group { + uint64_t S, F; /* group timestamps (approx). */ + unsigned int slot_shift; /* Slot shift. */ + unsigned int index; /* Group index. */ + unsigned int front; /* Index of the front slot. */ + bitmap full_slots; /* non-empty slots */ + + /* Array of lists of active classes. */ + struct qfq_class *slots[QFQ_MAX_SLOTS]; +}; + +/* scheduler instance descriptor. */ +struct qfq_sched { + uint64_t V; /* Precise virtual time. */ + uint32_t wsum; /* weight sum */ + NO(uint32_t i_wsum; /* ONE_FP/w_sum */ + uint32_t _queued; /* debugging */ + uint32_t loops; /* debugging */) + bitmap bitmaps[QFQ_MAX_STATE]; /* Group bitmaps. */ + struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */ +}; + +/*---- support functions ----------------------------*/ + +/* Generic comparison function, handling wraparound. */ +static inline int qfq_gt(uint64_t a, uint64_t b) +{ + return (int64_t)(a - b) > 0; +} + +/* Round a precise timestamp to its slotted value. */ +static inline uint64_t qfq_round_down(uint64_t ts, unsigned int shift) +{ + return ts & ~((1ULL << shift) - 1); +} + +/* return the pointer to the group with lowest index in the bitmap */ +static inline struct qfq_group *qfq_ffs(struct qfq_sched *q, + unsigned long bitmap) +{ + int index = ffs(bitmap) - 1; // zero-based + return &q->groups[index]; +} + +/* + * Calculate a flow index, given its weight and maximum packet length. + * index = log_2(maxlen/weight) but we need to apply the scaling. + * This is used only once at flow creation. + */ +static int qfq_calc_index(uint32_t inv_w, unsigned int maxlen) +{ + uint64_t slot_size = (uint64_t)maxlen *inv_w; + unsigned long size_map; + int index = 0; + + size_map = (unsigned long)(slot_size >> QFQ_MIN_SLOT_SHIFT); + if (!size_map) + goto out; + + index = __fls(size_map) + 1; // basically a log_2() + index -= !(slot_size - (1ULL << (index + QFQ_MIN_SLOT_SHIFT - 1))); + + if (index < 0) + index = 0; + +out: + ND("W = %d, L = %d, I = %d\n", ONE_FP/inv_w, maxlen, index); + return index; +} +/*---- end support functions ----*/ + +/*-------- API calls --------------------------------*/ +/* + * Validate and copy parameters from flowset. + */ +static int +qfq_new_queue(struct dn_queue *_q) +{ + struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1); + struct qfq_class *cl = (struct qfq_class *)_q; + int i; + uint32_t w; /* approximated weight */ + + /* import parameters from the flowset. They should be correct + * already. + */ + w = _q->fs->fs.par[0]; + cl->lmax = _q->fs->fs.par[1]; + if (!w || w > QFQ_MAX_WEIGHT) { + w = 1; + D("rounding weight to 1"); + } + cl->inv_w = ONE_FP/w; + w = ONE_FP/cl->inv_w; + if (q->wsum + w > QFQ_MAX_WSUM) + return EINVAL; + + i = qfq_calc_index(cl->inv_w, cl->lmax); + cl->grp = &q->groups[i]; + q->wsum += w; + // XXX cl->S = q->V; ? + // XXX compute q->i_wsum + return 0; +} + +/* remove an empty queue */ +static int +qfq_free_queue(struct dn_queue *_q) +{ + struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1); + struct qfq_class *cl = (struct qfq_class *)_q; + if (cl->inv_w) { + q->wsum -= ONE_FP/cl->inv_w; + cl->inv_w = 0; /* reset weight to avoid run twice */ + } + return 0; +} + +/* Calculate a mask to mimic what would be ffs_from(). */ +static inline unsigned long +mask_from(unsigned long bitmap, int from) +{ + return bitmap & ~((1UL << from) - 1); +} + +/* + * The state computation relies on ER=0, IR=1, EB=2, IB=3 + * First compute eligibility comparing grp->S, q->V, + * then check if someone is blocking us and possibly add EB + */ +static inline unsigned int +qfq_calc_state(struct qfq_sched *q, struct qfq_group *grp) +{ + /* if S > V we are not eligible */ + unsigned int state = qfq_gt(grp->S, q->V); + unsigned long mask = mask_from(q->bitmaps[ER], grp->index); + struct qfq_group *next; + + if (mask) { + next = qfq_ffs(q, mask); + if (qfq_gt(grp->F, next->F)) + state |= EB; + } + + return state; +} + +/* + * In principle + * q->bitmaps[dst] |= q->bitmaps[src] & mask; + * q->bitmaps[src] &= ~mask; + * but we should make sure that src != dst + */ +static inline void +qfq_move_groups(struct qfq_sched *q, unsigned long mask, int src, int dst) +{ + q->bitmaps[dst] |= q->bitmaps[src] & mask; + q->bitmaps[src] &= ~mask; +} + +static inline void +qfq_unblock_groups(struct qfq_sched *q, int index, uint64_t old_finish) +{ + unsigned long mask = mask_from(q->bitmaps[ER], index + 1); + struct qfq_group *next; + + if (mask) { + next = qfq_ffs(q, mask); + if (!qfq_gt(next->F, old_finish)) + return; + } + + mask = (1UL << index) - 1; + qfq_move_groups(q, mask, EB, ER); + qfq_move_groups(q, mask, IB, IR); +} + +/* + * perhaps + * + old_V ^= q->V; + old_V >>= QFQ_MIN_SLOT_SHIFT; + if (old_V) { + ... + } + * + */ +static inline void +qfq_make_eligible(struct qfq_sched *q, uint64_t old_V) +{ + unsigned long mask, vslot, old_vslot; + + vslot = q->V >> QFQ_MIN_SLOT_SHIFT; + old_vslot = old_V >> QFQ_MIN_SLOT_SHIFT; + + if (vslot != old_vslot) { + mask = (2UL << (__fls(vslot ^ old_vslot))) - 1; + qfq_move_groups(q, mask, IR, ER); + qfq_move_groups(q, mask, IB, EB); + } +} + +/* + * XXX we should make sure that slot becomes less than 32. + * This is guaranteed by the input values. + * roundedS is always cl->S rounded on grp->slot_shift bits. + */ +static inline void +qfq_slot_insert(struct qfq_group *grp, struct qfq_class *cl, uint64_t roundedS) +{ + uint64_t slot = (roundedS - grp->S) >> grp->slot_shift; + unsigned int i = (grp->front + slot) % QFQ_MAX_SLOTS; + + cl->next = grp->slots[i]; + grp->slots[i] = cl; + __set_bit(slot, &grp->full_slots); +} + +/* + * remove the entry from the slot + */ +static inline void +qfq_front_slot_remove(struct qfq_group *grp) +{ + struct qfq_class **h = &grp->slots[grp->front]; + + *h = (*h)->next; + if (!*h) + __clear_bit(0, &grp->full_slots); +} + +/* + * Returns the first full queue in a group. As a side effect, + * adjust the bucket list so the first non-empty bucket is at + * position 0 in full_slots. + */ +static inline struct qfq_class * +qfq_slot_scan(struct qfq_group *grp) +{ + int i; + + ND("grp %d full %x", grp->index, grp->full_slots); + if (!grp->full_slots) + return NULL; + + i = ffs(grp->full_slots) - 1; // zero-based + if (i > 0) { + grp->front = (grp->front + i) % QFQ_MAX_SLOTS; + grp->full_slots >>= i; + } + + return grp->slots[grp->front]; +} + +/* + * adjust the bucket list. When the start time of a group decreases, + * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to + * move the objects. The mask of occupied slots must be shifted + * because we use ffs() to find the first non-empty slot. + * This covers decreases in the group's start time, but what about + * increases of the start time ? + * Here too we should make sure that i is less than 32 + */ +static inline void +qfq_slot_rotate(struct qfq_sched *q, struct qfq_group *grp, uint64_t roundedS) +{ + unsigned int i = (grp->S - roundedS) >> grp->slot_shift; + + grp->full_slots <<= i; + grp->front = (grp->front - i) % QFQ_MAX_SLOTS; +} + + +static inline void +qfq_update_eligible(struct qfq_sched *q, uint64_t old_V) +{ + bitmap ineligible; + + ineligible = q->bitmaps[IR] | q->bitmaps[IB]; + if (ineligible) { + if (!q->bitmaps[ER]) { + struct qfq_group *grp; + grp = qfq_ffs(q, ineligible); + if (qfq_gt(grp->S, q->V)) + q->V = grp->S; + } + qfq_make_eligible(q, old_V); + } +} + +/* + * Updates the class, returns true if also the group needs to be updated. + */ +static inline int +qfq_update_class(struct qfq_sched *q, struct qfq_group *grp, + struct qfq_class *cl) +{ + + cl->S = cl->F; + if (cl->_q.mq.head == NULL) { + qfq_front_slot_remove(grp); + } else { + unsigned int len; + uint64_t roundedS; + + len = cl->_q.mq.head->m_pkthdr.len; + cl->F = cl->S + (uint64_t)len * cl->inv_w; + roundedS = qfq_round_down(cl->S, grp->slot_shift); + if (roundedS == grp->S) + return 0; + + qfq_front_slot_remove(grp); + qfq_slot_insert(grp, cl, roundedS); + } + return 1; +} + +static struct mbuf * +qfq_dequeue(struct dn_sch_inst *si) +{ + struct qfq_sched *q = (struct qfq_sched *)(si + 1); + struct qfq_group *grp; + struct qfq_class *cl; + struct mbuf *m; + uint64_t old_V; + + NO(q->loops++;) + if (!q->bitmaps[ER]) { + NO(if (q->queued) + dump_sched(q, "start dequeue");) + return NULL; + } + + grp = qfq_ffs(q, q->bitmaps[ER]); + + cl = grp->slots[grp->front]; + /* extract from the first bucket in the bucket list */ + m = dn_dequeue(&cl->_q); + + if (!m) { + D("BUG/* non-workconserving leaf */"); + return NULL; + } + NO(q->queued--;) + old_V = q->V; + q->V += (uint64_t)m->m_pkthdr.len * IWSUM; + ND("m is %p F 0x%llx V now 0x%llx", m, cl->F, q->V); + + if (qfq_update_class(q, grp, cl)) { + uint64_t old_F = grp->F; + cl = qfq_slot_scan(grp); + if (!cl) { /* group gone, remove from ER */ + __clear_bit(grp->index, &q->bitmaps[ER]); + // grp->S = grp->F + 1; // XXX debugging only + } else { + uint64_t roundedS = qfq_round_down(cl->S, grp->slot_shift); + unsigned int s; + + if (grp->S == roundedS) + goto skip_unblock; + grp->S = roundedS; + grp->F = roundedS + (2ULL << grp->slot_shift); + /* remove from ER and put in the new set */ + __clear_bit(grp->index, &q->bitmaps[ER]); + s = qfq_calc_state(q, grp); + __set_bit(grp->index, &q->bitmaps[s]); + } + /* we need to unblock even if the group has gone away */ + qfq_unblock_groups(q, grp->index, old_F); + } + +skip_unblock: + qfq_update_eligible(q, old_V); + NO(if (!q->bitmaps[ER] && q->queued) + dump_sched(q, "end dequeue");) + + return m; +} + +/* + * Assign a reasonable start time for a new flow k in group i. + * Admissible values for \hat(F) are multiples of \sigma_i + * no greater than V+\sigma_i . Larger values mean that + * we had a wraparound so we consider the timestamp to be stale. + * + * If F is not stale and F >= V then we set S = F. + * Otherwise we should assign S = V, but this may violate + * the ordering in ER. So, if we have groups in ER, set S to + * the F_j of the first group j which would be blocking us. + * We are guaranteed not to move S backward because + * otherwise our group i would still be blocked. + */ +static inline void +qfq_update_start(struct qfq_sched *q, struct qfq_class *cl) +{ + unsigned long mask; + uint32_t limit, roundedF; + int slot_shift = cl->grp->slot_shift; + + roundedF = qfq_round_down(cl->F, slot_shift); + limit = qfq_round_down(q->V, slot_shift) + (1UL << slot_shift); + + if (!qfq_gt(cl->F, q->V) || qfq_gt(roundedF, limit)) { + /* timestamp was stale */ + mask = mask_from(q->bitmaps[ER], cl->grp->index); + if (mask) { + struct qfq_group *next = qfq_ffs(q, mask); + if (qfq_gt(roundedF, next->F)) { + cl->S = next->F; + return; + } + } + cl->S = q->V; + } else { /* timestamp is not stale */ + cl->S = cl->F; + } +} + +static int +qfq_enqueue(struct dn_sch_inst *si, struct dn_queue *_q, struct mbuf *m) +{ + struct qfq_sched *q = (struct qfq_sched *)(si + 1); + struct qfq_group *grp; + struct qfq_class *cl = (struct qfq_class *)_q; + uint64_t roundedS; + int s; + + NO(q->loops++;) + DX(4, "len %d flow %p inv_w 0x%x grp %d", m->m_pkthdr.len, + _q, cl->inv_w, cl->grp->index); + /* XXX verify that the packet obeys the parameters */ + if (m != _q->mq.head) { + if (dn_enqueue(_q, m, 0)) /* packet was dropped */ + return 1; + NO(q->queued++;) + if (m != _q->mq.head) + return 0; + } + /* If reach this point, queue q was idle */ + grp = cl->grp; + qfq_update_start(q, cl); /* adjust start time */ + /* compute new finish time and rounded start. */ + cl->F = cl->S + (uint64_t)(m->m_pkthdr.len) * cl->inv_w; + roundedS = qfq_round_down(cl->S, grp->slot_shift); + + /* + * insert cl in the correct bucket. + * If cl->S >= grp->S we don't need to adjust the + * bucket list and simply go to the insertion phase. + * Otherwise grp->S is decreasing, we must make room + * in the bucket list, and also recompute the group state. + * Finally, if there were no flows in this group and nobody + * was in ER make sure to adjust V. + */ + if (grp->full_slots) { + if (!qfq_gt(grp->S, cl->S)) + goto skip_update; + /* create a slot for this cl->S */ + qfq_slot_rotate(q, grp, roundedS); + /* group was surely ineligible, remove */ + __clear_bit(grp->index, &q->bitmaps[IR]); + __clear_bit(grp->index, &q->bitmaps[IB]); + } else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V)) + q->V = roundedS; + + grp->S = roundedS; + grp->F = roundedS + (2ULL << grp->slot_shift); // i.e. 2\sigma_i + s = qfq_calc_state(q, grp); + __set_bit(grp->index, &q->bitmaps[s]); + ND("new state %d 0x%x", s, q->bitmaps[s]); + ND("S %llx F %llx V %llx", cl->S, cl->F, q->V); +skip_update: + qfq_slot_insert(grp, cl, roundedS); + + return 0; +} + + +#if 0 +static inline void +qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp, + struct qfq_class *cl, struct qfq_class **pprev) +{ + unsigned int i, offset; + uint64_t roundedS; + + roundedS = qfq_round_down(cl->S, grp->slot_shift); + offset = (roundedS - grp->S) >> grp->slot_shift; + i = (grp->front + offset) % QFQ_MAX_SLOTS; + +#ifdef notyet + if (!pprev) { + pprev = &grp->slots[i]; + while (*pprev && *pprev != cl) + pprev = &(*pprev)->next; + } +#endif + + *pprev = cl->next; + if (!grp->slots[i]) + __clear_bit(offset, &grp->full_slots); +} + +/* + * called to forcibly destroy a queue. + * If the queue is not in the front bucket, or if it has + * other queues in the front bucket, we can simply remove + * the queue with no other side effects. + * Otherwise we must propagate the event up. + * XXX description to be completed. + */ +static void +qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl, + struct qfq_class **pprev) +{ + struct qfq_group *grp = &q->groups[cl->index]; + unsigned long mask; + uint64_t roundedS; + int s; + + cl->F = cl->S; // not needed if the class goes away. + qfq_slot_remove(q, grp, cl, pprev); + + if (!grp->full_slots) { + /* nothing left in the group, remove from all sets. + * Do ER last because if we were blocking other groups + * we must unblock them. + */ + __clear_bit(grp->index, &q->bitmaps[IR]); + __clear_bit(grp->index, &q->bitmaps[EB]); + __clear_bit(grp->index, &q->bitmaps[IB]); + + if (test_bit(grp->index, &q->bitmaps[ER]) && + !(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) { + mask = q->bitmaps[ER] & ((1UL << grp->index) - 1); + if (mask) + mask = ~((1UL << __fls(mask)) - 1); + else + mask = ~0UL; + qfq_move_groups(q, mask, EB, ER); + qfq_move_groups(q, mask, IB, IR); + } + __clear_bit(grp->index, &q->bitmaps[ER]); + } else if (!grp->slots[grp->front]) { + cl = qfq_slot_scan(grp); + roundedS = qfq_round_down(cl->S, grp->slot_shift); + if (grp->S != roundedS) { + __clear_bit(grp->index, &q->bitmaps[ER]); + __clear_bit(grp->index, &q->bitmaps[IR]); + __clear_bit(grp->index, &q->bitmaps[EB]); + __clear_bit(grp->index, &q->bitmaps[IB]); + grp->S = roundedS; + grp->F = roundedS + (2ULL << grp->slot_shift); + s = qfq_calc_state(q, grp); + __set_bit(grp->index, &q->bitmaps[s]); + } + } + qfq_update_eligible(q, q->V); +} +#endif + +static int +qfq_new_fsk(struct dn_fsk *f) +{ + ipdn_bound_var(&f->fs.par[0], 1, 1, QFQ_MAX_WEIGHT, "qfq weight"); + ipdn_bound_var(&f->fs.par[1], 1500, 1, 2000, "qfq maxlen"); + ND("weight %d len %d\n", f->fs.par[0], f->fs.par[1]); + return 0; +} + +/* + * initialize a new scheduler instance + */ +static int +qfq_new_sched(struct dn_sch_inst *si) +{ + struct qfq_sched *q = (struct qfq_sched *)(si + 1); + struct qfq_group *grp; + int i; + + for (i = 0; i <= QFQ_MAX_INDEX; i++) { + grp = &q->groups[i]; + grp->index = i; + grp->slot_shift = QFQ_MTU_SHIFT + FRAC_BITS - + (QFQ_MAX_INDEX - i); + } + return 0; +} + +/* + * QFQ scheduler descriptor + */ +static struct dn_alg qfq_desc = { + _SI( .type = ) DN_SCHED_QFQ, + _SI( .name = ) "QFQ", + _SI( .flags = ) DN_MULTIQUEUE, + + _SI( .schk_datalen = ) 0, + _SI( .si_datalen = ) sizeof(struct qfq_sched), + _SI( .q_datalen = ) sizeof(struct qfq_class) - sizeof(struct dn_queue), + + _SI( .enqueue = ) qfq_enqueue, + _SI( .dequeue = ) qfq_dequeue, + + _SI( .config = ) NULL, + _SI( .destroy = ) NULL, + _SI( .new_sched = ) qfq_new_sched, + _SI( .free_sched = ) NULL, + _SI( .new_fsk = ) qfq_new_fsk, + _SI( .free_fsk = ) NULL, + _SI( .new_queue = ) qfq_new_queue, + _SI( .free_queue = ) qfq_free_queue, +}; + +DECLARE_DNSCHED_MODULE(dn_qfq, &qfq_desc); + +#ifdef QFQ_DEBUG +static void +dump_groups(struct qfq_sched *q, uint32_t mask) +{ + int i, j; + + for (i = 0; i < QFQ_MAX_INDEX + 1; i++) { + struct qfq_group *g = &q->groups[i]; + + if (0 == (mask & (1<<i))) + continue; + for (j = 0; j < QFQ_MAX_SLOTS; j++) { + if (g->slots[j]) + D(" bucket %d %p", j, g->slots[j]); + } + D("full_slots 0x%x", g->full_slots); + D(" %2d S 0x%20llx F 0x%llx %c", i, + g->S, g->F, + mask & (1<<i) ? '1' : '0'); + } +} + +static void +dump_sched(struct qfq_sched *q, const char *msg) +{ + D("--- in %s: ---", msg); + ND("loops %d queued %d V 0x%llx", q->loops, q->queued, q->V); + D(" ER 0x%08x", q->bitmaps[ER]); + D(" EB 0x%08x", q->bitmaps[EB]); + D(" IR 0x%08x", q->bitmaps[IR]); + D(" IB 0x%08x", q->bitmaps[IB]); + dump_groups(q, 0xffffffff); +}; +#endif /* QFQ_DEBUG */ diff --git a/sys/netinet/ipfw/dn_sched_rr.c b/sys/netinet/ipfw/dn_sched_rr.c new file mode 100644 index 0000000..fc7be00 --- /dev/null +++ b/sys/netinet/ipfw/dn_sched_rr.c @@ -0,0 +1,307 @@ +/* + * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + */ + +#ifdef _KERNEL +#include <sys/malloc.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/kernel.h> +#include <sys/mbuf.h> +#include <sys/module.h> +#include <net/if.h> /* IFNAMSIZ */ +#include <netinet/in.h> +#include <netinet/ip_var.h> /* ipfw_rule_ref */ +#include <netinet/ip_fw.h> /* flow_id */ +#include <netinet/ip_dummynet.h> +#include <netinet/ipfw/dn_heap.h> +#include <netinet/ipfw/ip_dn_private.h> +#include <netinet/ipfw/dn_sched.h> +#else +#include <dn_test.h> +#endif + +#define DN_SCHED_RR 3 // XXX Where? + +struct rr_queue { + struct dn_queue q; /* Standard queue */ + int status; /* 1: queue is in the list */ + int credit; /* Number of bytes to transmit */ + int quantum; /* quantum * C */ + struct rr_queue *qnext; /* */ +}; + +/* struct rr_schk contains global config parameters + * and is right after dn_schk + */ +struct rr_schk { + int min_q; /* Min quantum */ + int max_q; /* Max quantum */ + int q_bytes; /* Bytes per quantum */ +}; + +/* per-instance round robin list, right after dn_sch_inst */ +struct rr_si { + struct rr_queue *head, *tail; /* Pointer to current queue */ +}; + +/* Append a queue to the rr list */ +static inline void +rr_append(struct rr_queue *q, struct rr_si *si) +{ + q->status = 1; /* mark as in-rr_list */ + q->credit = q->quantum; /* initialize credit */ + + /* append to the tail */ + if (si->head == NULL) + si->head = q; + else + si->tail->qnext = q; + si->tail = q; /* advance the tail pointer */ + q->qnext = si->head; /* make it circular */ +} + +/* Remove the head queue from circular list. */ +static inline void +rr_remove_head(struct rr_si *si) +{ + if (si->head == NULL) + return; /* empty queue */ + si->head->status = 0; + + if (si->head == si->tail) { + si->head = si->tail = NULL; + return; + } + + si->head = si->head->qnext; + si->tail->qnext = si->head; +} + +/* Remove a queue from circular list. + * XXX see if ti can be merge with remove_queue() + */ +static inline void +remove_queue_q(struct rr_queue *q, struct rr_si *si) +{ + struct rr_queue *prev; + + if (q->status != 1) + return; + if (q == si->head) { + rr_remove_head(si); + return; + } + + for (prev = si->head; prev; prev = prev->qnext) { + if (prev->qnext != q) + continue; + prev->qnext = q->qnext; + if (q == si->tail) + si->tail = prev; + q->status = 0; + break; + } +} + + +static inline void +next_pointer(struct rr_si *si) +{ + if (si->head == NULL) + return; /* empty queue */ + + si->head = si->head->qnext; + si->tail = si->tail->qnext; +} + +static int +rr_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m) +{ + struct rr_si *si; + struct rr_queue *rrq; + + if (m != q->mq.head) { + if (dn_enqueue(q, m, 0)) /* packet was dropped */ + return 1; + if (m != q->mq.head) + return 0; + } + + /* If reach this point, queue q was idle */ + si = (struct rr_si *)(_si + 1); + rrq = (struct rr_queue *)q; + + if (rrq->status == 1) /* Queue is already in the queue list */ + return 0; + + /* Insert the queue in the queue list */ + rr_append(rrq, si); + + return 0; +} + +static struct mbuf * +rr_dequeue(struct dn_sch_inst *_si) +{ + /* Access scheduler instance private data */ + struct rr_si *si = (struct rr_si *)(_si + 1); + struct rr_queue *rrq; + uint64_t len; + + while ( (rrq = si->head) ) { + struct mbuf *m = rrq->q.mq.head; + if ( m == NULL) { + /* empty queue, remove from list */ + rr_remove_head(si); + continue; + } + len = m->m_pkthdr.len; + + if (len > rrq->credit) { + /* Packet too big */ + rrq->credit += rrq->quantum; + /* Try next queue */ + next_pointer(si); + } else { + rrq->credit -= len; + return dn_dequeue(&rrq->q); + } + } + + /* no packet to dequeue*/ + return NULL; +} + +static int +rr_config(struct dn_schk *_schk) +{ + struct rr_schk *schk = (struct rr_schk *)(_schk + 1); + ND("called"); + + /* use reasonable quantums (64..2k bytes, default 1500) */ + schk->min_q = 64; + schk->max_q = 2048; + schk->q_bytes = 1500; /* quantum */ + + return 0; +} + +static int +rr_new_sched(struct dn_sch_inst *_si) +{ + struct rr_si *si = (struct rr_si *)(_si + 1); + + ND("called"); + si->head = si->tail = NULL; + + return 0; +} + +static int +rr_free_sched(struct dn_sch_inst *_si) +{ + ND("called"); + /* Nothing to do? */ + return 0; +} + +static int +rr_new_fsk(struct dn_fsk *fs) +{ + struct rr_schk *schk = (struct rr_schk *)(fs->sched + 1); + /* par[0] is the weight, par[1] is the quantum step */ + ipdn_bound_var(&fs->fs.par[0], 1, + 1, 65536, "RR weight"); + ipdn_bound_var(&fs->fs.par[1], schk->q_bytes, + schk->min_q, schk->max_q, "RR quantum"); + return 0; +} + +static int +rr_new_queue(struct dn_queue *_q) +{ + struct rr_queue *q = (struct rr_queue *)_q; + + _q->ni.oid.subtype = DN_SCHED_RR; + + q->quantum = _q->fs->fs.par[0] * _q->fs->fs.par[1]; + ND("called, q->quantum %d", q->quantum); + q->credit = q->quantum; + q->status = 0; + + if (_q->mq.head != NULL) { + /* Queue NOT empty, insert in the queue list */ + rr_append(q, (struct rr_si *)(_q->_si + 1)); + } + return 0; +} + +static int +rr_free_queue(struct dn_queue *_q) +{ + struct rr_queue *q = (struct rr_queue *)_q; + + ND("called"); + if (q->status == 1) { + struct rr_si *si = (struct rr_si *)(_q->_si + 1); + remove_queue_q(q, si); + } + return 0; +} + +/* + * RR scheduler descriptor + * contains the type of the scheduler, the name, the size of the + * structures and function pointers. + */ +static struct dn_alg rr_desc = { + _SI( .type = ) DN_SCHED_RR, + _SI( .name = ) "RR", + _SI( .flags = ) DN_MULTIQUEUE, + + _SI( .schk_datalen = ) 0, + _SI( .si_datalen = ) sizeof(struct rr_si), + _SI( .q_datalen = ) sizeof(struct rr_queue) - sizeof(struct dn_queue), + + _SI( .enqueue = ) rr_enqueue, + _SI( .dequeue = ) rr_dequeue, + + _SI( .config = ) rr_config, + _SI( .destroy = ) NULL, + _SI( .new_sched = ) rr_new_sched, + _SI( .free_sched = ) rr_free_sched, + _SI( .new_fsk = ) rr_new_fsk, + _SI( .free_fsk = ) NULL, + _SI( .new_queue = ) rr_new_queue, + _SI( .free_queue = ) rr_free_queue, +}; + + +DECLARE_DNSCHED_MODULE(dn_rr, &rr_desc); diff --git a/sys/netinet/ipfw/dn_sched_wf2q.c b/sys/netinet/ipfw/dn_sched_wf2q.c new file mode 100644 index 0000000..1fbc120 --- /dev/null +++ b/sys/netinet/ipfw/dn_sched_wf2q.c @@ -0,0 +1,373 @@ +/* + * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa + * Copyright (c) 2000-2002 Luigi Rizzo, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + */ + +#ifdef _KERNEL +#include <sys/malloc.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/kernel.h> +#include <sys/mbuf.h> +#include <sys/module.h> +#include <net/if.h> /* IFNAMSIZ */ +#include <netinet/in.h> +#include <netinet/ip_var.h> /* ipfw_rule_ref */ +#include <netinet/ip_fw.h> /* flow_id */ +#include <netinet/ip_dummynet.h> +#include <netinet/ipfw/dn_heap.h> +#include <netinet/ipfw/ip_dn_private.h> +#include <netinet/ipfw/dn_sched.h> +#else +#include <dn_test.h> +#endif + +#ifndef MAX64 +#define MAX64(x,y) (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x) +#endif + +/* + * timestamps are computed on 64 bit using fixed point arithmetic. + * LMAX_BITS, WMAX_BITS are the max number of bits for the packet len + * and sum of weights, respectively. FRAC_BITS is the number of + * fractional bits. We want FRAC_BITS >> WMAX_BITS to avoid too large + * errors when computing the inverse, FRAC_BITS < 32 so we can do 1/w + * using an unsigned 32-bit division, and to avoid wraparounds we need + * LMAX_BITS + WMAX_BITS + FRAC_BITS << 64 + * As an example + * FRAC_BITS = 26, LMAX_BITS=14, WMAX_BITS = 19 + */ +#ifndef FRAC_BITS +#define FRAC_BITS 28 /* shift for fixed point arithmetic */ +#define ONE_FP (1UL << FRAC_BITS) +#endif + +/* + * Private information for the scheduler instance: + * sch_heap (key is Finish time) returns the next queue to serve + * ne_heap (key is Start time) stores not-eligible queues + * idle_heap (key=start/finish time) stores idle flows. It must + * support extract-from-middle. + * A flow is only in 1 of the three heaps. + * XXX todo: use a more efficient data structure, e.g. a tree sorted + * by F with min_subtree(S) in each node + */ +struct wf2qp_si { + struct dn_heap sch_heap; /* top extract - key Finish time */ + struct dn_heap ne_heap; /* top extract - key Start time */ + struct dn_heap idle_heap; /* random extract - key Start=Finish time */ + uint64_t V; /* virtual time */ + uint32_t inv_wsum; /* inverse of sum of weights */ + uint32_t wsum; /* sum of weights */ +}; + +struct wf2qp_queue { + struct dn_queue _q; + uint64_t S, F; /* start time, finish time */ + uint32_t inv_w; /* ONE_FP / weight */ + int32_t heap_pos; /* position (index) of struct in heap */ +}; + +/* + * This file implements a WF2Q+ scheduler as it has been in dummynet + * since 2000. + * The scheduler supports per-flow queues and has O(log N) complexity. + * + * WF2Q+ needs to drain entries from the idle heap so that we + * can keep the sum of weights up to date. We can do it whenever + * we get a chance, or periodically, or following some other + * strategy. The function idle_check() drains at most N elements + * from the idle heap. + */ +static void +idle_check(struct wf2qp_si *si, int n, int force) +{ + struct dn_heap *h = &si->idle_heap; + while (n-- > 0 && h->elements > 0 && + (force || DN_KEY_LT(HEAP_TOP(h)->key, si->V))) { + struct dn_queue *q = HEAP_TOP(h)->object; + struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q; + + heap_extract(h, NULL); + /* XXX to let the flowset delete the queue we should + * mark it as 'unused' by the scheduler. + */ + alg_fq->S = alg_fq->F + 1; /* Mark timestamp as invalid. */ + si->wsum -= q->fs->fs.par[0]; /* adjust sum of weights */ + if (si->wsum > 0) + si->inv_wsum = ONE_FP/si->wsum; + } +} + +static int +wf2qp_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m) +{ + struct dn_fsk *fs = q->fs; + struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); + struct wf2qp_queue *alg_fq; + uint64_t len = m->m_pkthdr.len; + + if (m != q->mq.head) { + if (dn_enqueue(q, m, 0)) /* packet was dropped */ + return 1; + if (m != q->mq.head) /* queue was already busy */ + return 0; + } + + /* If reach this point, queue q was idle */ + alg_fq = (struct wf2qp_queue *)q; + + if (DN_KEY_LT(alg_fq->F, alg_fq->S)) { + /* F<S means timestamps are invalid ->brand new queue. */ + alg_fq->S = si->V; /* init start time */ + si->wsum += fs->fs.par[0]; /* add weight of new queue. */ + si->inv_wsum = ONE_FP/si->wsum; + } else { /* if it was idle then it was in the idle heap */ + heap_extract(&si->idle_heap, q); + alg_fq->S = MAX64(alg_fq->F, si->V); /* compute new S */ + } + alg_fq->F = alg_fq->S + len * alg_fq->inv_w; + + /* if nothing is backlogged, make sure this flow is eligible */ + if (si->ne_heap.elements == 0 && si->sch_heap.elements == 0) + si->V = MAX64(alg_fq->S, si->V); + + /* + * Look at eligibility. A flow is not eligibile if S>V (when + * this happens, it means that there is some other flow already + * scheduled for the same pipe, so the sch_heap cannot be + * empty). If the flow is not eligible we just store it in the + * ne_heap. Otherwise, we store in the sch_heap. + * Note that for all flows in sch_heap (SCH), S_i <= V, + * and for all flows in ne_heap (NEH), S_i > V. + * So when we need to compute max(V, min(S_i)) forall i in + * SCH+NEH, we only need to look into NEH. + */ + if (DN_KEY_LT(si->V, alg_fq->S)) { + /* S>V means flow Not eligible. */ + if (si->sch_heap.elements == 0) + D("++ ouch! not eligible but empty scheduler!"); + heap_insert(&si->ne_heap, alg_fq->S, q); + } else { + heap_insert(&si->sch_heap, alg_fq->F, q); + } + return 0; +} + +/* XXX invariant: sch > 0 || V >= min(S in neh) */ +static struct mbuf * +wf2qp_dequeue(struct dn_sch_inst *_si) +{ + /* Access scheduler instance private data */ + struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); + struct mbuf *m; + struct dn_queue *q; + struct dn_heap *sch = &si->sch_heap; + struct dn_heap *neh = &si->ne_heap; + struct wf2qp_queue *alg_fq; + + if (sch->elements == 0 && neh->elements == 0) { + /* we have nothing to do. We could kill the idle heap + * altogether and reset V + */ + idle_check(si, 0x7fffffff, 1); + si->V = 0; + si->wsum = 0; /* should be set already */ + return NULL; /* quick return if nothing to do */ + } + idle_check(si, 1, 0); /* drain something from the idle heap */ + + /* make sure at least one element is eligible, bumping V + * and moving entries that have become eligible. + * We need to repeat the first part twice, before and + * after extracting the candidate, or enqueue() will + * find the data structure in a wrong state. + */ + m = NULL; + for(;;) { + /* + * Compute V = max(V, min(S_i)). Remember that all elements + * in sch have by definition S_i <= V so if sch is not empty, + * V is surely the max and we must not update it. Conversely, + * if sch is empty we only need to look at neh. + * We don't need to move the queues, as it will be done at the + * next enqueue + */ + if (sch->elements == 0 && neh->elements > 0) { + si->V = MAX64(si->V, HEAP_TOP(neh)->key); + } + while (neh->elements > 0 && + DN_KEY_LEQ(HEAP_TOP(neh)->key, si->V)) { + q = HEAP_TOP(neh)->object; + alg_fq = (struct wf2qp_queue *)q; + heap_extract(neh, NULL); + heap_insert(sch, alg_fq->F, q); + } + if (m) /* pkt found in previous iteration */ + break; + /* ok we have at least one eligible pkt */ + q = HEAP_TOP(sch)->object; + alg_fq = (struct wf2qp_queue *)q; + m = dn_dequeue(q); + heap_extract(sch, NULL); /* Remove queue from heap. */ + si->V += (uint64_t)(m->m_pkthdr.len) * si->inv_wsum; + alg_fq->S = alg_fq->F; /* Update start time. */ + if (q->mq.head == 0) { /* not backlogged any more. */ + heap_insert(&si->idle_heap, alg_fq->F, q); + } else { /* Still backlogged. */ + /* Update F, store in neh or sch */ + uint64_t len = q->mq.head->m_pkthdr.len; + alg_fq->F += len * alg_fq->inv_w; + if (DN_KEY_LEQ(alg_fq->S, si->V)) { + heap_insert(sch, alg_fq->F, q); + } else { + heap_insert(neh, alg_fq->S, q); + } + } + } + return m; +} + +static int +wf2qp_new_sched(struct dn_sch_inst *_si) +{ + struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); + int ofs = offsetof(struct wf2qp_queue, heap_pos); + + /* all heaps support extract from middle */ + if (heap_init(&si->idle_heap, 16, ofs) || + heap_init(&si->sch_heap, 16, ofs) || + heap_init(&si->ne_heap, 16, ofs)) { + heap_free(&si->ne_heap); + heap_free(&si->sch_heap); + heap_free(&si->idle_heap); + return ENOMEM; + } + return 0; +} + +static int +wf2qp_free_sched(struct dn_sch_inst *_si) +{ + struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); + + heap_free(&si->sch_heap); + heap_free(&si->ne_heap); + heap_free(&si->idle_heap); + + return 0; +} + +static int +wf2qp_new_fsk(struct dn_fsk *fs) +{ + ipdn_bound_var(&fs->fs.par[0], 1, + 1, 100, "WF2Q+ weight"); + return 0; +} + +static int +wf2qp_new_queue(struct dn_queue *_q) +{ + struct wf2qp_queue *q = (struct wf2qp_queue *)_q; + + _q->ni.oid.subtype = DN_SCHED_WF2QP; + q->F = 0; /* not strictly necessary */ + q->S = q->F + 1; /* mark timestamp as invalid. */ + q->inv_w = ONE_FP / _q->fs->fs.par[0]; + if (_q->mq.head != NULL) { + wf2qp_enqueue(_q->_si, _q, _q->mq.head); + } + return 0; +} + +/* + * Called when the infrastructure removes a queue (e.g. flowset + * is reconfigured). Nothing to do if we did not 'own' the queue, + * otherwise remove it from the right heap and adjust the sum + * of weights. + */ +static int +wf2qp_free_queue(struct dn_queue *q) +{ + struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q; + struct wf2qp_si *si = (struct wf2qp_si *)(q->_si + 1); + + if (alg_fq->S >= alg_fq->F + 1) + return 0; /* nothing to do, not in any heap */ + si->wsum -= q->fs->fs.par[0]; + if (si->wsum > 0) + si->inv_wsum = ONE_FP/si->wsum; + + /* extract from the heap. XXX TODO we may need to adjust V + * to make sure the invariants hold. + */ + if (q->mq.head == NULL) { + heap_extract(&si->idle_heap, q); + } else if (DN_KEY_LT(si->V, alg_fq->S)) { + heap_extract(&si->ne_heap, q); + } else { + heap_extract(&si->sch_heap, q); + } + return 0; +} + +/* + * WF2Q+ scheduler descriptor + * contains the type of the scheduler, the name, the size of the + * structures and function pointers. + */ +static struct dn_alg wf2qp_desc = { + _SI( .type = ) DN_SCHED_WF2QP, + _SI( .name = ) "WF2Q+", + _SI( .flags = ) DN_MULTIQUEUE, + + /* we need extra space in the si and the queue */ + _SI( .schk_datalen = ) 0, + _SI( .si_datalen = ) sizeof(struct wf2qp_si), + _SI( .q_datalen = ) sizeof(struct wf2qp_queue) - + sizeof(struct dn_queue), + + _SI( .enqueue = ) wf2qp_enqueue, + _SI( .dequeue = ) wf2qp_dequeue, + + _SI( .config = ) NULL, + _SI( .destroy = ) NULL, + _SI( .new_sched = ) wf2qp_new_sched, + _SI( .free_sched = ) wf2qp_free_sched, + + _SI( .new_fsk = ) wf2qp_new_fsk, + _SI( .free_fsk = ) NULL, + + _SI( .new_queue = ) wf2qp_new_queue, + _SI( .free_queue = ) wf2qp_free_queue, +}; + + +DECLARE_DNSCHED_MODULE(dn_wf2qp, &wf2qp_desc); diff --git a/sys/netinet/ipfw/dummynet.txt b/sys/netinet/ipfw/dummynet.txt new file mode 100644 index 0000000..0ed6ad1 --- /dev/null +++ b/sys/netinet/ipfw/dummynet.txt @@ -0,0 +1,860 @@ +# +# $FreeBSD$ +# + +Notes on the internal structure of dummynet (2010 version) +by Riccardo Panicucci and Luigi Rizzo +Work supported by the EC project ONELAB2 + + +********* +* INDEX * +********* +Implementation of new dummynet + Internal structure + Files +Packet arrival + The reconfiguration routine +dummynet_task() +Configuration + Add a pipe + Add a scheduler + Add a flowset +Listing object +Delete of object + Delete a pipe + Delete a flowset + Delete a scheduler +Compatibility with FreeBSD7.2 and FreeBSD 8 ipfw binary + ip_dummynet_glue.c + ip_fw_glue.c +How to configure dummynet +How to implement a new scheduler + + + +OPEN ISSUES +------------------------------ +20100131 deleting RR causes infinite loop + presumably in the rr_free_queue() call -- seems to hang + forever when deleting a live flow +------------------------------ + +Dummynet is a traffic shaper and network emulator. Packets are +selected by an external filter such as ipfw, and passed to the emulator +with a tag such as "pipe 10" or "queue 5" which tells what to +do with the packet. As an example + + ipfw add queue 5 icmp from 10.0.0.2 to all + +All packets with the same tag belong to a "flowset", or a set +of flows which can be further partitioned according to a mask. +Flowsets are then passed to a scheduler for processing. The +association of flowsets and schedulers is configurable e.g. + + ipfw queue 5 config sched 10 weight 3 flow_mask xxxx + ipfw queue 8 config sched 10 weight 1 ... + ipfw queue 3 config sched 20 weight 1 ... + +"sched 10" represents one or more scheduler instances, +selected through a mask on the 5-tuple itself. + + ipfw sched 20 config type FIFO sched_mask yyy ... + +There are in fact two masks applied to each packet: ++ the "sched_mask" sends packets arriving to a scheduler_id to + one of many instances. ++ the "flow_mask" together with the flowset_id is used to + collect packets into independent flows on each scheduler. + +As an example, we can have + ipfw queue 5 config sched 10 flow_mask src-ip 0x000000ff + ipfw sched 10 config type WF2Q+ sched_mask src-ip 0xffffff00 + +means that sched 10 will have one instance per /24 source subnet, +and within that, each individual source will be a flow. + +Internal structure +----------------- +Dummynet-related data is split into several data structures, +part of them constituting the userland-kernel API, and others +specific to the kernel. +NOTE: for up-to-date details please look at the relevant source + headers (ip_dummynet.h, ip_dn_private.h, dn_sched.h) + +USERLAND-KERNEL API (ip_dummynet.h) + + struct dn_link: + contains data about the physical link such as + bandwith, delay, burst size; + + struct dn_fs: + describes a flowset, i.e. a template for queues. + Main parameters are the scheduler we attach to, a flow_mask, + buckets, queue size, plr, weight, and other scheduler-specific + parameters. + + struct dn_flow + contains information on a flow, including masks and + statistics + + struct dn_sch: + defines a scheduler (and a link attached to it). + Parameters include scheduler type, sched_mask, number of + buckets, and possibly other scheduler-specific parameters, + + struct dn_profile: + fields to simulate a delay profile + + +KERNEL REPRESENTATION (ip_dn_private.h) + + struct mq + a queue of mbufs with head and tail. + + struct dn_queue + individual queue of packets, created by a flowset using + flow_mask and attached to a scheduler instance selected + through sched_mask. + A dn_queue has a pointer to the dn_fsk (which in turn counts + how many queues point to it), a pointer to the + dn_sch_inst it attaches to, and is in a hash table in the + flowset. scheduler instances also should store queues in + their own containers used for scheduling (lists, trees, etc.) + CREATE: done on packet arrivals when a flow matches a flowset. + DELETE: done only when deleting the parent dn_sch_inst + or draining memory. + + struct dn_fsk + includes a dn_fs; a pointer to the dn_schk; a link field + for the list of dn_fsk attached to the same scheduler, + or for the unlinked list; + a refcount for the number of queues pointing to it; + The dn_fsk is in a hash table, fshash. + CREATE: done on configuration commands. + DELETE: on configuration commands. + + struct dn_sch_inst + a scheduler instance, created from a dn_schk applying sched_mask. + Contains a delay line, a reference to the parent, and scheduler- + specific info. Both dn_sch_inst and its delay line can be in the + evheap if they have events to be processed. + CREATE: created from a dn_schk applying sched_mask + DELETE: configuration command delete a scheduler which in turn + sweeps the hash table of instances deleting them + + struct dn_schk + includes dn_sch, dn_link, a pointer to dn_profile, + a hash table of dn_sch_inst, a list of dn_fsk + attached to it. + CREATE: configuration command. If there are flowsets that + refer to this number, they are attached and moved + to the hash table + DELETE: manual, see dn_sch_inst + + + fshash schedhash + +---------------+ sched +--------------+ + | sched-------------------->| NEW_SCHK| + -<----*sch_chain |<-----------------*fsk_list | + |NEW_FSK |<----. | [dn_link] | + +---------------+ | +--------------+ + |qht (hash) | | | siht(hash) | + | [dn_queue] | | | [dn_si] | + | [dn_queue] | | | [dn_si] | + | ... | | | ... | + | +--------+ | | | +---------+ | + | |dn_queue| | | | |dn_si | | + | | fs *----------' | | | | + | | si *---------------------->| | | + | +---------+ | | +---------+ | + +---------------+ +--------------+ + +The following global data structures contain all +schedulers and flowsets. + +- schedhash[x]: contains all scheduler templates in the system. + Looked up only on manual configurations, where flowsets + are attached to matching schedulers. + We have one entry per 'sched X config' command + (plus one for each 'pipe X config'). + +- fshash[x]: contains all flowsets. + We do a lookup on this for each packet. + We have one entry for each 'queue X config' + (plus one for each 'pipe X config'). + +Additionally, a list that contains all unlinked flowset: +- fsu: contains flowset that are not linked with any scheduler. + Flowset are put in this list when they refer to a non + existing scheduler. + We don't need an efficient data structure as we never search + here on a packet arrivals. + +Scheduler instances and the delay lines associated with each scheduler +instance need to be woken up at certain times. Because we have many +such objects, we keep them in a priority heap (system_heap). + +Almost all objects in this implementation are preceded by a structure +(struct dn_id) which makes it easier to identify them. + + +Files +----- +The dummynet code is split in several files. +All kernel code is in sys/netinet/ipfw except ip_dummynet.h +All userland code is in sbin/ipfw. +Files are +- sys/netinet/ip_dummynet.h defines the kernel-userland API +- ip_dn_private.h contains the kernel-specific APIs + and data structures +- dn_sched.h defines the scheduler API +- ip_dummynet.c cointains module glue and sockopt handlers, with all + functions to configure and list objects. +- ip_dn_io.c contains the functions directly related to packet processing, + and run in the critical path. It also contains some functions + exported to the schedulers. +- dn_heap.[ch] implement a binary heap and a generic hash table +- dn_sched_* implement the various scheduler modules + +- dummynet.c is the file used to implement the user side of dummynet. + It contains the function to parsing command line, and functions to + show the output of dummynet objects. +Moreover, there are two new file (ip_dummynet_glue.c and ip_fw_glue.c) that +are used to allow compatibility with the "ipfw" binary from FreeBSD 7.2 and +FreeBSD 8. + +LOCKING +======= +At the moment the entire processing occurs under a single lock +which is expected to be acquired in exclusive mode +DN_BH_WLOCK() / DN_BH_WUNLOCK(). + +In perspective we aim at the following: +- the 'busy' flag, 'pending' list and all structures modified by packet + arrivals and departures are protected by the BH_WLOCK. + This is normally acquired in exclusive mode by the packet processing + functions for short sections of code (exception -- the timer). + If 'busy' is not set, we can do regular packet processing. + If 'busy' is set, no pieces can be accessed. + We must enqueue the packet on 'pending' and return immediately. + +- the 'busy' flag is set/cleared by long sections of code as follows: + UH_WLOCK(); KASSERT(busy == 0); + BH_WLOCK(); busy=1; BH_WUNLOCK(); + ... do processing ... + BH_WLOCK(); busy=0; drain_queue(pending); BH_WUNLOCK(); + UH_WUNLOCK(); + this normally happens when the upper half has something heavy + to do. The prologue and epilogue are not in the critical path. + +- the main containers (fshash, schedhash, ...) are protected by + UH_WLOCK. + +Packet processing +================= +A packet enters dummynet through dummynet_io(). We first lookup +the flowset number in fshash using dn_ht_find(), then find the scheduler +instance using ipdn_si_find(), then possibly identify the correct +queue with ipdn_q_find(). +If successful, we call the scheduler's enqueue function(), and +if needed start I/O on the link calling serve_sched(). +If the packet can be returned immediately, this is done by +leaving *m0 set. Otherwise, the packet is absorbed by dummynet +and we simply return, possibly with some appropriate error code. + +Reconfiguration +--------------- +Reconfiguration is the complex part of the system because we need to +keep track of the various objects and containers. +At the moment we do not use reference counts for objects so all +processing must be done under a lock. + +The main entry points for configuration is the ip_dn_ctl() handler +for the IP_DUMMYNET3 sockopt (others are provided only for backward +compatibility). Modifications to the configuration call do_config(). +The argument is a sequence of blocks each starting with a struct dn_id +which specifies its content. +The first dn_id must contain as obj.id the DN_API_VERSION +The obj.type is DN_CMD_CONFIG (followed by actual objects), +DN_CMD_DELETE (with the correct subtype and list of objects), or +DN_CMD_FLUSH. + +DN_CMD_CONFIG is followed by objects to add/reconfigure. In general, +if an object already exists it is reconfigured, otherwise it is +created in a way that keeps the structure consistent. +We have the following objects in the system, normally numbered with +an identifier N between 1 and 65535. For certain objects we have +"shadow" copies numbered I+NMAX and I+ 2*NMAX which are used to +implement certain backward compatibility features. + +In general we have the following linking + + TRADITIONAL DUMMYNET QUEUES "queue N config ... pipe M ..." + corresponds to a dn_fs object numbered N + + TRADITIONAL DUMMYNET PIPES "pipe N config ..." + dn_fs N+2*NMAX --> dn_sch N+NMAX type FIFO --> dn_link N+NMAX + + GENERIC SCHEDULER "sched N config ... " + [dn_fs N+NMAX] --> dn_sch N --> dn_link N + The flowset N+NMAX is created only if the scheduler is not + of type MULTIQUEUE. + + DELAY PROFILE "pipe N config profile ..." + it is always attached to an existing dn_link N + +Because traditional dummynet pipes actually configure both a +'standalone' instance and one that can be used by queues, +we do the following: + + "pipe N config ..." configures: + dn_sched N type WF2Q+ + dn_sched N+NMAX type FIFO + dn_fs N+2NMAX attached to dn_sched N+NMAX + dn_pipe N + dn_pipe N+NMAX + + "queue N config" configures + dn_fs N + + "sched N config" configures + dn_sched N type as desired + dn_fs N+NMAX attached to dn_sched N + + +dummynet_task() +=============== +The dummynet_task() is the the main dummynet processing function and is +called every tick. This function first calculate the new current time, then +it checks if it is the time to wake up object from the system_heap comparing +the current time and the key of the heap. Two types of object (really the +heap contains pointer to objects) are in the +system_heap: + +- scheduler instance: if a scheduler instance is waked up, the dequeue() + function is called until it has credit. If the dequeue() returns packets, + the scheduler instance is inserted in the heap with a new key depending of + the data that will be send out. If the scheduler instance remains with + some credit, it means that is hasn't other packet to send and so the + instance is no longer inserted in the heap. + + If the scheduler instance extracted from the heap has the DELETE flag set, + the dequeue() is not called and the instance is destroyed now. + +- delay line: when extracting a delay line, the function transmit_event() is + called to send out packet from delay line. + + If the scheduler instance associated with this delay line doesn't exists, + the delay line will be delete now. + +Configuration +============= +To create a pipe, queue or scheduler, the user should type commands like: +"ipfw pipe x config" +"ipfw queue y config pipe x" +"ipfw pipe x config sched <type>" + +The userland side of dummynet will prepare a buffer contains data to pass to +kernel side. +The buffer contains all struct needed to configure an object. In more detail, +to configure a pipe all three structs (dn_link, dn_sch, dn_fs) are needed, +plus the delay profile struct if the pipe has a delay profile. + +If configuring a scheduler only the struct dn_sch is wrote in the buffer, +while if configuring a flowset only the dn_fs struct is wrote. + +The first struct in the buffer contains the type of command request, that is +if it is configuring a pipe, a queue, or a scheduler. Then there are structs +need to configure the object, and finally there is the struct that mark +the end of the buffer. + +To support the insertion of pipe and queue using the old syntax, when adding +a pipe it's necessary to create a FIFO flowset and a FIFO scheduler, which +have a number x + DN_PIPEOFFSET. + +Add a pipe +---------- +A pipe is only a template for a link. +If the pipe already exists, parameters are updated. If a delay profile exists +it is deleted and a new one is created. +If the pipe doesn't exist a new one is created. After the creation, the +flowset unlinked list is scanned to see if there are some flowset that would +be linked with this pipe. If so, these flowset will be of wf2q+ type (for +compatibility) and a new wf2q+ scheduler is created now. + +Add a scheduler +--------------- +If the scheduler already exists, and the type and the mask are the same, the +scheduler is simply reconfigured calling the config_scheduler() scheduler +function with the RECONFIGURE flag active. +If the type or the mask differ, it is necessary to delete the old scheduler +and create a new one. +If the scheduler doesn't exists, a new one is created. If the scheduler has +a mask, the hash table is created to store pointers to scheduler instances. +When a new scheduler is created, it is necessary to scan the unlinked +flowset list to search eventually flowset that would be linked with this +scheduler number. If some are found, flowsets became of the type of this +scheduler and they are configured properly. + +Add a flowset +------------- +Flowset pointers are store in the system in two list. The unlinked flowset list +contains all flowset that aren't linked with a scheduler, the flowset list +contains flowset linked to a scheduler, and so they have a type. +When adding a new flowset, first it is checked if the flowset exists (that is, +it is in the flowset list) and if it doesn't exists a new flowset is created +and added to unlinked flowset list if the scheduler which the flowset would be +linked doesn't exists, or added in the flowset list and configured properly if +the scheduler exists. If the flowset (before to be created) was in the +unlinked flowset list, it is removed and deleted, and then recreated. +If the flowset exists, to allow reconfiguration of this flowset, the +scheduler number and types must match with the one in memory. If this isn't +so, the flowset is deleted and a new one will be created. Really, the flowset +it isn't deleted now, but it is removed from flowset list and it will be +deleted later because there could be some queues that are using it. + +Listing of object +================= +The user can request a list of object present in dummynet through the command +"ipfw [-v] pipe|queue [x] list|show" +The kernel side of dummynet send a buffer to user side that contains all +pipe, all scheduler, all flowset, plus all scheduler instances and all queues. +The dummynet user land will format the output and show only the relevant +information. +The buffer sent start with all pipe from the system. The entire struct dn_link +is passed, except the delay_profile struct that is useless in user space. +After pipes, all flowset are wrote in the buffer. The struct contains +scheduler flowset specific data is linked with the flowset writing the +'obj' id of the extension into the 'alg_fs' pointer. +Then schedulers are wrote. If a scheduler has one or more scheduler instance, +these are linked to the parent scheduler writing the id of the parent in the +'ptr_sched' pointer. If a scheduler instance has queues, there are wrote in +the buffer and linked thorugh the 'obj' and 'sched_inst' pointer. +Finally, flowsets in the unlinked flowset list are write in the buffer, and +then a struct gen in saved in the buffer to mark the last struct in the buffer. + + +Delete of object +================ +An object is usually removed by user through a command like +"ipfw pipe|queue x delete". XXX sched? +ipfw pass to the kernel a struct gen that contains the type and the number +of the object to remove + +Delete of pipe x +---------------- +A pipe can be deleted by the user throught the command 'ipfw pipe x delete'. +To delete a pipe, the pipe is removed from the pipe list, and then deleted. +Also the scheduler associated with this pipe should be deleted. +For compatibility with old dummynet syntax, the associated FIFO scheduler and +FIFO flowset must be deleted. + +Delete of flowset x +------------------- +To remove a flowset, we must be sure that is no loger referenced by any object. +If the flowset to remove is in the unlinked flowset list, there is not any +issue, the flowset can be safely removed calling a free() (the flowset +extension is not yet created if the flowset is in this list). +If the flowset is in the flowset list, first we remove from it so new packet +are discarded when arrive. Next, the flowset is marked as delete. +Now we must check if some queue is using this flowset. +To do this, a counter (active_f) is provided. This counter indicate how many +queues exist using this flowset. +The active_f counter is automatically incremented when a queue is created +and decremented when a queue is deleted. +If the counter is 0, the flowset can be safely deleted, and the delete_alg_fs() +scheduler function is called before deallocate memory. +If the counter is not 0, the flowset remain in memory until the counter become +zero. When a queue is delete (by dn_delete_queue() function) it is checked if +the linked flowset is deleting and if so the counter is decrementing. If the +counter reaches 0, the flowset is deleted. +The deletion of a queue can be done only by the scheduler, or when the scheduler +is destroyed. + +Delete of scheduler x +--------------------- +To delete a scheduler we must be sure that any scheduler instance of this type +are in the system_heap. To do so, a counter (inst_counter) is provided. +This counter is managed by the system: it is incremented every time it is +inserted in the system_heap, and decremented every time it is extracted from it. +To delete the scheduler, first we remove it from the scheduler list, so new +packet are discarded when they arrive, and mark the scheduler as deleting. + +If the counter is 0, we can remove the scheduler safely calling the +really_deletescheduler() function. This function will scan all scheduler +instances and call the delete_scheduler_instance() function that will delete +the instance. When all instance are deleted, the scheduler template is +deleted calling the delete_scheduler_template(). If the delay line associate +with the scheduler is empty, it is deleted now, else it will be deleted when +it will became empy. +If the counter was not 0, we wait for it. Every time the dummynet_task() +function extract a scheduler from the system_heap, the counter is decremented. +If the scheduler has the delete flag enabled the dequeue() is not called and +delete_scheduler_instance() is called to delete the instance. +Obviously this scheduler instance is no loger inserted in the system_heap. +If the counter reaches 0, the delete_scheduler_template() function is called +all memory is released. +NOTE: Flowsets that belong to this scheduler are not deleted, so if a new + scheduler with the same number is inserted will use these flowsets. + To do so, the best approach would be insert these flowset in the + unlinked flowset list, but doing this now will be very expensive. + So flowsets will remain in memory and linked with a scheduler that no + longer exists until a packet belonging to this flowset arrives. When + this packet arrives, the reconfigure() function is called because the + generation number mismatch with one contains in the flowset and so + the flowset will be moved into the flowset unlinked list, or will be + linked with the new scheduler if a new one was created. + + +COMPATIBILITY WITH FREEBSD 7.2 AND FREEBSD 8 'IPFW' BINARY +========================================================== +Dummynet is not compatible with old ipfw binary because internal structs are +changed. Moreover, the old ipfw binary is not compatible with new kernels +because the struct that represents a firewall rule has changed. So, if a user +install a new kernel on a FreeBSD 7.2, the ipfw (and possibly many other +commands) will not work. +New dummynet uses a new socket option: IP_DUMMYNET3, used for both set and get. +The old option can be used to allow compatibility with the 'ipfw' binary of +older version (tested with 7.2 and 8.0) of FreeBSD. +Two file are provided for this purpose: +- ip_dummynet_glue.c translates old dummynet requests to the new ones, +- ip_fw_glue.c converts the rule format between 7.2 and 8 versions. +Let see in detail these two files. + +IP_DUMMYNET_GLUE.C +------------------ +The internal structs of new dummynet are very different from the original. +Because of there are some difference from between dummynet in FreeBSD 7.2 and +dummynet in FreeBSD 8 (the FreeBSD 8 version includes support to pipe delay +profile and burst option), I have to include both header files. I copied +the revision 191715 (for version 7.2) and the revision 196045 (for version 8) +and I appended a number to each struct to mark them. + +The main function of this file is ip_dummynet_compat() that is called by +ip_dn_ctl() when it receive a request of old socket option. + +A global variabile ('is7') store the version of 'ipfw' that FreeBSD is using. +This variable is set every time a request of configuration is done, because +with this request we receive a buffer of which size depending of ipfw version. +Because of in general the first action is a configuration, this variable is +usually set accordly. If the first action is a request of listing of pipes +or queues, the system cannot know the version of ipfw, and we suppose that +version 7.2 is used. If version is wrong, the output can be senseless, but +the application should not crash. + +There are four request for old dummynet: +- IP_DUMMYNET_FLUSH: the flush options have no parameter, so simply the + dummynet_flush() function is called; +- IP_DUMMYNET_DEL: the delete option need to be translate. + It is only necessary to extract the number and the type of the object + (pipe or queue) to delete from the buffer received and build a new struct + gen contains the right parameters, then call the delete_object() function; +- IP_DUMMYNET_CONFIGURE: the configure command receive a buffer depending of + the ipfw version. After the properly extraction of all data, that depends + by the ipfw version used, new structures are filled and then the dummynet + config_link() function is properly called. Note that the 7.2 version does + not support some parameter as burst or delay profile. +- IP_DUMMYNET_GET: The get command should send to the ipfw the correct buffer + depending of its version. There are two function that build the + corrected buffer, ip_dummynet_get7() and ip_dummynet_get8(). These + functions reproduce the buffer exactly as 'ipfw' expect. The only difference + is that the weight parameter for a queue is no loger sent by dummynet and so + it is set to 0. + Moreover, because of the internal structure has changed, the bucket size + of a queue could not be correct, because now all flowset share the hash + table. + If the version of ipfw is wrong, the output could be senseless or truncated, + but the application should not crash. + +IP_FW_GLUE.C +------------ +The ipfw binary also is used to add rules to FreeBSD firewall. Because of the +struct ip_fw is changed from FreeBsd 7.2 to FreeBSD 8, it is necessary +to write some glue code to allow use ipfw from FreeBSD 7.2 with the kernel +provided with FreeBSD 8. +This file contains two functions to convert a rule from FreeBSD 7.2 format to +FreeBSD 8 format, and viceversa. +The conversion should be done when a rule passes from userspace to kernel space +and viceversa. +I have to modify the ip_fw2.c file to manage these two case, and added a +variable (is7) to store the ipfw version used, using an approach like the +previous file: +- when a new rule is added (option IP_FW_ADD) the is7 variable is set if the + size of the rule received corrispond to FreeBSD 7.2 ipfw version. If so, the + rule is converted to version 8 calling the function convert_rule_to_8(). + Moreover, after the insertion of the rule, the rule is now reconverted to + version 7 because the ipfw binary will print it. +- when the user request a list of rules (option IP_FW_GET) the is7 variable + should be set correctly because we suppose that a configure command was done, + else we suppose that the FreeBSD version is 8. The function ipfw_getrules() + in ip_fw2.c file return all rules, eventually converted to version 7 (if + the is7 is set) to the ipfw binary. +The conversion of a rule is quite simple. The only difference between the +two structures (struct ip_fw) is that in the new there is a new field +(uint32_t id). So, I copy the entire rule in a buffer and the copy the rule in +the right position in the new (or old) struct. The size of commands are not +changed, and the copy is done into a cicle. + +How to configure dummynet +========================= +It is possible to configure dummynet through two main commands: +'ipfw pipe' and 'ipfw queue'. +To allow compatibility with old version, it is possible configure dummynet +using the old command syntax. Doing so, obviously, it is only possible to +configure a FIFO scheduler or a wf2q+ scheduler. +A new command, 'ipfw pipe x config sched <type>' is supported to add a new +scheduler to the system. + +- ipfw pipe x config ... + create a new pipe with the link parameters + create a new scheduler fifo (x + offset) + create a new flowset fifo (x + offset) + the mask is eventually stored in the FIFO scheduler + +- ipfw queue y config pipe x ... + create a new flowset y linked to sched x. + The type of flowset depends by the specified scheduler. + If the scheduler does not exist, this flowset is inserted in a special + list and will be not active. + If pipe x exists and sched does not exist, a new wf2q+ scheduler is + created and the flowset will be linked to this new scheduler (this is + done for compatibility with old syntax). + +- ipfw pipe x config sched <type> ... + create a new scheduler x of type <type>. + Search into the flowset unlinked list if there are some flowset that + should be linked with this new scheduler. + +- ipfw pipe x delete + delete the pipe x + delete the scheduler fifo (x + offset) + delete the scheduler x + delete the flowset fifo (x + offset) + +- ipfw queue x delete + delete the flowset x + +- ipfw sched x delete ///XXX + delete the scheduler x + +Follow now some examples to how configure dummynet: +- Ex1: + ipfw pipe 10 config bw 1M delay 15 // create a pipe with band and delay + A FIFO flowset and scheduler is + also created + ipfw queue 5 config pipe 10 weight 56 // create a flowset. This flowset + will be of wf2q+ because a pipe 10 + exists. Moreover, the wf2q+ + scheduler is created now. +- Ex2: + ipfw queue 5 config pipe 10 weight 56 // Create a flowset. Scheduler 10 + does not exist, so this flowset + is inserted in the unlinked + flowset list. + ipfw pipe 10 config bw... // Create a pipe, a FIFO flowset and scheduler. + Because of a flowset with 'pipe 10' exists, + a wf2q+ scheduler is created now and that + flowset is linked with this sceduler. + +- Ex3: + ipfw pipe 10 config bw... // Create a pipe, a FIFO flowset and scheduler. + ipfw pipe 10 config sched rr // Create a scheduler of type RR, linked to + pipe 10 + ipfw queue 5 config pipe 10 weight 56 // Create a flowset 5. This flowset + will belong to scheduler 10 and + it is of type RR + +- Ex4: + ipfw pipe 10 config sched rr // Create a scheduler of type RR, linked to + pipe 10 (not exist yet) + ipfw pipe 10 config bw... // Create a pipe, a FIFO flowset and scheduler. + ipfw queue 5 config pipe 10 weight 56 // Create a flowset 5.This flowset + will belong to scheduler 10 and + it is of type RR + ipfw pipe 10 config sched wf2q+ // Modify the type of scheduler 10. It + becomes a wf2q+ scheduler. + When a new packet of flowset 5 arrives, + the flowset 5 becomes to wf2q+ type. + +How to implement a new scheduler +================================ +In dummynet, a scheduler algorithm is represented by two main structs, some +functions and other minor structs. +- A struct dn_sch_xyz (where xyz is the 'type' of scheduler algorithm + implemented) contains data relative to scheduler, as global parameter that + are common to all instances of the scheduler +- A struct dn_sch_inst_xyz contains data relative to a single scheduler + instance, as local status variable depending for example by flows that + are linked with the scheduler, and so on. +To add a scheduler to dummynet, the user should type a command like: +'ipfw pipe x config sched <type> [mask ... ...]' +This command creates a new struct dn_sch_xyz of type <type>, and +store the optional parameter in that struct. + +The parameter mask determines how many scheduler instance of this +scheduler may exist. For example, it is possible to divide traffic +depending on the source port (or destination, or ip address...), +so that every scheduler instance act as an independent scheduler. +If the mask is not set, all traffic goes to the same instance. + +When a packet arrives to a scheduler, the system search the corrected +scheduler instance, and if it does not exist it is created now (the +struct dn_sch_inst_xyz is allocated by the system, and the scheduler +fills the field correctly). It is a task of the scheduler to create +the struct that contains all queues for a scheduler instance. +Dummynet provides some function to create an hash table to store +queues, but the schedule algorithm can choice the own struct. + +To link a flow to a scheduler, the user should type a command like: +'ipfw queue z config pipe x [mask... ...]' + +This command creates a new 'dn_fs' struct that will be inserted +in the system. If the scheduler x exists, this flowset will be +linked to that scheduler and the flowset type become the same as +the scheduler type. At this point, the function create_alg_fs_xyz() +is called to allow store eventually parameter for the flowset that +depend by scheduler (for example the 'weight' parameter for a wf2q+ +scheduler, or some priority...). A parameter mask can be used for +a flowset. If the mask parameter is set, the scheduler instance can +separate packet according to its flow id (src and dst ip, ports...) +and assign it to a separate queue. This is done by the scheduler, +so it can ignore the mask if it wants. + +See now the two main structs: +struct dn_sch_xyz { + struct gen g; /* important the name g */ + /* global params */ +}; +struct dn_sch_inst_xyz { + struct gen g; /* important the name g */ + /* params of the instance */ +}; +It is important to embed the struct gen as first parameter. The struct gen +contains some values that the scheduler instance must fill (the 'type' of +scheduler, the 'len' of the struct...) +The function create_scheduler_xyz() should be implemented to initialize global +parameters in the first struct, and if memory allocation is done it is +mandatory to implement the delete_scheduler_template() function to free that +memory. +The function create_scheduler_instance_xyz() must be implemented even if the +scheduler instance does not use extra parameters. In this function the struct +gen fields must be filled with corrected infos. The +delete_scheduler_instance_xyz() function must bu implemented if the instance +has allocated some memory in the previous function. + +To store data belonging to a flowset the follow struct is used: +struct alg_fs_xyz { + struct gen g; + /* fill correctly the gen struct + g.subtype = DN_XYZ; + g.len = sizeof(struct alg_fs_xyz) + ... + */ + /* params for the flow */ +}; +The create_alg_fs_xyz() function is mandatory, because it must fill the struct +gen, but the delete_alg_fs_xyz() is mandatory only if the previous function +has allocated some memory. + +A struct dn_queue contains packets belonging to a queue and some statistical +data. The scheduler could have to store data in this struct, so it must define +a dn_queue_xyz struct: +struct dn_queue_xyz { + struct dn_queue q; + /* parameter for a queue */ +} + +All structures are allocated by the system. To do so, the scheduler must +set the size of its structs in the scheduler descriptor: +scheduler_size: sizeof(dn_sch_xyz) +scheduler_i_size: sizeof(dn_sch_inst_xyz) +flowset_size: sizeof(alg_fs_xyz) +queue_size: sizeof(dn_queue_xyz); +The scheduler_size could be 0, but other struct must have at least a struct gen. + + +After the definition of structs, it is necessary to implement the +scheduler functions. + +- int (*config_scheduler)(char *command, void *sch, int reconfigure); + Configure a scheduler, or reconfigure if 'reconfigure' == 1. + This function performs additional allocation and initialization of global + parameter for this scheduler. + If memory is allocated here, the delete_scheduler_template() function + should be implemented to remove this memory. +- int (*delete_scheduler_template)(void* sch); + Delete a scheduler template. This function is mandatory if the scheduler + uses extra data respect the struct dn_sch. +- int (*create_scheduler_instance)(void *s); + Create a new scheduler instance. The system allocate the necessary memory + and the schedulet can access it using the 's' pointer. + The scheduler instance stores all queues, and to do this can use the + hash table provided by the system. +- int (*delete_scheduler_instance)(void *s); + Delete a scheduler instance. It is important to free memory allocated + by create_scheduler_instance() function. The memory allocated by system + is freed by the system itself. The struct contains all queue also has + to be deleted. +- int (*enqueue)(void *s, struct gen *f, struct mbuf *m, + struct ipfw_flow_id *id); + Called when a packet arrives. The packet 'm' belongs to the scheduler + instance 's', has a flowset 'f' and the flowid 'id' has already been + masked. The enqueue() must call dn_queue_packet(q, m) function to really + enqueue packet in the queue q. The queue 'q' is chosen by the scheduler + and if it does not exist should be created calling the dn_create_queue() + function. If the schedule want to drop the packet, it must call the + dn_drop_packet() function and then return 1. +- struct mbuf * (*dequeue)(void *s); + Called when the timer expires (or when a packet arrives and the scheduler + instance is idle). + This function is called when at least a packet can be send out. The + scheduler choices the packet and returns it; if no packet are in the + schedulerinstance, the function must return NULL. + Before return a packet, it is important to call the function + dn_return_packet() to update some statistic of the queue and update the + queue counters. +- int (*drain_queue)(void *s, int flag); + The system request to scheduler to delete all queues that is not using + to free memory. The flag parameter indicate if a queue must be deleted + even if it is active. + +- int (*create_alg_fs)(char *command, struct gen *g, int reconfigure); + It is called when a flowset is linked with a scheduler. This is done + when the scheduler is defined, so we can know the type of flowset. + The function initialize the flowset paramenter parsing the command + line. The parameter will be stored in the g struct that have the right + size allocated by the system. If the reconfigure flag is set, it means + that the flowset is reconfiguring +- int (*delete_alg_fs)(struct gen *f); + It is called when a flowset is deleting. Must remove the memory allocate + by the create_alg_fs() function. + +- int (*create_queue_alg)(struct dn_queue *q, struct gen *f); + Called when a queue is created. The function should link the queue + to the struct used by the scheduler instance to store all queues. +- int (*delete_queue_alg)(struct dn_queue *q); + Called when a queue is deleting. The function should remove extra data + and update the struct contains all queues in the scheduler instance. + +The struct scheduler represent the scheduler descriptor that is passed to +dummynet when a scheduler module is loaded. +This struct contains the type of scheduler, the lenght of all structs and +all function pointers. +If a function is not implemented should be initialize to NULL. Some functions +are mandatory, other are mandatory if some memory should be freed. +Mandatory functions: +- create_scheduler_instance() +- enqueue() +- dequeue() +- create_alg_fs() +- drain_queue() +Optional functions: +- config_scheduler() +- create_queue_alg() +Mandatory functions if the corresponding create...() has allocated memory: +- delete_scheduler_template() +- delete_scheduler_instance() +- delete_alg_fs() +- delete_queue_alg() + diff --git a/sys/netinet/ipfw/ip_dn_glue.c b/sys/netinet/ipfw/ip_dn_glue.c new file mode 100644 index 0000000..c0df1fc --- /dev/null +++ b/sys/netinet/ipfw/ip_dn_glue.c @@ -0,0 +1,845 @@ +/*- + * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + * + * Binary compatibility support for /sbin/ipfw RELENG_7 and RELENG_8 + */ + +#include "opt_inet6.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/module.h> +#include <sys/priv.h> +#include <sys/proc.h> +#include <sys/rwlock.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/time.h> +#include <sys/taskqueue.h> +#include <net/if.h> /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ +#include <netinet/in.h> +#include <netinet/ip_var.h> /* ip_output(), IP_FORWARDING */ +#include <netinet/ip_fw.h> +#include <netinet/ipfw/ip_fw_private.h> +#include <netinet/ipfw/dn_heap.h> +#include <netinet/ip_dummynet.h> +#include <netinet/ipfw/ip_dn_private.h> +#include <netinet/ipfw/dn_sched.h> + +/* FREEBSD7.2 ip_dummynet.h r191715*/ + +struct dn_heap_entry7 { + int64_t key; /* sorting key. Topmost element is smallest one */ + void *object; /* object pointer */ +}; + +struct dn_heap7 { + int size; + int elements; + int offset; /* XXX if > 0 this is the offset of direct ptr to obj */ + struct dn_heap_entry7 *p; /* really an array of "size" entries */ +}; + +/* Common to 7.2 and 8 */ +struct dn_flow_set { + SLIST_ENTRY(dn_flow_set) next; /* linked list in a hash slot */ + + u_short fs_nr ; /* flow_set number */ + u_short flags_fs; +#define DNOLD_HAVE_FLOW_MASK 0x0001 +#define DNOLD_IS_RED 0x0002 +#define DNOLD_IS_GENTLE_RED 0x0004 +#define DNOLD_QSIZE_IS_BYTES 0x0008 /* queue size is measured in bytes */ +#define DNOLD_NOERROR 0x0010 /* do not report ENOBUFS on drops */ +#define DNOLD_HAS_PROFILE 0x0020 /* the pipe has a delay profile. */ +#define DNOLD_IS_PIPE 0x4000 +#define DNOLD_IS_QUEUE 0x8000 + + struct dn_pipe7 *pipe ; /* pointer to parent pipe */ + u_short parent_nr ; /* parent pipe#, 0 if local to a pipe */ + + int weight ; /* WFQ queue weight */ + int qsize ; /* queue size in slots or bytes */ + int plr ; /* pkt loss rate (2^31-1 means 100%) */ + + struct ipfw_flow_id flow_mask ; + + /* hash table of queues onto this flow_set */ + int rq_size ; /* number of slots */ + int rq_elements ; /* active elements */ + struct dn_flow_queue7 **rq; /* array of rq_size entries */ + + u_int32_t last_expired ; /* do not expire too frequently */ + int backlogged ; /* #active queues for this flowset */ + + /* RED parameters */ +#define SCALE_RED 16 +#define SCALE(x) ( (x) << SCALE_RED ) +#define SCALE_VAL(x) ( (x) >> SCALE_RED ) +#define SCALE_MUL(x,y) ( ( (x) * (y) ) >> SCALE_RED ) + int w_q ; /* queue weight (scaled) */ + int max_th ; /* maximum threshold for queue (scaled) */ + int min_th ; /* minimum threshold for queue (scaled) */ + int max_p ; /* maximum value for p_b (scaled) */ + u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */ + u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */ + u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */ + u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */ + u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */ + u_int lookup_depth ; /* depth of lookup table */ + int lookup_step ; /* granularity inside the lookup table */ + int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */ + int avg_pkt_size ; /* medium packet size */ + int max_pkt_size ; /* max packet size */ +}; +SLIST_HEAD(dn_flow_set_head, dn_flow_set); + +#define DN_IS_PIPE 0x4000 +#define DN_IS_QUEUE 0x8000 +struct dn_flow_queue7 { + struct dn_flow_queue7 *next ; + struct ipfw_flow_id id ; + + struct mbuf *head, *tail ; /* queue of packets */ + u_int len ; + u_int len_bytes ; + + u_long numbytes; + + u_int64_t tot_pkts ; /* statistics counters */ + u_int64_t tot_bytes ; + u_int32_t drops ; + + int hash_slot ; /* debugging/diagnostic */ + + /* RED parameters */ + int avg ; /* average queue length est. (scaled) */ + int count ; /* arrivals since last RED drop */ + int random ; /* random value (scaled) */ + u_int32_t q_time; /* start of queue idle time */ + + /* WF2Q+ support */ + struct dn_flow_set *fs ; /* parent flow set */ + int heap_pos ; /* position (index) of struct in heap */ + int64_t sched_time ; /* current time when queue enters ready_heap */ + + int64_t S,F ; /* start time, finish time */ +}; + +struct dn_pipe7 { /* a pipe */ + SLIST_ENTRY(dn_pipe7) next; /* linked list in a hash slot */ + + int pipe_nr ; /* number */ + int bandwidth; /* really, bytes/tick. */ + int delay ; /* really, ticks */ + + struct mbuf *head, *tail ; /* packets in delay line */ + + /* WF2Q+ */ + struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/ + struct dn_heap7 not_eligible_heap; /* top extract- key Start time */ + struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */ + + int64_t V ; /* virtual time */ + int sum; /* sum of weights of all active sessions */ + + int numbytes; + + int64_t sched_time ; /* time pipe was scheduled in ready_heap */ + + /* + * When the tx clock come from an interface (if_name[0] != '\0'), its name + * is stored below, whereas the ifp is filled when the rule is configured. + */ + char if_name[IFNAMSIZ]; + struct ifnet *ifp ; + int ready ; /* set if ifp != NULL and we got a signal from it */ + + struct dn_flow_set fs ; /* used with fixed-rate flows */ +}; +SLIST_HEAD(dn_pipe_head7, dn_pipe7); + + +/* FREEBSD8 ip_dummynet.h r196045 */ +struct dn_flow_queue8 { + struct dn_flow_queue8 *next ; + struct ipfw_flow_id id ; + + struct mbuf *head, *tail ; /* queue of packets */ + u_int len ; + u_int len_bytes ; + + uint64_t numbytes ; /* credit for transmission (dynamic queues) */ + int64_t extra_bits; /* extra bits simulating unavailable channel */ + + u_int64_t tot_pkts ; /* statistics counters */ + u_int64_t tot_bytes ; + u_int32_t drops ; + + int hash_slot ; /* debugging/diagnostic */ + + /* RED parameters */ + int avg ; /* average queue length est. (scaled) */ + int count ; /* arrivals since last RED drop */ + int random ; /* random value (scaled) */ + int64_t idle_time; /* start of queue idle time */ + + /* WF2Q+ support */ + struct dn_flow_set *fs ; /* parent flow set */ + int heap_pos ; /* position (index) of struct in heap */ + int64_t sched_time ; /* current time when queue enters ready_heap */ + + int64_t S,F ; /* start time, finish time */ +}; + +struct dn_pipe8 { /* a pipe */ + SLIST_ENTRY(dn_pipe8) next; /* linked list in a hash slot */ + + int pipe_nr ; /* number */ + int bandwidth; /* really, bytes/tick. */ + int delay ; /* really, ticks */ + + struct mbuf *head, *tail ; /* packets in delay line */ + + /* WF2Q+ */ + struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/ + struct dn_heap7 not_eligible_heap; /* top extract- key Start time */ + struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */ + + int64_t V ; /* virtual time */ + int sum; /* sum of weights of all active sessions */ + + /* Same as in dn_flow_queue, numbytes can become large */ + int64_t numbytes; /* bits I can transmit (more or less). */ + uint64_t burst; /* burst size, scaled: bits * hz */ + + int64_t sched_time ; /* time pipe was scheduled in ready_heap */ + int64_t idle_time; /* start of pipe idle time */ + + char if_name[IFNAMSIZ]; + struct ifnet *ifp ; + int ready ; /* set if ifp != NULL and we got a signal from it */ + + struct dn_flow_set fs ; /* used with fixed-rate flows */ + + /* fields to simulate a delay profile */ +#define ED_MAX_NAME_LEN 32 + char name[ED_MAX_NAME_LEN]; + int loss_level; + int samples_no; + int *samples; +}; + +#define ED_MAX_SAMPLES_NO 1024 +struct dn_pipe_max8 { + struct dn_pipe8 pipe; + int samples[ED_MAX_SAMPLES_NO]; +}; +SLIST_HEAD(dn_pipe_head8, dn_pipe8); + +/* + * Changes from 7.2 to 8: + * dn_pipe: + * numbytes from int to int64_t + * add burst (int64_t) + * add idle_time (int64_t) + * add profile + * add struct dn_pipe_max + * add flag DN_HAS_PROFILE + * + * dn_flow_queue + * numbytes from u_long to int64_t + * add extra_bits (int64_t) + * q_time from u_int32_t to int64_t and name idle_time + * + * dn_flow_set unchanged + * + */ + +/* NOTE:XXX copied from dummynet.c */ +#define O_NEXT(p, len) ((void *)((char *)p + len)) +static void +oid_fill(struct dn_id *oid, int len, int type, uintptr_t id) +{ + oid->len = len; + oid->type = type; + oid->subtype = 0; + oid->id = id; +} +/* make room in the buffer and move the pointer forward */ +static void * +o_next(struct dn_id **o, int len, int type) +{ + struct dn_id *ret = *o; + oid_fill(ret, len, type, 0); + *o = O_NEXT(*o, len); + return ret; +} + + +static size_t pipesize7 = sizeof(struct dn_pipe7); +static size_t pipesize8 = sizeof(struct dn_pipe8); +static size_t pipesizemax8 = sizeof(struct dn_pipe_max8); + +/* Indicate 'ipfw' version + * 1: from FreeBSD 7.2 + * 0: from FreeBSD 8 + * -1: unknow (for now is unused) + * + * It is update when a IP_DUMMYNET_DEL or IP_DUMMYNET_CONFIGURE request arrives + * NOTE: if a IP_DUMMYNET_GET arrives and the 'ipfw' version is unknow, + * it is suppose to be the FreeBSD 8 version. + */ +static int is7 = 0; + +static int +convertflags2new(int src) +{ + int dst = 0; + + if (src & DNOLD_HAVE_FLOW_MASK) + dst |= DN_HAVE_MASK; + if (src & DNOLD_QSIZE_IS_BYTES) + dst |= DN_QSIZE_BYTES; + if (src & DNOLD_NOERROR) + dst |= DN_NOERROR; + if (src & DNOLD_IS_RED) + dst |= DN_IS_RED; + if (src & DNOLD_IS_GENTLE_RED) + dst |= DN_IS_GENTLE_RED; + if (src & DNOLD_HAS_PROFILE) + dst |= DN_HAS_PROFILE; + + return dst; +} + +static int +convertflags2old(int src) +{ + int dst = 0; + + if (src & DN_HAVE_MASK) + dst |= DNOLD_HAVE_FLOW_MASK; + if (src & DN_IS_RED) + dst |= DNOLD_IS_RED; + if (src & DN_IS_GENTLE_RED) + dst |= DNOLD_IS_GENTLE_RED; + if (src & DN_NOERROR) + dst |= DNOLD_NOERROR; + if (src & DN_HAS_PROFILE) + dst |= DNOLD_HAS_PROFILE; + if (src & DN_QSIZE_BYTES) + dst |= DNOLD_QSIZE_IS_BYTES; + + return dst; +} + +static int +dn_compat_del(void *v) +{ + struct dn_pipe7 *p = (struct dn_pipe7 *) v; + struct dn_pipe8 *p8 = (struct dn_pipe8 *) v; + struct { + struct dn_id oid; + uintptr_t a[1]; /* add more if we want a list */ + } cmd; + + /* XXX DN_API_VERSION ??? */ + oid_fill((void *)&cmd, sizeof(cmd), DN_CMD_DELETE, DN_API_VERSION); + + if (is7) { + if (p->pipe_nr == 0 && p->fs.fs_nr == 0) + return EINVAL; + if (p->pipe_nr != 0 && p->fs.fs_nr != 0) + return EINVAL; + } else { + if (p8->pipe_nr == 0 && p8->fs.fs_nr == 0) + return EINVAL; + if (p8->pipe_nr != 0 && p8->fs.fs_nr != 0) + return EINVAL; + } + + if (p->pipe_nr != 0) { /* pipe x delete */ + cmd.a[0] = p->pipe_nr; + cmd.oid.subtype = DN_LINK; + } else { /* queue x delete */ + cmd.oid.subtype = DN_FS; + cmd.a[0] = (is7) ? p->fs.fs_nr : p8->fs.fs_nr; + } + + return do_config(&cmd, cmd.oid.len); +} + +static int +dn_compat_config_queue(struct dn_fs *fs, void* v) +{ + struct dn_pipe7 *p7 = (struct dn_pipe7 *)v; + struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; + struct dn_flow_set *f; + + if (is7) + f = &p7->fs; + else + f = &p8->fs; + + fs->fs_nr = f->fs_nr; + fs->sched_nr = f->parent_nr; + fs->flow_mask = f->flow_mask; + fs->buckets = f->rq_size; + fs->qsize = f->qsize; + fs->plr = f->plr; + fs->par[0] = f->weight; + fs->flags = convertflags2new(f->flags_fs); + if (fs->flags & DN_IS_GENTLE_RED || fs->flags & DN_IS_RED) { + fs->w_q = f->w_q; + fs->max_th = f->max_th; + fs->min_th = f->min_th; + fs->max_p = f->max_p; + } + + return 0; +} + +static int +dn_compat_config_pipe(struct dn_sch *sch, struct dn_link *p, + struct dn_fs *fs, void* v) +{ + struct dn_pipe7 *p7 = (struct dn_pipe7 *)v; + struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; + int i = p7->pipe_nr; + + sch->sched_nr = i; + sch->oid.subtype = 0; + p->link_nr = i; + fs->fs_nr = i + 2*DN_MAX_ID; + fs->sched_nr = i + DN_MAX_ID; + + /* Common to 7 and 8 */ + p->bandwidth = p7->bandwidth; + p->delay = p7->delay; + if (!is7) { + /* FreeBSD 8 has burst */ + p->burst = p8->burst; + } + + /* fill the fifo flowset */ + dn_compat_config_queue(fs, v); + fs->fs_nr = i + 2*DN_MAX_ID; + fs->sched_nr = i + DN_MAX_ID; + + /* Move scheduler related parameter from fs to sch */ + sch->buckets = fs->buckets; /*XXX*/ + fs->buckets = 0; + if (fs->flags & DN_HAVE_MASK) { + sch->flags |= DN_HAVE_MASK; + fs->flags &= ~DN_HAVE_MASK; + sch->sched_mask = fs->flow_mask; + bzero(&fs->flow_mask, sizeof(struct ipfw_flow_id)); + } + + return 0; +} + +static int +dn_compat_config_profile(struct dn_profile *pf, struct dn_link *p, + void *v) +{ + struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; + + p8->samples = &(((struct dn_pipe_max8 *)p8)->samples[0]); + + pf->link_nr = p->link_nr; + pf->loss_level = p8->loss_level; +// pf->bandwidth = p->bandwidth; //XXX bandwidth redundant? + pf->samples_no = p8->samples_no; + strncpy(pf->name, p8->name,sizeof(pf->name)); + bcopy(p8->samples, pf->samples, sizeof(pf->samples)); + + return 0; +} + +/* + * If p->pipe_nr != 0 the command is 'pipe x config', so need to create + * the three main struct, else only a flowset is created + */ +static int +dn_compat_configure(void *v) +{ + struct dn_id *buf = NULL, *base; + struct dn_sch *sch = NULL; + struct dn_link *p = NULL; + struct dn_fs *fs = NULL; + struct dn_profile *pf = NULL; + int lmax; + int error; + + struct dn_pipe7 *p7 = (struct dn_pipe7 *)v; + struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; + + int i; /* number of object to configure */ + + lmax = sizeof(struct dn_id); /* command header */ + lmax += sizeof(struct dn_sch) + sizeof(struct dn_link) + + sizeof(struct dn_fs) + sizeof(struct dn_profile); + + base = buf = malloc(lmax, M_DUMMYNET, M_WAIT|M_ZERO); + o_next(&buf, sizeof(struct dn_id), DN_CMD_CONFIG); + base->id = DN_API_VERSION; + + /* pipe_nr is the same in p7 and p8 */ + i = p7->pipe_nr; + if (i != 0) { /* pipe config */ + sch = o_next(&buf, sizeof(*sch), DN_SCH); + p = o_next(&buf, sizeof(*p), DN_LINK); + fs = o_next(&buf, sizeof(*fs), DN_FS); + + error = dn_compat_config_pipe(sch, p, fs, v); + if (error) { + free(buf, M_DUMMYNET); + return error; + } + if (!is7 && p8->samples_no > 0) { + /* Add profiles*/ + pf = o_next(&buf, sizeof(*pf), DN_PROFILE); + error = dn_compat_config_profile(pf, p, v); + if (error) { + free(buf, M_DUMMYNET); + return error; + } + } + } else { /* queue config */ + fs = o_next(&buf, sizeof(*fs), DN_FS); + error = dn_compat_config_queue(fs, v); + if (error) { + free(buf, M_DUMMYNET); + return error; + } + } + error = do_config(base, (char *)buf - (char *)base); + + if (buf) + free(buf, M_DUMMYNET); + return error; +} + +int +dn_compat_calc_size(struct dn_parms dn_cfg) +{ + int need = 0; + /* XXX use FreeBSD 8 struct size */ + /* NOTE: + * - half scheduler: schk_count/2 + * - all flowset: fsk_count + * - all flowset queues: queue_count + * - all pipe queue: si_count + */ + need += dn_cfg.schk_count * sizeof(struct dn_pipe8) / 2; + need += dn_cfg.fsk_count * sizeof(struct dn_flow_set); + need += dn_cfg.si_count * sizeof(struct dn_flow_queue8); + need += dn_cfg.queue_count * sizeof(struct dn_flow_queue8); + + return need; +} + +int +dn_c_copy_q (void *_ni, void *arg) +{ + struct copy_args *a = arg; + struct dn_flow_queue7 *fq7 = (struct dn_flow_queue7 *)*a->start; + struct dn_flow_queue8 *fq8 = (struct dn_flow_queue8 *)*a->start; + struct dn_flow *ni = (struct dn_flow *)_ni; + int size = 0; + + /* XXX hash slot not set */ + /* No difference between 7.2/8 */ + fq7->len = ni->length; + fq7->len_bytes = ni->len_bytes; + fq7->id = ni->fid; + + if (is7) { + size = sizeof(struct dn_flow_queue7); + fq7->tot_pkts = ni->tot_pkts; + fq7->tot_bytes = ni->tot_bytes; + fq7->drops = ni->drops; + } else { + size = sizeof(struct dn_flow_queue8); + fq8->tot_pkts = ni->tot_pkts; + fq8->tot_bytes = ni->tot_bytes; + fq8->drops = ni->drops; + } + + *a->start += size; + return 0; +} + +int +dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq) +{ + struct dn_link *l = &s->link; + struct dn_fsk *f = s->fs; + + struct dn_pipe7 *pipe7 = (struct dn_pipe7 *)*a->start; + struct dn_pipe8 *pipe8 = (struct dn_pipe8 *)*a->start; + struct dn_flow_set *fs; + int size = 0; + + if (is7) { + fs = &pipe7->fs; + size = sizeof(struct dn_pipe7); + } else { + fs = &pipe8->fs; + size = sizeof(struct dn_pipe8); + } + + /* These 4 field are the same in pipe7 and pipe8 */ + pipe7->next.sle_next = (struct dn_pipe7 *)DN_IS_PIPE; + pipe7->bandwidth = l->bandwidth; + pipe7->delay = l->delay; + pipe7->pipe_nr = l->link_nr - DN_MAX_ID; + + if (!is7) { + if (s->profile) { + struct dn_profile *pf = s->profile; + strncpy(pipe8->name, pf->name, sizeof(pf->name)); + pipe8->loss_level = pf->loss_level; + pipe8->samples_no = pf->samples_no; + } + pipe8->burst = div64(l->burst , 8 * hz); + } + + fs->flow_mask = s->sch.sched_mask; + fs->rq_size = s->sch.buckets ? s->sch.buckets : 1; + + fs->parent_nr = l->link_nr - DN_MAX_ID; + fs->qsize = f->fs.qsize; + fs->plr = f->fs.plr; + fs->w_q = f->fs.w_q; + fs->max_th = f->max_th; + fs->min_th = f->min_th; + fs->max_p = f->fs.max_p; + fs->rq_elements = nq; + + fs->flags_fs = convertflags2old(f->fs.flags); + + *a->start += size; + return 0; +} + + +int +dn_compat_copy_pipe(struct copy_args *a, void *_o) +{ + int have = a->end - *a->start; + int need = 0; + int pipe_size = sizeof(struct dn_pipe8); + int queue_size = sizeof(struct dn_flow_queue8); + int n_queue = 0; /* number of queues */ + + struct dn_schk *s = (struct dn_schk *)_o; + /* calculate needed space: + * - struct dn_pipe + * - if there are instances, dn_queue * n_instances + */ + n_queue = (s->sch.flags & DN_HAVE_MASK ? dn_ht_entries(s->siht) : + (s->siht ? 1 : 0)); + need = pipe_size + queue_size * n_queue; + if (have < need) { + D("have %d < need %d", have, need); + return 1; + } + /* copy pipe */ + dn_c_copy_pipe(s, a, n_queue); + + /* copy queues */ + if (s->sch.flags & DN_HAVE_MASK) + dn_ht_scan(s->siht, dn_c_copy_q, a); + else if (s->siht) + dn_c_copy_q(s->siht, a); + return 0; +} + +int +dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq) +{ + struct dn_flow_set *fs = (struct dn_flow_set *)*a->start; + + fs->next.sle_next = (struct dn_flow_set *)DN_IS_QUEUE; + fs->fs_nr = f->fs.fs_nr; + fs->qsize = f->fs.qsize; + fs->plr = f->fs.plr; + fs->w_q = f->fs.w_q; + fs->max_th = f->max_th; + fs->min_th = f->min_th; + fs->max_p = f->fs.max_p; + fs->flow_mask = f->fs.flow_mask; + fs->rq_elements = nq; + fs->rq_size = (f->fs.buckets ? f->fs.buckets : 1); + fs->parent_nr = f->fs.sched_nr; + fs->weight = f->fs.par[0]; + + fs->flags_fs = convertflags2old(f->fs.flags); + *a->start += sizeof(struct dn_flow_set); + return 0; +} + +int +dn_compat_copy_queue(struct copy_args *a, void *_o) +{ + int have = a->end - *a->start; + int need = 0; + int fs_size = sizeof(struct dn_flow_set); + int queue_size = sizeof(struct dn_flow_queue8); + + struct dn_fsk *fs = (struct dn_fsk *)_o; + int n_queue = 0; /* number of queues */ + + n_queue = (fs->fs.flags & DN_HAVE_MASK ? dn_ht_entries(fs->qht) : + (fs->qht ? 1 : 0)); + + need = fs_size + queue_size * n_queue; + if (have < need) { + D("have < need"); + return 1; + } + + /* copy flowset */ + dn_c_copy_fs(fs, a, n_queue); + + /* copy queues */ + if (fs->fs.flags & DN_HAVE_MASK) + dn_ht_scan(fs->qht, dn_c_copy_q, a); + else if (fs->qht) + dn_c_copy_q(fs->qht, a); + + return 0; +} + +int +copy_data_helper_compat(void *_o, void *_arg) +{ + struct copy_args *a = _arg; + + if (a->type == DN_COMPAT_PIPE) { + struct dn_schk *s = _o; + if (s->sch.oid.subtype != 1 || s->sch.sched_nr <= DN_MAX_ID) { + return 0; /* not old type */ + } + /* copy pipe parameters, and if instance exists, copy + * other parameters and eventually queues. + */ + if(dn_compat_copy_pipe(a, _o)) + return DNHT_SCAN_END; + } else if (a->type == DN_COMPAT_QUEUE) { + struct dn_fsk *fs = _o; + if (fs->fs.fs_nr >= DN_MAX_ID) + return 0; + if (dn_compat_copy_queue(a, _o)) + return DNHT_SCAN_END; + } + return 0; +} + +/* Main function to manage old requests */ +int +ip_dummynet_compat(struct sockopt *sopt) +{ + int error=0; + void *v = NULL; + struct dn_id oid; + + /* Lenght of data, used to found ipfw version... */ + int len = sopt->sopt_valsize; + + /* len can be 0 if command was dummynet_flush */ + if (len == pipesize7) { + D("setting compatibility with FreeBSD 7.2"); + is7 = 1; + } + else if (len == pipesize8 || len == pipesizemax8) { + D("setting compatibility with FreeBSD 8"); + is7 = 0; + } + + switch (sopt->sopt_name) { + default: + printf("dummynet: -- unknown option %d", sopt->sopt_name); + error = EINVAL; + break; + + case IP_DUMMYNET_FLUSH: + oid_fill(&oid, sizeof(oid), DN_CMD_FLUSH, DN_API_VERSION); + do_config(&oid, oid.len); + break; + + case IP_DUMMYNET_DEL: + v = malloc(len, M_TEMP, M_WAITOK); + error = sooptcopyin(sopt, v, len, len); + if (error) + break; + error = dn_compat_del(v); + free(v, M_DUMMYNET); + break; + + case IP_DUMMYNET_CONFIGURE: + v = malloc(len, M_TEMP, M_WAITOK); + error = sooptcopyin(sopt, v, len, len); + if (error) + break; + error = dn_compat_configure(v); + free(v, M_DUMMYNET); + break; + + case IP_DUMMYNET_GET: { + void *buf; + int ret; + int original_size = sopt->sopt_valsize; + int size; + + ret = dummynet_get(sopt, &buf); + if (ret) + return 0;//XXX ? + size = sopt->sopt_valsize; + sopt->sopt_valsize = original_size; + D("size=%d, buf=%p", size, buf); + ret = sooptcopyout(sopt, buf, size); + if (ret) + printf(" %s ERROR sooptcopyout\n", __FUNCTION__); + if (buf) + free(buf, M_DUMMYNET); + } + } + + return error; +} + + diff --git a/sys/netinet/ipfw/ip_dn_io.c b/sys/netinet/ipfw/ip_dn_io.c new file mode 100644 index 0000000..70f14ca --- /dev/null +++ b/sys/netinet/ipfw/ip_dn_io.c @@ -0,0 +1,788 @@ +/*- + * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Dummynet portions related to packet handling. + */ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include "opt_inet6.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/module.h> +#include <sys/priv.h> +#include <sys/proc.h> +#include <sys/rwlock.h> +#include <sys/socket.h> +#include <sys/time.h> +#include <sys/sysctl.h> +#include <net/if.h> /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ +#include <net/netisr.h> +#include <netinet/in.h> +#include <netinet/ip.h> /* ip_len, ip_off */ +#include <netinet/ip_var.h> /* ip_output(), IP_FORWARDING */ +#include <netinet/ip_fw.h> +#include <netinet/ipfw/ip_fw_private.h> +#include <netinet/ipfw/dn_heap.h> +#include <netinet/ip_dummynet.h> +#include <netinet/ipfw/ip_dn_private.h> +#include <netinet/ipfw/dn_sched.h> + +#include <netinet/if_ether.h> /* various ether_* routines */ + +#include <netinet/ip6.h> /* for ip6_input, ip6_output prototypes */ +#include <netinet6/ip6_var.h> + +/* + * We keep a private variable for the simulation time, but we could + * probably use an existing one ("softticks" in sys/kern/kern_timeout.c) + * instead of dn_cfg.curr_time + */ + +struct dn_parms dn_cfg; + +static long tick_last; /* Last tick duration (usec). */ +static long tick_delta; /* Last vs standard tick diff (usec). */ +static long tick_delta_sum; /* Accumulated tick difference (usec).*/ +static long tick_adjustment; /* Tick adjustments done. */ +static long tick_lost; /* Lost(coalesced) ticks number. */ +/* Adjusted vs non-adjusted curr_time difference (ticks). */ +static long tick_diff; + +static unsigned long io_pkt; +static unsigned long io_pkt_fast; +static unsigned long io_pkt_drop; + +/* + * We use a heap to store entities for which we have pending timer events. + * The heap is checked at every tick and all entities with expired events + * are extracted. + */ + +MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap"); + +extern void (*bridge_dn_p)(struct mbuf *, struct ifnet *); + +#ifdef SYSCTL_NODE + +SYSBEGIN(f4) + +SYSCTL_DECL(_net_inet); +SYSCTL_DECL(_net_inet_ip); +SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet"); + +/* parameters */ +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size, + CTLFLAG_RW, &dn_cfg.hash_size, 0, "Default hash table size"); +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_slot_limit, + CTLFLAG_RW, &dn_cfg.slot_limit, 0, + "Upper limit in slots for pipe queue."); +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_byte_limit, + CTLFLAG_RW, &dn_cfg.byte_limit, 0, + "Upper limit in bytes for pipe queue."); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, io_fast, + CTLFLAG_RW, &dn_cfg.io_fast, 0, "Enable fast dummynet io."); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, + CTLFLAG_RW, &dn_cfg.debug, 0, "Dummynet debug level"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire, + CTLFLAG_RW, &dn_cfg.expire, 0, "Expire empty queues/pipes"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire_cycle, + CTLFLAG_RD, &dn_cfg.expire_cycle, 0, "Expire cycle for queues/pipes"); + +/* RED parameters */ +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth, + CTLFLAG_RD, &dn_cfg.red_lookup_depth, 0, "Depth of RED lookup table"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size, + CTLFLAG_RD, &dn_cfg.red_avg_pkt_size, 0, "RED Medium packet size"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size, + CTLFLAG_RD, &dn_cfg.red_max_pkt_size, 0, "RED Max packet size"); + +/* time adjustment */ +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta, + CTLFLAG_RD, &tick_delta, 0, "Last vs standard tick difference (usec)."); +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta_sum, + CTLFLAG_RD, &tick_delta_sum, 0, "Accumulated tick difference (usec)."); +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_adjustment, + CTLFLAG_RD, &tick_adjustment, 0, "Tick adjustments done."); +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_diff, + CTLFLAG_RD, &tick_diff, 0, + "Adjusted vs non-adjusted curr_time difference (ticks)."); +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost, + CTLFLAG_RD, &tick_lost, 0, + "Number of ticks coalesced by dummynet taskqueue."); + +/* statistics */ +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, schk_count, + CTLFLAG_RD, &dn_cfg.schk_count, 0, "Number of schedulers"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, si_count, + CTLFLAG_RD, &dn_cfg.si_count, 0, "Number of scheduler instances"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, fsk_count, + CTLFLAG_RD, &dn_cfg.fsk_count, 0, "Number of flowsets"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, queue_count, + CTLFLAG_RD, &dn_cfg.queue_count, 0, "Number of queues"); +SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt, + CTLFLAG_RD, &io_pkt, 0, + "Number of packets passed to dummynet."); +SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_fast, + CTLFLAG_RD, &io_pkt_fast, 0, + "Number of packets bypassed dummynet scheduler."); +SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_drop, + CTLFLAG_RD, &io_pkt_drop, 0, + "Number of packets dropped by dummynet."); + +SYSEND + +#endif + +static void dummynet_send(struct mbuf *); + +/* + * Packets processed by dummynet have an mbuf tag associated with + * them that carries their dummynet state. + * Outside dummynet, only the 'rule' field is relevant, and it must + * be at the beginning of the structure. + */ +struct dn_pkt_tag { + struct ipfw_rule_ref rule; /* matching rule */ + + /* second part, dummynet specific */ + int dn_dir; /* action when packet comes out.*/ + /* see ip_fw_private.h */ + uint64_t output_time; /* when the pkt is due for delivery*/ + struct ifnet *ifp; /* interface, for ip_output */ + struct _ip6dn_args ip6opt; /* XXX ipv6 options */ +}; + +/* + * Return the mbuf tag holding the dummynet state (it should + * be the first one on the list). + */ +static struct dn_pkt_tag * +dn_tag_get(struct mbuf *m) +{ + struct m_tag *mtag = m_tag_first(m); + KASSERT(mtag != NULL && + mtag->m_tag_cookie == MTAG_ABI_COMPAT && + mtag->m_tag_id == PACKET_TAG_DUMMYNET, + ("packet on dummynet queue w/o dummynet tag!")); + return (struct dn_pkt_tag *)(mtag+1); +} + +static inline void +mq_append(struct mq *q, struct mbuf *m) +{ + if (q->head == NULL) + q->head = m; + else + q->tail->m_nextpkt = m; + q->tail = m; + m->m_nextpkt = NULL; +} + +/* + * Dispose a list of packet. Use a functions so if we need to do + * more work, this is a central point to do it. + */ +void dn_free_pkts(struct mbuf *mnext) +{ + struct mbuf *m; + + while ((m = mnext) != NULL) { + mnext = m->m_nextpkt; + FREE_PKT(m); + } +} + +static int +red_drops (struct dn_queue *q, int len) +{ + /* + * RED algorithm + * + * RED calculates the average queue size (avg) using a low-pass filter + * with an exponential weighted (w_q) moving average: + * avg <- (1-w_q) * avg + w_q * q_size + * where q_size is the queue length (measured in bytes or * packets). + * + * If q_size == 0, we compute the idle time for the link, and set + * avg = (1 - w_q)^(idle/s) + * where s is the time needed for transmitting a medium-sized packet. + * + * Now, if avg < min_th the packet is enqueued. + * If avg > max_th the packet is dropped. Otherwise, the packet is + * dropped with probability P function of avg. + */ + + struct dn_fsk *fs = q->fs; + int64_t p_b = 0; + + /* Queue in bytes or packets? */ + uint32_t q_size = (fs->fs.flags & DN_QSIZE_BYTES) ? + q->ni.len_bytes : q->ni.length; + + /* Average queue size estimation. */ + if (q_size != 0) { + /* Queue is not empty, avg <- avg + (q_size - avg) * w_q */ + int diff = SCALE(q_size) - q->avg; + int64_t v = SCALE_MUL((int64_t)diff, (int64_t)fs->w_q); + + q->avg += (int)v; + } else { + /* + * Queue is empty, find for how long the queue has been + * empty and use a lookup table for computing + * (1 - * w_q)^(idle_time/s) where s is the time to send a + * (small) packet. + * XXX check wraps... + */ + if (q->avg) { + u_int t = div64((dn_cfg.curr_time - q->q_time), fs->lookup_step); + + q->avg = (t < fs->lookup_depth) ? + SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0; + } + } + + /* Should i drop? */ + if (q->avg < fs->min_th) { + q->count = -1; + return (0); /* accept packet */ + } + if (q->avg >= fs->max_th) { /* average queue >= max threshold */ + if (fs->fs.flags & DN_IS_GENTLE_RED) { + /* + * According to Gentle-RED, if avg is greater than + * max_th the packet is dropped with a probability + * p_b = c_3 * avg - c_4 + * where c_3 = (1 - max_p) / max_th + * c_4 = 1 - 2 * max_p + */ + p_b = SCALE_MUL((int64_t)fs->c_3, (int64_t)q->avg) - + fs->c_4; + } else { + q->count = -1; + return (1); + } + } else if (q->avg > fs->min_th) { + /* + * We compute p_b using the linear dropping function + * p_b = c_1 * avg - c_2 + * where c_1 = max_p / (max_th - min_th) + * c_2 = max_p * min_th / (max_th - min_th) + */ + p_b = SCALE_MUL((int64_t)fs->c_1, (int64_t)q->avg) - fs->c_2; + } + + if (fs->fs.flags & DN_QSIZE_BYTES) + p_b = div64((p_b * len) , fs->max_pkt_size); + if (++q->count == 0) + q->random = random() & 0xffff; + else { + /* + * q->count counts packets arrived since last drop, so a greater + * value of q->count means a greater packet drop probability. + */ + if (SCALE_MUL(p_b, SCALE((int64_t)q->count)) > q->random) { + q->count = 0; + /* After a drop we calculate a new random value. */ + q->random = random() & 0xffff; + return (1); /* drop */ + } + } + /* End of RED algorithm. */ + + return (0); /* accept */ + +} + +/* + * Enqueue a packet in q, subject to space and queue management policy + * (whose parameters are in q->fs). + * Update stats for the queue and the scheduler. + * Return 0 on success, 1 on drop. The packet is consumed anyways. + */ +int +dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop) +{ + struct dn_fs *f; + struct dn_flow *ni; /* stats for scheduler instance */ + uint64_t len; + + if (q->fs == NULL || q->_si == NULL) { + printf("%s fs %p si %p, dropping\n", + __FUNCTION__, q->fs, q->_si); + FREE_PKT(m); + return 1; + } + f = &(q->fs->fs); + ni = &q->_si->ni; + len = m->m_pkthdr.len; + /* Update statistics, then check reasons to drop pkt. */ + q->ni.tot_bytes += len; + q->ni.tot_pkts++; + ni->tot_bytes += len; + ni->tot_pkts++; + if (drop) + goto drop; + if (f->plr && random() < f->plr) + goto drop; + if (f->flags & DN_IS_RED && red_drops(q, m->m_pkthdr.len)) + goto drop; + if (f->flags & DN_QSIZE_BYTES) { + if (q->ni.len_bytes > f->qsize) + goto drop; + } else if (q->ni.length >= f->qsize) { + goto drop; + } + mq_append(&q->mq, m); + q->ni.length++; + q->ni.len_bytes += len; + ni->length++; + ni->len_bytes += len; + return 0; + +drop: + io_pkt_drop++; + q->ni.drops++; + ni->drops++; + FREE_PKT(m); + return 1; +} + +/* + * Fetch packets from the delay line which are due now. If there are + * leftover packets, reinsert the delay line in the heap. + * Runs under scheduler lock. + */ +static void +transmit_event(struct mq *q, struct delay_line *dline, uint64_t now) +{ + struct mbuf *m; + struct dn_pkt_tag *pkt = NULL; + + dline->oid.subtype = 0; /* not in heap */ + while ((m = dline->mq.head) != NULL) { + pkt = dn_tag_get(m); + if (!DN_KEY_LEQ(pkt->output_time, now)) + break; + dline->mq.head = m->m_nextpkt; + mq_append(q, m); + } + if (m != NULL) { + dline->oid.subtype = 1; /* in heap */ + heap_insert(&dn_cfg.evheap, pkt->output_time, dline); + } +} + +/* + * Convert the additional MAC overheads/delays into an equivalent + * number of bits for the given data rate. The samples are + * in milliseconds so we need to divide by 1000. + */ +static uint64_t +extra_bits(struct mbuf *m, struct dn_schk *s) +{ + int index; + uint64_t bits; + struct dn_profile *pf = s->profile; + + if (!pf || pf->samples_no == 0) + return 0; + index = random() % pf->samples_no; + bits = div64((uint64_t)pf->samples[index] * s->link.bandwidth, 1000); + if (index >= pf->loss_level) { + struct dn_pkt_tag *dt = dn_tag_get(m); + if (dt) + dt->dn_dir = DIR_DROP; + } + return bits; +} + +/* + * Send traffic from a scheduler instance due by 'now'. + * Return a pointer to the head of the queue. + */ +static struct mbuf * +serve_sched(struct mq *q, struct dn_sch_inst *si, uint64_t now) +{ + struct mq def_q; + struct dn_schk *s = si->sched; + struct mbuf *m = NULL; + int delay_line_idle = (si->dline.mq.head == NULL); + int done, bw; + + if (q == NULL) { + q = &def_q; + q->head = NULL; + } + + bw = s->link.bandwidth; + si->kflags &= ~DN_ACTIVE; + + if (bw > 0) + si->credit += (now - si->sched_time) * bw; + else + si->credit = 0; + si->sched_time = now; + done = 0; + while (si->credit >= 0 && (m = s->fp->dequeue(si)) != NULL) { + uint64_t len_scaled; + done++; + len_scaled = (bw == 0) ? 0 : hz * + (m->m_pkthdr.len * 8 + extra_bits(m, s)); + si->credit -= len_scaled; + /* Move packet in the delay line */ + dn_tag_get(m)->output_time += s->link.delay ; + mq_append(&si->dline.mq, m); + } + /* + * If credit >= 0 the instance is idle, mark time. + * Otherwise put back in the heap, and adjust the output + * time of the last inserted packet, m, which was too early. + */ + if (si->credit >= 0) { + si->idle_time = now; + } else { + uint64_t t; + KASSERT (bw > 0, ("bw=0 and credit<0 ?")); + t = div64(bw - 1 - si->credit, bw); + if (m) + dn_tag_get(m)->output_time += t; + si->kflags |= DN_ACTIVE; + heap_insert(&dn_cfg.evheap, now + t, si); + } + if (delay_line_idle && done) + transmit_event(q, &si->dline, now); + return q->head; +} + +/* + * The timer handler for dummynet. Time is computed in ticks, but + * but the code is tolerant to the actual rate at which this is called. + * Once complete, the function reschedules itself for the next tick. + */ +void +dummynet_task(void *context, int pending) +{ + struct timeval t; + struct mq q = { NULL, NULL }; /* queue to accumulate results */ + + DN_BH_WLOCK(); + + /* Update number of lost(coalesced) ticks. */ + tick_lost += pending - 1; + + getmicrouptime(&t); + /* Last tick duration (usec). */ + tick_last = (t.tv_sec - dn_cfg.prev_t.tv_sec) * 1000000 + + (t.tv_usec - dn_cfg.prev_t.tv_usec); + /* Last tick vs standard tick difference (usec). */ + tick_delta = (tick_last * hz - 1000000) / hz; + /* Accumulated tick difference (usec). */ + tick_delta_sum += tick_delta; + + dn_cfg.prev_t = t; + + /* + * Adjust curr_time if the accumulated tick difference is + * greater than the 'standard' tick. Since curr_time should + * be monotonically increasing, we do positive adjustments + * as required, and throttle curr_time in case of negative + * adjustment. + */ + dn_cfg.curr_time++; + if (tick_delta_sum - tick >= 0) { + int diff = tick_delta_sum / tick; + + dn_cfg.curr_time += diff; + tick_diff += diff; + tick_delta_sum %= tick; + tick_adjustment++; + } else if (tick_delta_sum + tick <= 0) { + dn_cfg.curr_time--; + tick_diff--; + tick_delta_sum += tick; + tick_adjustment++; + } + + /* serve pending events, accumulate in q */ + for (;;) { + struct dn_id *p; /* generic parameter to handler */ + + if (dn_cfg.evheap.elements == 0 || + DN_KEY_LT(dn_cfg.curr_time, HEAP_TOP(&dn_cfg.evheap)->key)) + break; + p = HEAP_TOP(&dn_cfg.evheap)->object; + heap_extract(&dn_cfg.evheap, NULL); + + if (p->type == DN_SCH_I) { + serve_sched(&q, (struct dn_sch_inst *)p, dn_cfg.curr_time); + } else { /* extracted a delay line */ + transmit_event(&q, (struct delay_line *)p, dn_cfg.curr_time); + } + } + if (dn_cfg.expire && ++dn_cfg.expire_cycle >= dn_cfg.expire) { + dn_cfg.expire_cycle = 0; + dn_drain_scheduler(); + dn_drain_queue(); + } + + DN_BH_WUNLOCK(); + dn_reschedule(); + if (q.head != NULL) + dummynet_send(q.head); +} + +/* + * forward a chain of packets to the proper destination. + * This runs outside the dummynet lock. + */ +static void +dummynet_send(struct mbuf *m) +{ + struct mbuf *n; + + for (; m != NULL; m = n) { + struct ifnet *ifp = NULL; /* gcc 3.4.6 complains */ + struct m_tag *tag; + int dst; + + n = m->m_nextpkt; + m->m_nextpkt = NULL; + tag = m_tag_first(m); + if (tag == NULL) { /* should not happen */ + dst = DIR_DROP; + } else { + struct dn_pkt_tag *pkt = dn_tag_get(m); + /* extract the dummynet info, rename the tag + * to carry reinject info. + */ + dst = pkt->dn_dir; + ifp = pkt->ifp; + tag->m_tag_cookie = MTAG_IPFW_RULE; + tag->m_tag_id = 0; + } + + switch (dst) { + case DIR_OUT: + SET_HOST_IPLEN(mtod(m, struct ip *)); + ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL); + break ; + + case DIR_IN : + /* put header in network format for ip_input() */ + //SET_NET_IPLEN(mtod(m, struct ip *)); + netisr_dispatch(NETISR_IP, m); + break; + +#ifdef INET6 + case DIR_IN | PROTO_IPV6: + netisr_dispatch(NETISR_IPV6, m); + break; + + case DIR_OUT | PROTO_IPV6: + SET_HOST_IPLEN(mtod(m, struct ip *)); + ip6_output(m, NULL, NULL, IPV6_FORWARDING, NULL, NULL, NULL); + break; +#endif + + case DIR_FWD | PROTO_IFB: /* DN_TO_IFB_FWD: */ + if (bridge_dn_p != NULL) + ((*bridge_dn_p)(m, ifp)); + else + printf("dummynet: if_bridge not loaded\n"); + + break; + + case DIR_IN | PROTO_LAYER2: /* DN_TO_ETH_DEMUX: */ + /* + * The Ethernet code assumes the Ethernet header is + * contiguous in the first mbuf header. + * Insure this is true. + */ + if (m->m_len < ETHER_HDR_LEN && + (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) { + printf("dummynet/ether: pullup failed, " + "dropping packet\n"); + break; + } + ether_demux(m->m_pkthdr.rcvif, m); + break; + + case DIR_OUT | PROTO_LAYER2: /* N_TO_ETH_OUT: */ + ether_output_frame(ifp, m); + break; + + case DIR_DROP: + /* drop the packet after some time */ + FREE_PKT(m); + break; + + default: + printf("dummynet: bad switch %d!\n", dst); + FREE_PKT(m); + break; + } + } +} + +static inline int +tag_mbuf(struct mbuf *m, int dir, struct ip_fw_args *fwa) +{ + struct dn_pkt_tag *dt; + struct m_tag *mtag; + + mtag = m_tag_get(PACKET_TAG_DUMMYNET, + sizeof(*dt), M_NOWAIT | M_ZERO); + if (mtag == NULL) + return 1; /* Cannot allocate packet header. */ + m_tag_prepend(m, mtag); /* Attach to mbuf chain. */ + dt = (struct dn_pkt_tag *)(mtag + 1); + dt->rule = fwa->rule; + dt->rule.info &= IPFW_ONEPASS; /* only keep this info */ + dt->dn_dir = dir; + dt->ifp = fwa->oif; + /* dt->output tame is updated as we move through */ + dt->output_time = dn_cfg.curr_time; + return 0; +} + + +/* + * dummynet hook for packets. + * We use the argument to locate the flowset fs and the sched_set sch + * associated to it. The we apply flow_mask and sched_mask to + * determine the queue and scheduler instances. + * + * dir where shall we send the packet after dummynet. + * *m0 the mbuf with the packet + * ifp the 'ifp' parameter from the caller. + * NULL in ip_input, destination interface in ip_output, + */ +int +dummynet_io(struct mbuf **m0, int dir, struct ip_fw_args *fwa) +{ + struct mbuf *m = *m0; + struct dn_fsk *fs = NULL; + struct dn_sch_inst *si; + struct dn_queue *q = NULL; /* default */ + + int fs_id = (fwa->rule.info & IPFW_INFO_MASK) + + ((fwa->rule.info & IPFW_IS_PIPE) ? 2*DN_MAX_ID : 0); + DN_BH_WLOCK(); + io_pkt++; + /* we could actually tag outside the lock, but who cares... */ + if (tag_mbuf(m, dir, fwa)) + goto dropit; + if (dn_cfg.busy) { + /* if the upper half is busy doing something expensive, + * lets queue the packet and move forward + */ + mq_append(&dn_cfg.pending, m); + m = *m0 = NULL; /* consumed */ + goto done; /* already active, nothing to do */ + } + /* XXX locate_flowset could be optimised with a direct ref. */ + fs = dn_ht_find(dn_cfg.fshash, fs_id, 0, NULL); + if (fs == NULL) + goto dropit; /* This queue/pipe does not exist! */ + if (fs->sched == NULL) /* should not happen */ + goto dropit; + /* find scheduler instance, possibly applying sched_mask */ + si = ipdn_si_find(fs->sched, &(fwa->f_id)); + if (si == NULL) + goto dropit; + /* + * If the scheduler supports multiple queues, find the right one + * (otherwise it will be ignored by enqueue). + */ + if (fs->sched->fp->flags & DN_MULTIQUEUE) { + q = ipdn_q_find(fs, si, &(fwa->f_id)); + if (q == NULL) + goto dropit; + } + if (fs->sched->fp->enqueue(si, q, m)) { + printf("%s dropped by enqueue\n", __FUNCTION__); + /* packet was dropped by enqueue() */ + m = *m0 = NULL; + goto dropit; + } + + if (si->kflags & DN_ACTIVE) { + m = *m0 = NULL; /* consumed */ + goto done; /* already active, nothing to do */ + } + + /* compute the initial allowance */ + { + struct dn_link *p = &fs->sched->link; + si->credit = dn_cfg.io_fast ? p->bandwidth : 0; + if (p->burst) { + uint64_t burst = (dn_cfg.curr_time - si->idle_time) * p->bandwidth; + if (burst > p->burst) + burst = p->burst; + si->credit += burst; + } + } + /* pass through scheduler and delay line */ + m = serve_sched(NULL, si, dn_cfg.curr_time); + + /* optimization -- pass it back to ipfw for immediate send */ + /* XXX Don't call dummynet_send() if scheduler return the packet + * just enqueued. This avoid a lock order reversal. + * + */ + if (/*dn_cfg.io_fast &&*/ m == *m0 && (dir & PROTO_LAYER2) == 0 ) { + /* fast io */ + io_pkt_fast++; + if (m->m_nextpkt != NULL) { + printf("dummynet: fast io: pkt chain detected!\n"); + m->m_nextpkt = NULL; + } + m = NULL; + } else { + *m0 = NULL; + } +done: + DN_BH_WUNLOCK(); + if (m) + dummynet_send(m); + return 0; + +dropit: + io_pkt_drop++; + DN_BH_WUNLOCK(); + if (m) + FREE_PKT(m); + *m0 = NULL; + return (fs && (fs->fs.flags & DN_NOERROR)) ? 0 : ENOBUFS; +} diff --git a/sys/netinet/ipfw/ip_dn_private.h b/sys/netinet/ipfw/ip_dn_private.h new file mode 100644 index 0000000..270f188 --- /dev/null +++ b/sys/netinet/ipfw/ip_dn_private.h @@ -0,0 +1,402 @@ +/*- + * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * internal dummynet APIs. + * + * $FreeBSD$ + */ + +#ifndef _IP_DN_PRIVATE_H +#define _IP_DN_PRIVATE_H + +/* debugging support + * use ND() to remove debugging, D() to print a line, + * DX(level, ...) to print above a certain level + * If you redefine D() you are expected to redefine all. + */ +#ifndef D +#define ND(fmt, ...) do {} while (0) +#define D1(fmt, ...) do {} while (0) +#define D(fmt, ...) printf("%-10s " fmt "\n", \ + __FUNCTION__, ## __VA_ARGS__) +#define DX(lev, fmt, ...) do { \ + if (dn_cfg.debug > lev) D(fmt, ## __VA_ARGS__); } while (0) +#endif + +MALLOC_DECLARE(M_DUMMYNET); + +#ifndef FREE_PKT +#define FREE_PKT(m) m_freem(m) +#endif + +#ifndef __linux__ +#define div64(a, b) ((int64_t)(a) / (int64_t)(b)) +#endif + +#define DN_LOCK_INIT() do { \ + mtx_init(&dn_cfg.uh_mtx, "dn_uh", NULL, MTX_DEF); \ + mtx_init(&dn_cfg.bh_mtx, "dn_bh", NULL, MTX_DEF); \ + } while (0) +#define DN_LOCK_DESTROY() do { \ + mtx_destroy(&dn_cfg.uh_mtx); \ + mtx_destroy(&dn_cfg.bh_mtx); \ + } while (0) +#if 0 /* not used yet */ +#define DN_UH_RLOCK() mtx_lock(&dn_cfg.uh_mtx) +#define DN_UH_RUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) +#define DN_UH_WLOCK() mtx_lock(&dn_cfg.uh_mtx) +#define DN_UH_WUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) +#define DN_UH_LOCK_ASSERT() mtx_assert(&dn_cfg.uh_mtx, MA_OWNED) +#endif + +#define DN_BH_RLOCK() mtx_lock(&dn_cfg.uh_mtx) +#define DN_BH_RUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) +#define DN_BH_WLOCK() mtx_lock(&dn_cfg.uh_mtx) +#define DN_BH_WUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) +#define DN_BH_LOCK_ASSERT() mtx_assert(&dn_cfg.uh_mtx, MA_OWNED) + +SLIST_HEAD(dn_schk_head, dn_schk); +SLIST_HEAD(dn_sch_inst_head, dn_sch_inst); +SLIST_HEAD(dn_fsk_head, dn_fsk); +SLIST_HEAD(dn_queue_head, dn_queue); +SLIST_HEAD(dn_alg_head, dn_alg); + +struct mq { /* a basic queue of packets*/ + struct mbuf *head, *tail; +}; + +static inline void +set_oid(struct dn_id *o, int type, int len) +{ + o->type = type; + o->len = len; + o->subtype = 0; +}; + +/* + * configuration and global data for a dummynet instance + * + * When a configuration is modified from userland, 'id' is incremented + * so we can use the value to check for stale pointers. + */ +struct dn_parms { + uint32_t id; /* configuration version */ + + /* defaults (sysctl-accessible) */ + int red_lookup_depth; + int red_avg_pkt_size; + int red_max_pkt_size; + int hash_size; + int max_hash_size; + long byte_limit; /* max queue sizes */ + long slot_limit; + + int io_fast; + int debug; + + /* timekeeping */ + struct timeval prev_t; /* last time dummynet_tick ran */ + struct dn_heap evheap; /* scheduled events */ + + /* counters of objects -- used for reporting space */ + int schk_count; + int si_count; + int fsk_count; + int queue_count; + + /* ticks and other stuff */ + uint64_t curr_time; + /* flowsets and schedulers are in hash tables, with 'hash_size' + * buckets. fshash is looked up at every packet arrival + * so better be generous if we expect many entries. + */ + struct dn_ht *fshash; + struct dn_ht *schedhash; + /* list of flowsets without a scheduler -- use sch_chain */ + struct dn_fsk_head fsu; /* list of unlinked flowsets */ + struct dn_alg_head schedlist; /* list of algorithms */ + + /* Store the fs/sch to scan when draining. The value is the + * bucket number of the hash table. Expire can be disabled + * with net.inet.ip.dummynet.expire=0, or it happens every + * expire ticks. + **/ + int drain_fs; + int drain_sch; + uint32_t expire; + uint32_t expire_cycle; /* tick count */ + + /* if the upper half is busy doing something long, + * can set the busy flag and we will enqueue packets in + * a queue for later processing. + */ + int busy; + struct mq pending; + +#ifdef _KERNEL + /* + * This file is normally used in the kernel, unless we do + * some userland tests, in which case we do not need a mtx. + * uh_mtx arbitrates between system calls and also + * protects fshash, schedhash and fsunlinked. + * These structures are readonly for the lower half. + * bh_mtx protects all other structures which may be + * modified upon packet arrivals + */ +#if defined( __linux__ ) || defined( _WIN32 ) + spinlock_t uh_mtx; + spinlock_t bh_mtx; +#else + struct mtx uh_mtx; + struct mtx bh_mtx; +#endif + +#endif /* _KERNEL */ +}; + +/* + * Delay line, contains all packets on output from a link. + * Every scheduler instance has one. + */ +struct delay_line { + struct dn_id oid; + struct dn_sch_inst *si; + struct mq mq; +}; + +/* + * The kernel side of a flowset. It is linked in a hash table + * of flowsets, and in a list of children of their parent scheduler. + * qht is either the queue or (if HAVE_MASK) a hash table queues. + * Note that the mask to use is the (flow_mask|sched_mask), which + * changes as we attach/detach schedulers. So we store it here. + * + * XXX If we want to add scheduler-specific parameters, we need to + * put them in external storage because the scheduler may not be + * available when the fsk is created. + */ +struct dn_fsk { /* kernel side of a flowset */ + struct dn_fs fs; + SLIST_ENTRY(dn_fsk) fsk_next; /* hash chain for fshash */ + + struct ipfw_flow_id fsk_mask; + + /* qht is a hash table of queues, or just a single queue + * a bit in fs.flags tells us which one + */ + struct dn_ht *qht; + struct dn_schk *sched; /* Sched we are linked to */ + SLIST_ENTRY(dn_fsk) sch_chain; /* list of fsk attached to sched */ + + /* bucket index used by drain routine to drain queues for this + * flowset + */ + int drain_bucket; + /* Parameter realted to RED / GRED */ + /* original values are in dn_fs*/ + int w_q ; /* queue weight (scaled) */ + int max_th ; /* maximum threshold for queue (scaled) */ + int min_th ; /* minimum threshold for queue (scaled) */ + int max_p ; /* maximum value for p_b (scaled) */ + + u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */ + u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */ + u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */ + u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */ + u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */ + u_int lookup_depth ; /* depth of lookup table */ + int lookup_step ; /* granularity inside the lookup table */ + int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */ + int avg_pkt_size ; /* medium packet size */ + int max_pkt_size ; /* max packet size */ +}; + +/* + * A queue is created as a child of a flowset unless it belongs to + * a !MULTIQUEUE scheduler. It is normally in a hash table in the + * flowset. fs always points to the parent flowset. + * si normally points to the sch_inst, unless the flowset has been + * detached from the scheduler -- in this case si == NULL and we + * should not enqueue. + */ +struct dn_queue { + struct dn_flow ni; /* oid, flow_id, stats */ + struct mq mq; /* packets queue */ + struct dn_sch_inst *_si; /* owner scheduler instance */ + SLIST_ENTRY(dn_queue) q_next; /* hash chain list for qht */ + struct dn_fsk *fs; /* parent flowset. */ + + /* RED parameters */ + int avg; /* average queue length est. (scaled) */ + int count; /* arrivals since last RED drop */ + int random; /* random value (scaled) */ + uint64_t q_time; /* start of queue idle time */ + +}; + +/* + * The kernel side of a scheduler. Contains the userland config, + * a link, pointer to extra config arguments from command line, + * kernel flags, and a pointer to the scheduler methods. + * It is stored in a hash table, and holds a list of all + * flowsets and scheduler instances. + * XXX sch must be at the beginning, see schk_hash(). + */ +struct dn_schk { + struct dn_sch sch; + struct dn_alg *fp; /* Pointer to scheduler functions */ + struct dn_link link; /* The link, embedded */ + struct dn_profile *profile; /* delay profile, if any */ + struct dn_id *cfg; /* extra config arguments */ + + SLIST_ENTRY(dn_schk) schk_next; /* hash chain for schedhash */ + + struct dn_fsk_head fsk_list; /* all fsk linked to me */ + struct dn_fsk *fs; /* Flowset for !MULTIQUEUE */ + + /* bucket index used by the drain routine to drain the scheduler + * instance for this flowset. + */ + int drain_bucket; + + /* Hash table of all instances (through sch.sched_mask) + * or single instance if no mask. Always valid. + */ + struct dn_ht *siht; +}; + + +/* + * Scheduler instance. + * Contains variables and all queues relative to a this instance. + * This struct is created a runtime. + */ +struct dn_sch_inst { + struct dn_flow ni; /* oid, flowid and stats */ + SLIST_ENTRY(dn_sch_inst) si_next; /* hash chain for siht */ + struct delay_line dline; + struct dn_schk *sched; /* the template */ + int kflags; /* DN_ACTIVE */ + + int64_t credit; /* bits I can transmit (more or less). */ + uint64_t sched_time; /* time link was scheduled in ready_heap */ + uint64_t idle_time; /* start of scheduler instance idle time */ + + /* q_count is the number of queues that this instance is using. + * The counter is incremented or decremented when + * a reference from the queue is created or deleted. + * It is used to make sure that a scheduler instance can be safely + * deleted by the drain routine. See notes below. + */ + int q_count; + +}; + +/* + * NOTE about object drain. + * The system will automatically (XXX check when) drain queues and + * scheduler instances when they are idle. + * A queue is idle when it has no packets; an instance is idle when + * it is not in the evheap heap, and the corresponding delay line is empty. + * A queue can be safely deleted when it is idle because of the scheduler + * function xxx_free_queue() will remove any references to it. + * An instance can be only deleted when no queues reference it. To be sure + * of that, a counter (q_count) stores the number of queues that are pointing + * to the instance. + * + * XXX + * Order of scan: + * - take all flowset in a bucket for the flowset hash table + * - take all queues in a bucket for the flowset + * - increment the queue bucket + * - scan next flowset bucket + * Nothing is done if a bucket contains no entries. + * + * The same schema is used for sceduler instances + */ + + +/* kernel-side flags. Linux has DN_DELETE in fcntl.h + */ +enum { + /* 1 and 2 are reserved for the SCAN flags */ + DN_DESTROY = 0x0004, /* destroy */ + DN_DELETE_FS = 0x0008, /* destroy flowset */ + DN_DETACH = 0x0010, + DN_ACTIVE = 0x0020, /* object is in evheap */ + DN_F_DLINE = 0x0040, /* object is a delay line */ + DN_F_SCHI = 0x00C0, /* object is a sched.instance */ + DN_QHT_IS_Q = 0x0100, /* in flowset, qht is a single queue */ +}; + +extern struct dn_parms dn_cfg; + +int dummynet_io(struct mbuf **, int , struct ip_fw_args *); +void dummynet_task(void *context, int pending); +void dn_reschedule(void); + +struct dn_queue *ipdn_q_find(struct dn_fsk *, struct dn_sch_inst *, + struct ipfw_flow_id *); +struct dn_sch_inst *ipdn_si_find(struct dn_schk *, struct ipfw_flow_id *); + +/* + * copy_range is a template for requests for ranges of pipes/queues/scheds. + * The number of ranges is variable and can be derived by o.len. + * As a default, we use a small number of entries so that the struct + * fits easily on the stack and is sufficient for most common requests. + */ +#define DEFAULT_RANGES 5 +struct copy_range { + struct dn_id o; + uint32_t r[ 2 * DEFAULT_RANGES ]; +}; + +struct copy_args { + char **start; + char *end; + int flags; + int type; + struct copy_range *extra; /* extra filtering */ +}; + +struct sockopt; +int ip_dummynet_compat(struct sockopt *sopt); +int dummynet_get(struct sockopt *sopt, void **compat); +int dn_c_copy_q (void *_ni, void *arg); +int dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq); +int dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq); +int dn_compat_copy_queue(struct copy_args *a, void *_o); +int dn_compat_copy_pipe(struct copy_args *a, void *_o); +int copy_data_helper_compat(void *_o, void *_arg); +int dn_compat_calc_size(struct dn_parms dn_cfg); +int do_config(void *p, int l); + +/* function to drain idle object */ +void dn_drain_scheduler(void); +void dn_drain_queue(void); + +#endif /* _IP_DN_PRIVATE_H */ diff --git a/sys/netinet/ipfw/ip_dummynet.c b/sys/netinet/ipfw/ip_dummynet.c index e961a55..d7073eb 100644 --- a/sys/netinet/ipfw/ip_dummynet.c +++ b/sys/netinet/ipfw/ip_dummynet.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa + * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa * Portions Copyright (c) 2000 Akamba Corp. * All rights reserved * @@ -28,34 +28,12 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); -#define DUMMYNET_DEBUG - -#include "opt_inet6.h" - /* - * This module implements IP dummynet, a bandwidth limiter/delay emulator - * used in conjunction with the ipfw package. - * Description of the data structures used is in ip_dummynet.h - * Here you mainly find the following blocks of code: - * + variable declarations; - * + heap management functions; - * + scheduler and dummynet functions; - * + configuration and initialization. - * - * NOTA BENE: critical sections are protected by the "dummynet lock". - * - * Most important Changes: - * - * 011004: KLDable - * 010124: Fixed WF2Q behaviour - * 010122: Fixed spl protection. - * 000601: WF2Q support - * 000106: large rewrite, use heaps to handle very many pipes. - * 980513: initial release - * - * include files marked with XXX are probably not needed + * Configuration and internal object management for dummynet. */ +#include "opt_inet6.h" + #include <sys/param.h> #include <sys/systm.h> #include <sys/malloc.h> @@ -69,2210 +47,2115 @@ __FBSDID("$FreeBSD$"); #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/time.h> -#include <sys/sysctl.h> #include <sys/taskqueue.h> #include <net/if.h> /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ -#include <net/netisr.h> #include <netinet/in.h> -#include <netinet/ip.h> /* ip_len, ip_off */ +#include <netinet/ip_var.h> /* ip_output(), IP_FORWARDING */ #include <netinet/ip_fw.h> +#include <netinet/ipfw/ip_fw_private.h> +#include <netinet/ipfw/dn_heap.h> #include <netinet/ip_dummynet.h> -#include <netinet/ip_var.h> /* ip_output(), IP_FORWARDING */ - -#include <netinet/if_ether.h> /* various ether_* routines */ - -#include <netinet/ip6.h> /* for ip6_input, ip6_output prototypes */ -#include <netinet6/ip6_var.h> +#include <netinet/ipfw/ip_dn_private.h> +#include <netinet/ipfw/dn_sched.h> + +/* which objects to copy */ +#define DN_C_LINK 0x01 +#define DN_C_SCH 0x02 +#define DN_C_FLOW 0x04 +#define DN_C_FS 0x08 +#define DN_C_QUEUE 0x10 + +/* we use this argument in case of a schk_new */ +struct schk_new_arg { + struct dn_alg *fp; + struct dn_sch *sch; +}; -/* - * We keep a private variable for the simulation time, but we could - * probably use an existing one ("softticks" in sys/kern/kern_timeout.c) - */ -static dn_key curr_time = 0 ; /* current simulation time */ +/*---- callout hooks. ----*/ +static struct callout dn_timeout; +static struct task dn_task; +static struct taskqueue *dn_tq = NULL; -static int dn_hash_size = 64 ; /* default hash size */ +static void +dummynet(void * __unused unused) +{ -/* statistics on number of queue searches and search steps */ -static long searches, search_steps ; -static int pipe_expire = 1 ; /* expire queue if empty */ -static int dn_max_ratio = 16 ; /* max queues/buckets ratio */ + taskqueue_enqueue(dn_tq, &dn_task); +} -static long pipe_slot_limit = 100; /* Foot shooting limit for pipe queues. */ -static long pipe_byte_limit = 1024 * 1024; +void +dn_reschedule(void) +{ + callout_reset(&dn_timeout, 1, dummynet, NULL); +} +/*----- end of callout hooks -----*/ -static int red_lookup_depth = 256; /* RED - default lookup table depth */ -static int red_avg_pkt_size = 512; /* RED - default medium packet size */ -static int red_max_pkt_size = 1500; /* RED - default max packet size */ +/* Return a scheduler descriptor given the type or name. */ +static struct dn_alg * +find_sched_type(int type, char *name) +{ + struct dn_alg *d; -static struct timeval prev_t, t; -static long tick_last; /* Last tick duration (usec). */ -static long tick_delta; /* Last vs standard tick diff (usec). */ -static long tick_delta_sum; /* Accumulated tick difference (usec).*/ -static long tick_adjustment; /* Tick adjustments done. */ -static long tick_lost; /* Lost(coalesced) ticks number. */ -/* Adjusted vs non-adjusted curr_time difference (ticks). */ -static long tick_diff; + SLIST_FOREACH(d, &dn_cfg.schedlist, next) { + if (d->type == type || (name && !strcmp(d->name, name))) + return d; + } + return NULL; /* not found */ +} -static int io_fast; -static unsigned long io_pkt; -static unsigned long io_pkt_fast; -static unsigned long io_pkt_drop; +int +ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg) +{ + int oldv = *v; + const char *op = NULL; + if (oldv < lo) { + *v = dflt; + op = "Bump"; + } else if (oldv > hi) { + *v = hi; + op = "Clamp"; + } else + return *v; + if (op && msg) + printf("%s %s to %d (was %d)\n", op, msg, *v, oldv); + return *v; +} +/*---- flow_id mask, hash and compare functions ---*/ /* - * Three heaps contain queues and pipes that the scheduler handles: - * - * ready_heap contains all dn_flow_queue related to fixed-rate pipes. - * - * wfq_ready_heap contains the pipes associated with WF2Q flows - * - * extract_heap contains pipes associated with delay lines. - * + * The flow_id includes the 5-tuple, the queue/pipe number + * which we store in the extra area in host order, + * and for ipv6 also the flow_id6. + * XXX see if we want the tos byte (can store in 'flags') */ +static struct ipfw_flow_id * +flow_id_mask(struct ipfw_flow_id *mask, struct ipfw_flow_id *id) +{ + int is_v6 = IS_IP6_FLOW_ID(id); -MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap"); - -static struct dn_heap ready_heap, extract_heap, wfq_ready_heap ; - -static int heap_init(struct dn_heap *h, int size); -static int heap_insert (struct dn_heap *h, dn_key key1, void *p); -static void heap_extract(struct dn_heap *h, void *obj); -static void transmit_event(struct dn_pipe *pipe, struct mbuf **head, - struct mbuf **tail); -static void ready_event(struct dn_flow_queue *q, struct mbuf **head, - struct mbuf **tail); -static void ready_event_wfq(struct dn_pipe *p, struct mbuf **head, - struct mbuf **tail); - -#define HASHSIZE 16 -#define HASH(num) ((((num) >> 8) ^ ((num) >> 4) ^ (num)) & 0x0f) -static struct dn_pipe_head pipehash[HASHSIZE]; /* all pipes */ -static struct dn_flow_set_head flowsethash[HASHSIZE]; /* all flowsets */ - -static struct callout dn_timeout; - -extern void (*bridge_dn_p)(struct mbuf *, struct ifnet *); - -#ifdef SYSCTL_NODE -SYSCTL_DECL(_net_inet); -SYSCTL_DECL(_net_inet_ip); - -SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet"); -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size, - CTLFLAG_RW, &dn_hash_size, 0, "Default hash table size"); -#if 0 /* curr_time is 64 bit */ -SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, curr_time, - CTLFLAG_RD, &curr_time, 0, "Current tick"); -#endif -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, ready_heap, - CTLFLAG_RD, &ready_heap.size, 0, "Size of ready heap"); -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, extract_heap, - CTLFLAG_RD, &extract_heap.size, 0, "Size of extract heap"); -SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, searches, - CTLFLAG_RD, &searches, 0, "Number of queue searches"); -SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, search_steps, - CTLFLAG_RD, &search_steps, 0, "Number of queue search steps"); -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire, - CTLFLAG_RW, &pipe_expire, 0, "Expire queue if empty"); -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, max_chain_len, - CTLFLAG_RW, &dn_max_ratio, 0, - "Max ratio between dynamic queues and buckets"); -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth, - CTLFLAG_RD, &red_lookup_depth, 0, "Depth of RED lookup table"); -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size, - CTLFLAG_RD, &red_avg_pkt_size, 0, "RED Medium packet size"); -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size, - CTLFLAG_RD, &red_max_pkt_size, 0, "RED Max packet size"); -SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta, - CTLFLAG_RD, &tick_delta, 0, "Last vs standard tick difference (usec)."); -SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta_sum, - CTLFLAG_RD, &tick_delta_sum, 0, "Accumulated tick difference (usec)."); -SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_adjustment, - CTLFLAG_RD, &tick_adjustment, 0, "Tick adjustments done."); -SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_diff, - CTLFLAG_RD, &tick_diff, 0, - "Adjusted vs non-adjusted curr_time difference (ticks)."); -SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost, - CTLFLAG_RD, &tick_lost, 0, - "Number of ticks coalesced by dummynet taskqueue."); -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, io_fast, - CTLFLAG_RW, &io_fast, 0, "Enable fast dummynet io."); -SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt, - CTLFLAG_RD, &io_pkt, 0, - "Number of packets passed to dummynet."); -SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_fast, - CTLFLAG_RD, &io_pkt_fast, 0, - "Number of packets bypassed dummynet scheduler."); -SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_drop, - CTLFLAG_RD, &io_pkt_drop, 0, - "Number of packets dropped by dummynet."); -SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_slot_limit, - CTLFLAG_RW, &pipe_slot_limit, 0, "Upper limit in slots for pipe queue."); -SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_byte_limit, - CTLFLAG_RW, &pipe_byte_limit, 0, "Upper limit in bytes for pipe queue."); -#endif - -#ifdef DUMMYNET_DEBUG -int dummynet_debug = 0; -#ifdef SYSCTL_NODE -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW, &dummynet_debug, - 0, "control debugging printfs"); -#endif -#define DPRINTF(X) if (dummynet_debug) printf X -#else -#define DPRINTF(X) -#endif - -static struct task dn_task; -static struct taskqueue *dn_tq = NULL; -static void dummynet_task(void *, int); - -static struct mtx dummynet_mtx; -#define DUMMYNET_LOCK_INIT() \ - mtx_init(&dummynet_mtx, "dummynet", NULL, MTX_DEF) -#define DUMMYNET_LOCK_DESTROY() mtx_destroy(&dummynet_mtx) -#define DUMMYNET_LOCK() mtx_lock(&dummynet_mtx) -#define DUMMYNET_UNLOCK() mtx_unlock(&dummynet_mtx) -#define DUMMYNET_LOCK_ASSERT() mtx_assert(&dummynet_mtx, MA_OWNED) - -static int config_pipe(struct dn_pipe *p); -static int ip_dn_ctl(struct sockopt *sopt); - -static void dummynet(void *); -static void dummynet_flush(void); -static void dummynet_send(struct mbuf *); -void dummynet_drain(void); -static int dummynet_io(struct mbuf **, int , struct ip_fw_args *); + id->dst_port &= mask->dst_port; + id->src_port &= mask->src_port; + id->proto &= mask->proto; + id->extra &= mask->extra; + if (is_v6) { + APPLY_MASK(&id->dst_ip6, &mask->dst_ip6); + APPLY_MASK(&id->src_ip6, &mask->src_ip6); + id->flow_id6 &= mask->flow_id6; + } else { + id->dst_ip &= mask->dst_ip; + id->src_ip &= mask->src_ip; + } + return id; +} -/* - * Flow queue is idle if: - * 1) it's empty for at least 1 tick - * 2) it has invalid timestamp (WF2Q case) - * 3) parent pipe has no 'exhausted' burst. - */ -#define QUEUE_IS_IDLE(q) ((q)->head == NULL && (q)->S == (q)->F + 1 && \ - curr_time > (q)->idle_time + 1 && \ - ((q)->numbytes + (curr_time - (q)->idle_time - 1) * \ - (q)->fs->pipe->bandwidth >= (q)->fs->pipe->burst)) +/* computes an OR of two masks, result in dst and also returned */ +static struct ipfw_flow_id * +flow_id_or(struct ipfw_flow_id *src, struct ipfw_flow_id *dst) +{ + int is_v6 = IS_IP6_FLOW_ID(dst); -/* - * Heap management functions. - * - * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2. - * Some macros help finding parent/children so we can optimize them. - * - * heap_init() is called to expand the heap when needed. - * Increment size in blocks of 16 entries. - * XXX failure to allocate a new element is a pretty bad failure - * as we basically stall a whole queue forever!! - * Returns 1 on error, 0 on success - */ -#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 ) -#define HEAP_LEFT(x) ( 2*(x) + 1 ) -#define HEAP_IS_LEFT(x) ( (x) & 1 ) -#define HEAP_RIGHT(x) ( 2*(x) + 2 ) -#define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; } -#define HEAP_INCREMENT 15 + dst->dst_port |= src->dst_port; + dst->src_port |= src->src_port; + dst->proto |= src->proto; + dst->extra |= src->extra; + if (is_v6) { +#define OR_MASK(_d, _s) \ + (_d)->__u6_addr.__u6_addr32[0] |= (_s)->__u6_addr.__u6_addr32[0]; \ + (_d)->__u6_addr.__u6_addr32[1] |= (_s)->__u6_addr.__u6_addr32[1]; \ + (_d)->__u6_addr.__u6_addr32[2] |= (_s)->__u6_addr.__u6_addr32[2]; \ + (_d)->__u6_addr.__u6_addr32[3] |= (_s)->__u6_addr.__u6_addr32[3]; + OR_MASK(&dst->dst_ip6, &src->dst_ip6); + OR_MASK(&dst->src_ip6, &src->src_ip6); +#undef OR_MASK + dst->flow_id6 |= src->flow_id6; + } else { + dst->dst_ip |= src->dst_ip; + dst->src_ip |= src->src_ip; + } + return dst; +} static int -heap_init(struct dn_heap *h, int new_size) +nonzero_mask(struct ipfw_flow_id *m) { - struct dn_heap_entry *p; + if (m->dst_port || m->src_port || m->proto || m->extra) + return 1; + if (IS_IP6_FLOW_ID(m)) { + return + m->dst_ip6.__u6_addr.__u6_addr32[0] || + m->dst_ip6.__u6_addr.__u6_addr32[1] || + m->dst_ip6.__u6_addr.__u6_addr32[2] || + m->dst_ip6.__u6_addr.__u6_addr32[3] || + m->src_ip6.__u6_addr.__u6_addr32[0] || + m->src_ip6.__u6_addr.__u6_addr32[1] || + m->src_ip6.__u6_addr.__u6_addr32[2] || + m->src_ip6.__u6_addr.__u6_addr32[3] || + m->flow_id6; + } else { + return m->dst_ip || m->src_ip; + } +} - if (h->size >= new_size ) { - printf("dummynet: %s, Bogus call, have %d want %d\n", __func__, - h->size, new_size); - return 0 ; - } - new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT ; - p = malloc(new_size * sizeof(*p), M_DUMMYNET, M_NOWAIT); - if (p == NULL) { - printf("dummynet: %s, resize %d failed\n", __func__, new_size ); - return 1 ; /* error */ - } - if (h->size > 0) { - bcopy(h->p, p, h->size * sizeof(*p) ); - free(h->p, M_DUMMYNET); +/* XXX we may want a better hash function */ +static uint32_t +flow_id_hash(struct ipfw_flow_id *id) +{ + uint32_t i; + + if (IS_IP6_FLOW_ID(id)) { + uint32_t *d = (uint32_t *)&id->dst_ip6; + uint32_t *s = (uint32_t *)&id->src_ip6; + i = (d[0] ) ^ (d[1]) ^ + (d[2] ) ^ (d[3]) ^ + (d[0] >> 15) ^ (d[1] >> 15) ^ + (d[2] >> 15) ^ (d[3] >> 15) ^ + (s[0] << 1) ^ (s[1] << 1) ^ + (s[2] << 1) ^ (s[3] << 1) ^ + (s[0] << 16) ^ (s[1] << 16) ^ + (s[2] << 16) ^ (s[3] << 16) ^ + (id->dst_port << 1) ^ (id->src_port) ^ + (id->extra) ^ + (id->proto ) ^ (id->flow_id6); + } else { + i = (id->dst_ip) ^ (id->dst_ip >> 15) ^ + (id->src_ip << 1) ^ (id->src_ip >> 16) ^ + (id->extra) ^ + (id->dst_port << 1) ^ (id->src_port) ^ (id->proto); } - h->p = p ; - h->size = new_size ; - return 0 ; + return i; } -/* - * Insert element in heap. Normally, p != NULL, we insert p in - * a new position and bubble up. If p == NULL, then the element is - * already in place, and key is the position where to start the - * bubble-up. - * Returns 1 on failure (cannot allocate new heap entry) - * - * If offset > 0 the position (index, int) of the element in the heap is - * also stored in the element itself at the given offset in bytes. - */ -#define SET_OFFSET(heap, node) \ - if (heap->offset > 0) \ - *((int *)((char *)(heap->p[node].object) + heap->offset)) = node ; -/* - * RESET_OFFSET is used for sanity checks. It sets offset to an invalid value. - */ -#define RESET_OFFSET(heap, node) \ - if (heap->offset > 0) \ - *((int *)((char *)(heap->p[node].object) + heap->offset)) = -1 ; +/* Like bcmp, returns 0 if ids match, 1 otherwise. */ static int -heap_insert(struct dn_heap *h, dn_key key1, void *p) -{ - int son = h->elements ; - - if (p == NULL) /* data already there, set starting point */ - son = key1 ; - else { /* insert new element at the end, possibly resize */ - son = h->elements ; - if (son == h->size) /* need resize... */ - if (heap_init(h, h->elements+1) ) - return 1 ; /* failure... */ - h->p[son].object = p ; - h->p[son].key = key1 ; - h->elements++ ; - } - while (son > 0) { /* bubble up */ - int father = HEAP_FATHER(son) ; - struct dn_heap_entry tmp ; - - if (DN_KEY_LT( h->p[father].key, h->p[son].key ) ) - break ; /* found right position */ - /* son smaller than father, swap and repeat */ - HEAP_SWAP(h->p[son], h->p[father], tmp) ; - SET_OFFSET(h, son); - son = father ; - } - SET_OFFSET(h, son); - return 0 ; +flow_id_cmp(struct ipfw_flow_id *id1, struct ipfw_flow_id *id2) +{ + int is_v6 = IS_IP6_FLOW_ID(id1); + + if (!is_v6) { + if (IS_IP6_FLOW_ID(id2)) + return 1; /* different address families */ + + return (id1->dst_ip == id2->dst_ip && + id1->src_ip == id2->src_ip && + id1->dst_port == id2->dst_port && + id1->src_port == id2->src_port && + id1->proto == id2->proto && + id1->extra == id2->extra) ? 0 : 1; + } + /* the ipv6 case */ + return ( + !bcmp(&id1->dst_ip6,&id2->dst_ip6, sizeof(id1->dst_ip6)) && + !bcmp(&id1->src_ip6,&id2->src_ip6, sizeof(id1->src_ip6)) && + id1->dst_port == id2->dst_port && + id1->src_port == id2->src_port && + id1->proto == id2->proto && + id1->extra == id2->extra && + id1->flow_id6 == id2->flow_id6) ? 0 : 1; } +/*--------- end of flow-id mask, hash and compare ---------*/ -/* - * remove top element from heap, or obj if obj != NULL +/*--- support functions for the qht hashtable ---- + * Entries are hashed by flow-id */ -static void -heap_extract(struct dn_heap *h, void *obj) +static uint32_t +q_hash(uintptr_t key, int flags, void *arg) { - int child, father, max = h->elements - 1 ; + /* compute the hash slot from the flow id */ + struct ipfw_flow_id *id = (flags & DNHT_KEY_IS_OBJ) ? + &((struct dn_queue *)key)->ni.fid : + (struct ipfw_flow_id *)key; - if (max < 0) { - printf("dummynet: warning, extract from empty heap 0x%p\n", h); - return ; - } - father = 0 ; /* default: move up smallest child */ - if (obj != NULL) { /* extract specific element, index is at offset */ - if (h->offset <= 0) - panic("dummynet: heap_extract from middle not supported on this heap!!!\n"); - father = *((int *)((char *)obj + h->offset)) ; - if (father < 0 || father >= h->elements) { - printf("dummynet: heap_extract, father %d out of bound 0..%d\n", - father, h->elements); - panic("dummynet: heap_extract"); + return flow_id_hash(id); +} + +static int +q_match(void *obj, uintptr_t key, int flags, void *arg) +{ + struct dn_queue *o = (struct dn_queue *)obj; + struct ipfw_flow_id *id2; + + if (flags & DNHT_KEY_IS_OBJ) { + /* compare pointers */ + id2 = &((struct dn_queue *)key)->ni.fid; + } else { + id2 = (struct ipfw_flow_id *)key; } - } - RESET_OFFSET(h, father); - child = HEAP_LEFT(father) ; /* left child */ - while (child <= max) { /* valid entry */ - if (child != max && DN_KEY_LT(h->p[child+1].key, h->p[child].key) ) - child = child+1 ; /* take right child, otherwise left */ - h->p[father] = h->p[child] ; - SET_OFFSET(h, father); - father = child ; - child = HEAP_LEFT(child) ; /* left child for next loop */ - } - h->elements-- ; - if (father != max) { - /* - * Fill hole with last entry and bubble up, reusing the insert code - */ - h->p[father] = h->p[max] ; - heap_insert(h, father, NULL); /* this one cannot fail */ - } + return (0 == flow_id_cmp(&o->ni.fid, id2)); } -#if 0 /* - * change object position and update references - * XXX this one is never used! + * create a new queue instance for the given 'key'. */ -static void -heap_move(struct dn_heap *h, dn_key new_key, void *object) -{ - int temp; - int i ; - int max = h->elements-1 ; - struct dn_heap_entry buf ; - - if (h->offset <= 0) - panic("cannot move items on this heap"); - - i = *((int *)((char *)object + h->offset)); - if (DN_KEY_LT(new_key, h->p[i].key) ) { /* must move up */ - h->p[i].key = new_key ; - for (; i>0 && DN_KEY_LT(new_key, h->p[(temp = HEAP_FATHER(i))].key) ; - i = temp ) { /* bubble up */ - HEAP_SWAP(h->p[i], h->p[temp], buf) ; - SET_OFFSET(h, i); - } - } else { /* must move down */ - h->p[i].key = new_key ; - while ( (temp = HEAP_LEFT(i)) <= max ) { /* found left child */ - if ((temp != max) && DN_KEY_GT(h->p[temp].key, h->p[temp+1].key)) - temp++ ; /* select child with min key */ - if (DN_KEY_GT(new_key, h->p[temp].key)) { /* go down */ - HEAP_SWAP(h->p[i], h->p[temp], buf) ; - SET_OFFSET(h, i); - } else - break ; - i = temp ; +static void * +q_new(uintptr_t key, int flags, void *arg) +{ + struct dn_queue *q, *template = arg; + struct dn_fsk *fs = template->fs; + int size = sizeof(*q) + fs->sched->fp->q_datalen; + + q = malloc(size, M_DUMMYNET, M_NOWAIT | M_ZERO); + if (q == NULL) { + D("no memory for new queue"); + return NULL; } - } - SET_OFFSET(h, i); + + set_oid(&q->ni.oid, DN_QUEUE, size); + if (fs->fs.flags & DN_QHT_HASH) + q->ni.fid = *(struct ipfw_flow_id *)key; + q->fs = fs; + q->_si = template->_si; + q->_si->q_count++; + + if (fs->sched->fp->new_queue) + fs->sched->fp->new_queue(q); + dn_cfg.queue_count++; + return q; } -#endif /* heap_move, unused */ /* - * heapify() will reorganize data inside an array to maintain the - * heap property. It is needed when we delete a bunch of entries. + * Notify schedulers that a queue is going away. + * If (flags & DN_DESTROY), also free the packets. + * The version for callbacks is called q_delete_cb(). */ static void -heapify(struct dn_heap *h) +dn_delete_queue(struct dn_queue *q, int flags) { - int i ; + struct dn_fsk *fs = q->fs; + + // D("fs %p si %p\n", fs, q->_si); + /* notify the parent scheduler that the queue is going away */ + if (fs && fs->sched->fp->free_queue) + fs->sched->fp->free_queue(q); + q->_si->q_count--; + q->_si = NULL; + if (flags & DN_DESTROY) { + if (q->mq.head) + dn_free_pkts(q->mq.head); + bzero(q, sizeof(*q)); // safety + free(q, M_DUMMYNET); + dn_cfg.queue_count--; + } +} - for (i = 0 ; i < h->elements ; i++ ) - heap_insert(h, i , NULL) ; +static int +q_delete_cb(void *q, void *arg) +{ + int flags = (int)(uintptr_t)arg; + dn_delete_queue(q, flags); + return (flags & DN_DESTROY) ? DNHT_SCAN_DEL : 0; } /* - * cleanup the heap and free data structure + * calls dn_delete_queue/q_delete_cb on all queues, + * which notifies the parent scheduler and possibly drains packets. + * flags & DN_DESTROY: drains queues and destroy qht; */ static void -heap_free(struct dn_heap *h) +qht_delete(struct dn_fsk *fs, int flags) { - if (h->size >0 ) - free(h->p, M_DUMMYNET); - bzero(h, sizeof(*h) ); + ND("fs %d start flags %d qht %p", + fs->fs.fs_nr, flags, fs->qht); + if (!fs->qht) + return; + if (fs->fs.flags & DN_QHT_HASH) { + dn_ht_scan(fs->qht, q_delete_cb, (void *)(uintptr_t)flags); + if (flags & DN_DESTROY) { + dn_ht_free(fs->qht, 0); + fs->qht = NULL; + } + } else { + dn_delete_queue((struct dn_queue *)(fs->qht), flags); + if (flags & DN_DESTROY) + fs->qht = NULL; + } } /* - * --- end of heap management functions --- - */ - -/* - * Return the mbuf tag holding the dummynet state. As an optimization - * this is assumed to be the first tag on the list. If this turns out - * wrong we'll need to search the list. + * Find and possibly create the queue for a MULTIQUEUE scheduler. + * We never call it for !MULTIQUEUE (the queue is in the sch_inst). */ -static struct dn_pkt_tag * -dn_tag_get(struct mbuf *m) +struct dn_queue * +ipdn_q_find(struct dn_fsk *fs, struct dn_sch_inst *si, + struct ipfw_flow_id *id) { - struct m_tag *mtag = m_tag_first(m); - KASSERT(mtag != NULL && - mtag->m_tag_cookie == MTAG_ABI_COMPAT && - mtag->m_tag_id == PACKET_TAG_DUMMYNET, - ("packet on dummynet queue w/o dummynet tag!")); - return (struct dn_pkt_tag *)(mtag+1); + struct dn_queue template; + + template._si = si; + template.fs = fs; + + if (fs->fs.flags & DN_QHT_HASH) { + struct ipfw_flow_id masked_id; + if (fs->qht == NULL) { + fs->qht = dn_ht_init(NULL, fs->fs.buckets, + offsetof(struct dn_queue, q_next), + q_hash, q_match, q_new); + if (fs->qht == NULL) + return NULL; + } + masked_id = *id; + flow_id_mask(&fs->fsk_mask, &masked_id); + return dn_ht_find(fs->qht, (uintptr_t)&masked_id, + DNHT_INSERT, &template); + } else { + if (fs->qht == NULL) + fs->qht = q_new(0, 0, &template); + return (struct dn_queue *)fs->qht; + } } +/*--- end of queue hash table ---*/ -/* - * Scheduler functions: - * - * transmit_event() is called when the delay-line needs to enter - * the scheduler, either because of existing pkts getting ready, - * or new packets entering the queue. The event handled is the delivery - * time of the packet. - * - * ready_event() does something similar with fixed-rate queues, and the - * event handled is the finish time of the head pkt. +/*--- support functions for the sch_inst hashtable ---- * - * wfq_ready_event() does something similar with WF2Q queues, and the - * event handled is the start time of the head pkt. - * - * In all cases, we make sure that the data structures are consistent - * before passing pkts out, because this might trigger recursive - * invocations of the procedures. + * These are hashed by flow-id */ -static void -transmit_event(struct dn_pipe *pipe, struct mbuf **head, struct mbuf **tail) +static uint32_t +si_hash(uintptr_t key, int flags, void *arg) { - struct mbuf *m; - struct dn_pkt_tag *pkt; + /* compute the hash slot from the flow id */ + struct ipfw_flow_id *id = (flags & DNHT_KEY_IS_OBJ) ? + &((struct dn_sch_inst *)key)->ni.fid : + (struct ipfw_flow_id *)key; - DUMMYNET_LOCK_ASSERT(); + return flow_id_hash(id); +} - while ((m = pipe->head) != NULL) { - pkt = dn_tag_get(m); - if (!DN_KEY_LEQ(pkt->output_time, curr_time)) - break; +static int +si_match(void *obj, uintptr_t key, int flags, void *arg) +{ + struct dn_sch_inst *o = obj; + struct ipfw_flow_id *id2; - pipe->head = m->m_nextpkt; - if (*tail != NULL) - (*tail)->m_nextpkt = m; - else - *head = m; - *tail = m; + id2 = (flags & DNHT_KEY_IS_OBJ) ? + &((struct dn_sch_inst *)key)->ni.fid : + (struct ipfw_flow_id *)key; + return flow_id_cmp(&o->ni.fid, id2) == 0; +} + +/* + * create a new instance for the given 'key' + * Allocate memory for instance, delay line and scheduler private data. + */ +static void * +si_new(uintptr_t key, int flags, void *arg) +{ + struct dn_schk *s = arg; + struct dn_sch_inst *si; + int l = sizeof(*si) + s->fp->si_datalen; + + si = malloc(l, M_DUMMYNET, M_NOWAIT | M_ZERO); + if (si == NULL) + goto error; + /* Set length only for the part passed up to userland. */ + set_oid(&si->ni.oid, DN_SCH_I, sizeof(struct dn_flow)); + set_oid(&(si->dline.oid), DN_DELAY_LINE, + sizeof(struct delay_line)); + /* mark si and dline as outside the event queue */ + si->ni.oid.id = si->dline.oid.id = -1; + + si->sched = s; + si->dline.si = si; + + if (s->fp->new_sched && s->fp->new_sched(si)) { + D("new_sched error"); + goto error; } - if (*tail != NULL) - (*tail)->m_nextpkt = NULL; + if (s->sch.flags & DN_HAVE_MASK) + si->ni.fid = *(struct ipfw_flow_id *)key; - /* If there are leftover packets, put into the heap for next event. */ - if ((m = pipe->head) != NULL) { - pkt = dn_tag_get(m); - /* - * XXX Should check errors on heap_insert, by draining the - * whole pipe p and hoping in the future we are more successful. - */ - heap_insert(&extract_heap, pkt->output_time, pipe); + dn_cfg.si_count++; + return si; + +error: + if (si) { + bzero(si, sizeof(*si)); // safety + free(si, M_DUMMYNET); } + return NULL; } -#define div64(a, b) ((int64_t)(a) / (int64_t)(b)) -#define DN_TO_DROP 0xffff /* - * Compute how many ticks we have to wait before being able to send - * a packet. This is computed as the "wire time" for the packet - * (length + extra bits), minus the credit available, scaled to ticks. - * Check that the result is not be negative (it could be if we have - * too much leftover credit in q->numbytes). + * Callback from siht to delete all scheduler instances. Remove + * si and delay line from the system heap, destroy all queues. + * We assume that all flowset have been notified and do not + * point to us anymore. */ -static inline dn_key -set_ticks(struct mbuf *m, struct dn_flow_queue *q, struct dn_pipe *p) +static int +si_destroy(void *_si, void *arg) { - int64_t ret; - - ret = div64( (m->m_pkthdr.len * 8 + q->extra_bits) * hz - - q->numbytes + p->bandwidth - 1 , p->bandwidth); -#if 0 - printf("%s %d extra_bits %d numb %d ret %d\n", - __FUNCTION__, __LINE__, - (int)(q->extra_bits & 0xffffffff), - (int)(q->numbytes & 0xffffffff), - (int)(ret & 0xffffffff)); -#endif - if (ret < 0) - ret = 0; - return ret; + struct dn_sch_inst *si = _si; + struct dn_schk *s = si->sched; + struct delay_line *dl = &si->dline; + + if (dl->oid.subtype) /* remove delay line from event heap */ + heap_extract(&dn_cfg.evheap, dl); + dn_free_pkts(dl->mq.head); /* drain delay line */ + if (si->kflags & DN_ACTIVE) /* remove si from event heap */ + heap_extract(&dn_cfg.evheap, si); + if (s->fp->free_sched) + s->fp->free_sched(si); + bzero(si, sizeof(*si)); /* safety */ + free(si, M_DUMMYNET); + dn_cfg.si_count--; + return DNHT_SCAN_DEL; } /* - * Convert the additional MAC overheads/delays into an equivalent - * number of bits for the given data rate. The samples are in milliseconds - * so we need to divide by 1000. + * Find the scheduler instance for this packet. If we need to apply + * a mask, do on a local copy of the flow_id to preserve the original. + * Assume siht is always initialized if we have a mask. */ -static dn_key -compute_extra_bits(struct mbuf *pkt, struct dn_pipe *p) +struct dn_sch_inst * +ipdn_si_find(struct dn_schk *s, struct ipfw_flow_id *id) { - int index; - dn_key extra_bits; - if (!p->samples || p->samples_no == 0) - return 0; - index = random() % p->samples_no; - extra_bits = ((dn_key)p->samples[index] * p->bandwidth) / 1000; - if (index >= p->loss_level) { - struct dn_pkt_tag *dt = dn_tag_get(pkt); - if (dt) - dt->dn_dir = DN_TO_DROP; + if (s->sch.flags & DN_HAVE_MASK) { + struct ipfw_flow_id id_t = *id; + flow_id_mask(&s->sch.sched_mask, &id_t); + return dn_ht_find(s->siht, (uintptr_t)&id_t, + DNHT_INSERT, s); } - return extra_bits; + if (!s->siht) + s->siht = si_new(0, 0, s); + return (struct dn_sch_inst *)s->siht; } -static void -free_pipe(struct dn_pipe *p) +/* callback to flush credit for the scheduler instance */ +static int +si_reset_credit(void *_si, void *arg) { - if (p->samples) - free(p->samples, M_DUMMYNET); - free(p, M_DUMMYNET); + struct dn_sch_inst *si = _si; + struct dn_link *p = &si->sched->link; + + si->credit = p->burst + (dn_cfg.io_fast ? p->bandwidth : 0); + return 0; } -/* - * extract pkt from queue, compute output time (could be now) - * and put into delay line (p_queue) - */ static void -move_pkt(struct mbuf *pkt, struct dn_flow_queue *q, struct dn_pipe *p, - int len) +schk_reset_credit(struct dn_schk *s) { - struct dn_pkt_tag *dt = dn_tag_get(pkt); - - q->head = pkt->m_nextpkt ; - q->len-- ; - q->len_bytes -= len ; - - dt->output_time = curr_time + p->delay ; - - if (p->head == NULL) - p->head = pkt; - else - p->tail->m_nextpkt = pkt; - p->tail = pkt; - p->tail->m_nextpkt = NULL; + if (s->sch.flags & DN_HAVE_MASK) + dn_ht_scan(s->siht, si_reset_credit, NULL); + else if (s->siht) + si_reset_credit(s->siht, NULL); } +/*---- end of sch_inst hashtable ---------------------*/ -/* - * ready_event() is invoked every time the queue must enter the - * scheduler, either because the first packet arrives, or because - * a previously scheduled event fired. - * On invokation, drain as many pkts as possible (could be 0) and then - * if there are leftover packets reinsert the pkt in the scheduler. +/*------------------------------------------------------- + * flowset hash (fshash) support. Entries are hashed by fs_nr. + * New allocations are put in the fsunlinked list, from which + * they are removed when they point to a specific scheduler. */ -static void -ready_event(struct dn_flow_queue *q, struct mbuf **head, struct mbuf **tail) +static uint32_t +fsk_hash(uintptr_t key, int flags, void *arg) { - struct mbuf *pkt; - struct dn_pipe *p = q->fs->pipe; - int p_was_empty; - - DUMMYNET_LOCK_ASSERT(); + uint32_t i = !(flags & DNHT_KEY_IS_OBJ) ? key : + ((struct dn_fsk *)key)->fs.fs_nr; - if (p == NULL) { - printf("dummynet: ready_event- pipe is gone\n"); - return; - } - p_was_empty = (p->head == NULL); + return ( (i>>8)^(i>>4)^i ); +} - /* - * Schedule fixed-rate queues linked to this pipe: - * account for the bw accumulated since last scheduling, then - * drain as many pkts as allowed by q->numbytes and move to - * the delay line (in p) computing output time. - * bandwidth==0 (no limit) means we can drain the whole queue, - * setting len_scaled = 0 does the job. - */ - q->numbytes += (curr_time - q->sched_time) * p->bandwidth; - while ((pkt = q->head) != NULL) { - int len = pkt->m_pkthdr.len; - dn_key len_scaled = p->bandwidth ? len*8*hz - + q->extra_bits*hz - : 0; - - if (DN_KEY_GT(len_scaled, q->numbytes)) - break; - q->numbytes -= len_scaled; - move_pkt(pkt, q, p, len); - if (q->head) - q->extra_bits = compute_extra_bits(q->head, p); - } - /* - * If we have more packets queued, schedule next ready event - * (can only occur when bandwidth != 0, otherwise we would have - * flushed the whole queue in the previous loop). - * To this purpose we record the current time and compute how many - * ticks to go for the finish time of the packet. - */ - if ((pkt = q->head) != NULL) { /* this implies bandwidth != 0 */ - dn_key t = set_ticks(pkt, q, p); /* ticks i have to wait */ +static int +fsk_match(void *obj, uintptr_t key, int flags, void *arg) +{ + struct dn_fsk *fs = obj; + int i = !(flags & DNHT_KEY_IS_OBJ) ? key : + ((struct dn_fsk *)key)->fs.fs_nr; - q->sched_time = curr_time; - heap_insert(&ready_heap, curr_time + t, (void *)q); - /* - * XXX Should check errors on heap_insert, and drain the whole - * queue on error hoping next time we are luckier. - */ - } else /* RED needs to know when the queue becomes empty. */ - q->idle_time = curr_time; + return (fs->fs.fs_nr == i); +} - /* - * If the delay line was empty call transmit_event() now. - * Otherwise, the scheduler will take care of it. - */ - if (p_was_empty) - transmit_event(p, head, tail); +static void * +fsk_new(uintptr_t key, int flags, void *arg) +{ + struct dn_fsk *fs; + + fs = malloc(sizeof(*fs), M_DUMMYNET, M_NOWAIT | M_ZERO); + if (fs) { + set_oid(&fs->fs.oid, DN_FS, sizeof(fs->fs)); + dn_cfg.fsk_count++; + fs->drain_bucket = 0; + SLIST_INSERT_HEAD(&dn_cfg.fsu, fs, sch_chain); + } + return fs; } /* - * Called when we can transmit packets on WF2Q queues. Take pkts out of - * the queues at their start time, and enqueue into the delay line. - * Packets are drained until p->numbytes < 0. As long as - * len_scaled >= p->numbytes, the packet goes into the delay line - * with a deadline p->delay. For the last packet, if p->numbytes < 0, - * there is an additional delay. + * detach flowset from its current scheduler. Flags as follows: + * DN_DETACH removes from the fsk_list + * DN_DESTROY deletes individual queues + * DN_DELETE_FS destroys the flowset (otherwise goes in unlinked). */ static void -ready_event_wfq(struct dn_pipe *p, struct mbuf **head, struct mbuf **tail) +fsk_detach(struct dn_fsk *fs, int flags) { - int p_was_empty = (p->head == NULL); - struct dn_heap *sch = &(p->scheduler_heap); - struct dn_heap *neh = &(p->not_eligible_heap); - - DUMMYNET_LOCK_ASSERT(); - - if (p->if_name[0] == 0) /* tx clock is simulated */ - p->numbytes += (curr_time - p->sched_time) * p->bandwidth; - else { /* - * tx clock is for real, - * the ifq must be empty or this is a NOP. - */ - if (p->ifp && p->ifp->if_snd.ifq_head != NULL) - return; - else { - DPRINTF(("dummynet: pipe %d ready from %s --\n", - p->pipe_nr, p->if_name)); - } + if (flags & DN_DELETE_FS) + flags |= DN_DESTROY; + ND("fs %d from sched %d flags %s %s %s", + fs->fs.fs_nr, fs->fs.sched_nr, + (flags & DN_DELETE_FS) ? "DEL_FS":"", + (flags & DN_DESTROY) ? "DEL":"", + (flags & DN_DETACH) ? "DET":""); + if (flags & DN_DETACH) { /* detach from the list */ + struct dn_fsk_head *h; + h = fs->sched ? &fs->sched->fsk_list : &dn_cfg.fsu; + SLIST_REMOVE(h, fs, dn_fsk, sch_chain); } - - /* - * While we have backlogged traffic AND credit, we need to do - * something on the queue. + /* Free the RED parameters, they will be recomputed on + * subsequent attach if needed. */ - while (p->numbytes >= 0 && (sch->elements > 0 || neh->elements > 0)) { - if (sch->elements > 0) { - /* Have some eligible pkts to send out. */ - struct dn_flow_queue *q = sch->p[0].object; - struct mbuf *pkt = q->head; - struct dn_flow_set *fs = q->fs; - uint64_t len = pkt->m_pkthdr.len; - int len_scaled = p->bandwidth ? len * 8 * hz : 0; - - heap_extract(sch, NULL); /* Remove queue from heap. */ - p->numbytes -= len_scaled; - move_pkt(pkt, q, p, len); - - p->V += (len << MY_M) / p->sum; /* Update V. */ - q->S = q->F; /* Update start time. */ - if (q->len == 0) { - /* Flow not backlogged any more. */ - fs->backlogged--; - heap_insert(&(p->idle_heap), q->F, q); - } else { - /* Still backlogged. */ - - /* - * Update F and position in backlogged queue, - * then put flow in not_eligible_heap - * (we will fix this later). - */ - len = (q->head)->m_pkthdr.len; - q->F += (len << MY_M) / (uint64_t)fs->weight; - if (DN_KEY_LEQ(q->S, p->V)) - heap_insert(neh, q->S, q); - else - heap_insert(sch, q->F, q); - } - } - /* - * Now compute V = max(V, min(S_i)). Remember that all elements - * in sch have by definition S_i <= V so if sch is not empty, - * V is surely the max and we must not update it. Conversely, - * if sch is empty we only need to look at neh. - */ - if (sch->elements == 0 && neh->elements > 0) - p->V = MAX64(p->V, neh->p[0].key); - /* Move from neh to sch any packets that have become eligible */ - while (neh->elements > 0 && DN_KEY_LEQ(neh->p[0].key, p->V)) { - struct dn_flow_queue *q = neh->p[0].object; - heap_extract(neh, NULL); - heap_insert(sch, q->F, q); - } - - if (p->if_name[0] != '\0') { /* Tx clock is from a real thing */ - p->numbytes = -1; /* Mark not ready for I/O. */ - break; - } - } - if (sch->elements == 0 && neh->elements == 0 && p->numbytes >= 0) { - p->idle_time = curr_time; - /* - * No traffic and no events scheduled. - * We can get rid of idle-heap. - */ - if (p->idle_heap.elements > 0) { - int i; - - for (i = 0; i < p->idle_heap.elements; i++) { - struct dn_flow_queue *q; - - q = p->idle_heap.p[i].object; - q->F = 0; - q->S = q->F + 1; - } - p->sum = 0; - p->V = 0; - p->idle_heap.elements = 0; - } - } - /* - * If we are getting clocks from dummynet (not a real interface) and - * If we are under credit, schedule the next ready event. - * Also fix the delivery time of the last packet. - */ - if (p->if_name[0]==0 && p->numbytes < 0) { /* This implies bw > 0. */ - dn_key t = 0; /* Number of ticks i have to wait. */ - - if (p->bandwidth > 0) - t = (p->bandwidth - 1 - p->numbytes) / p->bandwidth; - dn_tag_get(p->tail)->output_time += t; - p->sched_time = curr_time; - heap_insert(&wfq_ready_heap, curr_time + t, (void *)p); - /* - * XXX Should check errors on heap_insert, and drain the whole - * queue on error hoping next time we are luckier. - */ + if (fs->w_q_lookup) + free(fs->w_q_lookup, M_DUMMYNET); + fs->w_q_lookup = NULL; + qht_delete(fs, flags); + if (fs->sched && fs->sched->fp->free_fsk) + fs->sched->fp->free_fsk(fs); + fs->sched = NULL; + if (flags & DN_DELETE_FS) { + bzero(fs, sizeof(fs)); /* safety */ + free(fs, M_DUMMYNET); + dn_cfg.fsk_count--; + } else { + SLIST_INSERT_HEAD(&dn_cfg.fsu, fs, sch_chain); } - - /* - * If the delay line was empty call transmit_event() now. - * Otherwise, the scheduler will take care of it. - */ - if (p_was_empty) - transmit_event(p, head, tail); } /* - * This is called one tick, after previous run. It is used to - * schedule next run. + * Detach or destroy all flowsets in a list. + * flags specifies what to do: + * DN_DESTROY: flush all queues + * DN_DELETE_FS: DN_DESTROY + destroy flowset + * DN_DELETE_FS implies DN_DESTROY */ static void -dummynet(void * __unused unused) +fsk_detach_list(struct dn_fsk_head *h, int flags) { - - taskqueue_enqueue(dn_tq, &dn_task); + struct dn_fsk *fs; + int n = 0; /* only for stats */ + + ND("head %p flags %x", h, flags); + while ((fs = SLIST_FIRST(h))) { + SLIST_REMOVE_HEAD(h, sch_chain); + n++; + fsk_detach(fs, flags); + } + ND("done %d flowsets", n); } /* - * The main dummynet processing function. + * called on 'queue X delete' -- removes the flowset from fshash, + * deletes all queues for the flowset, and removes the flowset. */ -static void -dummynet_task(void *context, int pending) +static int +delete_fs(int i, int locked) { - struct mbuf *head = NULL, *tail = NULL; - struct dn_pipe *pipe; - struct dn_heap *heaps[3]; - struct dn_heap *h; - void *p; /* generic parameter to handler */ - int i; - - DUMMYNET_LOCK(); - - heaps[0] = &ready_heap; /* fixed-rate queues */ - heaps[1] = &wfq_ready_heap; /* wfq queues */ - heaps[2] = &extract_heap; /* delay line */ - - /* Update number of lost(coalesced) ticks. */ - tick_lost += pending - 1; - - getmicrouptime(&t); - /* Last tick duration (usec). */ - tick_last = (t.tv_sec - prev_t.tv_sec) * 1000000 + - (t.tv_usec - prev_t.tv_usec); - /* Last tick vs standard tick difference (usec). */ - tick_delta = (tick_last * hz - 1000000) / hz; - /* Accumulated tick difference (usec). */ - tick_delta_sum += tick_delta; - - prev_t = t; - - /* - * Adjust curr_time if accumulated tick difference greater than - * 'standard' tick. Since curr_time should be monotonically increasing, - * we do positive adjustment as required and throttle curr_time in - * case of negative adjustment. - */ - curr_time++; - if (tick_delta_sum - tick >= 0) { - int diff = tick_delta_sum / tick; - - curr_time += diff; - tick_diff += diff; - tick_delta_sum %= tick; - tick_adjustment++; - } else if (tick_delta_sum + tick <= 0) { - curr_time--; - tick_diff--; - tick_delta_sum += tick; - tick_adjustment++; - } - - for (i = 0; i < 3; i++) { - h = heaps[i]; - while (h->elements > 0 && DN_KEY_LEQ(h->p[0].key, curr_time)) { - if (h->p[0].key > curr_time) - printf("dummynet: warning, " - "heap %d is %d ticks late\n", - i, (int)(curr_time - h->p[0].key)); - /* store a copy before heap_extract */ - p = h->p[0].object; - /* need to extract before processing */ - heap_extract(h, NULL); - if (i == 0) - ready_event(p, &head, &tail); - else if (i == 1) { - struct dn_pipe *pipe = p; - if (pipe->if_name[0] != '\0') - printf("dummynet: bad ready_event_wfq " - "for pipe %s\n", pipe->if_name); - else - ready_event_wfq(p, &head, &tail); - } else - transmit_event(p, &head, &tail); - } - } - - /* Sweep pipes trying to expire idle flow_queues. */ - for (i = 0; i < HASHSIZE; i++) - SLIST_FOREACH(pipe, &pipehash[i], next) - if (pipe->idle_heap.elements > 0 && - DN_KEY_LT(pipe->idle_heap.p[0].key, pipe->V)) { - struct dn_flow_queue *q = - pipe->idle_heap.p[0].object; - - heap_extract(&(pipe->idle_heap), NULL); - /* Mark timestamp as invalid. */ - q->S = q->F + 1; - pipe->sum -= q->fs->weight; - } - - DUMMYNET_UNLOCK(); - - if (head != NULL) - dummynet_send(head); - - callout_reset(&dn_timeout, 1, dummynet, NULL); + struct dn_fsk *fs; + int err = 0; + + if (!locked) + DN_BH_WLOCK(); + fs = dn_ht_find(dn_cfg.fshash, i, DNHT_REMOVE, NULL); + ND("fs %d found %p", i, fs); + if (fs) { + fsk_detach(fs, DN_DETACH | DN_DELETE_FS); + err = 0; + } else + err = EINVAL; + if (!locked) + DN_BH_WUNLOCK(); + return err; } -static void -dummynet_send(struct mbuf *m) -{ - struct dn_pkt_tag *pkt; - struct mbuf *n; - struct ip *ip; - - for (; m != NULL; m = n) { - n = m->m_nextpkt; - m->m_nextpkt = NULL; - pkt = dn_tag_get(m); - switch (pkt->dn_dir) { - case DN_TO_IP_OUT: - ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL); - break ; - case DN_TO_IP_IN : - ip = mtod(m, struct ip *); - ip->ip_len = htons(ip->ip_len); - ip->ip_off = htons(ip->ip_off); - netisr_dispatch(NETISR_IP, m); - break; -#ifdef INET6 - case DN_TO_IP6_IN: - netisr_dispatch(NETISR_IPV6, m); - break; +/*----- end of flowset hashtable support -------------*/ - case DN_TO_IP6_OUT: - ip6_output(m, NULL, NULL, IPV6_FORWARDING, NULL, NULL, NULL); - break; -#endif - case DN_TO_IFB_FWD: - if (bridge_dn_p != NULL) - ((*bridge_dn_p)(m, pkt->ifp)); - else - printf("dummynet: if_bridge not loaded\n"); - - break; - case DN_TO_ETH_DEMUX: - /* - * The Ethernet code assumes the Ethernet header is - * contiguous in the first mbuf header. - * Insure this is true. - */ - if (m->m_len < ETHER_HDR_LEN && - (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) { - printf("dummynet/ether: pullup failed, " - "dropping packet\n"); - break; - } - ether_demux(m->m_pkthdr.rcvif, m); - break; - case DN_TO_ETH_OUT: - ether_output_frame(pkt->ifp, m); - break; - - case DN_TO_DROP: - /* drop the packet after some time */ - m_freem(m); - break; - - default: - printf("dummynet: bad switch %d!\n", pkt->dn_dir); - m_freem(m); - break; - } - } +/*------------------------------------------------------------ + * Scheduler hash. When searching by index we pass sched_nr, + * otherwise we pass struct dn_sch * which is the first field in + * struct dn_schk so we can cast between the two. We use this trick + * because in the create phase (but it should be fixed). + */ +static uint32_t +schk_hash(uintptr_t key, int flags, void *_arg) +{ + uint32_t i = !(flags & DNHT_KEY_IS_OBJ) ? key : + ((struct dn_schk *)key)->sch.sched_nr; + return ( (i>>8)^(i>>4)^i ); } -/* - * Unconditionally expire empty queues in case of shortage. - * Returns the number of queues freed. - */ static int -expire_queues(struct dn_flow_set *fs) -{ - struct dn_flow_queue *q, *prev ; - int i, initial_elements = fs->rq_elements ; - - if (fs->last_expired == time_uptime) - return 0 ; - fs->last_expired = time_uptime ; - for (i = 0 ; i <= fs->rq_size ; i++) /* last one is overflow */ - for (prev=NULL, q = fs->rq[i] ; q != NULL ; ) - if (!QUEUE_IS_IDLE(q)) { - prev = q ; - q = q->next ; - } else { /* entry is idle, expire it */ - struct dn_flow_queue *old_q = q ; - - if (prev != NULL) - prev->next = q = q->next ; - else - fs->rq[i] = q = q->next ; - fs->rq_elements-- ; - free(old_q, M_DUMMYNET); - } - return initial_elements - fs->rq_elements ; +schk_match(void *obj, uintptr_t key, int flags, void *_arg) +{ + struct dn_schk *s = (struct dn_schk *)obj; + int i = !(flags & DNHT_KEY_IS_OBJ) ? key : + ((struct dn_schk *)key)->sch.sched_nr; + return (s->sch.sched_nr == i); } /* - * If room, create a new queue and put at head of slot i; - * otherwise, create or use the default queue. + * Create the entry and intialize with the sched hash if needed. + * Leave s->fp unset so we can tell whether a dn_ht_find() returns + * a new object or a previously existing one. */ -static struct dn_flow_queue * -create_queue(struct dn_flow_set *fs, int i) +static void * +schk_new(uintptr_t key, int flags, void *arg) { - struct dn_flow_queue *q; - - if (fs->rq_elements > fs->rq_size * dn_max_ratio && - expire_queues(fs) == 0) { - /* No way to get room, use or create overflow queue. */ - i = fs->rq_size; - if (fs->rq[i] != NULL) - return fs->rq[i]; - } - q = malloc(sizeof(*q), M_DUMMYNET, M_NOWAIT | M_ZERO); - if (q == NULL) { - printf("dummynet: sorry, cannot allocate queue for new flow\n"); - return (NULL); + struct schk_new_arg *a = arg; + struct dn_schk *s; + int l = sizeof(*s) +a->fp->schk_datalen; + + s = malloc(l, M_DUMMYNET, M_NOWAIT | M_ZERO); + if (s == NULL) + return NULL; + set_oid(&s->link.oid, DN_LINK, sizeof(s->link)); + s->sch = *a->sch; // copy initial values + s->link.link_nr = s->sch.sched_nr; + SLIST_INIT(&s->fsk_list); + /* initialize the hash table or create the single instance */ + s->fp = a->fp; /* si_new needs this */ + s->drain_bucket = 0; + if (s->sch.flags & DN_HAVE_MASK) { + s->siht = dn_ht_init(NULL, s->sch.buckets, + offsetof(struct dn_sch_inst, si_next), + si_hash, si_match, si_new); + if (s->siht == NULL) { + free(s, M_DUMMYNET); + return NULL; + } } - q->fs = fs; - q->hash_slot = i; - q->next = fs->rq[i]; - q->S = q->F + 1; /* hack - mark timestamp as invalid. */ - q->numbytes = fs->pipe->burst + (io_fast ? fs->pipe->bandwidth : 0); - fs->rq[i] = q; - fs->rq_elements++; - return (q); + s->fp = NULL; /* mark as a new scheduler */ + dn_cfg.schk_count++; + return s; } /* - * Given a flow_set and a pkt in last_pkt, find a matching queue - * after appropriate masking. The queue is moved to front - * so that further searches take less time. + * Callback for sched delete. Notify all attached flowsets to + * detach from the scheduler, destroy the internal flowset, and + * all instances. The scheduler goes away too. + * arg is 0 (only detach flowsets and destroy instances) + * DN_DESTROY (detach & delete queues, delete schk) + * or DN_DELETE_FS (delete queues and flowsets, delete schk) */ -static struct dn_flow_queue * -find_queue(struct dn_flow_set *fs, struct ipfw_flow_id *id) -{ - int i = 0 ; /* we need i and q for new allocations */ - struct dn_flow_queue *q, *prev; - int is_v6 = IS_IP6_FLOW_ID(id); - - if ( !(fs->flags_fs & DN_HAVE_FLOW_MASK) ) - q = fs->rq[0] ; - else { - /* first, do the masking, then hash */ - id->dst_port &= fs->flow_mask.dst_port ; - id->src_port &= fs->flow_mask.src_port ; - id->proto &= fs->flow_mask.proto ; - id->flags = 0 ; /* we don't care about this one */ - if (is_v6) { - APPLY_MASK(&id->dst_ip6, &fs->flow_mask.dst_ip6); - APPLY_MASK(&id->src_ip6, &fs->flow_mask.src_ip6); - id->flow_id6 &= fs->flow_mask.flow_id6; - - i = ((id->dst_ip6.__u6_addr.__u6_addr32[0]) & 0xffff)^ - ((id->dst_ip6.__u6_addr.__u6_addr32[1]) & 0xffff)^ - ((id->dst_ip6.__u6_addr.__u6_addr32[2]) & 0xffff)^ - ((id->dst_ip6.__u6_addr.__u6_addr32[3]) & 0xffff)^ - - ((id->dst_ip6.__u6_addr.__u6_addr32[0] >> 15) & 0xffff)^ - ((id->dst_ip6.__u6_addr.__u6_addr32[1] >> 15) & 0xffff)^ - ((id->dst_ip6.__u6_addr.__u6_addr32[2] >> 15) & 0xffff)^ - ((id->dst_ip6.__u6_addr.__u6_addr32[3] >> 15) & 0xffff)^ - - ((id->src_ip6.__u6_addr.__u6_addr32[0] << 1) & 0xfffff)^ - ((id->src_ip6.__u6_addr.__u6_addr32[1] << 1) & 0xfffff)^ - ((id->src_ip6.__u6_addr.__u6_addr32[2] << 1) & 0xfffff)^ - ((id->src_ip6.__u6_addr.__u6_addr32[3] << 1) & 0xfffff)^ - - ((id->src_ip6.__u6_addr.__u6_addr32[0] << 16) & 0xffff)^ - ((id->src_ip6.__u6_addr.__u6_addr32[1] << 16) & 0xffff)^ - ((id->src_ip6.__u6_addr.__u6_addr32[2] << 16) & 0xffff)^ - ((id->src_ip6.__u6_addr.__u6_addr32[3] << 16) & 0xffff)^ - - (id->dst_port << 1) ^ (id->src_port) ^ - (id->proto ) ^ - (id->flow_id6); - } else { - id->dst_ip &= fs->flow_mask.dst_ip ; - id->src_ip &= fs->flow_mask.src_ip ; - - i = ( (id->dst_ip) & 0xffff ) ^ - ( (id->dst_ip >> 15) & 0xffff ) ^ - ( (id->src_ip << 1) & 0xffff ) ^ - ( (id->src_ip >> 16 ) & 0xffff ) ^ - (id->dst_port << 1) ^ (id->src_port) ^ - (id->proto ); - } - i = i % fs->rq_size ; - /* finally, scan the current list for a match */ - searches++ ; - for (prev=NULL, q = fs->rq[i] ; q ; ) { - search_steps++; - if (is_v6 && - IN6_ARE_ADDR_EQUAL(&id->dst_ip6,&q->id.dst_ip6) && - IN6_ARE_ADDR_EQUAL(&id->src_ip6,&q->id.src_ip6) && - id->dst_port == q->id.dst_port && - id->src_port == q->id.src_port && - id->proto == q->id.proto && - id->flags == q->id.flags && - id->flow_id6 == q->id.flow_id6) - break ; /* found */ - - if (!is_v6 && id->dst_ip == q->id.dst_ip && - id->src_ip == q->id.src_ip && - id->dst_port == q->id.dst_port && - id->src_port == q->id.src_port && - id->proto == q->id.proto && - id->flags == q->id.flags) - break ; /* found */ - - /* No match. Check if we can expire the entry */ - if (pipe_expire && QUEUE_IS_IDLE(q)) { - /* entry is idle and not in any heap, expire it */ - struct dn_flow_queue *old_q = q ; - - if (prev != NULL) - prev->next = q = q->next ; - else - fs->rq[i] = q = q->next ; - fs->rq_elements-- ; - free(old_q, M_DUMMYNET); - continue ; - } - prev = q ; - q = q->next ; - } - if (q && prev != NULL) { /* found and not in front */ - prev->next = q->next ; - q->next = fs->rq[i] ; - fs->rq[i] = q ; +static int +schk_delete_cb(void *obj, void *arg) +{ + struct dn_schk *s = obj; +#if 0 + int a = (int)arg; + ND("sched %d arg %s%s", + s->sch.sched_nr, + a&DN_DESTROY ? "DEL ":"", + a&DN_DELETE_FS ? "DEL_FS":""); +#endif + fsk_detach_list(&s->fsk_list, arg ? DN_DESTROY : 0); + /* no more flowset pointing to us now */ + if (s->sch.flags & DN_HAVE_MASK) + dn_ht_scan(s->siht, si_destroy, NULL); + else if (s->siht) + si_destroy(s->siht, NULL); + if (s->profile) { + free(s->profile, M_DUMMYNET); + s->profile = NULL; } - } - if (q == NULL) { /* no match, need to allocate a new entry */ - q = create_queue(fs, i); - if (q != NULL) - q->id = *id ; - } - return q ; + s->siht = NULL; + if (s->fp->destroy) + s->fp->destroy(s); + bzero(s, sizeof(*s)); // safety + free(obj, M_DUMMYNET); + dn_cfg.schk_count--; + return DNHT_SCAN_DEL; } +/* + * called on a 'sched X delete' command. Deletes a single scheduler. + * This is done by removing from the schedhash, unlinking all + * flowsets and deleting their traffic. + */ static int -red_drops(struct dn_flow_set *fs, struct dn_flow_queue *q, int len) +delete_schk(int i) { - /* - * RED algorithm - * - * RED calculates the average queue size (avg) using a low-pass filter - * with an exponential weighted (w_q) moving average: - * avg <- (1-w_q) * avg + w_q * q_size - * where q_size is the queue length (measured in bytes or * packets). - * - * If q_size == 0, we compute the idle time for the link, and set - * avg = (1 - w_q)^(idle/s) - * where s is the time needed for transmitting a medium-sized packet. - * - * Now, if avg < min_th the packet is enqueued. - * If avg > max_th the packet is dropped. Otherwise, the packet is - * dropped with probability P function of avg. - */ - - int64_t p_b = 0; - - /* Queue in bytes or packets? */ - u_int q_size = (fs->flags_fs & DN_QSIZE_IS_BYTES) ? - q->len_bytes : q->len; - - DPRINTF(("\ndummynet: %d q: %2u ", (int)curr_time, q_size)); - - /* Average queue size estimation. */ - if (q_size != 0) { - /* Queue is not empty, avg <- avg + (q_size - avg) * w_q */ - int diff = SCALE(q_size) - q->avg; - int64_t v = SCALE_MUL((int64_t)diff, (int64_t)fs->w_q); + struct dn_schk *s; + + s = dn_ht_find(dn_cfg.schedhash, i, DNHT_REMOVE, NULL); + ND("%d %p", i, s); + if (!s) + return EINVAL; + delete_fs(i + DN_MAX_ID, 1); /* first delete internal fs */ + /* then detach flowsets, delete traffic */ + schk_delete_cb(s, (void*)(uintptr_t)DN_DESTROY); + return 0; +} +/*--- end of schk hashtable support ---*/ - q->avg += (int)v; - } else { - /* - * Queue is empty, find for how long the queue has been - * empty and use a lookup table for computing - * (1 - * w_q)^(idle_time/s) where s is the time to send a - * (small) packet. - * XXX check wraps... - */ - if (q->avg) { - u_int t = (curr_time - q->idle_time) / fs->lookup_step; +static int +copy_obj(char **start, char *end, void *_o, const char *msg, int i) +{ + struct dn_id *o = _o; + int have = end - *start; - q->avg = (t < fs->lookup_depth) ? - SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0; - } - } - DPRINTF(("dummynet: avg: %u ", SCALE_VAL(q->avg))); - - /* Should i drop? */ - if (q->avg < fs->min_th) { - q->count = -1; - return (0); /* accept packet */ - } - if (q->avg >= fs->max_th) { /* average queue >= max threshold */ - if (fs->flags_fs & DN_IS_GENTLE_RED) { - /* - * According to Gentle-RED, if avg is greater than - * max_th the packet is dropped with a probability - * p_b = c_3 * avg - c_4 - * where c_3 = (1 - max_p) / max_th - * c_4 = 1 - 2 * max_p - */ - p_b = SCALE_MUL((int64_t)fs->c_3, (int64_t)q->avg) - - fs->c_4; - } else { - q->count = -1; - DPRINTF(("dummynet: - drop")); - return (1); - } - } else if (q->avg > fs->min_th) { - /* - * We compute p_b using the linear dropping function - * p_b = c_1 * avg - c_2 - * where c_1 = max_p / (max_th - min_th) - * c_2 = max_p * min_th / (max_th - min_th) - */ - p_b = SCALE_MUL((int64_t)fs->c_1, (int64_t)q->avg) - fs->c_2; + if (have < o->len || o->len == 0 || o->type == 0) { + D("(WARN) type %d %s %d have %d need %d", + o->type, msg, i, have, o->len); + return 1; } - - if (fs->flags_fs & DN_QSIZE_IS_BYTES) - p_b = (p_b * len) / fs->max_pkt_size; - if (++q->count == 0) - q->random = random() & 0xffff; - else { - /* - * q->count counts packets arrived since last drop, so a greater - * value of q->count means a greater packet drop probability. - */ - if (SCALE_MUL(p_b, SCALE((int64_t)q->count)) > q->random) { - q->count = 0; - DPRINTF(("dummynet: - red drop")); - /* After a drop we calculate a new random value. */ - q->random = random() & 0xffff; - return (1); /* drop */ - } + ND("type %d %s %d len %d", o->type, msg, i, o->len); + bcopy(_o, *start, o->len); + if (o->type == DN_LINK) { + /* Adjust burst parameter for link */ + struct dn_link *l = (struct dn_link *)*start; + l->burst = div64(l->burst, 8 * hz); + } else if (o->type == DN_SCH) { + /* Set id->id to the number of instances */ + struct dn_schk *s = _o; + struct dn_id *id = (struct dn_id *)(*start); + id->id = (s->sch.flags & DN_HAVE_MASK) ? + dn_ht_entries(s->siht) : (s->siht ? 1 : 0); } - /* End of RED algorithm. */ - - return (0); /* accept */ + *start += o->len; + return 0; } -static __inline struct dn_flow_set * -locate_flowset(int fs_nr) +/* Specific function to copy a queue. + * Copies only the user-visible part of a queue (which is in + * a struct dn_flow), and sets len accordingly. + */ +static int +copy_obj_q(char **start, char *end, void *_o, const char *msg, int i) { - struct dn_flow_set *fs; - - SLIST_FOREACH(fs, &flowsethash[HASH(fs_nr)], next) - if (fs->fs_nr == fs_nr) - return (fs); - - return (NULL); + struct dn_id *o = _o; + int have = end - *start; + int len = sizeof(struct dn_flow); /* see above comment */ + + if (have < len || o->len == 0 || o->type != DN_QUEUE) { + D("ERROR type %d %s %d have %d need %d", + o->type, msg, i, have, len); + return 1; + } + ND("type %d %s %d len %d", o->type, msg, i, len); + bcopy(_o, *start, len); + ((struct dn_id*)(*start))->len = len; + *start += len; + return 0; } -static __inline struct dn_pipe * -locate_pipe(int pipe_nr) +static int +copy_q_cb(void *obj, void *arg) { - struct dn_pipe *pipe; - - SLIST_FOREACH(pipe, &pipehash[HASH(pipe_nr)], next) - if (pipe->pipe_nr == pipe_nr) - return (pipe); - - return (NULL); + struct dn_queue *q = obj; + struct copy_args *a = arg; + struct dn_flow *ni = (struct dn_flow *)(*a->start); + if (copy_obj_q(a->start, a->end, &q->ni, "queue", -1)) + return DNHT_SCAN_END; + ni->oid.type = DN_FLOW; /* override the DN_QUEUE */ + ni->oid.id = si_hash((uintptr_t)&ni->fid, 0, NULL); + return 0; } -/* - * dummynet hook for packets. Below 'pipe' is a pipe or a queue - * depending on whether WF2Q or fixed bw is used. - * - * pipe_nr pipe or queue the packet is destined for. - * dir where shall we send the packet after dummynet. - * m the mbuf with the packet - * ifp the 'ifp' parameter from the caller. - * NULL in ip_input, destination interface in ip_output, - * rule matching rule, in case of multiple passes - */ static int -dummynet_io(struct mbuf **m0, int dir, struct ip_fw_args *fwa) -{ - struct mbuf *m = *m0, *head = NULL, *tail = NULL; - struct dn_pkt_tag *pkt; - struct m_tag *mtag; - struct dn_flow_set *fs = NULL; - struct dn_pipe *pipe; - uint64_t len = m->m_pkthdr.len; - struct dn_flow_queue *q = NULL; - int is_pipe; - ipfw_insn *cmd = ACTION_PTR(fwa->rule); - - KASSERT(m->m_nextpkt == NULL, - ("dummynet_io: mbuf queue passed to dummynet")); - - if (cmd->opcode == O_LOG) - cmd += F_LEN(cmd); - if (cmd->opcode == O_ALTQ) - cmd += F_LEN(cmd); - if (cmd->opcode == O_TAG) - cmd += F_LEN(cmd); - is_pipe = (cmd->opcode == O_PIPE); - - DUMMYNET_LOCK(); - io_pkt++; - /* - * This is a dummynet rule, so we expect an O_PIPE or O_QUEUE rule. - * - * XXXGL: probably the pipe->fs and fs->pipe logic here - * below can be simplified. - */ - if (is_pipe) { - pipe = locate_pipe(fwa->cookie); - if (pipe != NULL) - fs = &(pipe->fs); - } else - fs = locate_flowset(fwa->cookie); - - if (fs == NULL) - goto dropit; /* This queue/pipe does not exist! */ - pipe = fs->pipe; - if (pipe == NULL) { /* Must be a queue, try find a matching pipe. */ - pipe = locate_pipe(fs->parent_nr); - if (pipe != NULL) - fs->pipe = pipe; - else { - printf("dummynet: no pipe %d for queue %d, drop pkt\n", - fs->parent_nr, fs->fs_nr); - goto dropit; - } - } - q = find_queue(fs, &(fwa->f_id)); - if (q == NULL) - goto dropit; /* Cannot allocate queue. */ - - /* Update statistics, then check reasons to drop pkt. */ - q->tot_bytes += len; - q->tot_pkts++; - if (fs->plr && random() < fs->plr) - goto dropit; /* Random pkt drop. */ - if (fs->flags_fs & DN_QSIZE_IS_BYTES) { - if (q->len_bytes > fs->qsize) - goto dropit; /* Queue size overflow. */ - } else { - if (q->len >= fs->qsize) - goto dropit; /* Queue count overflow. */ - } - if (fs->flags_fs & DN_IS_RED && red_drops(fs, q, len)) - goto dropit; - - /* XXX expensive to zero, see if we can remove it. */ - mtag = m_tag_get(PACKET_TAG_DUMMYNET, - sizeof(struct dn_pkt_tag), M_NOWAIT | M_ZERO); - if (mtag == NULL) - goto dropit; /* Cannot allocate packet header. */ - m_tag_prepend(m, mtag); /* Attach to mbuf chain. */ - - pkt = (struct dn_pkt_tag *)(mtag + 1); - /* - * Ok, i can handle the pkt now... - * Build and enqueue packet + parameters. - */ - pkt->rule = fwa->rule; - pkt->rule_id = fwa->rule_id; - pkt->chain_id = fwa->chain_id; - pkt->dn_dir = dir; - - pkt->ifp = fwa->oif; - - if (q->head == NULL) - q->head = m; +copy_q(struct copy_args *a, struct dn_fsk *fs, int flags) +{ + if (!fs->qht) + return 0; + if (fs->fs.flags & DN_QHT_HASH) + dn_ht_scan(fs->qht, copy_q_cb, a); else - q->tail->m_nextpkt = m; - q->tail = m; - q->len++; - q->len_bytes += len; - - if (q->head != m) /* Flow was not idle, we are done. */ - goto done; - - if (is_pipe) { /* Fixed rate queues. */ - if (q->idle_time < curr_time) { - /* Calculate available burst size. */ - q->numbytes += - (curr_time - q->idle_time - 1) * pipe->bandwidth; - if (q->numbytes > pipe->burst) - q->numbytes = pipe->burst; - if (io_fast) - q->numbytes += pipe->bandwidth; - } - } else { /* WF2Q. */ - if (pipe->idle_time < curr_time) { - /* Calculate available burst size. */ - pipe->numbytes += - (curr_time - pipe->idle_time - 1) * pipe->bandwidth; - if (pipe->numbytes > 0 && pipe->numbytes > pipe->burst) - pipe->numbytes = pipe->burst; - if (io_fast) - pipe->numbytes += pipe->bandwidth; - } - pipe->idle_time = curr_time; - } - /* Necessary for both: fixed rate & WF2Q queues. */ - q->idle_time = curr_time; - - /* - * If we reach this point the flow was previously idle, so we need - * to schedule it. This involves different actions for fixed-rate or - * WF2Q queues. - */ - if (is_pipe) { - /* Fixed-rate queue: just insert into the ready_heap. */ - dn_key t = 0; - - if (pipe->bandwidth) { - q->extra_bits = compute_extra_bits(m, pipe); - t = set_ticks(m, q, pipe); - } - q->sched_time = curr_time; - if (t == 0) /* Must process it now. */ - ready_event(q, &head, &tail); - else - heap_insert(&ready_heap, curr_time + t , q); - } else { - /* - * WF2Q. First, compute start time S: if the flow was - * idle (S = F + 1) set S to the virtual time V for the - * controlling pipe, and update the sum of weights for the pipe; - * otherwise, remove flow from idle_heap and set S to max(F,V). - * Second, compute finish time F = S + len / weight. - * Third, if pipe was idle, update V = max(S, V). - * Fourth, count one more backlogged flow. - */ - if (DN_KEY_GT(q->S, q->F)) { /* Means timestamps are invalid. */ - q->S = pipe->V; - pipe->sum += fs->weight; /* Add weight of new queue. */ - } else { - heap_extract(&(pipe->idle_heap), q); - q->S = MAX64(q->F, pipe->V); - } - q->F = q->S + (len << MY_M) / (uint64_t)fs->weight; - - if (pipe->not_eligible_heap.elements == 0 && - pipe->scheduler_heap.elements == 0) - pipe->V = MAX64(q->S, pipe->V); - fs->backlogged++; - /* - * Look at eligibility. A flow is not eligibile if S>V (when - * this happens, it means that there is some other flow already - * scheduled for the same pipe, so the scheduler_heap cannot be - * empty). If the flow is not eligible we just store it in the - * not_eligible_heap. Otherwise, we store in the scheduler_heap - * and possibly invoke ready_event_wfq() right now if there is - * leftover credit. - * Note that for all flows in scheduler_heap (SCH), S_i <= V, - * and for all flows in not_eligible_heap (NEH), S_i > V. - * So when we need to compute max(V, min(S_i)) forall i in - * SCH+NEH, we only need to look into NEH. - */ - if (DN_KEY_GT(q->S, pipe->V)) { /* Not eligible. */ - if (pipe->scheduler_heap.elements == 0) - printf("dummynet: ++ ouch! not eligible but empty scheduler!\n"); - heap_insert(&(pipe->not_eligible_heap), q->S, q); - } else { - heap_insert(&(pipe->scheduler_heap), q->F, q); - if (pipe->numbytes >= 0) { /* Pipe is idle. */ - if (pipe->scheduler_heap.elements != 1) - printf("dummynet: OUCH! pipe should have been idle!\n"); - DPRINTF(("dummynet: waking up pipe %d at %d\n", - pipe->pipe_nr, (int)(q->F >> MY_M))); - pipe->sched_time = curr_time; - ready_event_wfq(pipe, &head, &tail); - } - } - } -done: - if (head == m && dir != DN_TO_IFB_FWD && dir != DN_TO_ETH_DEMUX && - dir != DN_TO_ETH_OUT) { /* Fast io. */ - io_pkt_fast++; - if (m->m_nextpkt != NULL) - printf("dummynet: fast io: pkt chain detected!\n"); - head = m->m_nextpkt = NULL; - } else - *m0 = NULL; /* Normal io. */ - - DUMMYNET_UNLOCK(); - if (head != NULL) - dummynet_send(head); - return (0); - -dropit: - io_pkt_drop++; - if (q) - q->drops++; - DUMMYNET_UNLOCK(); - m_freem(m); - *m0 = NULL; - return ((fs && (fs->flags_fs & DN_NOERROR)) ? 0 : ENOBUFS); + copy_q_cb(fs->qht, a); + return 0; } /* - * Below, the rt_unref is only needed when (pkt->dn_dir == DN_TO_IP_OUT) - * Doing this would probably save us the initial bzero of dn_pkt + * This routine only copies the initial part of a profile ? XXX */ -#define DN_FREE_PKT(_m) do { \ - m_freem(_m); \ -} while (0) - -/* - * Dispose all packets and flow_queues on a flow_set. - * If all=1, also remove red lookup table and other storage, - * including the descriptor itself. - * For the one in dn_pipe MUST also cleanup ready_heap... - */ -static void -purge_flow_set(struct dn_flow_set *fs, int all) +static int +copy_profile(struct copy_args *a, struct dn_profile *p) { - struct dn_flow_queue *q, *qn; - int i; - - DUMMYNET_LOCK_ASSERT(); - - for (i = 0; i <= fs->rq_size; i++) { - for (q = fs->rq[i]; q != NULL; q = qn) { - struct mbuf *m, *mnext; + int have = a->end - *a->start; + /* XXX here we check for max length */ + int profile_len = sizeof(struct dn_profile) - + ED_MAX_SAMPLES_NO*sizeof(int); - mnext = q->head; - while ((m = mnext) != NULL) { - mnext = m->m_nextpkt; - DN_FREE_PKT(m); - } - qn = q->next; - free(q, M_DUMMYNET); - } - fs->rq[i] = NULL; + if (p == NULL) + return 0; + if (have < profile_len) { + D("error have %d need %d", have, profile_len); + return 1; } + bcopy(p, *a->start, profile_len); + ((struct dn_id *)(*a->start))->len = profile_len; + *a->start += profile_len; + return 0; +} - fs->rq_elements = 0; - if (all) { - /* RED - free lookup table. */ - if (fs->w_q_lookup != NULL) - free(fs->w_q_lookup, M_DUMMYNET); - if (fs->rq != NULL) - free(fs->rq, M_DUMMYNET); - /* If this fs is not part of a pipe, free it. */ - if (fs->pipe == NULL || fs != &(fs->pipe->fs)) - free(fs, M_DUMMYNET); +static int +copy_flowset(struct copy_args *a, struct dn_fsk *fs, int flags) +{ + struct dn_fs *ufs = (struct dn_fs *)(*a->start); + if (!fs) + return 0; + ND("flowset %d", fs->fs.fs_nr); + if (copy_obj(a->start, a->end, &fs->fs, "flowset", fs->fs.fs_nr)) + return DNHT_SCAN_END; + ufs->oid.id = (fs->fs.flags & DN_QHT_HASH) ? + dn_ht_entries(fs->qht) : (fs->qht ? 1 : 0); + if (flags) { /* copy queues */ + copy_q(a, fs, 0); } + return 0; } -/* - * Dispose all packets queued on a pipe (not a flow_set). - * Also free all resources associated to a pipe, which is about - * to be deleted. - */ -static void -purge_pipe(struct dn_pipe *pipe) +static int +copy_si_cb(void *obj, void *arg) { - struct mbuf *m, *mnext; - - purge_flow_set( &(pipe->fs), 1 ); - - mnext = pipe->head; - while ((m = mnext) != NULL) { - mnext = m->m_nextpkt; - DN_FREE_PKT(m); - } + struct dn_sch_inst *si = obj; + struct copy_args *a = arg; + struct dn_flow *ni = (struct dn_flow *)(*a->start); + if (copy_obj(a->start, a->end, &si->ni, "inst", + si->sched->sch.sched_nr)) + return DNHT_SCAN_END; + ni->oid.type = DN_FLOW; /* override the DN_SCH_I */ + ni->oid.id = si_hash((uintptr_t)si, DNHT_KEY_IS_OBJ, NULL); + return 0; +} - heap_free( &(pipe->scheduler_heap) ); - heap_free( &(pipe->not_eligible_heap) ); - heap_free( &(pipe->idle_heap) ); +static int +copy_si(struct copy_args *a, struct dn_schk *s, int flags) +{ + if (s->sch.flags & DN_HAVE_MASK) + dn_ht_scan(s->siht, copy_si_cb, a); + else if (s->siht) + copy_si_cb(s->siht, a); + return 0; } /* - * Delete all pipes and heaps returning memory. Must also - * remove references from all ipfw rules to all pipes. + * compute a list of children of a scheduler and copy up */ -static void -dummynet_flush(void) +static int +copy_fsk_list(struct copy_args *a, struct dn_schk *s, int flags) { - struct dn_pipe *pipe, *pipe1; - struct dn_flow_set *fs, *fs1; - int i; - - DUMMYNET_LOCK(); - /* Free heaps so we don't have unwanted events. */ - heap_free(&ready_heap); - heap_free(&wfq_ready_heap); - heap_free(&extract_heap); + struct dn_fsk *fs; + struct dn_id *o; + uint32_t *p; + + int n = 0, space = sizeof(*o); + SLIST_FOREACH(fs, &s->fsk_list, sch_chain) { + if (fs->fs.fs_nr < DN_MAX_ID) + n++; + } + space += n * sizeof(uint32_t); + DX(3, "sched %d has %d flowsets", s->sch.sched_nr, n); + if (a->end - *(a->start) < space) + return DNHT_SCAN_END; + o = (struct dn_id *)(*(a->start)); + o->len = space; + *a->start += o->len; + o->type = DN_TEXT; + p = (uint32_t *)(o+1); + SLIST_FOREACH(fs, &s->fsk_list, sch_chain) + if (fs->fs.fs_nr < DN_MAX_ID) + *p++ = fs->fs.fs_nr; + return 0; +} - /* - * Now purge all queued pkts and delete all pipes. - * - * XXXGL: can we merge the for(;;) cycles into one or not? - */ - for (i = 0; i < HASHSIZE; i++) - SLIST_FOREACH_SAFE(fs, &flowsethash[i], next, fs1) { - SLIST_REMOVE(&flowsethash[i], fs, dn_flow_set, next); - purge_flow_set(fs, 1); +static int +copy_data_helper(void *_o, void *_arg) +{ + struct copy_args *a = _arg; + uint32_t *r = a->extra->r; /* start of first range */ + uint32_t *lim; /* first invalid pointer */ + int n; + + lim = (uint32_t *)((char *)(a->extra) + a->extra->o.len); + + if (a->type == DN_LINK || a->type == DN_SCH) { + /* pipe|sched show, we receive a dn_schk */ + struct dn_schk *s = _o; + + n = s->sch.sched_nr; + if (a->type == DN_SCH && n >= DN_MAX_ID) + return 0; /* not a scheduler */ + if (a->type == DN_LINK && n <= DN_MAX_ID) + return 0; /* not a pipe */ + + /* see if the object is within one of our ranges */ + for (;r < lim; r += 2) { + if (n < r[0] || n > r[1]) + continue; + /* Found a valid entry, copy and we are done */ + if (a->flags & DN_C_LINK) { + if (copy_obj(a->start, a->end, + &s->link, "link", n)) + return DNHT_SCAN_END; + if (copy_profile(a, s->profile)) + return DNHT_SCAN_END; + if (copy_flowset(a, s->fs, 0)) + return DNHT_SCAN_END; + } + if (a->flags & DN_C_SCH) { + if (copy_obj(a->start, a->end, + &s->sch, "sched", n)) + return DNHT_SCAN_END; + /* list all attached flowsets */ + if (copy_fsk_list(a, s, 0)) + return DNHT_SCAN_END; + } + if (a->flags & DN_C_FLOW) + copy_si(a, s, 0); + break; } - for (i = 0; i < HASHSIZE; i++) - SLIST_FOREACH_SAFE(pipe, &pipehash[i], next, pipe1) { - SLIST_REMOVE(&pipehash[i], pipe, dn_pipe, next); - purge_pipe(pipe); - free_pipe(pipe); + } else if (a->type == DN_FS) { + /* queue show, skip internal flowsets */ + struct dn_fsk *fs = _o; + + n = fs->fs.fs_nr; + if (n >= DN_MAX_ID) + return 0; + /* see if the object is within one of our ranges */ + for (;r < lim; r += 2) { + if (n < r[0] || n > r[1]) + continue; + if (copy_flowset(a, fs, 0)) + return DNHT_SCAN_END; + copy_q(a, fs, 0); + break; /* we are done */ } - DUMMYNET_UNLOCK(); + } + return 0; +} + +static inline struct dn_schk * +locate_scheduler(int i) +{ + return dn_ht_find(dn_cfg.schedhash, i, 0, NULL); } /* - * setup RED parameters + * red parameters are in fixed point arithmetic. */ static int -config_red(struct dn_flow_set *p, struct dn_flow_set *x) +config_red(struct dn_fsk *fs) { - int i; - - x->w_q = p->w_q; - x->min_th = SCALE(p->min_th); - x->max_th = SCALE(p->max_th); - x->max_p = p->max_p; - - x->c_1 = p->max_p / (p->max_th - p->min_th); - x->c_2 = SCALE_MUL(x->c_1, SCALE(p->min_th)); - - if (x->flags_fs & DN_IS_GENTLE_RED) { - x->c_3 = (SCALE(1) - p->max_p) / p->max_th; - x->c_4 = SCALE(1) - 2 * p->max_p; + int64_t s, idle, weight, w0; + int t, i; + + fs->w_q = fs->fs.w_q; + fs->max_p = fs->fs.max_p; + D("called"); + /* Doing stuff that was in userland */ + i = fs->sched->link.bandwidth; + s = (i <= 0) ? 0 : + hz * dn_cfg.red_avg_pkt_size * 8 * SCALE(1) / i; + + idle = div64((s * 3) , fs->w_q); /* s, fs->w_q scaled; idle not scaled */ + fs->lookup_step = div64(idle , dn_cfg.red_lookup_depth); + /* fs->lookup_step not scaled, */ + if (!fs->lookup_step) + fs->lookup_step = 1; + w0 = weight = SCALE(1) - fs->w_q; //fs->w_q scaled + + for (t = fs->lookup_step; t > 1; --t) + weight = SCALE_MUL(weight, w0); + fs->lookup_weight = (int)(weight); // scaled + + /* Now doing stuff that was in kerneland */ + fs->min_th = SCALE(fs->fs.min_th); + fs->max_th = SCALE(fs->fs.max_th); + + fs->c_1 = fs->max_p / (fs->fs.max_th - fs->fs.min_th); + fs->c_2 = SCALE_MUL(fs->c_1, SCALE(fs->fs.min_th)); + + if (fs->fs.flags & DN_IS_GENTLE_RED) { + fs->c_3 = (SCALE(1) - fs->max_p) / fs->fs.max_th; + fs->c_4 = SCALE(1) - 2 * fs->max_p; } /* If the lookup table already exist, free and create it again. */ - if (x->w_q_lookup) { - free(x->w_q_lookup, M_DUMMYNET); - x->w_q_lookup = NULL; + if (fs->w_q_lookup) { + free(fs->w_q_lookup, M_DUMMYNET); + fs->w_q_lookup = NULL; } - if (red_lookup_depth == 0) { + if (dn_cfg.red_lookup_depth == 0) { printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth" "must be > 0\n"); - free(x, M_DUMMYNET); + fs->fs.flags &= ~DN_IS_RED; + fs->fs.flags &= ~DN_IS_GENTLE_RED; return (EINVAL); } - x->lookup_depth = red_lookup_depth; - x->w_q_lookup = (u_int *)malloc(x->lookup_depth * sizeof(int), + fs->lookup_depth = dn_cfg.red_lookup_depth; + fs->w_q_lookup = (u_int *)malloc(fs->lookup_depth * sizeof(int), M_DUMMYNET, M_NOWAIT); - if (x->w_q_lookup == NULL) { + if (fs->w_q_lookup == NULL) { printf("dummynet: sorry, cannot allocate red lookup table\n"); - free(x, M_DUMMYNET); + fs->fs.flags &= ~DN_IS_RED; + fs->fs.flags &= ~DN_IS_GENTLE_RED; return(ENOSPC); } /* Fill the lookup table with (1 - w_q)^x */ - x->lookup_step = p->lookup_step; - x->lookup_weight = p->lookup_weight; - x->w_q_lookup[0] = SCALE(1) - x->w_q; - - for (i = 1; i < x->lookup_depth; i++) - x->w_q_lookup[i] = - SCALE_MUL(x->w_q_lookup[i - 1], x->lookup_weight); + fs->w_q_lookup[0] = SCALE(1) - fs->w_q; + + for (i = 1; i < fs->lookup_depth; i++) + fs->w_q_lookup[i] = + SCALE_MUL(fs->w_q_lookup[i - 1], fs->lookup_weight); + + if (dn_cfg.red_avg_pkt_size < 1) + dn_cfg.red_avg_pkt_size = 512; + fs->avg_pkt_size = dn_cfg.red_avg_pkt_size; + if (dn_cfg.red_max_pkt_size < 1) + dn_cfg.red_max_pkt_size = 1500; + fs->max_pkt_size = dn_cfg.red_max_pkt_size; + D("exit"); + return 0; +} - if (red_avg_pkt_size < 1) - red_avg_pkt_size = 512; - x->avg_pkt_size = red_avg_pkt_size; - if (red_max_pkt_size < 1) - red_max_pkt_size = 1500; - x->max_pkt_size = red_max_pkt_size; - return (0); +/* Scan all flowset attached to this scheduler and update red */ +static void +update_red(struct dn_schk *s) +{ + struct dn_fsk *fs; + SLIST_FOREACH(fs, &s->fsk_list, sch_chain) { + if (fs && (fs->fs.flags & DN_IS_RED)) + config_red(fs); + } } -static int -alloc_hash(struct dn_flow_set *x, struct dn_flow_set *pfs) -{ - if (x->flags_fs & DN_HAVE_FLOW_MASK) { /* allocate some slots */ - int l = pfs->rq_size; - - if (l == 0) - l = dn_hash_size; - if (l < 4) - l = 4; - else if (l > DN_MAX_HASH_SIZE) - l = DN_MAX_HASH_SIZE; - x->rq_size = l; - } else /* one is enough for null mask */ - x->rq_size = 1; - x->rq = malloc((1 + x->rq_size) * sizeof(struct dn_flow_queue *), - M_DUMMYNET, M_NOWAIT | M_ZERO); - if (x->rq == NULL) { - printf("dummynet: sorry, cannot allocate queue\n"); - return (ENOMEM); - } - x->rq_elements = 0; - return 0 ; +/* attach flowset to scheduler s, possibly requeue */ +static void +fsk_attach(struct dn_fsk *fs, struct dn_schk *s) +{ + ND("remove fs %d from fsunlinked, link to sched %d", + fs->fs.fs_nr, s->sch.sched_nr); + SLIST_REMOVE(&dn_cfg.fsu, fs, dn_fsk, sch_chain); + fs->sched = s; + SLIST_INSERT_HEAD(&s->fsk_list, fs, sch_chain); + if (s->fp->new_fsk) + s->fp->new_fsk(fs); + /* XXX compute fsk_mask */ + fs->fsk_mask = fs->fs.flow_mask; + if (fs->sched->sch.flags & DN_HAVE_MASK) + flow_id_or(&fs->sched->sch.sched_mask, &fs->fsk_mask); + if (fs->qht) { + /* + * we must drain qht according to the old + * type, and reinsert according to the new one. + * The requeue is complex -- in general we need to + * reclassify every single packet. + * For the time being, let's hope qht is never set + * when we reach this point. + */ + D("XXX TODO requeue from fs %d to sch %d", + fs->fs.fs_nr, s->sch.sched_nr); + fs->qht = NULL; + } + /* set the new type for qht */ + if (nonzero_mask(&fs->fsk_mask)) + fs->fs.flags |= DN_QHT_HASH; + else + fs->fs.flags &= ~DN_QHT_HASH; + + /* XXX config_red() can fail... */ + if (fs->fs.flags & DN_IS_RED) + config_red(fs); } +/* update all flowsets which may refer to this scheduler */ static void -set_fs_parms(struct dn_flow_set *x, struct dn_flow_set *src) -{ - x->flags_fs = src->flags_fs; - x->qsize = src->qsize; - x->plr = src->plr; - x->flow_mask = src->flow_mask; - if (x->flags_fs & DN_QSIZE_IS_BYTES) { - if (x->qsize > pipe_byte_limit) - x->qsize = 1024 * 1024; - } else { - if (x->qsize == 0) - x->qsize = 50; - if (x->qsize > pipe_slot_limit) - x->qsize = 50; +update_fs(struct dn_schk *s) +{ + struct dn_fsk *fs, *tmp; + + SLIST_FOREACH_SAFE(fs, &dn_cfg.fsu, sch_chain, tmp) { + if (s->sch.sched_nr != fs->fs.sched_nr) { + D("fs %d for sch %d not %d still unlinked", + fs->fs.fs_nr, fs->fs.sched_nr, + s->sch.sched_nr); + continue; + } + fsk_attach(fs, s); } - /* Configuring RED. */ - if (x->flags_fs & DN_IS_RED) - config_red(src, x); /* XXX should check errors */ } /* - * Setup pipe or queue parameters. + * Configuration -- to preserve backward compatibility we use + * the following scheme (N is 65536) + * NUMBER SCHED LINK FLOWSET + * 1 .. N-1 (1)WFQ (2)WFQ (3)queue + * N+1 .. 2N-1 (4)FIFO (5)FIFO (6)FIFO for sched 1..N-1 + * 2N+1 .. 3N-1 -- -- (7)FIFO for sched N+1..2N-1 + * + * "pipe i config" configures #1, #2 and #3 + * "sched i config" configures #1 and possibly #6 + * "queue i config" configures #3 + * #1 is configured with 'pipe i config' or 'sched i config' + * #2 is configured with 'pipe i config', and created if not + * existing with 'sched i config' + * #3 is configured with 'queue i config' + * #4 is automatically configured after #1, can only be FIFO + * #5 is automatically configured after #2 + * #6 is automatically created when #1 is !MULTIQUEUE, + * and can be updated. + * #7 is automatically configured after #2 + */ + +/* + * configure a link (and its FIFO instance) */ static int -config_pipe(struct dn_pipe *p) +config_link(struct dn_link *p, struct dn_id *arg) { - struct dn_flow_set *pfs = &(p->fs); - struct dn_flow_queue *q; - int i, error; + int i; + if (p->oid.len != sizeof(*p)) { + D("invalid pipe len %d", p->oid.len); + return EINVAL; + } + i = p->link_nr; + if (i <= 0 || i >= DN_MAX_ID) + return EINVAL; /* * The config program passes parameters as follows: * bw = bits/second (0 means no limits), * delay = ms, must be translated into ticks. * qsize = slots/bytes + * burst ??? */ p->delay = (p->delay * hz) / 1000; /* Scale burst size: bytes -> bits * hz */ p->burst *= 8 * hz; - /* We need either a pipe number or a flow_set number. */ - if (p->pipe_nr == 0 && pfs->fs_nr == 0) - return (EINVAL); - if (p->pipe_nr != 0 && pfs->fs_nr != 0) - return (EINVAL); - if (p->pipe_nr != 0) { /* this is a pipe */ - struct dn_pipe *pipe; - - DUMMYNET_LOCK(); - pipe = locate_pipe(p->pipe_nr); /* locate pipe */ - - if (pipe == NULL) { /* new pipe */ - pipe = malloc(sizeof(struct dn_pipe), M_DUMMYNET, - M_NOWAIT | M_ZERO); - if (pipe == NULL) { - DUMMYNET_UNLOCK(); - printf("dummynet: no memory for new pipe\n"); - return (ENOMEM); - } - pipe->pipe_nr = p->pipe_nr; - pipe->fs.pipe = pipe; - /* - * idle_heap is the only one from which - * we extract from the middle. - */ - pipe->idle_heap.size = pipe->idle_heap.elements = 0; - pipe->idle_heap.offset = - offsetof(struct dn_flow_queue, heap_pos); - } else - /* Flush accumulated credit for all queues. */ - for (i = 0; i <= pipe->fs.rq_size; i++) - for (q = pipe->fs.rq[i]; q; q = q->next) { - q->numbytes = p->burst + - (io_fast ? p->bandwidth : 0); - } - - pipe->bandwidth = p->bandwidth; - pipe->burst = p->burst; - pipe->numbytes = pipe->burst + (io_fast ? pipe->bandwidth : 0); - bcopy(p->if_name, pipe->if_name, sizeof(p->if_name)); - pipe->ifp = NULL; /* reset interface ptr */ - pipe->delay = p->delay; - set_fs_parms(&(pipe->fs), pfs); - - /* Handle changes in the delay profile. */ - if (p->samples_no > 0) { - if (pipe->samples_no != p->samples_no) { - if (pipe->samples != NULL) - free(pipe->samples, M_DUMMYNET); - pipe->samples = - malloc(p->samples_no*sizeof(dn_key), - M_DUMMYNET, M_NOWAIT | M_ZERO); - if (pipe->samples == NULL) { - DUMMYNET_UNLOCK(); - printf("dummynet: no memory " - "for new samples\n"); - return (ENOMEM); - } - pipe->samples_no = p->samples_no; - } - strncpy(pipe->name,p->name,sizeof(pipe->name)); - pipe->loss_level = p->loss_level; - for (i = 0; i<pipe->samples_no; ++i) - pipe->samples[i] = p->samples[i]; - } else if (pipe->samples != NULL) { - free(pipe->samples, M_DUMMYNET); - pipe->samples = NULL; - pipe->samples_no = 0; - } + DN_BH_WLOCK(); + /* do it twice, base link and FIFO link */ + for (; i < 2*DN_MAX_ID; i += DN_MAX_ID) { + struct dn_schk *s = locate_scheduler(i); + if (s == NULL) { + DN_BH_WUNLOCK(); + D("sched %d not found", i); + return EINVAL; + } + /* remove profile if exists */ + if (s->profile) { + free(s->profile, M_DUMMYNET); + s->profile = NULL; + } + /* copy all parameters */ + s->link.oid = p->oid; + s->link.link_nr = i; + s->link.delay = p->delay; + if (s->link.bandwidth != p->bandwidth) { + /* XXX bandwidth changes, need to update red params */ + s->link.bandwidth = p->bandwidth; + update_red(s); + } + s->link.burst = p->burst; + schk_reset_credit(s); + } + dn_cfg.id++; + DN_BH_WUNLOCK(); + return 0; +} - if (pipe->fs.rq == NULL) { /* a new pipe */ - error = alloc_hash(&(pipe->fs), pfs); - if (error) { - DUMMYNET_UNLOCK(); - free_pipe(pipe); - return (error); - } - SLIST_INSERT_HEAD(&pipehash[HASH(pipe->pipe_nr)], - pipe, next); - } - DUMMYNET_UNLOCK(); - } else { /* config queue */ - struct dn_flow_set *fs; +/* + * configure a flowset. Can be called from inside with locked=1, + */ +static struct dn_fsk * +config_fs(struct dn_fs *nfs, struct dn_id *arg, int locked) +{ + int i; + struct dn_fsk *fs; - DUMMYNET_LOCK(); - fs = locate_flowset(pfs->fs_nr); /* locate flow_set */ + if (nfs->oid.len != sizeof(*nfs)) { + D("invalid flowset len %d", nfs->oid.len); + return NULL; + } + i = nfs->fs_nr; + if (i <= 0 || i >= 3*DN_MAX_ID) + return NULL; + ND("flowset %d", i); + /* XXX other sanity checks */ + if (nfs->flags & DN_QSIZE_BYTES) { + ipdn_bound_var(&nfs->qsize, 16384, + 1500, dn_cfg.byte_limit, NULL); // "queue byte size"); + } else { + ipdn_bound_var(&nfs->qsize, 50, + 1, dn_cfg.slot_limit, NULL); // "queue slot size"); + } + if (nfs->flags & DN_HAVE_MASK) { + /* make sure we have some buckets */ + ipdn_bound_var(&nfs->buckets, dn_cfg.hash_size, + 1, dn_cfg.max_hash_size, "flowset buckets"); + } else { + nfs->buckets = 1; /* we only need 1 */ + } + if (!locked) + DN_BH_WLOCK(); + do { /* exit with break when done */ + struct dn_schk *s; + int flags = nfs->sched_nr ? DNHT_INSERT : 0; + int j; + int oldc = dn_cfg.fsk_count; + fs = dn_ht_find(dn_cfg.fshash, i, flags, NULL); + if (fs == NULL) { + D("missing sched for flowset %d", i); + break; + } + /* grab some defaults from the existing one */ + if (nfs->sched_nr == 0) /* reuse */ + nfs->sched_nr = fs->fs.sched_nr; + for (j = 0; j < sizeof(nfs->par)/sizeof(nfs->par[0]); j++) { + if (nfs->par[j] == -1) /* reuse */ + nfs->par[j] = fs->fs.par[j]; + } + if (bcmp(&fs->fs, nfs, sizeof(*nfs)) == 0) { + ND("flowset %d unchanged", i); + break; /* no change, nothing to do */ + } + if (oldc != dn_cfg.fsk_count) /* new item */ + dn_cfg.id++; + s = locate_scheduler(nfs->sched_nr); + /* detach from old scheduler if needed, preserving + * queues if we need to reattach. Then update the + * configuration, and possibly attach to the new sched. + */ + DX(2, "fs %d changed sched %d@%p to %d@%p", + fs->fs.fs_nr, + fs->fs.sched_nr, fs->sched, nfs->sched_nr, s); + if (fs->sched) { + int flags = s ? DN_DETACH : (DN_DETACH | DN_DESTROY); + flags |= DN_DESTROY; /* XXX temporary */ + fsk_detach(fs, flags); + } + fs->fs = *nfs; /* copy configuration */ + if (s != NULL) + fsk_attach(fs, s); + } while (0); + if (!locked) + DN_BH_WUNLOCK(); + return fs; +} - if (fs == NULL) { /* new */ - if (pfs->parent_nr == 0) { /* need link to a pipe */ - DUMMYNET_UNLOCK(); - return (EINVAL); +/* + * config/reconfig a scheduler and its FIFO variant. + * For !MULTIQUEUE schedulers, also set up the flowset. + * + * On reconfigurations (detected because s->fp is set), + * detach existing flowsets preserving traffic, preserve link, + * and delete the old scheduler creating a new one. + */ +static int +config_sched(struct dn_sch *_nsch, struct dn_id *arg) +{ + struct dn_schk *s; + struct schk_new_arg a; /* argument for schk_new */ + int i; + struct dn_link p; /* copy of oldlink */ + struct dn_profile *pf = NULL; /* copy of old link profile */ + /* Used to preserv mask parameter */ + struct ipfw_flow_id new_mask; + int new_buckets = 0; + int new_flags = 0; + int pipe_cmd; + int err = ENOMEM; + + a.sch = _nsch; + if (a.sch->oid.len != sizeof(*a.sch)) { + D("bad sched len %d", a.sch->oid.len); + return EINVAL; + } + i = a.sch->sched_nr; + if (i <= 0 || i >= DN_MAX_ID) + return EINVAL; + /* make sure we have some buckets */ + if (a.sch->flags & DN_HAVE_MASK) + ipdn_bound_var(&a.sch->buckets, dn_cfg.hash_size, + 1, dn_cfg.max_hash_size, "sched buckets"); + /* XXX other sanity checks */ + bzero(&p, sizeof(p)); + + pipe_cmd = a.sch->flags & DN_PIPE_CMD; + a.sch->flags &= ~DN_PIPE_CMD; //XXX do it even if is not set? + if (pipe_cmd) { + /* Copy mask parameter */ + new_mask = a.sch->sched_mask; + new_buckets = a.sch->buckets; + new_flags = a.sch->flags; + } + DN_BH_WLOCK(); +again: /* run twice, for wfq and fifo */ + /* + * lookup the type. If not supplied, use the previous one + * or default to WF2Q+. Otherwise, return an error. + */ + dn_cfg.id++; + a.fp = find_sched_type(a.sch->oid.subtype, a.sch->name); + if (a.fp != NULL) { + /* found. Lookup or create entry */ + s = dn_ht_find(dn_cfg.schedhash, i, DNHT_INSERT, &a); + } else if (a.sch->oid.subtype == 0 && !a.sch->name[0]) { + /* No type. search existing s* or retry with WF2Q+ */ + s = dn_ht_find(dn_cfg.schedhash, i, 0, &a); + if (s != NULL) { + a.fp = s->fp; + /* Scheduler exists, skip to FIFO scheduler + * if command was pipe config... + */ + if (pipe_cmd) + goto next; + } else { + /* New scheduler, create a wf2q+ with no mask + * if command was pipe config... + */ + if (pipe_cmd) { + /* clear mask parameter */ + bzero(&a.sch->sched_mask, sizeof(new_mask)); + a.sch->buckets = 0; + a.sch->flags &= ~DN_HAVE_MASK; } - fs = malloc(sizeof(struct dn_flow_set), M_DUMMYNET, - M_NOWAIT | M_ZERO); - if (fs == NULL) { - DUMMYNET_UNLOCK(); - printf( - "dummynet: no memory for new flow_set\n"); - return (ENOMEM); + a.sch->oid.subtype = DN_SCHED_WF2QP; + goto again; + } + } else { + D("invalid scheduler type %d %s", + a.sch->oid.subtype, a.sch->name); + err = EINVAL; + goto error; + } + /* normalize name and subtype */ + a.sch->oid.subtype = a.fp->type; + bzero(a.sch->name, sizeof(a.sch->name)); + strlcpy(a.sch->name, a.fp->name, sizeof(a.sch->name)); + if (s == NULL) { + D("cannot allocate scheduler %d", i); + goto error; + } + /* restore existing link if any */ + if (p.link_nr) { + s->link = p; + if (!pf || pf->link_nr != p.link_nr) { /* no saved value */ + s->profile = NULL; /* XXX maybe not needed */ + } else { + s->profile = malloc(sizeof(struct dn_profile), + M_DUMMYNET, M_NOWAIT | M_ZERO); + if (s->profile == NULL) { + D("cannot allocate profile"); + goto error; //XXX } - fs->fs_nr = pfs->fs_nr; - fs->parent_nr = pfs->parent_nr; - fs->weight = pfs->weight; - if (fs->weight == 0) - fs->weight = 1; - else if (fs->weight > 100) - fs->weight = 100; + bcopy(pf, s->profile, sizeof(*pf)); + } + } + p.link_nr = 0; + if (s->fp == NULL) { + DX(2, "sched %d new type %s", i, a.fp->name); + } else if (s->fp != a.fp || + bcmp(a.sch, &s->sch, sizeof(*a.sch)) ) { + /* already existing. */ + DX(2, "sched %d type changed from %s to %s", + i, s->fp->name, a.fp->name); + DX(4, " type/sub %d/%d -> %d/%d", + s->sch.oid.type, s->sch.oid.subtype, + a.sch->oid.type, a.sch->oid.subtype); + if (s->link.link_nr == 0) + D("XXX WARNING link 0 for sched %d", i); + p = s->link; /* preserve link */ + if (s->profile) {/* preserve profile */ + if (!pf) + pf = malloc(sizeof(*pf), + M_DUMMYNET, M_NOWAIT | M_ZERO); + if (pf) /* XXX should issue a warning otherwise */ + bcopy(s->profile, pf, sizeof(*pf)); + } + /* remove from the hash */ + dn_ht_find(dn_cfg.schedhash, i, DNHT_REMOVE, NULL); + /* Detach flowsets, preserve queues. */ + // schk_delete_cb(s, NULL); + // XXX temporarily, kill queues + schk_delete_cb(s, (void *)DN_DESTROY); + goto again; + } else { + DX(4, "sched %d unchanged type %s", i, a.fp->name); + } + /* complete initialization */ + s->sch = *a.sch; + s->fp = a.fp; + s->cfg = arg; + // XXX schk_reset_credit(s); + /* create the internal flowset if needed, + * trying to reuse existing ones if available + */ + if (!(s->fp->flags & DN_MULTIQUEUE) && !s->fs) { + s->fs = dn_ht_find(dn_cfg.fshash, i, 0, NULL); + if (!s->fs) { + struct dn_fs fs; + bzero(&fs, sizeof(fs)); + set_oid(&fs.oid, DN_FS, sizeof(fs)); + fs.fs_nr = i + DN_MAX_ID; + fs.sched_nr = i; + s->fs = config_fs(&fs, NULL, 1 /* locked */); + } + if (!s->fs) { + schk_delete_cb(s, (void *)DN_DESTROY); + D("error creating internal fs for %d", i); + goto error; + } + } + /* call init function after the flowset is created */ + if (s->fp->config) + s->fp->config(s); + update_fs(s); +next: + if (i < DN_MAX_ID) { /* now configure the FIFO instance */ + i += DN_MAX_ID; + if (pipe_cmd) { + /* Restore mask parameter for FIFO */ + a.sch->sched_mask = new_mask; + a.sch->buckets = new_buckets; + a.sch->flags = new_flags; } else { - /* - * Change parent pipe not allowed; - * must delete and recreate. - */ - if (pfs->parent_nr != 0 && - fs->parent_nr != pfs->parent_nr) { - DUMMYNET_UNLOCK(); - return (EINVAL); + /* sched config shouldn't modify the FIFO scheduler */ + if (dn_ht_find(dn_cfg.schedhash, i, 0, &a) != NULL) { + /* FIFO already exist, don't touch it */ + err = 0; /* and this is not an error */ + goto error; } } + a.sch->sched_nr = i; + a.sch->oid.subtype = DN_SCHED_FIFO; + bzero(a.sch->name, sizeof(a.sch->name)); + goto again; + } + err = 0; +error: + DN_BH_WUNLOCK(); + if (pf) + free(pf, M_DUMMYNET); + return err; +} - set_fs_parms(fs, pfs); +/* + * attach a profile to a link + */ +static int +config_profile(struct dn_profile *pf, struct dn_id *arg) +{ + struct dn_schk *s; + int i, olen, err = 0; - if (fs->rq == NULL) { /* a new flow_set */ - error = alloc_hash(fs, pfs); - if (error) { - DUMMYNET_UNLOCK(); - free(fs, M_DUMMYNET); - return (error); - } - SLIST_INSERT_HEAD(&flowsethash[HASH(fs->fs_nr)], - fs, next); + if (pf->oid.len < sizeof(*pf)) { + D("short profile len %d", pf->oid.len); + return EINVAL; + } + i = pf->link_nr; + if (i <= 0 || i >= DN_MAX_ID) + return EINVAL; + /* XXX other sanity checks */ + DN_BH_WLOCK(); + for (; i < 2*DN_MAX_ID; i += DN_MAX_ID) { + s = locate_scheduler(i); + + if (s == NULL) { + err = EINVAL; + break; + } + dn_cfg.id++; + /* + * If we had a profile and the new one does not fit, + * or it is deleted, then we need to free memory. + */ + if (s->profile && (pf->samples_no == 0 || + s->profile->oid.len < pf->oid.len)) { + free(s->profile, M_DUMMYNET); + s->profile = NULL; + } + if (pf->samples_no == 0) + continue; + /* + * new profile, possibly allocate memory + * and copy data. + */ + if (s->profile == NULL) + s->profile = malloc(pf->oid.len, + M_DUMMYNET, M_NOWAIT | M_ZERO); + if (s->profile == NULL) { + D("no memory for profile %d", i); + err = ENOMEM; + break; } - DUMMYNET_UNLOCK(); + /* preserve larger length XXX double check */ + olen = s->profile->oid.len; + if (olen < pf->oid.len) + olen = pf->oid.len; + bcopy(pf, s->profile, pf->oid.len); + s->profile->oid.len = olen; } - return (0); + DN_BH_WUNLOCK(); + return err; } /* - * Helper function to remove from a heap queues which are linked to - * a flow_set about to be deleted. + * Delete all objects: */ static void -fs_remove_from_heap(struct dn_heap *h, struct dn_flow_set *fs) -{ - int i = 0, found = 0 ; - for (; i < h->elements ;) - if ( ((struct dn_flow_queue *)h->p[i].object)->fs == fs) { - h->elements-- ; - h->p[i] = h->p[h->elements] ; - found++ ; - } else - i++ ; - if (found) - heapify(h); +dummynet_flush(void) +{ + + /* delete all schedulers and related links/queues/flowsets */ + dn_ht_scan(dn_cfg.schedhash, schk_delete_cb, + (void *)(uintptr_t)DN_DELETE_FS); + /* delete all remaining (unlinked) flowsets */ + DX(4, "still %d unlinked fs", dn_cfg.fsk_count); + dn_ht_free(dn_cfg.fshash, DNHT_REMOVE); + fsk_detach_list(&dn_cfg.fsu, DN_DELETE_FS); + /* Reinitialize system heap... */ + heap_init(&dn_cfg.evheap, 16, offsetof(struct dn_id, id)); } /* - * helper function to remove a pipe from a heap (can be there at most once) + * Main handler for configuration. We are guaranteed to be called + * with an oid which is at least a dn_id. + * - the first object is the command (config, delete, flush, ...) + * - config_link must be issued after the corresponding config_sched + * - parameters (DN_TXT) for an object must preceed the object + * processed on a config_sched. */ -static void -pipe_remove_from_heap(struct dn_heap *h, struct dn_pipe *p) -{ - if (h->elements > 0) { - int i = 0 ; - for (i=0; i < h->elements ; i++ ) { - if (h->p[i].object == p) { /* found it */ - h->elements-- ; - h->p[i] = h->p[h->elements] ; - heapify(h); - break ; - } +int +do_config(void *p, int l) +{ + struct dn_id *next, *o; + int err = 0, err2 = 0; + struct dn_id *arg = NULL; + uintptr_t *a; + + o = p; + if (o->id != DN_API_VERSION) { + D("invalid api version got %d need %d", + o->id, DN_API_VERSION); + return EINVAL; } - } + for (; l >= sizeof(*o); o = next) { + struct dn_id *prev = arg; + if (o->len < sizeof(*o) || l < o->len) { + D("bad len o->len %d len %d", o->len, l); + err = EINVAL; + break; + } + l -= o->len; + next = (struct dn_id *)((char *)o + o->len); + err = 0; + switch (o->type) { + default: + D("cmd %d not implemented", o->type); + break; +#ifdef EMULATE_SYSCTL + /* sysctl emulation. + * if we recognize the command, jump to the correct + * handler and return + */ + case DN_SYSCTL_SET: + err = kesysctl_emu_set(p, l); + return err; +#endif + case DN_CMD_CONFIG: /* simply a header */ + break; + + case DN_CMD_DELETE: + /* the argument is in the first uintptr_t after o */ + a = (uintptr_t *)(o+1); + if (o->len < sizeof(*o) + sizeof(*a)) { + err = EINVAL; + break; + } + switch (o->subtype) { + case DN_LINK: + /* delete base and derived schedulers */ + DN_BH_WLOCK(); + err = delete_schk(*a); + err2 = delete_schk(*a + DN_MAX_ID); + DN_BH_WUNLOCK(); + if (!err) + err = err2; + break; + + default: + D("invalid delete type %d", + o->subtype); + err = EINVAL; + break; + + case DN_FS: + err = (*a <1 || *a >= DN_MAX_ID) ? + EINVAL : delete_fs(*a, 0) ; + break; + } + break; + + case DN_CMD_FLUSH: + DN_BH_WLOCK(); + dummynet_flush(); + DN_BH_WUNLOCK(); + break; + case DN_TEXT: /* store argument the next block */ + prev = NULL; + arg = o; + break; + case DN_LINK: + err = config_link((struct dn_link *)o, arg); + break; + case DN_PROFILE: + err = config_profile((struct dn_profile *)o, arg); + break; + case DN_SCH: + err = config_sched((struct dn_sch *)o, arg); + break; + case DN_FS: + err = (NULL==config_fs((struct dn_fs *)o, arg, 0)); + break; + } + if (prev) + arg = NULL; + if (err != 0) + break; + } + return err; +} + +static int +compute_space(struct dn_id *cmd, struct copy_args *a) +{ + int x = 0, need = 0; + int profile_size = sizeof(struct dn_profile) - + ED_MAX_SAMPLES_NO*sizeof(int); + + /* NOTE about compute space: + * NP = dn_cfg.schk_count + * NSI = dn_cfg.si_count + * NF = dn_cfg.fsk_count + * NQ = dn_cfg.queue_count + * - ipfw pipe show + * (NP/2)*(dn_link + dn_sch + dn_id + dn_fs) only half scheduler + * link, scheduler template, flowset + * integrated in scheduler and header + * for flowset list + * (NSI)*(dn_flow) all scheduler instance (includes + * the queue instance) + * - ipfw sched show + * (NP/2)*(dn_link + dn_sch + dn_id + dn_fs) only half scheduler + * link, scheduler template, flowset + * integrated in scheduler and header + * for flowset list + * (NSI * dn_flow) all scheduler instances + * (NF * sizeof(uint_32)) space for flowset list linked to scheduler + * (NQ * dn_queue) all queue [XXXfor now not listed] + * - ipfw queue show + * (NF * dn_fs) all flowset + * (NQ * dn_queue) all queues + */ + switch (cmd->subtype) { + default: + return -1; + /* XXX where do LINK and SCH differ ? */ + /* 'ipfw sched show' could list all queues associated to + * a scheduler. This feature for now is disabled + */ + case DN_LINK: /* pipe show */ + x = DN_C_LINK | DN_C_SCH | DN_C_FLOW; + need += dn_cfg.schk_count * + (sizeof(struct dn_fs) + profile_size) / 2; + need += dn_cfg.fsk_count * sizeof(uint32_t); + break; + case DN_SCH: /* sched show */ + need += dn_cfg.schk_count * + (sizeof(struct dn_fs) + profile_size) / 2; + need += dn_cfg.fsk_count * sizeof(uint32_t); + x = DN_C_SCH | DN_C_LINK | DN_C_FLOW; + break; + case DN_FS: /* queue show */ + x = DN_C_FS | DN_C_QUEUE; + break; + case DN_GET_COMPAT: /* compatibility mode */ + need = dn_compat_calc_size(dn_cfg); + break; + } + a->flags = x; + if (x & DN_C_SCH) { + need += dn_cfg.schk_count * sizeof(struct dn_sch) / 2; + /* NOT also, each fs might be attached to a sched */ + need += dn_cfg.schk_count * sizeof(struct dn_id) / 2; + } + if (x & DN_C_FS) + need += dn_cfg.fsk_count * sizeof(struct dn_fs); + if (x & DN_C_LINK) { + need += dn_cfg.schk_count * sizeof(struct dn_link) / 2; + } + /* + * When exporting a queue to userland, only pass up the + * struct dn_flow, which is the only visible part. + */ + + if (x & DN_C_QUEUE) + need += dn_cfg.queue_count * sizeof(struct dn_flow); + if (x & DN_C_FLOW) + need += dn_cfg.si_count * (sizeof(struct dn_flow)); + return need; } /* - * drain all queues. Called in case of severe mbuf shortage. + * If compat != NULL dummynet_get is called in compatibility mode. + * *compat will be the pointer to the buffer to pass to ipfw */ -void -dummynet_drain(void) -{ - struct dn_flow_set *fs; - struct dn_pipe *pipe; - struct mbuf *m, *mnext; - int i; - - DUMMYNET_LOCK_ASSERT(); - - heap_free(&ready_heap); - heap_free(&wfq_ready_heap); - heap_free(&extract_heap); - /* remove all references to this pipe from flow_sets */ - for (i = 0; i < HASHSIZE; i++) - SLIST_FOREACH(fs, &flowsethash[i], next) - purge_flow_set(fs, 0); - - for (i = 0; i < HASHSIZE; i++) { - SLIST_FOREACH(pipe, &pipehash[i], next) { - purge_flow_set(&(pipe->fs), 0); - - mnext = pipe->head; - while ((m = mnext) != NULL) { - mnext = m->m_nextpkt; - DN_FREE_PKT(m); +int +dummynet_get(struct sockopt *sopt, void **compat) +{ + int have, i, need, error; + char *start = NULL, *buf; + size_t sopt_valsize; + struct dn_id *cmd; + struct copy_args a; + struct copy_range r; + int l = sizeof(struct dn_id); + + bzero(&a, sizeof(a)); + bzero(&r, sizeof(r)); + + /* save and restore original sopt_valsize around copyin */ + sopt_valsize = sopt->sopt_valsize; + + cmd = &r.o; + + if (!compat) { + /* copy at least an oid, and possibly a full object */ + error = sooptcopyin(sopt, cmd, sizeof(r), sizeof(*cmd)); + sopt->sopt_valsize = sopt_valsize; + if (error) + goto done; + l = cmd->len; +#ifdef EMULATE_SYSCTL + /* sysctl emulation. */ + if (cmd->type == DN_SYSCTL_GET) + return kesysctl_emu_get(sopt); +#endif + if (l > sizeof(r)) { + /* request larger than default, allocate buffer */ + cmd = malloc(l, M_DUMMYNET, M_WAIT); + if (cmd == NULL) + return ENOMEM; //XXX + error = sooptcopyin(sopt, cmd, l, l); + sopt->sopt_valsize = sopt_valsize; + if (error) + goto done; } - pipe->head = pipe->tail = NULL; + } else { /* compatibility */ + error = 0; + cmd->type = DN_CMD_GET; + cmd->len = sizeof(struct dn_id); + cmd->subtype = DN_GET_COMPAT; + // cmd->id = sopt_valsize; + D("compatibility mode"); } - } + a.extra = (struct copy_range *)cmd; + if (cmd->len == sizeof(*cmd)) { /* no range, create a default */ + uint32_t *rp = (uint32_t *)(cmd + 1); + cmd->len += 2* sizeof(uint32_t); + rp[0] = 1; + rp[1] = DN_MAX_ID - 1; + if (cmd->subtype == DN_LINK) { + rp[0] += DN_MAX_ID; + rp[1] += DN_MAX_ID; + } + } + /* Count space (under lock) and allocate (outside lock). + * Exit with lock held if we manage to get enough buffer. + * Try a few times then give up. + */ + for (have = 0, i = 0; i < 10; i++) { + DN_BH_WLOCK(); + need = compute_space(cmd, &a); + + /* if there is a range, ignore value from compute_space() */ + if (l > sizeof(*cmd)) + need = sopt_valsize - sizeof(*cmd); + + if (need < 0) { + DN_BH_WUNLOCK(); + error = EINVAL; + goto done; + } + need += sizeof(*cmd); + cmd->id = need; + if (have >= need) + break; + + DN_BH_WUNLOCK(); + if (start) + free(start, M_DUMMYNET); + start = NULL; + if (need > sopt_valsize) + break; + + have = need; + start = malloc(have, M_DUMMYNET, M_WAITOK | M_ZERO); + if (start == NULL) { + error = ENOMEM; + goto done; + } + } + + if (start == NULL) { + if (compat) { + *compat = NULL; + error = 1; // XXX + } else { + error = sooptcopyout(sopt, cmd, sizeof(*cmd)); + } + goto done; + } + ND("have %d:%d sched %d, %d:%d links %d, %d:%d flowsets %d, " + "%d:%d si %d, %d:%d queues %d", + dn_cfg.schk_count, sizeof(struct dn_sch), DN_SCH, + dn_cfg.schk_count, sizeof(struct dn_link), DN_LINK, + dn_cfg.fsk_count, sizeof(struct dn_fs), DN_FS, + dn_cfg.si_count, sizeof(struct dn_flow), DN_SCH_I, + dn_cfg.queue_count, sizeof(struct dn_queue), DN_QUEUE); + sopt->sopt_valsize = sopt_valsize; + a.type = cmd->subtype; + + if (compat == NULL) { + bcopy(cmd, start, sizeof(*cmd)); + ((struct dn_id*)(start))->len = sizeof(struct dn_id); + buf = start + sizeof(*cmd); + } else + buf = start; + a.start = &buf; + a.end = start + have; + /* start copying other objects */ + if (compat) { + a.type = DN_COMPAT_PIPE; + dn_ht_scan(dn_cfg.schedhash, copy_data_helper_compat, &a); + a.type = DN_COMPAT_QUEUE; + dn_ht_scan(dn_cfg.fshash, copy_data_helper_compat, &a); + } else if (a.type == DN_FS) { + dn_ht_scan(dn_cfg.fshash, copy_data_helper, &a); + } else { + dn_ht_scan(dn_cfg.schedhash, copy_data_helper, &a); + } + DN_BH_WUNLOCK(); + + if (compat) { + *compat = start; + sopt->sopt_valsize = buf - start; + /* free() is done by ip_dummynet_compat() */ + start = NULL; //XXX hack + } else { + error = sooptcopyout(sopt, start, buf - start); + } +done: + if (cmd && cmd != &r.o) + free(cmd, M_DUMMYNET); + if (start) + free(start, M_DUMMYNET); + return error; } -/* - * Fully delete a pipe or a queue, cleaning up associated info. - */ +/* Callback called on scheduler instance to delete it if idle */ static int -delete_pipe(struct dn_pipe *p) +drain_scheduler_cb(void *_si, void *arg) { + struct dn_sch_inst *si = _si; - if (p->pipe_nr == 0 && p->fs.fs_nr == 0) - return EINVAL ; - if (p->pipe_nr != 0 && p->fs.fs_nr != 0) - return EINVAL ; - if (p->pipe_nr != 0) { /* this is an old-style pipe */ - struct dn_pipe *pipe; - struct dn_flow_set *fs; - int i; - - DUMMYNET_LOCK(); - pipe = locate_pipe(p->pipe_nr); /* locate pipe */ + if ((si->kflags & DN_ACTIVE) || si->dline.mq.head != NULL) + return 0; - if (pipe == NULL) { - DUMMYNET_UNLOCK(); - return (ENOENT); /* not found */ + if (si->sched->fp->flags & DN_MULTIQUEUE) { + if (si->q_count == 0) + return si_destroy(si, NULL); + else + return 0; + } else { /* !DN_MULTIQUEUE */ + if ((si+1)->ni.length == 0) + return si_destroy(si, NULL); + else + return 0; } + return 0; /* unreachable */ +} - /* Unlink from list of pipes. */ - SLIST_REMOVE(&pipehash[HASH(pipe->pipe_nr)], pipe, dn_pipe, next); +/* Callback called on scheduler to check if it has instances */ +static int +drain_scheduler_sch_cb(void *_s, void *arg) +{ + struct dn_schk *s = _s; - /* Remove all references to this pipe from flow_sets. */ - for (i = 0; i < HASHSIZE; i++) - SLIST_FOREACH(fs, &flowsethash[i], next) - if (fs->pipe == pipe) { - printf("dummynet: ++ ref to pipe %d from fs %d\n", - p->pipe_nr, fs->fs_nr); - fs->pipe = NULL ; - purge_flow_set(fs, 0); + if (s->sch.flags & DN_HAVE_MASK) { + dn_ht_scan_bucket(s->siht, &s->drain_bucket, + drain_scheduler_cb, NULL); + s->drain_bucket++; + } else { + if (s->siht) { + if (drain_scheduler_cb(s->siht, NULL) == DNHT_SCAN_DEL) + s->siht = NULL; } - fs_remove_from_heap(&ready_heap, &(pipe->fs)); - purge_pipe(pipe); /* remove all data associated to this pipe */ - /* remove reference to here from extract_heap and wfq_ready_heap */ - pipe_remove_from_heap(&extract_heap, pipe); - pipe_remove_from_heap(&wfq_ready_heap, pipe); - DUMMYNET_UNLOCK(); - - free_pipe(pipe); - } else { /* this is a WF2Q queue (dn_flow_set) */ - struct dn_flow_set *fs; - - DUMMYNET_LOCK(); - fs = locate_flowset(p->fs.fs_nr); /* locate set */ - - if (fs == NULL) { - DUMMYNET_UNLOCK(); - return (ENOENT); /* not found */ - } - - /* Unlink from list of flowsets. */ - SLIST_REMOVE( &flowsethash[HASH(fs->fs_nr)], fs, dn_flow_set, next); - - if (fs->pipe != NULL) { - /* Update total weight on parent pipe and cleanup parent heaps. */ - fs->pipe->sum -= fs->weight * fs->backlogged ; - fs_remove_from_heap(&(fs->pipe->not_eligible_heap), fs); - fs_remove_from_heap(&(fs->pipe->scheduler_heap), fs); -#if 1 /* XXX should i remove from idle_heap as well ? */ - fs_remove_from_heap(&(fs->pipe->idle_heap), fs); -#endif } - purge_flow_set(fs, 1); - DUMMYNET_UNLOCK(); - } - return 0 ; + return 0; } -/* - * helper function used to copy data from kernel in DUMMYNET_GET - */ -static char * -dn_copy_set(struct dn_flow_set *set, char *bp) -{ - int i, copied = 0 ; - struct dn_flow_queue *q, *qp = (struct dn_flow_queue *)bp; - - DUMMYNET_LOCK_ASSERT(); - - for (i = 0 ; i <= set->rq_size ; i++) - for (q = set->rq[i] ; q ; q = q->next, qp++ ) { - if (q->hash_slot != i) - printf("dummynet: ++ at %d: wrong slot (have %d, " - "should be %d)\n", copied, q->hash_slot, i); - if (q->fs != set) - printf("dummynet: ++ at %d: wrong fs ptr (have %p, should be %p)\n", - i, q->fs, set); - copied++ ; - bcopy(q, qp, sizeof( *q ) ); - /* cleanup pointers */ - qp->next = NULL ; - qp->head = qp->tail = NULL ; - qp->fs = NULL ; - } - if (copied != set->rq_elements) - printf("dummynet: ++ wrong count, have %d should be %d\n", - copied, set->rq_elements); - return (char *)qp ; -} - -static size_t -dn_calc_size(void) -{ - struct dn_flow_set *fs; - struct dn_pipe *pipe; - size_t size = 0; - int i; - - DUMMYNET_LOCK_ASSERT(); - /* - * Compute size of data structures: list of pipes and flow_sets. - */ - for (i = 0; i < HASHSIZE; i++) { - SLIST_FOREACH(pipe, &pipehash[i], next) - size += sizeof(*pipe) + - pipe->fs.rq_elements * sizeof(struct dn_flow_queue); - SLIST_FOREACH(fs, &flowsethash[i], next) - size += sizeof (*fs) + - fs->rq_elements * sizeof(struct dn_flow_queue); - } - return size; +/* Called every tick, try to delete a 'bucket' of scheduler */ +void +dn_drain_scheduler(void) +{ + dn_ht_scan_bucket(dn_cfg.schedhash, &dn_cfg.drain_sch, + drain_scheduler_sch_cb, NULL); + dn_cfg.drain_sch++; } +/* Callback called on queue to delete if it is idle */ static int -dummynet_get(struct sockopt *sopt) -{ - char *buf, *bp ; /* bp is the "copy-pointer" */ - size_t size ; - struct dn_flow_set *fs; - struct dn_pipe *pipe; - int error=0, i ; - - /* XXX lock held too long */ - DUMMYNET_LOCK(); - /* - * XXX: Ugly, but we need to allocate memory with M_WAITOK flag and we - * cannot use this flag while holding a mutex. - */ - for (i = 0; i < 10; i++) { - size = dn_calc_size(); - DUMMYNET_UNLOCK(); - buf = malloc(size, M_TEMP, M_WAITOK); - DUMMYNET_LOCK(); - if (size == dn_calc_size()) - break; - free(buf, M_TEMP); - buf = NULL; - } - if (buf == NULL) { - DUMMYNET_UNLOCK(); - return ENOBUFS ; - } - bp = buf; - for (i = 0; i < HASHSIZE; i++) - SLIST_FOREACH(pipe, &pipehash[i], next) { - struct dn_pipe *pipe_bp = (struct dn_pipe *)bp; - - /* - * Copy pipe descriptor into *bp, convert delay back to ms, - * then copy the flow_set descriptor(s) one at a time. - * After each flow_set, copy the queue descriptor it owns. - */ - bcopy(pipe, bp, sizeof(*pipe)); - pipe_bp->delay = (pipe_bp->delay * 1000) / hz; - pipe_bp->burst /= 8 * hz; - /* - * XXX the following is a hack based on ->next being the - * first field in dn_pipe and dn_flow_set. The correct - * solution would be to move the dn_flow_set to the beginning - * of struct dn_pipe. - */ - pipe_bp->next.sle_next = (struct dn_pipe *)DN_IS_PIPE; - /* Clean pointers. */ - pipe_bp->head = pipe_bp->tail = NULL; - pipe_bp->fs.next.sle_next = NULL; - pipe_bp->fs.pipe = NULL; - pipe_bp->fs.rq = NULL; - pipe_bp->samples = NULL; +drain_queue_cb(void *_q, void *arg) +{ + struct dn_queue *q = _q; - bp += sizeof(*pipe) ; - bp = dn_copy_set(&(pipe->fs), bp); + if (q->ni.length == 0) { + dn_delete_queue(q, DN_DESTROY); + return DNHT_SCAN_DEL; /* queue is deleted */ } - for (i = 0; i < HASHSIZE; i++) - SLIST_FOREACH(fs, &flowsethash[i], next) { - struct dn_flow_set *fs_bp = (struct dn_flow_set *)bp; + return 0; /* queue isn't deleted */ +} - bcopy(fs, bp, sizeof(*fs)); - /* XXX same hack as above */ - fs_bp->next.sle_next = (struct dn_flow_set *)DN_IS_QUEUE; - fs_bp->pipe = NULL; - fs_bp->rq = NULL; - bp += sizeof(*fs); - bp = dn_copy_set(fs, bp); - } +/* Callback called on flowset used to check if it has queues */ +static int +drain_queue_fs_cb(void *_fs, void *arg) +{ + struct dn_fsk *fs = _fs; - DUMMYNET_UNLOCK(); + if (fs->fs.flags & DN_QHT_HASH) { + /* Flowset has a hash table for queues */ + dn_ht_scan_bucket(fs->qht, &fs->drain_bucket, + drain_queue_cb, NULL); + fs->drain_bucket++; + } else { + /* No hash table for this flowset, null the pointer + * if the queue is deleted + */ + if (fs->qht) { + if (drain_queue_cb(fs->qht, NULL) == DNHT_SCAN_DEL) + fs->qht = NULL; + } + } + return 0; +} - error = sooptcopyout(sopt, buf, size); - free(buf, M_TEMP); - return error ; +/* Called every tick, try to delete a 'bucket' of queue */ +void +dn_drain_queue(void) +{ + /* scan a bucket of flowset */ + dn_ht_scan_bucket(dn_cfg.fshash, &dn_cfg.drain_fs, + drain_queue_fs_cb, NULL); + dn_cfg.drain_fs++; } /* - * Handler for the various dummynet socket options (get, flush, config, del) + * Handler for the various dummynet socket options */ static int ip_dn_ctl(struct sockopt *sopt) { - int error; - struct dn_pipe *p = NULL; + void *p = NULL; + int error, l; - error = priv_check(sopt->sopt_td, PRIV_NETINET_DUMMYNET); - if (error) - return (error); - - /* Disallow sets in really-really secure mode. */ - if (sopt->sopt_dir == SOPT_SET) { -#if __FreeBSD_version >= 500034 - error = securelevel_ge(sopt->sopt_td->td_ucred, 3); + error = priv_check(sopt->sopt_td, PRIV_NETINET_DUMMYNET); if (error) - return (error); -#else - if (securelevel >= 3) - return (EPERM); -#endif - } - - switch (sopt->sopt_name) { - default : - printf("dummynet: -- unknown option %d", sopt->sopt_name); - error = EINVAL ; - break; + return (error); - case IP_DUMMYNET_GET : - error = dummynet_get(sopt); - break ; + /* Disallow sets in really-really secure mode. */ + if (sopt->sopt_dir == SOPT_SET) { + error = securelevel_ge(sopt->sopt_td->td_ucred, 3); + if (error) + return (error); + } - case IP_DUMMYNET_FLUSH : - dummynet_flush() ; - break ; + switch (sopt->sopt_name) { + default : + D("dummynet: unknown option %d", sopt->sopt_name); + error = EINVAL; + break; - case IP_DUMMYNET_CONFIGURE : - p = malloc(sizeof(struct dn_pipe_max), M_TEMP, M_WAITOK); - error = sooptcopyin(sopt, p, sizeof(struct dn_pipe_max), sizeof *p); - if (error) - break ; - if (p->samples_no > 0) - p->samples = &(((struct dn_pipe_max *)p)->samples[0]); + case IP_DUMMYNET_FLUSH: + case IP_DUMMYNET_CONFIGURE: + case IP_DUMMYNET_DEL: /* remove a pipe or queue */ + case IP_DUMMYNET_GET: + D("dummynet: compat option %d", sopt->sopt_name); + error = ip_dummynet_compat(sopt); + break; - error = config_pipe(p); - break ; + case IP_DUMMYNET3 : + if (sopt->sopt_dir == SOPT_GET) { + error = dummynet_get(sopt, NULL); + break; + } + l = sopt->sopt_valsize; + if (l < sizeof(struct dn_id) || l > 12000) { + D("argument len %d invalid", l); + break; + } + p = malloc(l, M_TEMP, M_WAITOK); // XXX can it fail ? + error = sooptcopyin(sopt, p, l, l); + if (error) + break ; + error = do_config(p, l); + break; + } - case IP_DUMMYNET_DEL : /* remove a pipe or queue */ - p = malloc(sizeof(struct dn_pipe), M_TEMP, M_WAITOK); - error = sooptcopyin(sopt, p, sizeof(struct dn_pipe), sizeof *p); - if (error) - break ; + if (p != NULL) + free(p, M_TEMP); - error = delete_pipe(p); - break ; - } - if (p != NULL) - free(p, M_TEMP); - return error ; + return error ; } + static void ip_dn_init(void) { - int i; + static int init_done = 0; + if (init_done) + return; + init_done = 1; if (bootverbose) - printf("DUMMYNET with IPv6 initialized (040826)\n"); - - DUMMYNET_LOCK_INIT(); - - for (i = 0; i < HASHSIZE; i++) { - SLIST_INIT(&pipehash[i]); - SLIST_INIT(&flowsethash[i]); - } - ready_heap.size = ready_heap.elements = 0; - ready_heap.offset = 0; - - wfq_ready_heap.size = wfq_ready_heap.elements = 0; - wfq_ready_heap.offset = 0; - - extract_heap.size = extract_heap.elements = 0; - extract_heap.offset = 0; + printf("DUMMYNET with IPv6 initialized (100131)\n"); + /* Set defaults here. MSVC does not accept initializers, + * and this is also useful for vimages + */ + /* queue limits */ + dn_cfg.slot_limit = 100; /* Foot shooting limit for queues. */ + dn_cfg.byte_limit = 1024 * 1024; + dn_cfg.expire = 1; + + /* RED parameters */ + dn_cfg.red_lookup_depth = 256; /* default lookup table depth */ + dn_cfg.red_avg_pkt_size = 512; /* default medium packet size */ + dn_cfg.red_max_pkt_size = 1500; /* default max packet size */ + + /* hash tables */ + dn_cfg.max_hash_size = 1024; /* max in the hash tables */ + dn_cfg.hash_size = 64; /* default hash size */ + + /* create hash tables for schedulers and flowsets. + * In both we search by key and by pointer. + */ + dn_cfg.schedhash = dn_ht_init(NULL, dn_cfg.hash_size, + offsetof(struct dn_schk, schk_next), + schk_hash, schk_match, schk_new); + dn_cfg.fshash = dn_ht_init(NULL, dn_cfg.hash_size, + offsetof(struct dn_fsk, fsk_next), + fsk_hash, fsk_match, fsk_new); + + /* bucket index to drain object */ + dn_cfg.drain_fs = 0; + dn_cfg.drain_sch = 0; + + heap_init(&dn_cfg.evheap, 16, offsetof(struct dn_id, id)); + SLIST_INIT(&dn_cfg.fsu); + SLIST_INIT(&dn_cfg.schedlist); + + DN_LOCK_INIT(); ip_dn_ctl_ptr = ip_dn_ctl; ip_dn_io_ptr = dummynet_io; @@ -2285,25 +2168,29 @@ ip_dn_init(void) callout_reset(&dn_timeout, 1, dummynet, NULL); /* Initialize curr_time adjustment mechanics. */ - getmicrouptime(&prev_t); + getmicrouptime(&dn_cfg.prev_t); } #ifdef KLD_MODULE static void ip_dn_destroy(void) { + callout_drain(&dn_timeout); + + DN_BH_WLOCK(); ip_dn_ctl_ptr = NULL; ip_dn_io_ptr = NULL; - DUMMYNET_LOCK(); - callout_stop(&dn_timeout); - DUMMYNET_UNLOCK(); + dummynet_flush(); + DN_BH_WUNLOCK(); taskqueue_drain(dn_tq, &dn_task); taskqueue_free(dn_tq); - dummynet_flush(); + dn_ht_free(dn_cfg.schedhash, 0); + dn_ht_free(dn_cfg.fshash, 0); + heap_free(&dn_cfg.evheap); - DUMMYNET_LOCK_DESTROY(); + DN_LOCK_DESTROY(); } #endif /* KLD_MODULE */ @@ -2311,35 +2198,98 @@ static int dummynet_modevent(module_t mod, int type, void *data) { - switch (type) { - case MOD_LOAD: + if (type == MOD_LOAD) { if (ip_dn_io_ptr) { - printf("DUMMYNET already loaded\n"); - return EEXIST ; + printf("DUMMYNET already loaded\n"); + return EEXIST ; } ip_dn_init(); - break; - - case MOD_UNLOAD: + return 0; + } else if (type == MOD_UNLOAD) { #if !defined(KLD_MODULE) printf("dummynet statically compiled, cannot unload\n"); return EINVAL ; #else ip_dn_destroy(); + return 0; #endif - break ; - default: + } else return EOPNOTSUPP; - break ; +} + +/* modevent helpers for the modules */ +static int +load_dn_sched(struct dn_alg *d) +{ + struct dn_alg *s; + + if (d == NULL) + return 1; /* error */ + ip_dn_init(); /* just in case, we need the lock */ + + /* Check that mandatory funcs exists */ + if (d->enqueue == NULL || d->dequeue == NULL) { + D("missing enqueue or dequeue for %s", d->name); + return 1; + } + + /* Search if scheduler already exists */ + DN_BH_WLOCK(); + SLIST_FOREACH(s, &dn_cfg.schedlist, next) { + if (strcmp(s->name, d->name) == 0) { + D("%s already loaded", d->name); + break; /* scheduler already exists */ + } + } + if (s == NULL) + SLIST_INSERT_HEAD(&dn_cfg.schedlist, d, next); + DN_BH_WUNLOCK(); + D("dn_sched %s %sloaded", d->name, s ? "not ":""); + return s ? 1 : 0; +} + +static int +unload_dn_sched(struct dn_alg *s) +{ + struct dn_alg *tmp, *r; + int err = EINVAL; + + D("called for %s", s->name); + + DN_BH_WLOCK(); + SLIST_FOREACH_SAFE(r, &dn_cfg.schedlist, next, tmp) { + if (strcmp(s->name, r->name) != 0) + continue; + D("ref_count = %d", r->ref_count); + err = (r->ref_count != 0) ? EBUSY : 0; + if (err == 0) + SLIST_REMOVE(&dn_cfg.schedlist, r, dn_alg, next); + break; } - return 0 ; + DN_BH_WUNLOCK(); + D("dn_sched %s %sunloaded", s->name, err ? "not ":""); + return err; +} + +int +dn_sched_modevent(module_t mod, int cmd, void *arg) +{ + struct dn_alg *sch = arg; + + if (cmd == MOD_LOAD) + return load_dn_sched(sch); + else if (cmd == MOD_UNLOAD) + return unload_dn_sched(sch); + else + return EINVAL; } static moduledata_t dummynet_mod = { - "dummynet", - dummynet_modevent, - NULL + "dummynet", dummynet_modevent, NULL }; -DECLARE_MODULE(dummynet, dummynet_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); + +DECLARE_MODULE(dummynet, dummynet_mod, + SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY-1); MODULE_DEPEND(dummynet, ipfw, 2, 2, 2); MODULE_VERSION(dummynet, 1); +/* end of file */ diff --git a/sys/netinet/ipfw/ip_fw2.c b/sys/netinet/ipfw/ip_fw2.c index 1ccbf2b..959ad8e 100644 --- a/sys/netinet/ipfw/ip_fw2.c +++ b/sys/netinet/ipfw/ip_fw2.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa + * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -26,11 +26,8 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); -#define DEB(x) -#define DDB(x) x - /* - * Implement IP packet firewall (new version) + * The FreeBSD IP packet firewall, main file */ #if !defined(KLD_MODULE) @@ -65,13 +62,10 @@ __FBSDID("$FreeBSD$"); #include <sys/ucred.h> #include <net/ethernet.h> /* for ETHERTYPE_IP */ #include <net/if.h> -#include <net/radix.h> #include <net/route.h> #include <net/pf_mtag.h> #include <net/vnet.h> -#define IPFW_INTERNAL /* Access to protected data structures in ip_fw.h. */ - #include <netinet/in.h> #include <netinet/in_var.h> #include <netinet/in_pcb.h> @@ -79,8 +73,7 @@ __FBSDID("$FreeBSD$"); #include <netinet/ip_var.h> #include <netinet/ip_icmp.h> #include <netinet/ip_fw.h> -#include <netinet/ip_divert.h> -#include <netinet/ip_dummynet.h> +#include <netinet/ipfw/ip_fw_private.h> #include <netinet/ip_carp.h> #include <netinet/pim.h> #include <netinet/tcp_var.h> @@ -88,8 +81,6 @@ __FBSDID("$FreeBSD$"); #include <netinet/udp_var.h> #include <netinet/sctp.h> -#include <netgraph/ng_ipfw.h> - #include <netinet/ip6.h> #include <netinet/icmp6.h> #ifdef INET6 @@ -103,73 +94,66 @@ __FBSDID("$FreeBSD$"); #include <security/mac/mac_framework.h> #endif -static VNET_DEFINE(int, ipfw_vnet_ready) = 0; -#define V_ipfw_vnet_ready VNET(ipfw_vnet_ready) /* - * set_disable contains one bit per set value (0..31). - * If the bit is set, all rules with the corresponding set - * are disabled. Set RESVD_SET(31) is reserved for the default rule - * and rules that are not deleted by the flush command, - * and CANNOT be disabled. - * Rules in set RESVD_SET can only be deleted explicitly. + * static variables followed by global ones. + * All ipfw global variables are here. */ -static VNET_DEFINE(u_int32_t, set_disable); -static VNET_DEFINE(int, fw_verbose); -static VNET_DEFINE(struct callout, ipfw_timeout); -static VNET_DEFINE(int, verbose_limit); -#define V_set_disable VNET(set_disable) -#define V_fw_verbose VNET(fw_verbose) -#define V_ipfw_timeout VNET(ipfw_timeout) -#define V_verbose_limit VNET(verbose_limit) +/* ipfw_vnet_ready controls when we are open for business */ +static VNET_DEFINE(int, ipfw_vnet_ready) = 0; +#define V_ipfw_vnet_ready VNET(ipfw_vnet_ready) + +static VNET_DEFINE(int, fw_deny_unknown_exthdrs); +#define V_fw_deny_unknown_exthdrs VNET(fw_deny_unknown_exthdrs) #ifdef IPFIREWALL_DEFAULT_TO_ACCEPT static int default_to_accept = 1; #else static int default_to_accept; #endif -static uma_zone_t ipfw_dyn_rule_zone; -struct ip_fw *ip_fw_default_rule; +VNET_DEFINE(int, autoinc_step); /* - * list of rules for layer 3 + * Each rule belongs to one of 32 different sets (0..31). + * The variable set_disable contains one bit per set. + * If the bit is set, all rules in the corresponding set + * are disabled. Set RESVD_SET(31) is reserved for the default rule + * and rules that are not deleted by the flush command, + * and CANNOT be disabled. + * Rules in set RESVD_SET can only be deleted individually. */ +VNET_DEFINE(u_int32_t, set_disable); +#define V_set_disable VNET(set_disable) + +VNET_DEFINE(int, fw_verbose); +/* counter for ipfw_log(NULL...) */ +VNET_DEFINE(u_int64_t, norule_counter); +VNET_DEFINE(int, verbose_limit); + +/* layer3_chain contains the list of rules for layer 3 */ VNET_DEFINE(struct ip_fw_chain, layer3_chain); -MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's"); -MALLOC_DEFINE(M_IPFW_TBL, "ipfw_tbl", "IpFw tables"); -#define IPFW_NAT_LOADED (ipfw_nat_ptr != NULL) ipfw_nat_t *ipfw_nat_ptr = NULL; +struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int); ipfw_nat_cfg_t *ipfw_nat_cfg_ptr; ipfw_nat_cfg_t *ipfw_nat_del_ptr; ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr; ipfw_nat_cfg_t *ipfw_nat_get_log_ptr; -struct table_entry { - struct radix_node rn[2]; - struct sockaddr_in addr, mask; - u_int32_t value; -}; - -static VNET_DEFINE(int, autoinc_step); -#define V_autoinc_step VNET(autoinc_step) -static VNET_DEFINE(int, fw_deny_unknown_exthdrs); -#define V_fw_deny_unknown_exthdrs VNET(fw_deny_unknown_exthdrs) +#ifdef SYSCTL_NODE +uint32_t dummy_def = IPFW_DEFAULT_RULE; +uint32_t dummy_tables_max = IPFW_TABLES_MAX; -extern int ipfw_chg_hook(SYSCTL_HANDLER_ARGS); +SYSBEGIN(f3) -#ifdef SYSCTL_NODE SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); -SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, enable, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_enable), 0, - ipfw_chg_hook, "I", "Enable ipfw"); -SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step, - CTLFLAG_RW, &VNET_NAME(autoinc_step), 0, - "Rule number auto-increment step"); SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, one_pass, CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0, "Only do a single pass through ipfw when using dummynet(4)"); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step, + CTLFLAG_RW, &VNET_NAME(autoinc_step), 0, + "Rule number auto-increment step"); SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose, CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_verbose), 0, "Log matches to ipfw rules"); @@ -177,168 +161,34 @@ SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW, &VNET_NAME(verbose_limit), 0, "Set upper limit of matches of ipfw rules logged"); SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, default_rule, CTLFLAG_RD, - NULL, IPFW_DEFAULT_RULE, + &dummy_def, 0, "The default/max possible rule number."); SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, tables_max, CTLFLAG_RD, - NULL, IPFW_TABLES_MAX, + &dummy_tables_max, 0, "The maximum number of tables."); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, default_to_accept, CTLFLAG_RDTUN, &default_to_accept, 0, "Make the default rule accept all packets."); TUNABLE_INT("net.inet.ip.fw.default_to_accept", &default_to_accept); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, static_count, + CTLFLAG_RD, &VNET_NAME(layer3_chain.n_rules), 0, + "Number of static rules"); #ifdef INET6 SYSCTL_DECL(_net_inet6_ip6); SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); -SYSCTL_VNET_PROC(_net_inet6_ip6_fw, OID_AUTO, enable, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw6_enable), 0, - ipfw_chg_hook, "I", "Enable ipfw+6"); SYSCTL_VNET_INT(_net_inet6_ip6_fw, OID_AUTO, deny_unknown_exthdrs, CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_deny_unknown_exthdrs), 0, "Deny packets with unknown IPv6 Extension Headers"); #endif /* INET6 */ -#endif /* SYSCTL_NODE */ - -/* - * Description of dynamic rules. - * - * Dynamic rules are stored in lists accessed through a hash table - * (ipfw_dyn_v) whose size is curr_dyn_buckets. This value can - * be modified through the sysctl variable dyn_buckets which is - * updated when the table becomes empty. - * - * XXX currently there is only one list, ipfw_dyn. - * - * When a packet is received, its address fields are first masked - * with the mask defined for the rule, then hashed, then matched - * against the entries in the corresponding list. - * Dynamic rules can be used for different purposes: - * + stateful rules; - * + enforcing limits on the number of sessions; - * + in-kernel NAT (not implemented yet) - * - * The lifetime of dynamic rules is regulated by dyn_*_lifetime, - * measured in seconds and depending on the flags. - * - * The total number of dynamic rules is stored in dyn_count. - * The max number of dynamic rules is dyn_max. When we reach - * the maximum number of rules we do not create anymore. This is - * done to avoid consuming too much memory, but also too much - * time when searching on each packet (ideally, we should try instead - * to put a limit on the length of the list on each bucket...). - * - * Each dynamic rule holds a pointer to the parent ipfw rule so - * we know what action to perform. Dynamic rules are removed when - * the parent rule is deleted. XXX we should make them survive. - * - * There are some limitations with dynamic rules -- we do not - * obey the 'randomized match', and we do not do multiple - * passes through the firewall. XXX check the latter!!! - */ -static VNET_DEFINE(ipfw_dyn_rule **, ipfw_dyn_v); -static VNET_DEFINE(u_int32_t, dyn_buckets); -static VNET_DEFINE(u_int32_t, curr_dyn_buckets); - -#define V_ipfw_dyn_v VNET(ipfw_dyn_v) -#define V_dyn_buckets VNET(dyn_buckets) -#define V_curr_dyn_buckets VNET(curr_dyn_buckets) - -static struct mtx ipfw_dyn_mtx; /* mutex guarding dynamic rules */ -#define IPFW_DYN_LOCK_INIT() \ - mtx_init(&ipfw_dyn_mtx, "IPFW dynamic rules", NULL, MTX_DEF) -#define IPFW_DYN_LOCK_DESTROY() mtx_destroy(&ipfw_dyn_mtx) -#define IPFW_DYN_LOCK() mtx_lock(&ipfw_dyn_mtx) -#define IPFW_DYN_UNLOCK() mtx_unlock(&ipfw_dyn_mtx) -#define IPFW_DYN_LOCK_ASSERT() mtx_assert(&ipfw_dyn_mtx, MA_OWNED) +SYSEND -static struct mbuf *send_pkt(struct mbuf *, struct ipfw_flow_id *, - u_int32_t, u_int32_t, int); - - -/* - * Timeouts for various events in handing dynamic rules. - */ -static VNET_DEFINE(u_int32_t, dyn_ack_lifetime); -static VNET_DEFINE(u_int32_t, dyn_syn_lifetime); -static VNET_DEFINE(u_int32_t, dyn_fin_lifetime); -static VNET_DEFINE(u_int32_t, dyn_rst_lifetime); -static VNET_DEFINE(u_int32_t, dyn_udp_lifetime); -static VNET_DEFINE(u_int32_t, dyn_short_lifetime); - -#define V_dyn_ack_lifetime VNET(dyn_ack_lifetime) -#define V_dyn_syn_lifetime VNET(dyn_syn_lifetime) -#define V_dyn_fin_lifetime VNET(dyn_fin_lifetime) -#define V_dyn_rst_lifetime VNET(dyn_rst_lifetime) -#define V_dyn_udp_lifetime VNET(dyn_udp_lifetime) -#define V_dyn_short_lifetime VNET(dyn_short_lifetime) - -/* - * Keepalives are sent if dyn_keepalive is set. They are sent every - * dyn_keepalive_period seconds, in the last dyn_keepalive_interval - * seconds of lifetime of a rule. - * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower - * than dyn_keepalive_period. - */ - -static VNET_DEFINE(u_int32_t, dyn_keepalive_interval); -static VNET_DEFINE(u_int32_t, dyn_keepalive_period); -static VNET_DEFINE(u_int32_t, dyn_keepalive); - -#define V_dyn_keepalive_interval VNET(dyn_keepalive_interval) -#define V_dyn_keepalive_period VNET(dyn_keepalive_period) -#define V_dyn_keepalive VNET(dyn_keepalive) - -static VNET_DEFINE(u_int32_t, static_count); /* # of static rules */ -static VNET_DEFINE(u_int32_t, static_len); /* bytes of static rules */ -static VNET_DEFINE(u_int32_t, dyn_count); /* # of dynamic rules */ -static VNET_DEFINE(u_int32_t, dyn_max); /* max # of dynamic rules */ - -#define V_static_count VNET(static_count) -#define V_static_len VNET(static_len) -#define V_dyn_count VNET(dyn_count) -#define V_dyn_max VNET(dyn_max) - -#ifdef SYSCTL_NODE -SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_buckets, - CTLFLAG_RW, &VNET_NAME(dyn_buckets), 0, - "Number of dyn. buckets"); -SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, - CTLFLAG_RD, &VNET_NAME(curr_dyn_buckets), 0, - "Current Number of dyn. buckets"); -SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_count, - CTLFLAG_RD, &VNET_NAME(dyn_count), 0, - "Number of dyn. rules"); -SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_max, - CTLFLAG_RW, &VNET_NAME(dyn_max), 0, - "Max number of dyn. rules"); -SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, static_count, - CTLFLAG_RD, &VNET_NAME(static_count), 0, - "Number of static rules"); -SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, - CTLFLAG_RW, &VNET_NAME(dyn_ack_lifetime), 0, - "Lifetime of dyn. rules for acks"); -SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, - CTLFLAG_RW, &VNET_NAME(dyn_syn_lifetime), 0, - "Lifetime of dyn. rules for syn"); -SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, - CTLFLAG_RW, &VNET_NAME(dyn_fin_lifetime), 0, - "Lifetime of dyn. rules for fin"); -SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, - CTLFLAG_RW, &VNET_NAME(dyn_rst_lifetime), 0, - "Lifetime of dyn. rules for rst"); -SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, - CTLFLAG_RW, &VNET_NAME(dyn_udp_lifetime), 0, - "Lifetime of dyn. rules for UDP"); -SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, - CTLFLAG_RW, &VNET_NAME(dyn_short_lifetime), 0, - "Lifetime of dyn. rules for other situations"); -SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, - CTLFLAG_RW, &VNET_NAME(dyn_keepalive), 0, - "Enable keepalives for dyn. rules"); #endif /* SYSCTL_NODE */ + /* + * Some macros used in the various matching options. * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T * Other macros just cast void * into the appropriate type */ @@ -501,6 +351,7 @@ iface_match(struct ifnet *ifp, ipfw_insn_if *cmd) return(1); } } else { +#ifdef __FreeBSD__ /* and OSX too ? */ struct ifaddr *ia; if_addr_rlock(ifp); @@ -514,6 +365,7 @@ iface_match(struct ifnet *ifp, ipfw_insn_if *cmd) } } if_addr_runlock(ifp); +#endif /* __FreeBSD__ */ } return(0); /* no match, fail ... */ } @@ -524,23 +376,27 @@ iface_match(struct ifnet *ifp, ipfw_insn_if *cmd) * * The 'verrevpath' option checks that the interface that an IP packet * arrives on is the same interface that traffic destined for the - * packet's source address would be routed out of. The 'versrcreach' - * option just checks that the source address is reachable via any route - * (except default) in the routing table. These two are a measure to block - * forged packets. This is also commonly known as "anti-spoofing" or Unicast - * Reverse Path Forwarding (Unicast RFP) in Cisco-ese. The name of the knobs + * packet's source address would be routed out of. + * The 'versrcreach' option just checks that the source address is + * reachable via any route (except default) in the routing table. + * These two are a measure to block forged packets. This is also + * commonly known as "anti-spoofing" or Unicast Reverse Path + * Forwarding (Unicast RFP) in Cisco-ese. The name of the knobs * is purposely reminiscent of the Cisco IOS command, * * ip verify unicast reverse-path * ip verify unicast source reachable-via any * - * which implements the same functionality. But note that syntax is - * misleading. The check may be performed on all IP packets whether unicast, - * multicast, or broadcast. + * which implements the same functionality. But note that the syntax + * is misleading, and the check may be performed on all IP packets + * whether unicast, multicast, or broadcast. */ static int verify_path(struct in_addr src, struct ifnet *ifp, u_int fib) { +#ifndef __FreeBSD__ + return 0; +#else struct route ro; struct sockaddr_in *dst; @@ -583,6 +439,7 @@ verify_path(struct in_addr src, struct ifnet *ifp, u_int fib) /* found valid route */ RTFREE(ro.ro_rt); return 1; +#endif /* __FreeBSD__ */ } #ifdef INET6 @@ -681,17 +538,6 @@ verify_path6(struct in6_addr *src, struct ifnet *ifp) return 1; } -static __inline int -hash_packet6(struct ipfw_flow_id *id) -{ - u_int32_t i; - i = (id->dst_ip6.__u6_addr.__u6_addr32[2]) ^ - (id->dst_ip6.__u6_addr.__u6_addr32[3]) ^ - (id->src_ip6.__u6_addr.__u6_addr32[2]) ^ - (id->src_ip6.__u6_addr.__u6_addr32[3]) ^ - (id->dst_port) ^ (id->src_port); - return i; -} static int is_icmp6_query(int icmp6_type) @@ -719,14 +565,14 @@ send_reject6(struct ip_fw_args *args, int code, u_int hlen, struct ip6_hdr *ip6) if ((tcp->th_flags & TH_RST) == 0) { struct mbuf *m0; - m0 = send_pkt(args->m, &(args->f_id), + m0 = ipfw_send_pkt(args->m, &(args->f_id), ntohl(tcp->th_seq), ntohl(tcp->th_ack), tcp->th_flags | TH_RST); if (m0 != NULL) ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL); } - m_freem(m); + FREE_PKT(m); } else if (code != ICMP6_UNREACH_RST) { /* Send an ICMPv6 unreach. */ #if 0 /* @@ -742,1085 +588,19 @@ send_reject6(struct ip_fw_args *args, int code, u_int hlen, struct ip6_hdr *ip6) #endif icmp6_error(m, ICMP6_DST_UNREACH, code, 0); } else - m_freem(m); + FREE_PKT(m); args->m = NULL; } #endif /* INET6 */ -/* counter for ipfw_log(NULL...) */ -static VNET_DEFINE(u_int64_t, norule_counter); -#define V_norule_counter VNET(norule_counter) - -#define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0 -#define SNP(buf) buf, sizeof(buf) - -/* - * We enter here when we have a rule with O_LOG. - * XXX this function alone takes about 2Kbytes of code! - */ -static void -ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args, - struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg, - struct ip *ip) -{ - struct ether_header *eh = args->eh; - char *action; - int limit_reached = 0; - char action2[40], proto[128], fragment[32]; - - fragment[0] = '\0'; - proto[0] = '\0'; - - if (f == NULL) { /* bogus pkt */ - if (V_verbose_limit != 0 && V_norule_counter >= V_verbose_limit) - return; - V_norule_counter++; - if (V_norule_counter == V_verbose_limit) - limit_reached = V_verbose_limit; - action = "Refuse"; - } else { /* O_LOG is the first action, find the real one */ - ipfw_insn *cmd = ACTION_PTR(f); - ipfw_insn_log *l = (ipfw_insn_log *)cmd; - - if (l->max_log != 0 && l->log_left == 0) - return; - l->log_left--; - if (l->log_left == 0) - limit_reached = l->max_log; - cmd += F_LEN(cmd); /* point to first action */ - if (cmd->opcode == O_ALTQ) { - ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd; - - snprintf(SNPARGS(action2, 0), "Altq %d", - altq->qid); - cmd += F_LEN(cmd); - } - if (cmd->opcode == O_PROB) - cmd += F_LEN(cmd); - - if (cmd->opcode == O_TAG) - cmd += F_LEN(cmd); - - action = action2; - switch (cmd->opcode) { - case O_DENY: - action = "Deny"; - break; - - case O_REJECT: - if (cmd->arg1==ICMP_REJECT_RST) - action = "Reset"; - else if (cmd->arg1==ICMP_UNREACH_HOST) - action = "Reject"; - else - snprintf(SNPARGS(action2, 0), "Unreach %d", - cmd->arg1); - break; - - case O_UNREACH6: - if (cmd->arg1==ICMP6_UNREACH_RST) - action = "Reset"; - else - snprintf(SNPARGS(action2, 0), "Unreach %d", - cmd->arg1); - break; - - case O_ACCEPT: - action = "Accept"; - break; - case O_COUNT: - action = "Count"; - break; - case O_DIVERT: - snprintf(SNPARGS(action2, 0), "Divert %d", - cmd->arg1); - break; - case O_TEE: - snprintf(SNPARGS(action2, 0), "Tee %d", - cmd->arg1); - break; - case O_SETFIB: - snprintf(SNPARGS(action2, 0), "SetFib %d", - cmd->arg1); - break; - case O_SKIPTO: - snprintf(SNPARGS(action2, 0), "SkipTo %d", - cmd->arg1); - break; - case O_PIPE: - snprintf(SNPARGS(action2, 0), "Pipe %d", - cmd->arg1); - break; - case O_QUEUE: - snprintf(SNPARGS(action2, 0), "Queue %d", - cmd->arg1); - break; - case O_FORWARD_IP: { - ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd; - int len; - struct in_addr dummyaddr; - if (sa->sa.sin_addr.s_addr == INADDR_ANY) - dummyaddr.s_addr = htonl(tablearg); - else - dummyaddr.s_addr = sa->sa.sin_addr.s_addr; - - len = snprintf(SNPARGS(action2, 0), "Forward to %s", - inet_ntoa(dummyaddr)); - - if (sa->sa.sin_port) - snprintf(SNPARGS(action2, len), ":%d", - sa->sa.sin_port); - } - break; - case O_NETGRAPH: - snprintf(SNPARGS(action2, 0), "Netgraph %d", - cmd->arg1); - break; - case O_NGTEE: - snprintf(SNPARGS(action2, 0), "Ngtee %d", - cmd->arg1); - break; - case O_NAT: - action = "Nat"; - break; - case O_REASS: - action = "Reass"; - break; - default: - action = "UNKNOWN"; - break; - } - } - - if (hlen == 0) { /* non-ip */ - snprintf(SNPARGS(proto, 0), "MAC"); - - } else { - int len; -#ifdef INET6 - char src[INET6_ADDRSTRLEN + 2], dst[INET6_ADDRSTRLEN + 2]; -#else - char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN]; -#endif - struct icmphdr *icmp; - struct tcphdr *tcp; - struct udphdr *udp; -#ifdef INET6 - struct ip6_hdr *ip6 = NULL; - struct icmp6_hdr *icmp6; -#endif - src[0] = '\0'; - dst[0] = '\0'; -#ifdef INET6 - if (IS_IP6_FLOW_ID(&(args->f_id))) { - char ip6buf[INET6_ADDRSTRLEN]; - snprintf(src, sizeof(src), "[%s]", - ip6_sprintf(ip6buf, &args->f_id.src_ip6)); - snprintf(dst, sizeof(dst), "[%s]", - ip6_sprintf(ip6buf, &args->f_id.dst_ip6)); - - ip6 = (struct ip6_hdr *)ip; - tcp = (struct tcphdr *)(((char *)ip) + hlen); - udp = (struct udphdr *)(((char *)ip) + hlen); - } else -#endif - { - tcp = L3HDR(struct tcphdr, ip); - udp = L3HDR(struct udphdr, ip); - - inet_ntoa_r(ip->ip_src, src); - inet_ntoa_r(ip->ip_dst, dst); - } - - switch (args->f_id.proto) { - case IPPROTO_TCP: - len = snprintf(SNPARGS(proto, 0), "TCP %s", src); - if (offset == 0) - snprintf(SNPARGS(proto, len), ":%d %s:%d", - ntohs(tcp->th_sport), - dst, - ntohs(tcp->th_dport)); - else - snprintf(SNPARGS(proto, len), " %s", dst); - break; - - case IPPROTO_UDP: - len = snprintf(SNPARGS(proto, 0), "UDP %s", src); - if (offset == 0) - snprintf(SNPARGS(proto, len), ":%d %s:%d", - ntohs(udp->uh_sport), - dst, - ntohs(udp->uh_dport)); - else - snprintf(SNPARGS(proto, len), " %s", dst); - break; - - case IPPROTO_ICMP: - icmp = L3HDR(struct icmphdr, ip); - if (offset == 0) - len = snprintf(SNPARGS(proto, 0), - "ICMP:%u.%u ", - icmp->icmp_type, icmp->icmp_code); - else - len = snprintf(SNPARGS(proto, 0), "ICMP "); - len += snprintf(SNPARGS(proto, len), "%s", src); - snprintf(SNPARGS(proto, len), " %s", dst); - break; -#ifdef INET6 - case IPPROTO_ICMPV6: - icmp6 = (struct icmp6_hdr *)(((char *)ip) + hlen); - if (offset == 0) - len = snprintf(SNPARGS(proto, 0), - "ICMPv6:%u.%u ", - icmp6->icmp6_type, icmp6->icmp6_code); - else - len = snprintf(SNPARGS(proto, 0), "ICMPv6 "); - len += snprintf(SNPARGS(proto, len), "%s", src); - snprintf(SNPARGS(proto, len), " %s", dst); - break; -#endif - default: - len = snprintf(SNPARGS(proto, 0), "P:%d %s", - args->f_id.proto, src); - snprintf(SNPARGS(proto, len), " %s", dst); - break; - } - -#ifdef INET6 - if (IS_IP6_FLOW_ID(&(args->f_id))) { - if (offset & (IP6F_OFF_MASK | IP6F_MORE_FRAG)) - snprintf(SNPARGS(fragment, 0), - " (frag %08x:%d@%d%s)", - args->f_id.frag_id6, - ntohs(ip6->ip6_plen) - hlen, - ntohs(offset & IP6F_OFF_MASK) << 3, - (offset & IP6F_MORE_FRAG) ? "+" : ""); - } else -#endif - { - int ip_off, ip_len; - if (eh != NULL) { /* layer 2 packets are as on the wire */ - ip_off = ntohs(ip->ip_off); - ip_len = ntohs(ip->ip_len); - } else { - ip_off = ip->ip_off; - ip_len = ip->ip_len; - } - if (ip_off & (IP_MF | IP_OFFMASK)) - snprintf(SNPARGS(fragment, 0), - " (frag %d:%d@%d%s)", - ntohs(ip->ip_id), ip_len - (ip->ip_hl << 2), - offset << 3, - (ip_off & IP_MF) ? "+" : ""); - } - } - if (oif || m->m_pkthdr.rcvif) - log(LOG_SECURITY | LOG_INFO, - "ipfw: %d %s %s %s via %s%s\n", - f ? f->rulenum : -1, - action, proto, oif ? "out" : "in", - oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname, - fragment); - else - log(LOG_SECURITY | LOG_INFO, - "ipfw: %d %s %s [no if info]%s\n", - f ? f->rulenum : -1, - action, proto, fragment); - if (limit_reached) - log(LOG_SECURITY | LOG_NOTICE, - "ipfw: limit %d reached on entry %d\n", - limit_reached, f ? f->rulenum : -1); -} - -/* - * IMPORTANT: the hash function for dynamic rules must be commutative - * in source and destination (ip,port), because rules are bidirectional - * and we want to find both in the same bucket. - */ -static __inline int -hash_packet(struct ipfw_flow_id *id) -{ - u_int32_t i; - -#ifdef INET6 - if (IS_IP6_FLOW_ID(id)) - i = hash_packet6(id); - else -#endif /* INET6 */ - i = (id->dst_ip) ^ (id->src_ip) ^ (id->dst_port) ^ (id->src_port); - i &= (V_curr_dyn_buckets - 1); - return i; -} - -static __inline void -unlink_dyn_rule_print(struct ipfw_flow_id *id) -{ - struct in_addr da; -#ifdef INET6 - char src[INET6_ADDRSTRLEN], dst[INET6_ADDRSTRLEN]; -#else - char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN]; -#endif - -#ifdef INET6 - if (IS_IP6_FLOW_ID(id)) { - ip6_sprintf(src, &id->src_ip6); - ip6_sprintf(dst, &id->dst_ip6); - } else -#endif - { - da.s_addr = htonl(id->src_ip); - inet_ntoa_r(da, src); - da.s_addr = htonl(id->dst_ip); - inet_ntoa_r(da, dst); - } - printf("ipfw: unlink entry %s %d -> %s %d, %d left\n", - src, id->src_port, dst, id->dst_port, V_dyn_count - 1); -} - -/** - * unlink a dynamic rule from a chain. prev is a pointer to - * the previous one, q is a pointer to the rule to delete, - * head is a pointer to the head of the queue. - * Modifies q and potentially also head. - */ -#define UNLINK_DYN_RULE(prev, head, q) { \ - ipfw_dyn_rule *old_q = q; \ - \ - /* remove a refcount to the parent */ \ - if (q->dyn_type == O_LIMIT) \ - q->parent->count--; \ - DEB(unlink_dyn_rule_print(&q->id);) \ - if (prev != NULL) \ - prev->next = q = q->next; \ - else \ - head = q = q->next; \ - V_dyn_count--; \ - uma_zfree(ipfw_dyn_rule_zone, old_q); } - -#define TIME_LEQ(a,b) ((int)((a)-(b)) <= 0) - -/** - * Remove dynamic rules pointing to "rule", or all of them if rule == NULL. - * - * If keep_me == NULL, rules are deleted even if not expired, - * otherwise only expired rules are removed. - * - * The value of the second parameter is also used to point to identify - * a rule we absolutely do not want to remove (e.g. because we are - * holding a reference to it -- this is the case with O_LIMIT_PARENT - * rules). The pointer is only used for comparison, so any non-null - * value will do. - */ -static void -remove_dyn_rule(struct ip_fw *rule, ipfw_dyn_rule *keep_me) -{ - static u_int32_t last_remove = 0; - -#define FORCE (keep_me == NULL) - - ipfw_dyn_rule *prev, *q; - int i, pass = 0, max_pass = 0; - - IPFW_DYN_LOCK_ASSERT(); - - if (V_ipfw_dyn_v == NULL || V_dyn_count == 0) - return; - /* do not expire more than once per second, it is useless */ - if (!FORCE && last_remove == time_uptime) - return; - last_remove = time_uptime; - - /* - * because O_LIMIT refer to parent rules, during the first pass only - * remove child and mark any pending LIMIT_PARENT, and remove - * them in a second pass. - */ -next_pass: - for (i = 0 ; i < V_curr_dyn_buckets ; i++) { - for (prev=NULL, q = V_ipfw_dyn_v[i] ; q ; ) { - /* - * Logic can become complex here, so we split tests. - */ - if (q == keep_me) - goto next; - if (rule != NULL && rule != q->rule) - goto next; /* not the one we are looking for */ - if (q->dyn_type == O_LIMIT_PARENT) { - /* - * handle parent in the second pass, - * record we need one. - */ - max_pass = 1; - if (pass == 0) - goto next; - if (FORCE && q->count != 0 ) { - /* XXX should not happen! */ - printf("ipfw: OUCH! cannot remove rule," - " count %d\n", q->count); - } - } else { - if (!FORCE && - !TIME_LEQ( q->expire, time_uptime )) - goto next; - } - if (q->dyn_type != O_LIMIT_PARENT || !q->count) { - UNLINK_DYN_RULE(prev, V_ipfw_dyn_v[i], q); - continue; - } -next: - prev=q; - q=q->next; - } - } - if (pass++ < max_pass) - goto next_pass; -} - - -/** - * lookup a dynamic rule. - */ -static ipfw_dyn_rule * -lookup_dyn_rule_locked(struct ipfw_flow_id *pkt, int *match_direction, - struct tcphdr *tcp) -{ - /* - * stateful ipfw extensions. - * Lookup into dynamic session queue - */ -#define MATCH_REVERSE 0 -#define MATCH_FORWARD 1 -#define MATCH_NONE 2 -#define MATCH_UNKNOWN 3 - int i, dir = MATCH_NONE; - ipfw_dyn_rule *prev, *q=NULL; - - IPFW_DYN_LOCK_ASSERT(); - - if (V_ipfw_dyn_v == NULL) - goto done; /* not found */ - i = hash_packet( pkt ); - for (prev=NULL, q = V_ipfw_dyn_v[i] ; q != NULL ; ) { - if (q->dyn_type == O_LIMIT_PARENT && q->count) - goto next; - if (TIME_LEQ( q->expire, time_uptime)) { /* expire entry */ - UNLINK_DYN_RULE(prev, V_ipfw_dyn_v[i], q); - continue; - } - if (pkt->proto == q->id.proto && - q->dyn_type != O_LIMIT_PARENT) { - if (IS_IP6_FLOW_ID(pkt)) { - if (IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6), - &(q->id.src_ip6)) && - IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6), - &(q->id.dst_ip6)) && - pkt->src_port == q->id.src_port && - pkt->dst_port == q->id.dst_port ) { - dir = MATCH_FORWARD; - break; - } - if (IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6), - &(q->id.dst_ip6)) && - IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6), - &(q->id.src_ip6)) && - pkt->src_port == q->id.dst_port && - pkt->dst_port == q->id.src_port ) { - dir = MATCH_REVERSE; - break; - } - } else { - if (pkt->src_ip == q->id.src_ip && - pkt->dst_ip == q->id.dst_ip && - pkt->src_port == q->id.src_port && - pkt->dst_port == q->id.dst_port ) { - dir = MATCH_FORWARD; - break; - } - if (pkt->src_ip == q->id.dst_ip && - pkt->dst_ip == q->id.src_ip && - pkt->src_port == q->id.dst_port && - pkt->dst_port == q->id.src_port ) { - dir = MATCH_REVERSE; - break; - } - } - } -next: - prev = q; - q = q->next; - } - if (q == NULL) - goto done; /* q = NULL, not found */ - - if ( prev != NULL) { /* found and not in front */ - prev->next = q->next; - q->next = V_ipfw_dyn_v[i]; - V_ipfw_dyn_v[i] = q; - } - if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */ - u_char flags = pkt->flags & (TH_FIN|TH_SYN|TH_RST); - -#define BOTH_SYN (TH_SYN | (TH_SYN << 8)) -#define BOTH_FIN (TH_FIN | (TH_FIN << 8)) - q->state |= (dir == MATCH_FORWARD ) ? flags : (flags << 8); - switch (q->state) { - case TH_SYN: /* opening */ - q->expire = time_uptime + V_dyn_syn_lifetime; - break; - - case BOTH_SYN: /* move to established */ - case BOTH_SYN | TH_FIN : /* one side tries to close */ - case BOTH_SYN | (TH_FIN << 8) : - if (tcp) { -#define _SEQ_GE(a,b) ((int)(a) - (int)(b) >= 0) - u_int32_t ack = ntohl(tcp->th_ack); - if (dir == MATCH_FORWARD) { - if (q->ack_fwd == 0 || _SEQ_GE(ack, q->ack_fwd)) - q->ack_fwd = ack; - else { /* ignore out-of-sequence */ - break; - } - } else { - if (q->ack_rev == 0 || _SEQ_GE(ack, q->ack_rev)) - q->ack_rev = ack; - else { /* ignore out-of-sequence */ - break; - } - } - } - q->expire = time_uptime + V_dyn_ack_lifetime; - break; - - case BOTH_SYN | BOTH_FIN: /* both sides closed */ - if (V_dyn_fin_lifetime >= V_dyn_keepalive_period) - V_dyn_fin_lifetime = V_dyn_keepalive_period - 1; - q->expire = time_uptime + V_dyn_fin_lifetime; - break; - - default: -#if 0 - /* - * reset or some invalid combination, but can also - * occur if we use keep-state the wrong way. - */ - if ( (q->state & ((TH_RST << 8)|TH_RST)) == 0) - printf("invalid state: 0x%x\n", q->state); -#endif - if (V_dyn_rst_lifetime >= V_dyn_keepalive_period) - V_dyn_rst_lifetime = V_dyn_keepalive_period - 1; - q->expire = time_uptime + V_dyn_rst_lifetime; - break; - } - } else if (pkt->proto == IPPROTO_UDP) { - q->expire = time_uptime + V_dyn_udp_lifetime; - } else { - /* other protocols */ - q->expire = time_uptime + V_dyn_short_lifetime; - } -done: - if (match_direction) - *match_direction = dir; - return q; -} - -static ipfw_dyn_rule * -lookup_dyn_rule(struct ipfw_flow_id *pkt, int *match_direction, - struct tcphdr *tcp) -{ - ipfw_dyn_rule *q; - - IPFW_DYN_LOCK(); - q = lookup_dyn_rule_locked(pkt, match_direction, tcp); - if (q == NULL) - IPFW_DYN_UNLOCK(); - /* NB: return table locked when q is not NULL */ - return q; -} - -static void -realloc_dynamic_table(void) -{ - IPFW_DYN_LOCK_ASSERT(); - - /* - * Try reallocation, make sure we have a power of 2 and do - * not allow more than 64k entries. In case of overflow, - * default to 1024. - */ - - if (V_dyn_buckets > 65536) - V_dyn_buckets = 1024; - if ((V_dyn_buckets & (V_dyn_buckets-1)) != 0) { /* not a power of 2 */ - V_dyn_buckets = V_curr_dyn_buckets; /* reset */ - return; - } - V_curr_dyn_buckets = V_dyn_buckets; - if (V_ipfw_dyn_v != NULL) - free(V_ipfw_dyn_v, M_IPFW); - for (;;) { - V_ipfw_dyn_v = malloc(V_curr_dyn_buckets * sizeof(ipfw_dyn_rule *), - M_IPFW, M_NOWAIT | M_ZERO); - if (V_ipfw_dyn_v != NULL || V_curr_dyn_buckets <= 2) - break; - V_curr_dyn_buckets /= 2; - } -} - -/** - * Install state of type 'type' for a dynamic session. - * The hash table contains two type of rules: - * - regular rules (O_KEEP_STATE) - * - rules for sessions with limited number of sess per user - * (O_LIMIT). When they are created, the parent is - * increased by 1, and decreased on delete. In this case, - * the third parameter is the parent rule and not the chain. - * - "parent" rules for the above (O_LIMIT_PARENT). - */ -static ipfw_dyn_rule * -add_dyn_rule(struct ipfw_flow_id *id, u_int8_t dyn_type, struct ip_fw *rule) -{ - ipfw_dyn_rule *r; - int i; - - IPFW_DYN_LOCK_ASSERT(); - - if (V_ipfw_dyn_v == NULL || - (V_dyn_count == 0 && V_dyn_buckets != V_curr_dyn_buckets)) { - realloc_dynamic_table(); - if (V_ipfw_dyn_v == NULL) - return NULL; /* failed ! */ - } - i = hash_packet(id); - - r = uma_zalloc(ipfw_dyn_rule_zone, M_NOWAIT | M_ZERO); - if (r == NULL) { - printf ("ipfw: sorry cannot allocate state\n"); - return NULL; - } - - /* increase refcount on parent, and set pointer */ - if (dyn_type == O_LIMIT) { - ipfw_dyn_rule *parent = (ipfw_dyn_rule *)rule; - if ( parent->dyn_type != O_LIMIT_PARENT) - panic("invalid parent"); - parent->count++; - r->parent = parent; - rule = parent->rule; - } - - r->id = *id; - r->expire = time_uptime + V_dyn_syn_lifetime; - r->rule = rule; - r->dyn_type = dyn_type; - r->pcnt = r->bcnt = 0; - r->count = 0; - - r->bucket = i; - r->next = V_ipfw_dyn_v[i]; - V_ipfw_dyn_v[i] = r; - V_dyn_count++; - DEB({ - struct in_addr da; -#ifdef INET6 - char src[INET6_ADDRSTRLEN]; - char dst[INET6_ADDRSTRLEN]; -#else - char src[INET_ADDRSTRLEN]; - char dst[INET_ADDRSTRLEN]; -#endif - -#ifdef INET6 - if (IS_IP6_FLOW_ID(&(r->id))) { - ip6_sprintf(src, &r->id.src_ip6); - ip6_sprintf(dst, &r->id.dst_ip6); - } else -#endif - { - da.s_addr = htonl(r->id.src_ip); - inet_ntoa_r(da, src); - da.s_addr = htonl(r->id.dst_ip); - inet_ntoa_r(da, dst); - } - printf("ipfw: add dyn entry ty %d %s %d -> %s %d, total %d\n", - dyn_type, src, r->id.src_port, dst, r->id.dst_port, - V_dyn_count); - }) - return r; -} - -/** - * lookup dynamic parent rule using pkt and rule as search keys. - * If the lookup fails, then install one. - */ -static ipfw_dyn_rule * -lookup_dyn_parent(struct ipfw_flow_id *pkt, struct ip_fw *rule) -{ - ipfw_dyn_rule *q; - int i; - - IPFW_DYN_LOCK_ASSERT(); - - if (V_ipfw_dyn_v) { - int is_v6 = IS_IP6_FLOW_ID(pkt); - i = hash_packet( pkt ); - for (q = V_ipfw_dyn_v[i] ; q != NULL ; q=q->next) - if (q->dyn_type == O_LIMIT_PARENT && - rule== q->rule && - pkt->proto == q->id.proto && - pkt->src_port == q->id.src_port && - pkt->dst_port == q->id.dst_port && - ( - (is_v6 && - IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6), - &(q->id.src_ip6)) && - IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6), - &(q->id.dst_ip6))) || - (!is_v6 && - pkt->src_ip == q->id.src_ip && - pkt->dst_ip == q->id.dst_ip) - ) - ) { - q->expire = time_uptime + V_dyn_short_lifetime; - DEB(printf("ipfw: lookup_dyn_parent found 0x%p\n",q);) - return q; - } - } - return add_dyn_rule(pkt, O_LIMIT_PARENT, rule); -} - -/** - * Install dynamic state for rule type cmd->o.opcode - * - * Returns 1 (failure) if state is not installed because of errors or because - * session limitations are enforced. - */ -static int -install_state(struct ip_fw *rule, ipfw_insn_limit *cmd, - struct ip_fw_args *args, uint32_t tablearg) -{ - static int last_log; - ipfw_dyn_rule *q; - struct in_addr da; -#ifdef INET6 - char src[INET6_ADDRSTRLEN + 2], dst[INET6_ADDRSTRLEN + 2]; -#else - char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN]; -#endif - - src[0] = '\0'; - dst[0] = '\0'; - - IPFW_DYN_LOCK(); - - DEB( -#ifdef INET6 - if (IS_IP6_FLOW_ID(&(args->f_id))) { - ip6_sprintf(src, &args->f_id.src_ip6); - ip6_sprintf(dst, &args->f_id.dst_ip6); - } else -#endif - { - da.s_addr = htonl(args->f_id.src_ip); - inet_ntoa_r(da, src); - da.s_addr = htonl(args->f_id.dst_ip); - inet_ntoa_r(da, dst); - } - printf("ipfw: %s: type %d %s %u -> %s %u\n", - __func__, cmd->o.opcode, src, args->f_id.src_port, - dst, args->f_id.dst_port); - src[0] = '\0'; - dst[0] = '\0'; - ) - - q = lookup_dyn_rule_locked(&args->f_id, NULL, NULL); - - if (q != NULL) { /* should never occur */ - if (last_log != time_uptime) { - last_log = time_uptime; - printf("ipfw: %s: entry already present, done\n", - __func__); - } - IPFW_DYN_UNLOCK(); - return (0); - } - - if (V_dyn_count >= V_dyn_max) - /* Run out of slots, try to remove any expired rule. */ - remove_dyn_rule(NULL, (ipfw_dyn_rule *)1); - - if (V_dyn_count >= V_dyn_max) { - if (last_log != time_uptime) { - last_log = time_uptime; - printf("ipfw: %s: Too many dynamic rules\n", __func__); - } - IPFW_DYN_UNLOCK(); - return (1); /* cannot install, notify caller */ - } - - switch (cmd->o.opcode) { - case O_KEEP_STATE: /* bidir rule */ - add_dyn_rule(&args->f_id, O_KEEP_STATE, rule); - break; - - case O_LIMIT: { /* limit number of sessions */ - struct ipfw_flow_id id; - ipfw_dyn_rule *parent; - uint32_t conn_limit; - uint16_t limit_mask = cmd->limit_mask; - - conn_limit = (cmd->conn_limit == IP_FW_TABLEARG) ? - tablearg : cmd->conn_limit; - - DEB( - if (cmd->conn_limit == IP_FW_TABLEARG) - printf("ipfw: %s: O_LIMIT rule, conn_limit: %u " - "(tablearg)\n", __func__, conn_limit); - else - printf("ipfw: %s: O_LIMIT rule, conn_limit: %u\n", - __func__, conn_limit); - ) - - id.dst_ip = id.src_ip = id.dst_port = id.src_port = 0; - id.proto = args->f_id.proto; - id.addr_type = args->f_id.addr_type; - id.fib = M_GETFIB(args->m); - - if (IS_IP6_FLOW_ID (&(args->f_id))) { - if (limit_mask & DYN_SRC_ADDR) - id.src_ip6 = args->f_id.src_ip6; - if (limit_mask & DYN_DST_ADDR) - id.dst_ip6 = args->f_id.dst_ip6; - } else { - if (limit_mask & DYN_SRC_ADDR) - id.src_ip = args->f_id.src_ip; - if (limit_mask & DYN_DST_ADDR) - id.dst_ip = args->f_id.dst_ip; - } - if (limit_mask & DYN_SRC_PORT) - id.src_port = args->f_id.src_port; - if (limit_mask & DYN_DST_PORT) - id.dst_port = args->f_id.dst_port; - if ((parent = lookup_dyn_parent(&id, rule)) == NULL) { - printf("ipfw: %s: add parent failed\n", __func__); - IPFW_DYN_UNLOCK(); - return (1); - } - - if (parent->count >= conn_limit) { - /* See if we can remove some expired rule. */ - remove_dyn_rule(rule, parent); - if (parent->count >= conn_limit) { - if (V_fw_verbose && last_log != time_uptime) { - last_log = time_uptime; -#ifdef INET6 - /* - * XXX IPv6 flows are not - * supported yet. - */ - if (IS_IP6_FLOW_ID(&(args->f_id))) { - char ip6buf[INET6_ADDRSTRLEN]; - snprintf(src, sizeof(src), - "[%s]", ip6_sprintf(ip6buf, - &args->f_id.src_ip6)); - snprintf(dst, sizeof(dst), - "[%s]", ip6_sprintf(ip6buf, - &args->f_id.dst_ip6)); - } else -#endif - { - da.s_addr = - htonl(args->f_id.src_ip); - inet_ntoa_r(da, src); - da.s_addr = - htonl(args->f_id.dst_ip); - inet_ntoa_r(da, dst); - } - log(LOG_SECURITY | LOG_DEBUG, - "ipfw: %d %s %s:%u -> %s:%u, %s\n", - parent->rule->rulenum, - "drop session", - src, (args->f_id.src_port), - dst, (args->f_id.dst_port), - "too many entries"); - } - IPFW_DYN_UNLOCK(); - return (1); - } - } - add_dyn_rule(&args->f_id, O_LIMIT, (struct ip_fw *)parent); - break; - } - default: - printf("ipfw: %s: unknown dynamic rule type %u\n", - __func__, cmd->o.opcode); - IPFW_DYN_UNLOCK(); - return (1); - } - - /* XXX just set lifetime */ - lookup_dyn_rule_locked(&args->f_id, NULL, NULL); - - IPFW_DYN_UNLOCK(); - return (0); -} - -/* - * Generate a TCP packet, containing either a RST or a keepalive. - * When flags & TH_RST, we are sending a RST packet, because of a - * "reset" action matched the packet. - * Otherwise we are sending a keepalive, and flags & TH_ - * The 'replyto' mbuf is the mbuf being replied to, if any, and is required - * so that MAC can label the reply appropriately. - */ -static struct mbuf * -send_pkt(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t seq, - u_int32_t ack, int flags) -{ - struct mbuf *m; - int len, dir; - struct ip *h = NULL; /* stupid compiler */ -#ifdef INET6 - struct ip6_hdr *h6 = NULL; -#endif - struct tcphdr *th = NULL; - - MGETHDR(m, M_DONTWAIT, MT_DATA); - if (m == NULL) - return (NULL); - - M_SETFIB(m, id->fib); -#ifdef MAC - if (replyto != NULL) - mac_netinet_firewall_reply(replyto, m); - else - mac_netinet_firewall_send(m); -#else - (void)replyto; /* don't warn about unused arg */ -#endif - - switch (id->addr_type) { - case 4: - len = sizeof(struct ip) + sizeof(struct tcphdr); - break; -#ifdef INET6 - case 6: - len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); - break; -#endif - default: - /* XXX: log me?!? */ - m_freem(m); - return (NULL); - } - dir = ((flags & (TH_SYN | TH_RST)) == TH_SYN); - - m->m_data += max_linkhdr; - m->m_flags |= M_SKIP_FIREWALL; - m->m_pkthdr.len = m->m_len = len; - m->m_pkthdr.rcvif = NULL; - bzero(m->m_data, len); - - switch (id->addr_type) { - case 4: - h = mtod(m, struct ip *); - - /* prepare for checksum */ - h->ip_p = IPPROTO_TCP; - h->ip_len = htons(sizeof(struct tcphdr)); - if (dir) { - h->ip_src.s_addr = htonl(id->src_ip); - h->ip_dst.s_addr = htonl(id->dst_ip); - } else { - h->ip_src.s_addr = htonl(id->dst_ip); - h->ip_dst.s_addr = htonl(id->src_ip); - } - - th = (struct tcphdr *)(h + 1); - break; -#ifdef INET6 - case 6: - h6 = mtod(m, struct ip6_hdr *); - - /* prepare for checksum */ - h6->ip6_nxt = IPPROTO_TCP; - h6->ip6_plen = htons(sizeof(struct tcphdr)); - if (dir) { - h6->ip6_src = id->src_ip6; - h6->ip6_dst = id->dst_ip6; - } else { - h6->ip6_src = id->dst_ip6; - h6->ip6_dst = id->src_ip6; - } - - th = (struct tcphdr *)(h6 + 1); - break; -#endif - } - - if (dir) { - th->th_sport = htons(id->src_port); - th->th_dport = htons(id->dst_port); - } else { - th->th_sport = htons(id->dst_port); - th->th_dport = htons(id->src_port); - } - th->th_off = sizeof(struct tcphdr) >> 2; - - if (flags & TH_RST) { - if (flags & TH_ACK) { - th->th_seq = htonl(ack); - th->th_flags = TH_RST; - } else { - if (flags & TH_SYN) - seq++; - th->th_ack = htonl(seq); - th->th_flags = TH_RST | TH_ACK; - } - } else { - /* - * Keepalive - use caller provided sequence numbers - */ - th->th_seq = htonl(seq); - th->th_ack = htonl(ack); - th->th_flags = TH_ACK; - } - - switch (id->addr_type) { - case 4: - th->th_sum = in_cksum(m, len); - - /* finish the ip header */ - h->ip_v = 4; - h->ip_hl = sizeof(*h) >> 2; - h->ip_tos = IPTOS_LOWDELAY; - h->ip_off = 0; - h->ip_len = len; - h->ip_ttl = V_ip_defttl; - h->ip_sum = 0; - break; -#ifdef INET6 - case 6: - th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(*h6), - sizeof(struct tcphdr)); - - /* finish the ip6 header */ - h6->ip6_vfc |= IPV6_VERSION; - h6->ip6_hlim = IPV6_DEFHLIM; - break; -#endif - } - - return (m); -} /* * sends a reject message, consuming the mbuf passed as an argument. */ static void -send_reject(struct ip_fw_args *args, int code, int ip_len, struct ip *ip) +send_reject(struct ip_fw_args *args, int code, int iplen, struct ip *ip) { #if 0 @@ -1835,269 +615,46 @@ send_reject(struct ip_fw_args *args, int code, int ip_len, struct ip *ip) #endif if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */ /* We need the IP header in host order for icmp_error(). */ - if (args->eh != NULL) { - ip->ip_len = ntohs(ip->ip_len); - ip->ip_off = ntohs(ip->ip_off); - } + SET_HOST_IPLEN(ip); icmp_error(args->m, ICMP_UNREACH, code, 0L, 0); } else if (args->f_id.proto == IPPROTO_TCP) { struct tcphdr *const tcp = L3HDR(struct tcphdr, mtod(args->m, struct ip *)); if ( (tcp->th_flags & TH_RST) == 0) { struct mbuf *m; - m = send_pkt(args->m, &(args->f_id), + m = ipfw_send_pkt(args->m, &(args->f_id), ntohl(tcp->th_seq), ntohl(tcp->th_ack), tcp->th_flags | TH_RST); if (m != NULL) ip_output(m, NULL, NULL, 0, NULL, NULL); } - m_freem(args->m); + FREE_PKT(args->m); } else - m_freem(args->m); + FREE_PKT(args->m); args->m = NULL; } -/** - * - * Given an ip_fw *, lookup_next_rule will return a pointer - * to the next rule, which can be either the jump - * target (for skipto instructions) or the next one in the list (in - * all other cases including a missing jump target). - * The result is also written in the "next_rule" field of the rule. - * Backward jumps are not allowed, so start looking from the next - * rule... - * - * This never returns NULL -- in case we do not have an exact match, - * the next rule is returned. When the ruleset is changed, - * pointers are flushed so we are always correct. +/* + * Support for uid/gid/jail lookup. These tests are expensive + * (because we may need to look into the list of active sockets) + * so we cache the results. ugid_lookupp is 0 if we have not + * yet done a lookup, 1 if we succeeded, and -1 if we tried + * and failed. The function always returns the match value. + * We could actually spare the variable and use *uc, setting + * it to '(void *)check_uidgid if we have no info, NULL if + * we tried and failed, or any other value if successful. */ - -static struct ip_fw * -lookup_next_rule(struct ip_fw *me, u_int32_t tablearg) -{ - struct ip_fw *rule = NULL; - ipfw_insn *cmd; - u_int16_t rulenum; - - /* look for action, in case it is a skipto */ - cmd = ACTION_PTR(me); - if (cmd->opcode == O_LOG) - cmd += F_LEN(cmd); - if (cmd->opcode == O_ALTQ) - cmd += F_LEN(cmd); - if (cmd->opcode == O_TAG) - cmd += F_LEN(cmd); - if (cmd->opcode == O_SKIPTO ) { - if (tablearg != 0) { - rulenum = (u_int16_t)tablearg; - } else { - rulenum = cmd->arg1; - } - for (rule = me->next; rule ; rule = rule->next) { - if (rule->rulenum >= rulenum) { - break; - } - } - } - if (rule == NULL) /* failure or not a skipto */ - rule = me->next; - me->next_rule = rule; - return rule; -} - -static int -add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, - uint8_t mlen, uint32_t value) -{ - struct radix_node_head *rnh; - struct table_entry *ent; - struct radix_node *rn; - - if (tbl >= IPFW_TABLES_MAX) - return (EINVAL); - rnh = ch->tables[tbl]; - ent = malloc(sizeof(*ent), M_IPFW_TBL, M_NOWAIT | M_ZERO); - if (ent == NULL) - return (ENOMEM); - ent->value = value; - ent->addr.sin_len = ent->mask.sin_len = 8; - ent->mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0); - ent->addr.sin_addr.s_addr = addr & ent->mask.sin_addr.s_addr; - IPFW_WLOCK(ch); - rn = rnh->rnh_addaddr(&ent->addr, &ent->mask, rnh, (void *)ent); - if (rn == NULL) { - IPFW_WUNLOCK(ch); - free(ent, M_IPFW_TBL); - return (EEXIST); - } - IPFW_WUNLOCK(ch); - return (0); -} - -static int -del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, - uint8_t mlen) -{ - struct radix_node_head *rnh; - struct table_entry *ent; - struct sockaddr_in sa, mask; - - if (tbl >= IPFW_TABLES_MAX) - return (EINVAL); - rnh = ch->tables[tbl]; - sa.sin_len = mask.sin_len = 8; - mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0); - sa.sin_addr.s_addr = addr & mask.sin_addr.s_addr; - IPFW_WLOCK(ch); - ent = (struct table_entry *)rnh->rnh_deladdr(&sa, &mask, rnh); - if (ent == NULL) { - IPFW_WUNLOCK(ch); - return (ESRCH); - } - IPFW_WUNLOCK(ch); - free(ent, M_IPFW_TBL); - return (0); -} - -static int -flush_table_entry(struct radix_node *rn, void *arg) -{ - struct radix_node_head * const rnh = arg; - struct table_entry *ent; - - ent = (struct table_entry *) - rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh); - if (ent != NULL) - free(ent, M_IPFW_TBL); - return (0); -} - -static int -flush_table(struct ip_fw_chain *ch, uint16_t tbl) -{ - struct radix_node_head *rnh; - - IPFW_WLOCK_ASSERT(ch); - - if (tbl >= IPFW_TABLES_MAX) - return (EINVAL); - rnh = ch->tables[tbl]; - KASSERT(rnh != NULL, ("NULL IPFW table")); - rnh->rnh_walktree(rnh, flush_table_entry, rnh); - return (0); -} - -static void -flush_tables(struct ip_fw_chain *ch) -{ - uint16_t tbl; - - IPFW_WLOCK_ASSERT(ch); - - for (tbl = 0; tbl < IPFW_TABLES_MAX; tbl++) - flush_table(ch, tbl); -} - -static int -init_tables(struct ip_fw_chain *ch) -{ - int i; - uint16_t j; - - for (i = 0; i < IPFW_TABLES_MAX; i++) { - if (!rn_inithead((void **)&ch->tables[i], 32)) { - for (j = 0; j < i; j++) { - (void) flush_table(ch, j); - } - return (ENOMEM); - } - } - return (0); -} - -static int -lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, - uint32_t *val) -{ - struct radix_node_head *rnh; - struct table_entry *ent; - struct sockaddr_in sa; - - if (tbl >= IPFW_TABLES_MAX) - return (0); - rnh = ch->tables[tbl]; - sa.sin_len = 8; - sa.sin_addr.s_addr = addr; - ent = (struct table_entry *)(rnh->rnh_lookup(&sa, NULL, rnh)); - if (ent != NULL) { - *val = ent->value; - return (1); - } - return (0); -} - -static int -count_table_entry(struct radix_node *rn, void *arg) -{ - u_int32_t * const cnt = arg; - - (*cnt)++; - return (0); -} - -static int -count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt) -{ - struct radix_node_head *rnh; - - if (tbl >= IPFW_TABLES_MAX) - return (EINVAL); - rnh = ch->tables[tbl]; - *cnt = 0; - rnh->rnh_walktree(rnh, count_table_entry, cnt); - return (0); -} - -static int -dump_table_entry(struct radix_node *rn, void *arg) -{ - struct table_entry * const n = (struct table_entry *)rn; - ipfw_table * const tbl = arg; - ipfw_table_entry *ent; - - if (tbl->cnt == tbl->size) - return (1); - ent = &tbl->ent[tbl->cnt]; - ent->tbl = tbl->tbl; - if (in_nullhost(n->mask.sin_addr)) - ent->masklen = 0; - else - ent->masklen = 33 - ffs(ntohl(n->mask.sin_addr.s_addr)); - ent->addr = n->addr.sin_addr.s_addr; - ent->value = n->value; - tbl->cnt++; - return (0); -} - -static int -dump_table(struct ip_fw_chain *ch, ipfw_table *tbl) -{ - struct radix_node_head *rnh; - - if (tbl->tbl >= IPFW_TABLES_MAX) - return (EINVAL); - rnh = ch->tables[tbl->tbl]; - tbl->cnt = 0; - rnh->rnh_walktree(rnh, dump_table_entry, tbl); - return (0); -} - static int check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif, struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip, - u_int16_t src_port, struct ucred **uc, int *ugid_lookupp, - struct inpcb *inp) + u_int16_t src_port, int *ugid_lookupp, + struct ucred **uc, struct inpcb *inp) { +#ifndef __FreeBSD__ + return cred_check(insn, proto, oif, + dst_ip, dst_port, src_ip, src_port, + (struct bsd_ucred *)uc, ugid_lookupp, ((struct mbuf *)inp)->m_skb); +#else /* FreeBSD */ struct inpcbinfo *pi; int wildcard; struct inpcb *pcb; @@ -2150,10 +707,8 @@ check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif, INP_INFO_RUNLOCK(pi); if (*ugid_lookupp == 0) { /* - * If the lookup did not yield any results, there - * is no sense in coming back and trying again. So - * we can set lookup to -1 and ensure that we wont - * bother the pcb system again. + * We tried and failed, set the variable to -1 + * so we will not try again on this packet. */ *ugid_lookupp = -1; return (0); @@ -2166,6 +721,22 @@ check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif, else if (insn->o.opcode == O_JAIL) match = ((*uc)->cr_prison->pr_id == (int)insn->d[0]); return match; +#endif /* __FreeBSD__ */ +} + +/* + * Helper function to set args with info on the rule after the matching + * one. slot is precise, whereas we guess rule_id as they are + * assigned sequentially. + */ +static inline void +set_match(struct ip_fw_args *args, int slot, + struct ip_fw_chain *chain) +{ + args->rule.chain_id = chain->id; + args->rule.slot = slot + 1; /* we use 0 as a marker */ + args->rule.rule_id = 1 + chain->map[slot]->id; + args->rule.rulenum = chain->map[slot]->rulenum; } /* @@ -2178,10 +749,10 @@ check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif, * * args->m (in/out) The packet; we set to NULL when/if we nuke it. * Starts with the IP header. - * args->eh (in) Mac header if present, or NULL for layer3 packet. + * args->eh (in) Mac header if present, NULL for layer3 packet. * args->L3offset Number of bytes bypassed if we came from L2. * e.g. often sizeof(eh) ** NOTYET ** - * args->oif Outgoing interface, or NULL if packet is incoming. + * args->oif Outgoing interface, NULL if packet is incoming. * The incoming interface is in the mbuf. (in) * args->divert_rule (in/out) * Skip up to the first rule past this rule number; @@ -2190,7 +761,7 @@ check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif, * args->rule Pointer to the last matching rule (in/out) * args->next_hop Socket we are forwarding to (out). * args->f_id Addresses grabbed from the packet (out) - * args->cookie a cookie depending on rule action + * args->rule.info a cookie depending on rule action * * Return value: * @@ -2200,6 +771,8 @@ check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif, * IP_FW_TEE tee packet, port in m_tag * IP_FW_DUMMYNET to dummynet, pipe in args->cookie * IP_FW_NETGRAPH into netgraph, cookie args->cookie + * args->rule contains the matching rule, + * args->rule.info has additional information. * */ int @@ -2207,7 +780,7 @@ ipfw_chk(struct ip_fw_args *args) { /* - * Local variables holding state during the processing of a packet: + * Local variables holding state while processing a packet: * * IMPORTANT NOTE: to speed up the processing of rules, there * are some assumption on the values of the variables, which @@ -2240,18 +813,14 @@ ipfw_chk(struct ip_fw_args *args) * these types of constraints, as well as decrease contention * on pcb related locks. */ +#ifndef __FreeBSD__ + struct bsd_ucred ucred_cache; +#else struct ucred *ucred_cache = NULL; +#endif int ucred_lookup = 0; /* - * divinput_flags If non-zero, set to the IP_FW_DIVERT_*_FLAG - * associated with a packet input on a divert socket. This - * will allow to distinguish traffic and its direction when - * it originates from a divert socket. - */ - u_int divinput_flags = 0; - - /* * oif | args->oif If NULL, ipfw_chk has been called on the * inbound path (ether_input, ip_input). * If non-NULL, ipfw_chk has been called on the outbound path @@ -2259,7 +828,7 @@ ipfw_chk(struct ip_fw_args *args) */ struct ifnet *oif = args->oif; - struct ip_fw *f = NULL; /* matching rule */ + int f_pos = 0; /* index of current rule in the array */ int retval = 0; /* @@ -2294,12 +863,12 @@ ipfw_chk(struct ip_fw_args *args) * src_ip, dst_ip ip addresses, in NETWORK format. * Only valid for IPv4 packets. */ - u_int8_t proto; - u_int16_t src_port = 0, dst_port = 0; /* NOTE: host format */ + uint8_t proto; + uint16_t src_port = 0, dst_port = 0; /* NOTE: host format */ struct in_addr src_ip, dst_ip; /* NOTE: network format */ - u_int16_t ip_len=0; + uint16_t iplen=0; int pktlen; - u_int16_t etype = 0; /* Host order stored ether type */ + uint16_t etype = 0; /* Host order stored ether type */ /* * dyn_dir = MATCH_UNKNOWN when rules unchecked, @@ -2309,7 +878,6 @@ ipfw_chk(struct ip_fw_args *args) int dyn_dir = MATCH_UNKNOWN; ipfw_dyn_rule *q = NULL; struct ip_fw_chain *chain = &V_layer3_chain; - struct m_tag *mtag; /* * We store in ulp a pointer to the upper layer protocol header. @@ -2318,12 +886,17 @@ ipfw_chk(struct ip_fw_args *args) * ulp is NULL if not found. */ void *ulp = NULL; /* upper layer protocol pointer. */ + /* XXX ipv6 variables */ int is_ipv6 = 0; - u_int16_t ext_hd = 0; /* bits vector for extension header filtering */ + uint8_t icmp6_type = 0; + uint16_t ext_hd = 0; /* bits vector for extension header filtering */ /* end of ipv6 variables */ + int is_ipv4 = 0; + int done = 0; /* flag to exit the outer loop */ + if (m->m_flags & M_SKIP_FIREWALL || (! V_ipfw_vnet_ready)) return (IP_FW_PASS); /* accept */ @@ -2340,15 +913,15 @@ ipfw_chk(struct ip_fw_args *args) * pointer might become stale after other pullups (but we never use it * this way). */ -#define PULLUP_TO(_len, p, T) \ -do { \ - int x = (_len) + sizeof(T); \ - if ((m)->m_len < x) { \ - args->m = m = m_pullup(m, x); \ - if (m == NULL) \ - goto pullup_failed; \ - } \ - p = (mtod(m, char *) + (_len)); \ +#define PULLUP_TO(_len, p, T) \ +do { \ + int x = (_len) + sizeof(T); \ + if ((m)->m_len < x) { \ + args->m = m = m_pullup(m, x); \ + if (m == NULL) \ + goto pullup_failed; \ + } \ + p = (mtod(m, char *) + (_len)); \ } while (0) /* @@ -2371,14 +944,15 @@ do { \ switch (proto) { case IPPROTO_ICMPV6: PULLUP_TO(hlen, ulp, struct icmp6_hdr); - args->f_id.flags = ICMP6(ulp)->icmp6_type; + icmp6_type = ICMP6(ulp)->icmp6_type; break; case IPPROTO_TCP: PULLUP_TO(hlen, ulp, struct tcphdr); dst_port = TCP(ulp)->th_dport; src_port = TCP(ulp)->th_sport; - args->f_id.flags = TCP(ulp)->th_flags; + /* save flags for dynamic rules */ + args->f_id._flags = TCP(ulp)->th_flags; break; case IPPROTO_SCTP: @@ -2442,7 +1016,7 @@ do { \ return (IP_FW_DENY); break; } - args->f_id.frag_id6 = + args->f_id.extra = ntohl(((struct ip6_frag *)ulp)->ip6f_ident); ulp = NULL; break; @@ -2535,14 +1109,9 @@ do { \ proto = ip->ip_p; src_ip = ip->ip_src; dst_ip = ip->ip_dst; - if (args->eh != NULL) { /* layer 2 packets are as on the wire */ - offset = ntohs(ip->ip_off) & IP_OFFMASK; - ip_len = ntohs(ip->ip_len); - } else { - offset = ip->ip_off & IP_OFFMASK; - ip_len = ip->ip_len; - } - pktlen = ip_len < pktlen ? ip_len : pktlen; + offset = ntohs(ip->ip_off) & IP_OFFMASK; + iplen = ntohs(ip->ip_len); + pktlen = iplen < pktlen ? iplen : pktlen; if (offset == 0) { switch (proto) { @@ -2550,7 +1119,8 @@ do { \ PULLUP_TO(hlen, ulp, struct tcphdr); dst_port = TCP(ulp)->th_dport; src_port = TCP(ulp)->th_sport; - args->f_id.flags = TCP(ulp)->th_flags; + /* save flags for dynamic rules */ + args->f_id._flags = TCP(ulp)->th_flags; break; case IPPROTO_UDP: @@ -2561,7 +1131,7 @@ do { \ case IPPROTO_ICMP: PULLUP_TO(hlen, ulp, struct icmphdr); - args->f_id.flags = ICMP(ulp)->icmp_type; + //args->f_id.flags = ICMP(ulp)->icmp_type; break; default: @@ -2585,64 +1155,47 @@ do { \ IPFW_RUNLOCK(chain); return (IP_FW_PASS); /* accept */ } - mtag = m_tag_find(m, PACKET_TAG_DIVERT, NULL); - if (args->rule) { + if (args->rule.slot) { /* - * Packet has already been tagged. Look for the next rule - * to restart processing. Make sure that args->rule still - * exists and not changed. + * Packet has already been tagged as a result of a previous + * match on rule args->rule aka args->rule_id (PIPE, QUEUE, + * REASS, NETGRAPH, DIVERT/TEE...) + * Validate the slot and continue from the next one + * if still present, otherwise do a lookup. */ - if (chain->id != args->chain_id) { - for (f = chain->rules; f != NULL; f = f->next) - if (f == args->rule && f->id == args->rule_id) - break; - - if (f != NULL) - f = f->next_rule; - else - f = ip_fw_default_rule; - } else - f = args->rule->next_rule; - - if (f == NULL) - f = lookup_next_rule(args->rule, 0); + f_pos = (args->rule.chain_id == chain->id) ? + args->rule.slot : + ipfw_find_rule(chain, args->rule.rulenum, + args->rule.rule_id); } else { - /* - * Find the starting rule. It can be either the first - * one, or the one after divert_rule if asked so. - */ - int skipto = mtag ? divert_cookie(mtag) : 0; - - f = chain->rules; - if (args->eh == NULL && skipto != 0) { - if (skipto >= IPFW_DEFAULT_RULE) { - IPFW_RUNLOCK(chain); - return (IP_FW_DENY); /* invalid */ - } - while (f && f->rulenum <= skipto) - f = f->next; - if (f == NULL) { /* drop packet */ - IPFW_RUNLOCK(chain); - return (IP_FW_DENY); - } - } - } - /* reset divert rule to avoid confusion later */ - if (mtag) { - divinput_flags = divert_info(mtag) & - (IP_FW_DIVERT_OUTPUT_FLAG | IP_FW_DIVERT_LOOPBACK_FLAG); - m_tag_delete(m, mtag); + f_pos = 0; } /* * Now scan the rules, and parse microinstructions for each rule. + * We have two nested loops and an inner switch. Sometimes we + * need to break out of one or both loops, or re-enter one of + * the loops with updated variables. Loop variables are: + * + * f_pos (outer loop) points to the current rule. + * On output it points to the matching rule. + * done (outer loop) is used as a flag to break the loop. + * l (inner loop) residual length of current rule. + * cmd points to the current microinstruction. + * + * We break the inner loop by setting l=0 and possibly + * cmdlen=0 if we don't want to advance cmd. + * We break the outer loop by setting done=1 + * We can restart the inner loop by setting l>0 and f_pos, f, cmd + * as needed. */ - for (; f; f = f->next) { + for (; f_pos < chain->n_rules; f_pos++) { ipfw_insn *cmd; uint32_t tablearg = 0; int l, cmdlen, skip_or; /* skip rest of OR block */ + struct ip_fw *f; -again: + f = chain->map[f_pos]; if (V_set_disable & (1 << f->set) ) continue; @@ -2657,7 +1210,7 @@ again: * the target rule. */ -check_body: +/* check_body: */ cmdlen = F_LEN(cmd); /* * An OR block (insn_1 || .. || insn_n) has the @@ -2708,8 +1261,13 @@ check_body: (ipfw_insn_u32 *)cmd, proto, oif, dst_ip, dst_port, - src_ip, src_port, &ucred_cache, - &ucred_lookup, args->inp); + src_ip, src_port, &ucred_lookup, +#ifdef __FreeBSD__ + &ucred_cache, args->inp); +#else + (void *)&ucred_cache, + (struct inpcb *)args->m); +#endif break; case O_RECV: @@ -2767,10 +1325,15 @@ check_body: break; case O_DIVERTED: - match = (cmd->arg1 & 1 && divinput_flags & - IP_FW_DIVERT_LOOPBACK_FLAG) || - (cmd->arg1 & 2 && divinput_flags & - IP_FW_DIVERT_OUTPUT_FLAG); + { + /* For diverted packets, args->rule.info + * contains the divert port (in host format) + * reason and direction. + */ + uint32_t i = args->rule.info; + match = (i&IPFW_IS_MASK) == IPFW_IS_DIVERT && + cmd->arg1 & ((i & IPFW_INFO_IN) ? 1 : 2); + } break; case O_PROTO: @@ -2790,13 +1353,57 @@ check_body: case O_IP_SRC_LOOKUP: case O_IP_DST_LOOKUP: if (is_ipv4) { - uint32_t a = + uint32_t key = (cmd->opcode == O_IP_DST_LOOKUP) ? dst_ip.s_addr : src_ip.s_addr; uint32_t v = 0; - match = lookup_table(chain, cmd->arg1, a, - &v); + if (cmdlen > F_INSN_SIZE(ipfw_insn_u32)) { + /* generic lookup. The key must be + * in 32bit big-endian format. + */ + v = ((ipfw_insn_u32 *)cmd)->d[1]; + if (v == 0) + key = dst_ip.s_addr; + else if (v == 1) + key = src_ip.s_addr; + else if (v == 6) /* dscp */ + key = (ip->ip_tos >> 2) & 0x3f; + else if (offset != 0) + break; + else if (proto != IPPROTO_TCP && + proto != IPPROTO_UDP) + break; + else if (v == 2) + key = htonl(dst_port); + else if (v == 3) + key = htonl(src_port); + else if (v == 4 || v == 5) { + check_uidgid( + (ipfw_insn_u32 *)cmd, + proto, oif, + dst_ip, dst_port, + src_ip, src_port, &ucred_lookup, +#ifdef __FreeBSD__ + &ucred_cache, args->inp); + if (v == 4 /* O_UID */) + key = ucred_cache->cr_uid; + else if (v == 5 /* O_JAIL */) + key = ucred_cache->cr_prison->pr_id; +#else /* !__FreeBSD__ */ + (void *)&ucred_cache, + (struct inpcb *)args->m); + if (v ==4 /* O_UID */) + key = ucred_cache.uid; + else if (v == 5 /* O_JAIL */) + key = ucred_cache.xid; +#endif /* !__FreeBSD__ */ + key = htonl(key); + } else + break; + } + match = ipfw_lookup_table(chain, + cmd->arg1, key, &v); if (!match) break; if (cmdlen == F_INSN_SIZE(ipfw_insn_u32)) @@ -2827,7 +1434,13 @@ check_body: INADDR_TO_IFP(src_ip, tif); match = (tif != NULL); + break; } +#ifdef INET6 + /* FALLTHROUGH */ + case O_IP6_SRC_ME: + match= is_ipv6 && search_ip6_addr_net(&args->f_id.src_ip6); +#endif break; case O_IP_DST_SET: @@ -2860,9 +1473,16 @@ check_body: INADDR_TO_IFP(dst_ip, tif); match = (tif != NULL); + break; } +#ifdef INET6 + /* FALLTHROUGH */ + case O_IP6_DST_ME: + match= is_ipv6 && search_ip6_addr_net(&args->f_id.dst_ip6); +#endif break; + case O_IP_SRCPORT: case O_IP_DSTPORT: /* @@ -2919,7 +1539,7 @@ check_body: int i; if (cmd->opcode == O_IPLEN) - x = ip_len; + x = iplen; else if (cmd->opcode == O_IPTTL) x = ip->ip_ttl; else /* must be IPID */ @@ -2954,7 +1574,7 @@ check_body: int i; tcp = TCP(ulp); - x = ip_len - + x = iplen - ((ip->ip_hl + tcp->th_off) << 2); if (cmdlen == 1) { match = (cmd->arg1 == x); @@ -3029,8 +1649,7 @@ check_body: } case O_LOG: - if (V_fw_verbose) - ipfw_log(f, hlen, args, m, + ipfw_log(f, hlen, args, m, oif, offset, tablearg, ip); match = 1; break; @@ -3129,14 +1748,6 @@ check_body: } break; - case O_IP6_SRC_ME: - match= is_ipv6 && search_ip6_addr_net(&args->f_id.src_ip6); - break; - - case O_IP6_DST_ME: - match= is_ipv6 && search_ip6_addr_net(&args->f_id.dst_ip6); - break; - case O_FLOW6ID: match = is_ipv6 && flow6id_match(args->f_id.flow_id6, @@ -3158,6 +1769,7 @@ check_body: break; case O_TAG: { + struct m_tag *mtag; uint32_t tag = (cmd->arg1 == IP_FW_TABLEARG) ? tablearg : cmd->arg1; @@ -3174,12 +1786,13 @@ check_body: if (cmd->len & F_NOT) { /* `untag' action */ if (mtag != NULL) m_tag_delete(m, mtag); + match = 0; } else if (mtag == NULL) { if ((mtag = m_tag_alloc(MTAG_IPFW, tag, 0, M_NOWAIT)) != NULL) m_tag_prepend(m, mtag); + match = 1; } - match = (cmd->len & F_NOT) ? 0: 1; break; } @@ -3189,6 +1802,7 @@ check_body: break; case O_TAGGED: { + struct m_tag *mtag; uint32_t tag = (cmd->arg1 == IP_FW_TABLEARG) ? tablearg : cmd->arg1; @@ -3228,14 +1842,13 @@ check_body: * * In general, here we set retval and terminate the * outer loop (would be a 'break 3' in some language, - * but we need to do a 'goto done'). + * but we need to set l=0, done=1) * * Exceptions: * O_COUNT and O_SKIPTO actions: * instead of terminating, we jump to the next rule - * ('goto next_rule', equivalent to a 'break 2'), - * or to the SKIPTO target ('goto again' after - * having set f, cmd and l), respectively. + * (setting l=0), or to the SKIPTO target (setting + * f/f_len, cmd and l as needed), respectively. * * O_TAG, O_LOG and O_ALTQ action parameters: * perform some action and set match = 1; @@ -3246,25 +1859,28 @@ check_body: * These opcodes try to install an entry in the * state tables; if successful, we continue with * the next opcode (match=1; break;), otherwise - * the packet * must be dropped - * ('goto done' after setting retval); + * the packet must be dropped (set retval, + * break loops with l=0, done=1) * * O_PROBE_STATE and O_CHECK_STATE: these opcodes * cause a lookup of the state table, and a jump * to the 'action' part of the parent rule - * ('goto check_body') if an entry is found, or + * if an entry is found, or * (CHECK_STATE only) a jump to the next rule if - * the entry is not found ('goto next_rule'). - * The result of the lookup is cached to make - * further instances of these opcodes are - * effectively NOPs. + * the entry is not found. + * The result of the lookup is cached so that + * further instances of these opcodes become NOPs. + * The jump to the next rule is done by setting + * l=0, cmdlen=0. */ case O_LIMIT: case O_KEEP_STATE: - if (install_state(f, + if (ipfw_install_state(f, (ipfw_insn_limit *)cmd, args, tablearg)) { + /* error or limit violation */ retval = IP_FW_DENY; - goto done; /* error/limit violation */ + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ } match = 1; break; @@ -3281,22 +1897,32 @@ check_body: * to be run first). */ if (dyn_dir == MATCH_UNKNOWN && - (q = lookup_dyn_rule(&args->f_id, + (q = ipfw_lookup_dyn_rule(&args->f_id, &dyn_dir, proto == IPPROTO_TCP ? TCP(ulp) : NULL)) != NULL) { /* * Found dynamic entry, update stats * and jump to the 'action' part of - * the parent rule. + * the parent rule by setting + * f, cmd, l and clearing cmdlen. */ q->pcnt++; q->bcnt += pktlen; + /* XXX we would like to have f_pos + * readily accessible in the dynamic + * rule, instead of having to + * lookup q->rule. + */ f = q->rule; + f_pos = ipfw_find_rule(chain, + f->rulenum, f->id); cmd = ACTION_PTR(f); l = f->cmd_len - f->act_ofs; - IPFW_DYN_UNLOCK(); - goto check_body; + ipfw_dyn_unlock(); + cmdlen = 0; + match = 1; + break; } /* * Dynamic entry not found. If CHECK_STATE, @@ -3304,70 +1930,96 @@ check_body: * ignore and continue with next opcode. */ if (cmd->opcode == O_CHECK_STATE) - goto next_rule; + l = 0; /* exit inner loop */ match = 1; break; case O_ACCEPT: retval = 0; /* accept */ - goto done; + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + break; case O_PIPE: case O_QUEUE: - args->rule = f; /* report matching rule */ - args->rule_id = f->id; - args->chain_id = chain->id; - if (cmd->arg1 == IP_FW_TABLEARG) - args->cookie = tablearg; - else - args->cookie = cmd->arg1; + set_match(args, f_pos, chain); + args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ? + tablearg : cmd->arg1; + if (cmd->opcode == O_PIPE) + args->rule.info |= IPFW_IS_PIPE; + if (V_fw_one_pass) + args->rule.info |= IPFW_ONEPASS; retval = IP_FW_DUMMYNET; - goto done; + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + break; case O_DIVERT: - case O_TEE: { - struct divert_tag *dt; - + case O_TEE: if (args->eh) /* not on layer 2 */ - break; - mtag = m_tag_get(PACKET_TAG_DIVERT, - sizeof(struct divert_tag), - M_NOWAIT); - if (mtag == NULL) { - /* XXX statistic */ - /* drop packet */ - IPFW_RUNLOCK(chain); - if (ucred_cache != NULL) - crfree(ucred_cache); - return (IP_FW_DENY); - } - dt = (struct divert_tag *)(mtag+1); - dt->cookie = f->rulenum; - if (cmd->arg1 == IP_FW_TABLEARG) - dt->info = tablearg; - else - dt->info = cmd->arg1; - m_tag_prepend(m, mtag); + break; + /* otherwise this is terminal */ + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ retval = (cmd->opcode == O_DIVERT) ? - IP_FW_DIVERT : IP_FW_TEE; - goto done; - } + IP_FW_DIVERT : IP_FW_TEE; + set_match(args, f_pos, chain); + args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ? + tablearg : cmd->arg1; + break; + case O_COUNT: - case O_SKIPTO: f->pcnt++; /* update stats */ f->bcnt += pktlen; f->timestamp = time_uptime; - if (cmd->opcode == O_COUNT) - goto next_rule; - /* handle skipto */ - if (cmd->arg1 == IP_FW_TABLEARG) { - f = lookup_next_rule(f, tablearg); - } else { - if (f->next_rule == NULL) - lookup_next_rule(f, 0); - f = f->next_rule; + l = 0; /* exit inner loop */ + break; + + case O_SKIPTO: + f->pcnt++; /* update stats */ + f->bcnt += pktlen; + f->timestamp = time_uptime; + /* If possible use cached f_pos (in f->next_rule), + * whose version is written in f->next_rule + * (horrible hacks to avoid changing the ABI). + */ + if (cmd->arg1 != IP_FW_TABLEARG && + (uintptr_t)f->x_next == chain->id) { + f_pos = (uintptr_t)f->next_rule; + } else { + int i = (cmd->arg1 == IP_FW_TABLEARG) ? + tablearg : cmd->arg1; + /* make sure we do not jump backward */ + if (i <= f->rulenum) + i = f->rulenum + 1; + f_pos = ipfw_find_rule(chain, i, 0); + /* update the cache */ + if (cmd->arg1 != IP_FW_TABLEARG) { + f->next_rule = + (void *)(uintptr_t)f_pos; + f->x_next = + (void *)(uintptr_t)chain->id; } - goto again; + } + /* + * Skip disabled rules, and re-enter + * the inner loop with the correct + * f_pos, f, l and cmd. + * Also clear cmdlen and skip_or + */ + for (; f_pos < chain->n_rules - 1 && + (V_set_disable & + (1 << chain->map[f_pos]->set)); + f_pos++) + ; + /* prepare to enter the inner loop */ + f = chain->map[f_pos]; + l = f->cmd_len; + cmd = f->cmd; + match = 1; + cmdlen = 0; + skip_or = 0; + break; case O_REJECT: /* @@ -3380,7 +2032,7 @@ check_body: is_icmp_query(ICMP(ulp))) && !(m->m_flags & (M_BCAST|M_MCAST)) && !IN_MULTICAST(ntohl(dst_ip.s_addr))) { - send_reject(args, cmd->arg1, ip_len, ip); + send_reject(args, cmd->arg1, iplen, ip); m = args->m; } /* FALLTHROUGH */ @@ -3389,7 +2041,7 @@ check_body: if (hlen > 0 && is_ipv6 && ((offset & IP6F_OFF_MASK) == 0) && (proto != IPPROTO_ICMPV6 || - (is_icmp6_query(args->f_id.flags) == 1)) && + (is_icmp6_query(icmp6_type) == 1)) && !(m->m_flags & (M_BCAST|M_MCAST)) && !IN6_IS_ADDR_MULTICAST(&args->f_id.dst_ip6)) { send_reject6( @@ -3401,41 +2053,41 @@ check_body: #endif case O_DENY: retval = IP_FW_DENY; - goto done; + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + break; - case O_FORWARD_IP: { - struct sockaddr_in *sa; - sa = &(((ipfw_insn_sa *)cmd)->sa); + case O_FORWARD_IP: if (args->eh) /* not valid on layer2 pkts */ break; if (!q || dyn_dir == MATCH_FORWARD) { - if (sa->sin_addr.s_addr == INADDR_ANY) { - bcopy(sa, &args->hopstore, + struct sockaddr_in *sa; + sa = &(((ipfw_insn_sa *)cmd)->sa); + if (sa->sin_addr.s_addr == INADDR_ANY) { + bcopy(sa, &args->hopstore, sizeof(*sa)); - args->hopstore.sin_addr.s_addr = + args->hopstore.sin_addr.s_addr = htonl(tablearg); - args->next_hop = - &args->hopstore; - } else { - args->next_hop = sa; - } + args->next_hop = &args->hopstore; + } else { + args->next_hop = sa; + } } retval = IP_FW_PASS; - } - goto done; + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + break; case O_NETGRAPH: case O_NGTEE: - args->rule = f; /* report matching rule */ - args->rule_id = f->id; - args->chain_id = chain->id; - if (cmd->arg1 == IP_FW_TABLEARG) - args->cookie = tablearg; - else - args->cookie = cmd->arg1; + set_match(args, f_pos, chain); + args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ? + tablearg : cmd->arg1; retval = (cmd->opcode == O_NETGRAPH) ? IP_FW_NETGRAPH : IP_FW_NGTEE; - goto done; + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + break; case O_SETFIB: f->pcnt++; /* update stats */ @@ -3443,88 +2095,86 @@ check_body: f->timestamp = time_uptime; M_SETFIB(m, cmd->arg1); args->f_id.fib = cmd->arg1; - goto next_rule; + l = 0; /* exit inner loop */ + break; + + case O_NAT: + if (!IPFW_NAT_LOADED) { + retval = IP_FW_DENY; + } else { + struct cfg_nat *t; + int nat_id; - case O_NAT: { - struct cfg_nat *t; - int nat_id; + set_match(args, f_pos, chain); + t = ((ipfw_insn_nat *)cmd)->nat; + if (t == NULL) { + nat_id = (cmd->arg1 == IP_FW_TABLEARG) ? + tablearg : cmd->arg1; + t = (*lookup_nat_ptr)(&chain->nat, nat_id); - if (IPFW_NAT_LOADED) { - args->rule = f; /* Report matching rule. */ - args->rule_id = f->id; - args->chain_id = chain->id; - t = ((ipfw_insn_nat *)cmd)->nat; if (t == NULL) { - nat_id = (cmd->arg1 == IP_FW_TABLEARG) ? - tablearg : cmd->arg1; - LOOKUP_NAT(V_layer3_chain, nat_id, t); - if (t == NULL) { - retval = IP_FW_DENY; - goto done; - } - if (cmd->arg1 != IP_FW_TABLEARG) - ((ipfw_insn_nat *)cmd)->nat = t; + retval = IP_FW_DENY; + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + break; } - retval = ipfw_nat_ptr(args, t, m); - } else - retval = IP_FW_DENY; - goto done; - } + if (cmd->arg1 != IP_FW_TABLEARG) + ((ipfw_insn_nat *)cmd)->nat = t; + } + retval = ipfw_nat_ptr(args, t, m); + } + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + break; case O_REASS: { int ip_off; f->pcnt++; f->bcnt += pktlen; - ip_off = (args->eh != NULL) ? ntohs(ip->ip_off) : ip->ip_off; - if (ip_off & (IP_MF | IP_OFFMASK)) { - /* - * ip_reass() expects len & off in host - * byte order: fix them in case we come - * from layer2. - */ - if (args->eh != NULL) { - ip->ip_len = ntohs(ip->ip_len); - ip->ip_off = ntohs(ip->ip_off); - } + l = 0; /* in any case exit inner loop */ + ip_off = ntohs(ip->ip_off); - m = ip_reass(m); - args->m = m; - - /* - * IP header checksum fixup after - * reassembly and leave header - * in network byte order. - */ - if (m != NULL) { - int hlen; - - ip = mtod(m, struct ip *); - hlen = ip->ip_hl << 2; - /* revert len & off for layer2 pkts */ - if (args->eh != NULL) - ip->ip_len = htons(ip->ip_len); - ip->ip_sum = 0; - if (hlen == sizeof(struct ip)) - ip->ip_sum = in_cksum_hdr(ip); - else - ip->ip_sum = in_cksum(m, hlen); - retval = IP_FW_REASS; - args->rule = f; - args->rule_id = f->id; - args->chain_id = chain->id; - goto done; - } else { - retval = IP_FW_DENY; - goto done; - } + /* if not fragmented, go to next rule */ + if ((ip_off & (IP_MF | IP_OFFMASK)) == 0) + break; + /* + * ip_reass() expects len & off in host + * byte order. + */ + SET_HOST_IPLEN(ip); + + args->m = m = ip_reass(m); + + /* + * do IP header checksum fixup. + */ + if (m == NULL) { /* fragment got swallowed */ + retval = IP_FW_DENY; + } else { /* good, packet complete */ + int hlen; + + ip = mtod(m, struct ip *); + hlen = ip->ip_hl << 2; + SET_NET_IPLEN(ip); + ip->ip_sum = 0; + if (hlen == sizeof(struct ip)) + ip->ip_sum = in_cksum_hdr(ip); + else + ip->ip_sum = in_cksum(m, hlen); + retval = IP_FW_REASS; + set_match(args, f_pos, chain); } - goto next_rule; + done = 1; /* exit outer loop */ + break; } default: panic("-- unknown opcode %d\n", cmd->opcode); } /* end of switch() on opcodes */ + /* + * if we get here with l=0, then match is irrelevant. + */ if (cmd->len & F_NOT) match = !match; @@ -3537,25 +2187,30 @@ check_body: break; /* try next rule */ } - } /* end of inner for, scan opcodes */ + } /* end of inner loop, scan opcodes */ + + if (done) + break; -next_rule:; /* try next rule */ +/* next_rule:; */ /* try next rule */ } /* end of outer for, scan rules */ - printf("ipfw: ouch!, skip past end of rules, denying packet\n"); - IPFW_RUNLOCK(chain); - if (ucred_cache != NULL) - crfree(ucred_cache); - return (IP_FW_DENY); -done: - /* Update statistics */ - f->pcnt++; - f->bcnt += pktlen; - f->timestamp = time_uptime; + if (done) { + struct ip_fw *rule = chain->map[f_pos]; + /* Update statistics */ + rule->pcnt++; + rule->bcnt += pktlen; + rule->timestamp = time_uptime; + } else { + retval = IP_FW_DENY; + printf("ipfw: ouch!, skip past end of rules, denying packet\n"); + } IPFW_RUNLOCK(chain); +#ifdef __FreeBSD__ if (ucred_cache != NULL) crfree(ucred_cache); +#endif return (retval); pullup_failed: @@ -3565,1150 +2220,10 @@ pullup_failed: } /* - * When a rule is added/deleted, clear the next_rule pointers in all rules. - * These will be reconstructed on the fly as packets are matched. - */ -static void -flush_rule_ptrs(struct ip_fw_chain *chain) -{ - struct ip_fw *rule; - - IPFW_WLOCK_ASSERT(chain); - - chain->id++; - - for (rule = chain->rules; rule; rule = rule->next) - rule->next_rule = NULL; -} - -/* - * Add a new rule to the list. Copy the rule into a malloc'ed area, then - * possibly create a rule number and add the rule to the list. - * Update the rule_number in the input struct so the caller knows it as well. - */ -static int -add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule) -{ - struct ip_fw *rule, *f, *prev; - int l = RULESIZE(input_rule); - - if (chain->rules == NULL && input_rule->rulenum != IPFW_DEFAULT_RULE) - return (EINVAL); - - rule = malloc(l, M_IPFW, M_NOWAIT | M_ZERO); - if (rule == NULL) - return (ENOSPC); - - bcopy(input_rule, rule, l); - - rule->next = NULL; - rule->next_rule = NULL; - - rule->pcnt = 0; - rule->bcnt = 0; - rule->timestamp = 0; - - IPFW_WLOCK(chain); - - if (chain->rules == NULL) { /* default rule */ - chain->rules = rule; - rule->id = ++chain->id; - goto done; - } - - /* - * If rulenum is 0, find highest numbered rule before the - * default rule, and add autoinc_step - */ - if (V_autoinc_step < 1) - V_autoinc_step = 1; - else if (V_autoinc_step > 1000) - V_autoinc_step = 1000; - if (rule->rulenum == 0) { - /* - * locate the highest numbered rule before default - */ - for (f = chain->rules; f; f = f->next) { - if (f->rulenum == IPFW_DEFAULT_RULE) - break; - rule->rulenum = f->rulenum; - } - if (rule->rulenum < IPFW_DEFAULT_RULE - V_autoinc_step) - rule->rulenum += V_autoinc_step; - input_rule->rulenum = rule->rulenum; - } - - /* - * Now insert the new rule in the right place in the sorted list. - */ - for (prev = NULL, f = chain->rules; f; prev = f, f = f->next) { - if (f->rulenum > rule->rulenum) { /* found the location */ - if (prev) { - rule->next = f; - prev->next = rule; - } else { /* head insert */ - rule->next = chain->rules; - chain->rules = rule; - } - break; - } - } - flush_rule_ptrs(chain); - /* chain->id incremented inside flush_rule_ptrs() */ - rule->id = chain->id; -done: - V_static_count++; - V_static_len += l; - IPFW_WUNLOCK(chain); - DEB(printf("ipfw: installed rule %d, static count now %d\n", - rule->rulenum, V_static_count);) - return (0); -} - -/** - * Remove a static rule (including derived * dynamic rules) - * and place it on the ``reap list'' for later reclamation. - * The caller is in charge of clearing rule pointers to avoid - * dangling pointers. - * @return a pointer to the next entry. - * Arguments are not checked, so they better be correct. - */ -static struct ip_fw * -remove_rule(struct ip_fw_chain *chain, struct ip_fw *rule, - struct ip_fw *prev) -{ - struct ip_fw *n; - int l = RULESIZE(rule); - - IPFW_WLOCK_ASSERT(chain); - - n = rule->next; - IPFW_DYN_LOCK(); - remove_dyn_rule(rule, NULL /* force removal */); - IPFW_DYN_UNLOCK(); - if (prev == NULL) - chain->rules = n; - else - prev->next = n; - V_static_count--; - V_static_len -= l; - - rule->next = chain->reap; - chain->reap = rule; - - return n; -} - -/* - * Reclaim storage associated with a list of rules. This is - * typically the list created using remove_rule. - * A NULL pointer on input is handled correctly. - */ -static void -reap_rules(struct ip_fw *head) -{ - struct ip_fw *rule; - - while ((rule = head) != NULL) { - head = head->next; - free(rule, M_IPFW); - } -} - -/* - * Remove all rules from a chain (except rules in set RESVD_SET - * unless kill_default = 1). The caller is responsible for - * reclaiming storage for the rules left in chain->reap. - */ -static void -free_chain(struct ip_fw_chain *chain, int kill_default) -{ - struct ip_fw *prev, *rule; - - IPFW_WLOCK_ASSERT(chain); - - chain->reap = NULL; - flush_rule_ptrs(chain); /* more efficient to do outside the loop */ - for (prev = NULL, rule = chain->rules; rule ; ) - if (kill_default || rule->set != RESVD_SET) - rule = remove_rule(chain, rule, prev); - else { - prev = rule; - rule = rule->next; - } -} - -/** - * Remove all rules with given number, and also do set manipulation. - * Assumes chain != NULL && *chain != NULL. - * - * The argument is an u_int32_t. The low 16 bit are the rule or set number, - * the next 8 bits are the new set, the top 8 bits are the command: - * - * 0 delete rules with given number - * 1 delete rules with given set number - * 2 move rules with given number to new set - * 3 move rules with given set number to new set - * 4 swap sets with given numbers - * 5 delete rules with given number and with given set number - */ -static int -del_entry(struct ip_fw_chain *chain, u_int32_t arg) -{ - struct ip_fw *prev = NULL, *rule; - u_int16_t rulenum; /* rule or old_set */ - u_int8_t cmd, new_set; - - rulenum = arg & 0xffff; - cmd = (arg >> 24) & 0xff; - new_set = (arg >> 16) & 0xff; - - if (cmd > 5 || new_set > RESVD_SET) - return EINVAL; - if (cmd == 0 || cmd == 2 || cmd == 5) { - if (rulenum >= IPFW_DEFAULT_RULE) - return EINVAL; - } else { - if (rulenum > RESVD_SET) /* old_set */ - return EINVAL; - } - - IPFW_WLOCK(chain); - rule = chain->rules; /* common starting point */ - chain->reap = NULL; /* prepare for deletions */ - switch (cmd) { - case 0: /* delete rules with given number */ - /* - * locate first rule to delete - */ - for (; rule->rulenum < rulenum; prev = rule, rule = rule->next) - ; - if (rule->rulenum != rulenum) { - IPFW_WUNLOCK(chain); - return EINVAL; - } - - /* - * flush pointers outside the loop, then delete all matching - * rules. prev remains the same throughout the cycle. - */ - flush_rule_ptrs(chain); - while (rule->rulenum == rulenum) - rule = remove_rule(chain, rule, prev); - break; - - case 1: /* delete all rules with given set number */ - flush_rule_ptrs(chain); - while (rule->rulenum < IPFW_DEFAULT_RULE) { - if (rule->set == rulenum) - rule = remove_rule(chain, rule, prev); - else { - prev = rule; - rule = rule->next; - } - } - break; - - case 2: /* move rules with given number to new set */ - for (; rule->rulenum < IPFW_DEFAULT_RULE; rule = rule->next) - if (rule->rulenum == rulenum) - rule->set = new_set; - break; - - case 3: /* move rules with given set number to new set */ - for (; rule->rulenum < IPFW_DEFAULT_RULE; rule = rule->next) - if (rule->set == rulenum) - rule->set = new_set; - break; - - case 4: /* swap two sets */ - for (; rule->rulenum < IPFW_DEFAULT_RULE; rule = rule->next) - if (rule->set == rulenum) - rule->set = new_set; - else if (rule->set == new_set) - rule->set = rulenum; - break; - - case 5: /* delete rules with given number and with given set number. - * rulenum - given rule number; - * new_set - given set number. - */ - for (; rule->rulenum < rulenum; prev = rule, rule = rule->next) - ; - if (rule->rulenum != rulenum) { - IPFW_WUNLOCK(chain); - return (EINVAL); - } - flush_rule_ptrs(chain); - while (rule->rulenum == rulenum) { - if (rule->set == new_set) - rule = remove_rule(chain, rule, prev); - else { - prev = rule; - rule = rule->next; - } - } - } - /* - * Look for rules to reclaim. We grab the list before - * releasing the lock then reclaim them w/o the lock to - * avoid a LOR with dummynet. - */ - rule = chain->reap; - IPFW_WUNLOCK(chain); - reap_rules(rule); - return 0; -} - -/* - * Clear counters for a specific rule. - * The enclosing "table" is assumed locked. - */ -static void -clear_counters(struct ip_fw *rule, int log_only) -{ - ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule); - - if (log_only == 0) { - rule->bcnt = rule->pcnt = 0; - rule->timestamp = 0; - } - if (l->o.opcode == O_LOG) - l->log_left = l->max_log; -} - -/** - * Reset some or all counters on firewall rules. - * The argument `arg' is an u_int32_t. The low 16 bit are the rule number, - * the next 8 bits are the set number, the top 8 bits are the command: - * 0 work with rules from all set's; - * 1 work with rules only from specified set. - * Specified rule number is zero if we want to clear all entries. - * log_only is 1 if we only want to reset logs, zero otherwise. - */ -static int -zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only) -{ - struct ip_fw *rule; - char *msg; - - uint16_t rulenum = arg & 0xffff; - uint8_t set = (arg >> 16) & 0xff; - uint8_t cmd = (arg >> 24) & 0xff; - - if (cmd > 1) - return (EINVAL); - if (cmd == 1 && set > RESVD_SET) - return (EINVAL); - - IPFW_WLOCK(chain); - if (rulenum == 0) { - V_norule_counter = 0; - for (rule = chain->rules; rule; rule = rule->next) { - /* Skip rules from another set. */ - if (cmd == 1 && rule->set != set) - continue; - clear_counters(rule, log_only); - } - msg = log_only ? "All logging counts reset" : - "Accounting cleared"; - } else { - int cleared = 0; - /* - * We can have multiple rules with the same number, so we - * need to clear them all. - */ - for (rule = chain->rules; rule; rule = rule->next) - if (rule->rulenum == rulenum) { - while (rule && rule->rulenum == rulenum) { - if (cmd == 0 || rule->set == set) - clear_counters(rule, log_only); - rule = rule->next; - } - cleared = 1; - break; - } - if (!cleared) { /* we did not find any matching rules */ - IPFW_WUNLOCK(chain); - return (EINVAL); - } - msg = log_only ? "logging count reset" : "cleared"; - } - IPFW_WUNLOCK(chain); - - if (V_fw_verbose) { - int lev = LOG_SECURITY | LOG_NOTICE; - - if (rulenum) - log(lev, "ipfw: Entry %d %s.\n", rulenum, msg); - else - log(lev, "ipfw: %s.\n", msg); - } - return (0); -} - -/* - * Check validity of the structure before insert. - * Fortunately rules are simple, so this mostly need to check rule sizes. - */ -static int -check_ipfw_struct(struct ip_fw *rule, int size) -{ - int l, cmdlen = 0; - int have_action=0; - ipfw_insn *cmd; - - if (size < sizeof(*rule)) { - printf("ipfw: rule too short\n"); - return (EINVAL); - } - /* first, check for valid size */ - l = RULESIZE(rule); - if (l != size) { - printf("ipfw: size mismatch (have %d want %d)\n", size, l); - return (EINVAL); - } - if (rule->act_ofs >= rule->cmd_len) { - printf("ipfw: bogus action offset (%u > %u)\n", - rule->act_ofs, rule->cmd_len - 1); - return (EINVAL); - } - /* - * Now go for the individual checks. Very simple ones, basically only - * instruction sizes. - */ - for (l = rule->cmd_len, cmd = rule->cmd ; - l > 0 ; l -= cmdlen, cmd += cmdlen) { - cmdlen = F_LEN(cmd); - if (cmdlen > l) { - printf("ipfw: opcode %d size truncated\n", - cmd->opcode); - return EINVAL; - } - DEB(printf("ipfw: opcode %d\n", cmd->opcode);) - switch (cmd->opcode) { - case O_PROBE_STATE: - case O_KEEP_STATE: - case O_PROTO: - case O_IP_SRC_ME: - case O_IP_DST_ME: - case O_LAYER2: - case O_IN: - case O_FRAG: - case O_DIVERTED: - case O_IPOPT: - case O_IPTOS: - case O_IPPRECEDENCE: - case O_IPVER: - case O_TCPWIN: - case O_TCPFLAGS: - case O_TCPOPTS: - case O_ESTAB: - case O_VERREVPATH: - case O_VERSRCREACH: - case O_ANTISPOOF: - case O_IPSEC: -#ifdef INET6 - case O_IP6_SRC_ME: - case O_IP6_DST_ME: - case O_EXT_HDR: - case O_IP6: -#endif - case O_IP4: - case O_TAG: - if (cmdlen != F_INSN_SIZE(ipfw_insn)) - goto bad_size; - break; - - case O_FIB: - if (cmdlen != F_INSN_SIZE(ipfw_insn)) - goto bad_size; - if (cmd->arg1 >= rt_numfibs) { - printf("ipfw: invalid fib number %d\n", - cmd->arg1); - return EINVAL; - } - break; - - case O_SETFIB: - if (cmdlen != F_INSN_SIZE(ipfw_insn)) - goto bad_size; - if (cmd->arg1 >= rt_numfibs) { - printf("ipfw: invalid fib number %d\n", - cmd->arg1); - return EINVAL; - } - goto check_action; - - case O_UID: - case O_GID: - case O_JAIL: - case O_IP_SRC: - case O_IP_DST: - case O_TCPSEQ: - case O_TCPACK: - case O_PROB: - case O_ICMPTYPE: - if (cmdlen != F_INSN_SIZE(ipfw_insn_u32)) - goto bad_size; - break; - - case O_LIMIT: - if (cmdlen != F_INSN_SIZE(ipfw_insn_limit)) - goto bad_size; - break; - - case O_LOG: - if (cmdlen != F_INSN_SIZE(ipfw_insn_log)) - goto bad_size; - - ((ipfw_insn_log *)cmd)->log_left = - ((ipfw_insn_log *)cmd)->max_log; - - break; - - case O_IP_SRC_MASK: - case O_IP_DST_MASK: - /* only odd command lengths */ - if ( !(cmdlen & 1) || cmdlen > 31) - goto bad_size; - break; - - case O_IP_SRC_SET: - case O_IP_DST_SET: - if (cmd->arg1 == 0 || cmd->arg1 > 256) { - printf("ipfw: invalid set size %d\n", - cmd->arg1); - return EINVAL; - } - if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + - (cmd->arg1+31)/32 ) - goto bad_size; - break; - - case O_IP_SRC_LOOKUP: - case O_IP_DST_LOOKUP: - if (cmd->arg1 >= IPFW_TABLES_MAX) { - printf("ipfw: invalid table number %d\n", - cmd->arg1); - return (EINVAL); - } - if (cmdlen != F_INSN_SIZE(ipfw_insn) && - cmdlen != F_INSN_SIZE(ipfw_insn_u32)) - goto bad_size; - break; - - case O_MACADDR2: - if (cmdlen != F_INSN_SIZE(ipfw_insn_mac)) - goto bad_size; - break; - - case O_NOP: - case O_IPID: - case O_IPTTL: - case O_IPLEN: - case O_TCPDATALEN: - case O_TAGGED: - if (cmdlen < 1 || cmdlen > 31) - goto bad_size; - break; - - case O_MAC_TYPE: - case O_IP_SRCPORT: - case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */ - if (cmdlen < 2 || cmdlen > 31) - goto bad_size; - break; - - case O_RECV: - case O_XMIT: - case O_VIA: - if (cmdlen != F_INSN_SIZE(ipfw_insn_if)) - goto bad_size; - break; - - case O_ALTQ: - if (cmdlen != F_INSN_SIZE(ipfw_insn_altq)) - goto bad_size; - break; - - case O_PIPE: - case O_QUEUE: - if (cmdlen != F_INSN_SIZE(ipfw_insn)) - goto bad_size; - goto check_action; - - case O_FORWARD_IP: -#ifdef IPFIREWALL_FORWARD - if (cmdlen != F_INSN_SIZE(ipfw_insn_sa)) - goto bad_size; - goto check_action; -#else - return EINVAL; -#endif - - case O_DIVERT: - case O_TEE: - if (ip_divert_ptr == NULL) - return EINVAL; - else - goto check_size; - case O_NETGRAPH: - case O_NGTEE: - if (!NG_IPFW_LOADED) - return EINVAL; - else - goto check_size; - case O_NAT: - if (!IPFW_NAT_LOADED) - return EINVAL; - if (cmdlen != F_INSN_SIZE(ipfw_insn_nat)) - goto bad_size; - goto check_action; - case O_FORWARD_MAC: /* XXX not implemented yet */ - case O_CHECK_STATE: - case O_COUNT: - case O_ACCEPT: - case O_DENY: - case O_REJECT: -#ifdef INET6 - case O_UNREACH6: -#endif - case O_SKIPTO: - case O_REASS: -check_size: - if (cmdlen != F_INSN_SIZE(ipfw_insn)) - goto bad_size; -check_action: - if (have_action) { - printf("ipfw: opcode %d, multiple actions" - " not allowed\n", - cmd->opcode); - return EINVAL; - } - have_action = 1; - if (l != cmdlen) { - printf("ipfw: opcode %d, action must be" - " last opcode\n", - cmd->opcode); - return EINVAL; - } - break; -#ifdef INET6 - case O_IP6_SRC: - case O_IP6_DST: - if (cmdlen != F_INSN_SIZE(struct in6_addr) + - F_INSN_SIZE(ipfw_insn)) - goto bad_size; - break; - - case O_FLOW6ID: - if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + - ((ipfw_insn_u32 *)cmd)->o.arg1) - goto bad_size; - break; - - case O_IP6_SRC_MASK: - case O_IP6_DST_MASK: - if ( !(cmdlen & 1) || cmdlen > 127) - goto bad_size; - break; - case O_ICMP6TYPE: - if( cmdlen != F_INSN_SIZE( ipfw_insn_icmp6 ) ) - goto bad_size; - break; -#endif - - default: - switch (cmd->opcode) { -#ifndef INET6 - case O_IP6_SRC_ME: - case O_IP6_DST_ME: - case O_EXT_HDR: - case O_IP6: - case O_UNREACH6: - case O_IP6_SRC: - case O_IP6_DST: - case O_FLOW6ID: - case O_IP6_SRC_MASK: - case O_IP6_DST_MASK: - case O_ICMP6TYPE: - printf("ipfw: no IPv6 support in kernel\n"); - return EPROTONOSUPPORT; -#endif - default: - printf("ipfw: opcode %d, unknown opcode\n", - cmd->opcode); - return EINVAL; - } - } - } - if (have_action == 0) { - printf("ipfw: missing action\n"); - return EINVAL; - } - return 0; - -bad_size: - printf("ipfw: opcode %d size %d wrong\n", - cmd->opcode, cmdlen); - return EINVAL; -} - -/* - * Copy the static and dynamic rules to the supplied buffer - * and return the amount of space actually used. + * Module and VNET glue */ -static size_t -ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space) -{ - char *bp = buf; - char *ep = bp + space; - struct ip_fw *rule; - int i; - time_t boot_seconds; - - boot_seconds = boottime.tv_sec; - /* XXX this can take a long time and locking will block packet flow */ - IPFW_RLOCK(chain); - for (rule = chain->rules; rule ; rule = rule->next) { - /* - * Verify the entry fits in the buffer in case the - * rules changed between calculating buffer space and - * now. This would be better done using a generation - * number but should suffice for now. - */ - i = RULESIZE(rule); - if (bp + i <= ep) { - bcopy(rule, bp, i); - /* - * XXX HACK. Store the disable mask in the "next" - * pointer in a wild attempt to keep the ABI the same. - * Why do we do this on EVERY rule? - */ - bcopy(&V_set_disable, - &(((struct ip_fw *)bp)->next_rule), - sizeof(V_set_disable)); - if (((struct ip_fw *)bp)->timestamp) - ((struct ip_fw *)bp)->timestamp += boot_seconds; - bp += i; - } - } - IPFW_RUNLOCK(chain); - if (V_ipfw_dyn_v) { - ipfw_dyn_rule *p, *last = NULL; - - IPFW_DYN_LOCK(); - for (i = 0 ; i < V_curr_dyn_buckets; i++) - for (p = V_ipfw_dyn_v[i] ; p != NULL; p = p->next) { - if (bp + sizeof *p <= ep) { - ipfw_dyn_rule *dst = - (ipfw_dyn_rule *)bp; - bcopy(p, dst, sizeof *p); - bcopy(&(p->rule->rulenum), &(dst->rule), - sizeof(p->rule->rulenum)); - /* - * store set number into high word of - * dst->rule pointer. - */ - bcopy(&(p->rule->set), - (char *)&dst->rule + - sizeof(p->rule->rulenum), - sizeof(p->rule->set)); - /* - * store a non-null value in "next". - * The userland code will interpret a - * NULL here as a marker - * for the last dynamic rule. - */ - bcopy(&dst, &dst->next, sizeof(dst)); - last = dst; - dst->expire = - TIME_LEQ(dst->expire, time_uptime) ? - 0 : dst->expire - time_uptime ; - bp += sizeof(ipfw_dyn_rule); - } - } - IPFW_DYN_UNLOCK(); - if (last != NULL) /* mark last dynamic rule */ - bzero(&last->next, sizeof(last)); - } - return (bp - (char *)buf); -} - - -/** - * {set|get}sockopt parser. - */ -static int -ipfw_ctl(struct sockopt *sopt) -{ -#define RULE_MAXSIZE (256*sizeof(u_int32_t)) - int error; - size_t size; - struct ip_fw *buf, *rule; - u_int32_t rulenum[2]; - - error = priv_check(sopt->sopt_td, PRIV_NETINET_IPFW); - if (error) - return (error); - - /* - * Disallow modifications in really-really secure mode, but still allow - * the logging counters to be reset. - */ - if (sopt->sopt_name == IP_FW_ADD || - (sopt->sopt_dir == SOPT_SET && sopt->sopt_name != IP_FW_RESETLOG)) { - error = securelevel_ge(sopt->sopt_td->td_ucred, 3); - if (error) - return (error); - } - - error = 0; - - switch (sopt->sopt_name) { - case IP_FW_GET: - /* - * pass up a copy of the current rules. Static rules - * come first (the last of which has number IPFW_DEFAULT_RULE), - * followed by a possibly empty list of dynamic rule. - * The last dynamic rule has NULL in the "next" field. - * - * Note that the calculated size is used to bound the - * amount of data returned to the user. The rule set may - * change between calculating the size and returning the - * data in which case we'll just return what fits. - */ - size = V_static_len; /* size of static rules */ - if (V_ipfw_dyn_v) /* add size of dyn.rules */ - size += (V_dyn_count * sizeof(ipfw_dyn_rule)); - - if (size >= sopt->sopt_valsize) - break; - /* - * XXX todo: if the user passes a short length just to know - * how much room is needed, do not bother filling up the - * buffer, just jump to the sooptcopyout. - */ - buf = malloc(size, M_TEMP, M_WAITOK); - error = sooptcopyout(sopt, buf, - ipfw_getrules(&V_layer3_chain, buf, size)); - free(buf, M_TEMP); - break; - - case IP_FW_FLUSH: - /* - * Normally we cannot release the lock on each iteration. - * We could do it here only because we start from the head all - * the times so there is no risk of missing some entries. - * On the other hand, the risk is that we end up with - * a very inconsistent ruleset, so better keep the lock - * around the whole cycle. - * - * XXX this code can be improved by resetting the head of - * the list to point to the default rule, and then freeing - * the old list without the need for a lock. - */ - - IPFW_WLOCK(&V_layer3_chain); - free_chain(&V_layer3_chain, 0 /* keep default rule */); - rule = V_layer3_chain.reap; - IPFW_WUNLOCK(&V_layer3_chain); - reap_rules(rule); - break; - - case IP_FW_ADD: - rule = malloc(RULE_MAXSIZE, M_TEMP, M_WAITOK); - error = sooptcopyin(sopt, rule, RULE_MAXSIZE, - sizeof(struct ip_fw) ); - if (error == 0) - error = check_ipfw_struct(rule, sopt->sopt_valsize); - if (error == 0) { - error = add_rule(&V_layer3_chain, rule); - size = RULESIZE(rule); - if (!error && sopt->sopt_dir == SOPT_GET) - error = sooptcopyout(sopt, rule, size); - } - free(rule, M_TEMP); - break; - - case IP_FW_DEL: - /* - * IP_FW_DEL is used for deleting single rules or sets, - * and (ab)used to atomically manipulate sets. Argument size - * is used to distinguish between the two: - * sizeof(u_int32_t) - * delete single rule or set of rules, - * or reassign rules (or sets) to a different set. - * 2*sizeof(u_int32_t) - * atomic disable/enable sets. - * first u_int32_t contains sets to be disabled, - * second u_int32_t contains sets to be enabled. - */ - error = sooptcopyin(sopt, rulenum, - 2*sizeof(u_int32_t), sizeof(u_int32_t)); - if (error) - break; - size = sopt->sopt_valsize; - if (size == sizeof(u_int32_t)) /* delete or reassign */ - error = del_entry(&V_layer3_chain, rulenum[0]); - else if (size == 2*sizeof(u_int32_t)) /* set enable/disable */ - V_set_disable = - (V_set_disable | rulenum[0]) & ~rulenum[1] & - ~(1<<RESVD_SET); /* set RESVD_SET always enabled */ - else - error = EINVAL; - break; - - case IP_FW_ZERO: - case IP_FW_RESETLOG: /* argument is an u_int_32, the rule number */ - rulenum[0] = 0; - if (sopt->sopt_val != 0) { - error = sooptcopyin(sopt, rulenum, - sizeof(u_int32_t), sizeof(u_int32_t)); - if (error) - break; - } - error = zero_entry(&V_layer3_chain, rulenum[0], - sopt->sopt_name == IP_FW_RESETLOG); - break; - - case IP_FW_TABLE_ADD: - { - ipfw_table_entry ent; - - error = sooptcopyin(sopt, &ent, - sizeof(ent), sizeof(ent)); - if (error) - break; - error = add_table_entry(&V_layer3_chain, ent.tbl, - ent.addr, ent.masklen, ent.value); - } - break; - - case IP_FW_TABLE_DEL: - { - ipfw_table_entry ent; - - error = sooptcopyin(sopt, &ent, - sizeof(ent), sizeof(ent)); - if (error) - break; - error = del_table_entry(&V_layer3_chain, ent.tbl, - ent.addr, ent.masklen); - } - break; - - case IP_FW_TABLE_FLUSH: - { - u_int16_t tbl; - - error = sooptcopyin(sopt, &tbl, - sizeof(tbl), sizeof(tbl)); - if (error) - break; - IPFW_WLOCK(&V_layer3_chain); - error = flush_table(&V_layer3_chain, tbl); - IPFW_WUNLOCK(&V_layer3_chain); - } - break; - - case IP_FW_TABLE_GETSIZE: - { - u_int32_t tbl, cnt; - - if ((error = sooptcopyin(sopt, &tbl, sizeof(tbl), - sizeof(tbl)))) - break; - IPFW_RLOCK(&V_layer3_chain); - error = count_table(&V_layer3_chain, tbl, &cnt); - IPFW_RUNLOCK(&V_layer3_chain); - if (error) - break; - error = sooptcopyout(sopt, &cnt, sizeof(cnt)); - } - break; - - case IP_FW_TABLE_LIST: - { - ipfw_table *tbl; - - if (sopt->sopt_valsize < sizeof(*tbl)) { - error = EINVAL; - break; - } - size = sopt->sopt_valsize; - tbl = malloc(size, M_TEMP, M_WAITOK); - error = sooptcopyin(sopt, tbl, size, sizeof(*tbl)); - if (error) { - free(tbl, M_TEMP); - break; - } - tbl->size = (size - sizeof(*tbl)) / - sizeof(ipfw_table_entry); - IPFW_RLOCK(&V_layer3_chain); - error = dump_table(&V_layer3_chain, tbl); - IPFW_RUNLOCK(&V_layer3_chain); - if (error) { - free(tbl, M_TEMP); - break; - } - error = sooptcopyout(sopt, tbl, size); - free(tbl, M_TEMP); - } - break; - - case IP_FW_NAT_CFG: - if (IPFW_NAT_LOADED) - error = ipfw_nat_cfg_ptr(sopt); - else { - printf("IP_FW_NAT_CFG: %s\n", - "ipfw_nat not present, please load it"); - error = EINVAL; - } - break; - - case IP_FW_NAT_DEL: - if (IPFW_NAT_LOADED) - error = ipfw_nat_del_ptr(sopt); - else { - printf("IP_FW_NAT_DEL: %s\n", - "ipfw_nat not present, please load it"); - error = EINVAL; - } - break; - - case IP_FW_NAT_GET_CONFIG: - if (IPFW_NAT_LOADED) - error = ipfw_nat_get_cfg_ptr(sopt); - else { - printf("IP_FW_NAT_GET_CFG: %s\n", - "ipfw_nat not present, please load it"); - error = EINVAL; - } - break; - - case IP_FW_NAT_GET_LOG: - if (IPFW_NAT_LOADED) - error = ipfw_nat_get_log_ptr(sopt); - else { - printf("IP_FW_NAT_GET_LOG: %s\n", - "ipfw_nat not present, please load it"); - error = EINVAL; - } - break; - - default: - printf("ipfw: ipfw_ctl invalid option %d\n", sopt->sopt_name); - error = EINVAL; - } - - return (error); -#undef RULE_MAXSIZE -} - /* - * This procedure is only used to handle keepalives. It is invoked - * every dyn_keepalive_period - */ -static void -ipfw_tick(void * vnetx) -{ - struct mbuf *m0, *m, *mnext, **mtailp; -#ifdef INET6 - struct mbuf *m6, **m6_tailp; -#endif - int i; - ipfw_dyn_rule *q; -#ifdef VIMAGE - struct vnet *vp = vnetx; -#endif - - CURVNET_SET(vp); - if (V_dyn_keepalive == 0 || V_ipfw_dyn_v == NULL || V_dyn_count == 0) - goto done; - - /* - * We make a chain of packets to go out here -- not deferring - * until after we drop the IPFW dynamic rule lock would result - * in a lock order reversal with the normal packet input -> ipfw - * call stack. - */ - m0 = NULL; - mtailp = &m0; -#ifdef INET6 - m6 = NULL; - m6_tailp = &m6; -#endif - IPFW_DYN_LOCK(); - for (i = 0 ; i < V_curr_dyn_buckets ; i++) { - for (q = V_ipfw_dyn_v[i] ; q ; q = q->next ) { - if (q->dyn_type == O_LIMIT_PARENT) - continue; - if (q->id.proto != IPPROTO_TCP) - continue; - if ( (q->state & BOTH_SYN) != BOTH_SYN) - continue; - if (TIME_LEQ(time_uptime + V_dyn_keepalive_interval, - q->expire)) - continue; /* too early */ - if (TIME_LEQ(q->expire, time_uptime)) - continue; /* too late, rule expired */ - - m = send_pkt(NULL, &(q->id), q->ack_rev - 1, - q->ack_fwd, TH_SYN); - mnext = send_pkt(NULL, &(q->id), q->ack_fwd - 1, - q->ack_rev, 0); - - switch (q->id.addr_type) { - case 4: - if (m != NULL) { - *mtailp = m; - mtailp = &(*mtailp)->m_nextpkt; - } - if (mnext != NULL) { - *mtailp = mnext; - mtailp = &(*mtailp)->m_nextpkt; - } - break; -#ifdef INET6 - case 6: - if (m != NULL) { - *m6_tailp = m; - m6_tailp = &(*m6_tailp)->m_nextpkt; - } - if (mnext != NULL) { - *m6_tailp = mnext; - m6_tailp = &(*m6_tailp)->m_nextpkt; - } - break; -#endif - } - - m = mnext = NULL; - } - } - IPFW_DYN_UNLOCK(); - for (m = mnext = m0; m != NULL; m = mnext) { - mnext = m->m_nextpkt; - m->m_nextpkt = NULL; - ip_output(m, NULL, NULL, 0, NULL, NULL); - } -#ifdef INET6 - for (m = mnext = m6; m != NULL; m = mnext) { - mnext = m->m_nextpkt; - m->m_nextpkt = NULL; - ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); - } -#endif -done: - callout_reset(&V_ipfw_timeout, V_dyn_keepalive_period * hz, - ipfw_tick, vnetx); - CURVNET_RESTORE(); -} - -/**************** * Stuff that must be initialised only on boot or module load */ static int @@ -4716,11 +2231,7 @@ ipfw_init(void) { int error = 0; - ipfw_dyn_rule_zone = uma_zcreate("IPFW dynamic rule", - sizeof(ipfw_dyn_rule), NULL, NULL, NULL, NULL, - UMA_ALIGN_PTR, 0); - - IPFW_DYN_LOCK_INIT(); + ipfw_dyn_attach(); /* * Only print out this stuff the first time around, * when called from the sysinit code. @@ -4764,22 +2275,23 @@ ipfw_init(void) printf("limited to %d packets/entry by default\n", V_verbose_limit); + ipfw_log_bpf(1); /* init */ return (error); } -/********************** +/* * Called for the removal of the last instance only on module unload. */ static void ipfw_destroy(void) { - uma_zdestroy(ipfw_dyn_rule_zone); - IPFW_DYN_LOCK_DESTROY(); + ipfw_log_bpf(0); /* uninit */ + ipfw_dyn_detach(); printf("IP firewall unloaded\n"); } -/**************** +/* * Stuff that must be initialized for every instance * (including the first of course). */ @@ -4787,139 +2299,121 @@ static int vnet_ipfw_init(const void *unused) { int error; - struct ip_fw default_rule; + struct ip_fw *rule = NULL; + struct ip_fw_chain *chain; + + chain = &V_layer3_chain; /* First set up some values that are compile time options */ + V_autoinc_step = 100; /* bounded to 1..1000 in add_rule() */ + V_fw_deny_unknown_exthdrs = 1; #ifdef IPFIREWALL_VERBOSE V_fw_verbose = 1; #endif #ifdef IPFIREWALL_VERBOSE_LIMIT V_verbose_limit = IPFIREWALL_VERBOSE_LIMIT; #endif - - error = init_tables(&V_layer3_chain); - if (error) { - panic("init_tables"); /* XXX Marko fix this ! */ - } #ifdef IPFIREWALL_NAT - LIST_INIT(&V_layer3_chain.nat); + LIST_INIT(&chain->nat); #endif - V_autoinc_step = 100; /* bounded to 1..1000 in add_rule() */ - - V_ipfw_dyn_v = NULL; - V_dyn_buckets = 256; /* must be power of 2 */ - V_curr_dyn_buckets = 256; /* must be power of 2 */ - - V_dyn_ack_lifetime = 300; - V_dyn_syn_lifetime = 20; - V_dyn_fin_lifetime = 1; - V_dyn_rst_lifetime = 1; - V_dyn_udp_lifetime = 10; - V_dyn_short_lifetime = 5; - - V_dyn_keepalive_interval = 20; - V_dyn_keepalive_period = 5; - V_dyn_keepalive = 1; /* do send keepalives */ - - V_dyn_max = 4096; /* max # of dynamic rules */ - - V_fw_deny_unknown_exthdrs = 1; - - V_layer3_chain.rules = NULL; - IPFW_LOCK_INIT(&V_layer3_chain); - callout_init(&V_ipfw_timeout, CALLOUT_MPSAFE); - - bzero(&default_rule, sizeof default_rule); - default_rule.act_ofs = 0; - default_rule.rulenum = IPFW_DEFAULT_RULE; - default_rule.cmd_len = 1; - default_rule.set = RESVD_SET; - default_rule.cmd[0].len = 1; - default_rule.cmd[0].opcode = default_to_accept ? O_ACCEPT : O_DENY; - error = add_rule(&V_layer3_chain, &default_rule); - - if (error != 0) { - printf("ipfw2: error %u initializing default rule " - "(support disabled)\n", error); - IPFW_LOCK_DESTROY(&V_layer3_chain); - printf("leaving ipfw_iattach (1) with error %d\n", error); - return (error); + /* insert the default rule and create the initial map */ + chain->n_rules = 1; + chain->static_len = sizeof(struct ip_fw); + chain->map = malloc(sizeof(struct ip_fw *), M_IPFW, M_NOWAIT | M_ZERO); + if (chain->map) + rule = malloc(chain->static_len, M_IPFW, M_NOWAIT | M_ZERO); + if (rule == NULL) { + if (chain->map) + free(chain->map, M_IPFW); + printf("ipfw2: ENOSPC initializing default rule " + "(support disabled)\n"); + return (ENOSPC); } - - ip_fw_default_rule = V_layer3_chain.rules; - + error = ipfw_init_tables(chain); if (error) { - IPFW_LOCK_DESTROY(&V_layer3_chain); - printf("leaving ipfw_iattach (2) with error %d\n", error); - return (error); + panic("init_tables"); /* XXX Marko fix this ! */ } -#ifdef VIMAGE /* want a better way to do this */ - callout_reset(&V_ipfw_timeout, hz, ipfw_tick, curvnet); -#else - callout_reset(&V_ipfw_timeout, hz, ipfw_tick, NULL); -#endif + + /* fill and insert the default rule */ + rule->act_ofs = 0; + rule->rulenum = IPFW_DEFAULT_RULE; + rule->cmd_len = 1; + rule->set = RESVD_SET; + rule->cmd[0].len = 1; + rule->cmd[0].opcode = default_to_accept ? O_ACCEPT : O_DENY; + chain->rules = chain->default_rule = chain->map[0] = rule; + chain->id = rule->id = 1; + + IPFW_LOCK_INIT(chain); + ipfw_dyn_init(); /* First set up some values that are compile time options */ V_ipfw_vnet_ready = 1; /* Open for business */ - /* Hook up the raw inputs */ - V_ip_fw_ctl_ptr = ipfw_ctl; - V_ip_fw_chk_ptr = ipfw_chk; - /* - * Hook us up to pfil. + * Hook the sockopt handler, and the layer2 (V_ip_fw_chk_ptr) + * and pfil hooks for ipv4 and ipv6. Even if the latter two fail + * we still keep the module alive because the sockopt and + * layer2 paths are still useful. + * ipfw[6]_hook return 0 on success, ENOENT on failure, + * so we can ignore the exact return value and just set a flag. + * + * Note that V_fw[6]_enable are manipulated by a SYSCTL_PROC so + * changes in the underlying (per-vnet) variables trigger + * immediate hook()/unhook() calls. + * In layer2 we have the same behaviour, except that V_ether_ipfw + * is checked on each packet because there are no pfil hooks. */ - if (V_fw_enable) { - if ((error = ipfw_hook()) != 0) { - printf("ipfw_hook() error\n"); - return (error); - } - } -#ifdef INET6 - if (V_fw6_enable) { - if ((error = ipfw6_hook()) != 0) { - printf("ipfw6_hook() error\n"); - /* XXX should we unhook everything else? */ - return (error); - } - } -#endif - return (0); + V_ip_fw_ctl_ptr = ipfw_ctl; + V_ip_fw_chk_ptr = ipfw_chk; + error = ipfw_attach_hooks(1); + return (error); } -/*********************** +/* * Called for the removal of each instance. */ static int vnet_ipfw_uninit(const void *unused) { - struct ip_fw *reap; + struct ip_fw *reap, *rule; + struct ip_fw_chain *chain = &V_layer3_chain; + int i; V_ipfw_vnet_ready = 0; /* tell new callers to go away */ - ipfw_unhook(); -#ifdef INET6 - ipfw6_unhook(); -#endif - /* layer2 and other entrypoints still come in this way. */ + /* + * disconnect from ipv4, ipv6, layer2 and sockopt. + * Then grab, release and grab again the WLOCK so we make + * sure the update is propagated and nobody will be in. + */ + (void)ipfw_attach_hooks(0 /* detach */); V_ip_fw_chk_ptr = NULL; V_ip_fw_ctl_ptr = NULL; - IPFW_WLOCK(&V_layer3_chain); - /* We wait on the wlock here until the last user leaves */ - IPFW_WUNLOCK(&V_layer3_chain); - IPFW_WLOCK(&V_layer3_chain); - callout_drain(&V_ipfw_timeout); - flush_tables(&V_layer3_chain); - V_layer3_chain.reap = NULL; - free_chain(&V_layer3_chain, 1 /* kill default rule */); - reap = V_layer3_chain.reap; - V_layer3_chain.reap = NULL; - IPFW_WUNLOCK(&V_layer3_chain); + IPFW_UH_WLOCK(chain); + IPFW_UH_WUNLOCK(chain); + IPFW_UH_WLOCK(chain); + + IPFW_WLOCK(chain); + IPFW_WUNLOCK(chain); + IPFW_WLOCK(chain); + + ipfw_dyn_uninit(0); /* run the callout_drain */ + ipfw_destroy_tables(chain); + reap = NULL; + for (i = 0; i < chain->n_rules; i++) { + rule = chain->map[i]; + rule->x_next = reap; + reap = rule; + } + if (chain->map) + free(chain->map, M_IPFW); + IPFW_WUNLOCK(chain); + IPFW_UH_WUNLOCK(chain); if (reap != NULL) - reap_rules(reap); - IPFW_LOCK_DESTROY(&V_layer3_chain); - if (V_ipfw_dyn_v != NULL) - free(V_ipfw_dyn_v, M_IPFW); + ipfw_reap_rules(reap); + IPFW_LOCK_DESTROY(chain); + ipfw_dyn_uninit(1); /* free the remaining parts */ return 0; } @@ -4993,4 +2487,4 @@ SYSUNINIT(ipfw_destroy, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER, ipfw_destroy, NULL); VNET_SYSUNINIT(vnet_ipfw_uninit, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER, vnet_ipfw_uninit, NULL); - +/* end of file */ diff --git a/sys/netinet/ipfw/ip_fw_dynamic.c b/sys/netinet/ipfw/ip_fw_dynamic.c new file mode 100644 index 0000000..6947582 --- /dev/null +++ b/sys/netinet/ipfw/ip_fw_dynamic.c @@ -0,0 +1,1244 @@ +/*- + * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#define DEB(x) +#define DDB(x) x + +/* + * Dynamic rule support for ipfw + */ + +#if !defined(KLD_MODULE) +#include "opt_ipfw.h" +#include "opt_ipdivert.h" +#include "opt_ipdn.h" +#include "opt_inet.h" +#ifndef INET +#error IPFIREWALL requires INET. +#endif /* INET */ +#endif +#include "opt_inet6.h" +#include "opt_ipsec.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/socket.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> +#include <net/ethernet.h> /* for ETHERTYPE_IP */ +#include <net/if.h> +#include <net/vnet.h> + +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> /* ip_defttl */ +#include <netinet/ip_fw.h> +#include <netinet/ipfw/ip_fw_private.h> +#include <netinet/tcp_var.h> +#include <netinet/udp.h> + +#include <netinet/ip6.h> /* IN6_ARE_ADDR_EQUAL */ +#ifdef INET6 +#include <netinet6/in6_var.h> +#include <netinet6/ip6_var.h> +#endif + +#include <machine/in_cksum.h> /* XXX for in_cksum */ + +#ifdef MAC +#include <security/mac/mac_framework.h> +#endif + +/* + * Description of dynamic rules. + * + * Dynamic rules are stored in lists accessed through a hash table + * (ipfw_dyn_v) whose size is curr_dyn_buckets. This value can + * be modified through the sysctl variable dyn_buckets which is + * updated when the table becomes empty. + * + * XXX currently there is only one list, ipfw_dyn. + * + * When a packet is received, its address fields are first masked + * with the mask defined for the rule, then hashed, then matched + * against the entries in the corresponding list. + * Dynamic rules can be used for different purposes: + * + stateful rules; + * + enforcing limits on the number of sessions; + * + in-kernel NAT (not implemented yet) + * + * The lifetime of dynamic rules is regulated by dyn_*_lifetime, + * measured in seconds and depending on the flags. + * + * The total number of dynamic rules is stored in dyn_count. + * The max number of dynamic rules is dyn_max. When we reach + * the maximum number of rules we do not create anymore. This is + * done to avoid consuming too much memory, but also too much + * time when searching on each packet (ideally, we should try instead + * to put a limit on the length of the list on each bucket...). + * + * Each dynamic rule holds a pointer to the parent ipfw rule so + * we know what action to perform. Dynamic rules are removed when + * the parent rule is deleted. XXX we should make them survive. + * + * There are some limitations with dynamic rules -- we do not + * obey the 'randomized match', and we do not do multiple + * passes through the firewall. XXX check the latter!!! + */ + +/* + * Static variables followed by global ones + */ +static VNET_DEFINE(ipfw_dyn_rule **, ipfw_dyn_v); +static VNET_DEFINE(u_int32_t, dyn_buckets); +static VNET_DEFINE(u_int32_t, curr_dyn_buckets); +static VNET_DEFINE(struct callout, ipfw_timeout); +#define V_ipfw_dyn_v VNET(ipfw_dyn_v) +#define V_dyn_buckets VNET(dyn_buckets) +#define V_curr_dyn_buckets VNET(curr_dyn_buckets) +#define V_ipfw_timeout VNET(ipfw_timeout) + +static uma_zone_t ipfw_dyn_rule_zone; +#ifndef __FreeBSD__ +DEFINE_SPINLOCK(ipfw_dyn_mtx); +#else +static struct mtx ipfw_dyn_mtx; /* mutex guarding dynamic rules */ +#endif + +#define IPFW_DYN_LOCK_INIT() \ + mtx_init(&ipfw_dyn_mtx, "IPFW dynamic rules", NULL, MTX_DEF) +#define IPFW_DYN_LOCK_DESTROY() mtx_destroy(&ipfw_dyn_mtx) +#define IPFW_DYN_LOCK() mtx_lock(&ipfw_dyn_mtx) +#define IPFW_DYN_UNLOCK() mtx_unlock(&ipfw_dyn_mtx) +#define IPFW_DYN_LOCK_ASSERT() mtx_assert(&ipfw_dyn_mtx, MA_OWNED) + +void +ipfw_dyn_unlock(void) +{ + IPFW_DYN_UNLOCK(); +} + +/* + * Timeouts for various events in handing dynamic rules. + */ +static VNET_DEFINE(u_int32_t, dyn_ack_lifetime); +static VNET_DEFINE(u_int32_t, dyn_syn_lifetime); +static VNET_DEFINE(u_int32_t, dyn_fin_lifetime); +static VNET_DEFINE(u_int32_t, dyn_rst_lifetime); +static VNET_DEFINE(u_int32_t, dyn_udp_lifetime); +static VNET_DEFINE(u_int32_t, dyn_short_lifetime); + +#define V_dyn_ack_lifetime VNET(dyn_ack_lifetime) +#define V_dyn_syn_lifetime VNET(dyn_syn_lifetime) +#define V_dyn_fin_lifetime VNET(dyn_fin_lifetime) +#define V_dyn_rst_lifetime VNET(dyn_rst_lifetime) +#define V_dyn_udp_lifetime VNET(dyn_udp_lifetime) +#define V_dyn_short_lifetime VNET(dyn_short_lifetime) + +/* + * Keepalives are sent if dyn_keepalive is set. They are sent every + * dyn_keepalive_period seconds, in the last dyn_keepalive_interval + * seconds of lifetime of a rule. + * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower + * than dyn_keepalive_period. + */ + +static VNET_DEFINE(u_int32_t, dyn_keepalive_interval); +static VNET_DEFINE(u_int32_t, dyn_keepalive_period); +static VNET_DEFINE(u_int32_t, dyn_keepalive); + +#define V_dyn_keepalive_interval VNET(dyn_keepalive_interval) +#define V_dyn_keepalive_period VNET(dyn_keepalive_period) +#define V_dyn_keepalive VNET(dyn_keepalive) + +static VNET_DEFINE(u_int32_t, dyn_count); /* # of dynamic rules */ +static VNET_DEFINE(u_int32_t, dyn_max); /* max # of dynamic rules */ + +#define V_dyn_count VNET(dyn_count) +#define V_dyn_max VNET(dyn_max) + +#ifdef SYSCTL_NODE + +SYSBEGIN(f2) + +SYSCTL_DECL(_net_inet_ip_fw); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_buckets, + CTLFLAG_RW, &VNET_NAME(dyn_buckets), 0, + "Number of dyn. buckets"); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, + CTLFLAG_RD, &VNET_NAME(curr_dyn_buckets), 0, + "Current Number of dyn. buckets"); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_count, + CTLFLAG_RD, &VNET_NAME(dyn_count), 0, + "Number of dyn. rules"); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_max, + CTLFLAG_RW, &VNET_NAME(dyn_max), 0, + "Max number of dyn. rules"); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, + CTLFLAG_RW, &VNET_NAME(dyn_ack_lifetime), 0, + "Lifetime of dyn. rules for acks"); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, + CTLFLAG_RW, &VNET_NAME(dyn_syn_lifetime), 0, + "Lifetime of dyn. rules for syn"); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, + CTLFLAG_RW, &VNET_NAME(dyn_fin_lifetime), 0, + "Lifetime of dyn. rules for fin"); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, + CTLFLAG_RW, &VNET_NAME(dyn_rst_lifetime), 0, + "Lifetime of dyn. rules for rst"); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, + CTLFLAG_RW, &VNET_NAME(dyn_udp_lifetime), 0, + "Lifetime of dyn. rules for UDP"); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, + CTLFLAG_RW, &VNET_NAME(dyn_short_lifetime), 0, + "Lifetime of dyn. rules for other situations"); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, + CTLFLAG_RW, &VNET_NAME(dyn_keepalive), 0, + "Enable keepalives for dyn. rules"); + +SYSEND + +#endif /* SYSCTL_NODE */ + + +static __inline int +hash_packet6(struct ipfw_flow_id *id) +{ + u_int32_t i; + i = (id->dst_ip6.__u6_addr.__u6_addr32[2]) ^ + (id->dst_ip6.__u6_addr.__u6_addr32[3]) ^ + (id->src_ip6.__u6_addr.__u6_addr32[2]) ^ + (id->src_ip6.__u6_addr.__u6_addr32[3]) ^ + (id->dst_port) ^ (id->src_port); + return i; +} + +/* + * IMPORTANT: the hash function for dynamic rules must be commutative + * in source and destination (ip,port), because rules are bidirectional + * and we want to find both in the same bucket. + */ +static __inline int +hash_packet(struct ipfw_flow_id *id) +{ + u_int32_t i; + +#ifdef INET6 + if (IS_IP6_FLOW_ID(id)) + i = hash_packet6(id); + else +#endif /* INET6 */ + i = (id->dst_ip) ^ (id->src_ip) ^ (id->dst_port) ^ (id->src_port); + i &= (V_curr_dyn_buckets - 1); + return i; +} + +static __inline void +unlink_dyn_rule_print(struct ipfw_flow_id *id) +{ + struct in_addr da; +#ifdef INET6 + char src[INET6_ADDRSTRLEN], dst[INET6_ADDRSTRLEN]; +#else + char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN]; +#endif + +#ifdef INET6 + if (IS_IP6_FLOW_ID(id)) { + ip6_sprintf(src, &id->src_ip6); + ip6_sprintf(dst, &id->dst_ip6); + } else +#endif + { + da.s_addr = htonl(id->src_ip); + inet_ntoa_r(da, src); + da.s_addr = htonl(id->dst_ip); + inet_ntoa_r(da, dst); + } + printf("ipfw: unlink entry %s %d -> %s %d, %d left\n", + src, id->src_port, dst, id->dst_port, V_dyn_count - 1); +} + +/** + * unlink a dynamic rule from a chain. prev is a pointer to + * the previous one, q is a pointer to the rule to delete, + * head is a pointer to the head of the queue. + * Modifies q and potentially also head. + */ +#define UNLINK_DYN_RULE(prev, head, q) { \ + ipfw_dyn_rule *old_q = q; \ + \ + /* remove a refcount to the parent */ \ + if (q->dyn_type == O_LIMIT) \ + q->parent->count--; \ + DEB(unlink_dyn_rule_print(&q->id);) \ + if (prev != NULL) \ + prev->next = q = q->next; \ + else \ + head = q = q->next; \ + V_dyn_count--; \ + uma_zfree(ipfw_dyn_rule_zone, old_q); } + +#define TIME_LEQ(a,b) ((int)((a)-(b)) <= 0) + +/** + * Remove dynamic rules pointing to "rule", or all of them if rule == NULL. + * + * If keep_me == NULL, rules are deleted even if not expired, + * otherwise only expired rules are removed. + * + * The value of the second parameter is also used to point to identify + * a rule we absolutely do not want to remove (e.g. because we are + * holding a reference to it -- this is the case with O_LIMIT_PARENT + * rules). The pointer is only used for comparison, so any non-null + * value will do. + */ +static void +remove_dyn_rule(struct ip_fw *rule, ipfw_dyn_rule *keep_me) +{ + static u_int32_t last_remove = 0; + +#define FORCE (keep_me == NULL) + + ipfw_dyn_rule *prev, *q; + int i, pass = 0, max_pass = 0; + + IPFW_DYN_LOCK_ASSERT(); + + if (V_ipfw_dyn_v == NULL || V_dyn_count == 0) + return; + /* do not expire more than once per second, it is useless */ + if (!FORCE && last_remove == time_uptime) + return; + last_remove = time_uptime; + + /* + * because O_LIMIT refer to parent rules, during the first pass only + * remove child and mark any pending LIMIT_PARENT, and remove + * them in a second pass. + */ +next_pass: + for (i = 0 ; i < V_curr_dyn_buckets ; i++) { + for (prev=NULL, q = V_ipfw_dyn_v[i] ; q ; ) { + /* + * Logic can become complex here, so we split tests. + */ + if (q == keep_me) + goto next; + if (rule != NULL && rule != q->rule) + goto next; /* not the one we are looking for */ + if (q->dyn_type == O_LIMIT_PARENT) { + /* + * handle parent in the second pass, + * record we need one. + */ + max_pass = 1; + if (pass == 0) + goto next; + if (FORCE && q->count != 0 ) { + /* XXX should not happen! */ + printf("ipfw: OUCH! cannot remove rule," + " count %d\n", q->count); + } + } else { + if (!FORCE && + !TIME_LEQ( q->expire, time_uptime )) + goto next; + } + if (q->dyn_type != O_LIMIT_PARENT || !q->count) { + UNLINK_DYN_RULE(prev, V_ipfw_dyn_v[i], q); + continue; + } +next: + prev=q; + q=q->next; + } + } + if (pass++ < max_pass) + goto next_pass; +} + +void +ipfw_remove_dyn_children(struct ip_fw *rule) +{ + IPFW_DYN_LOCK(); + remove_dyn_rule(rule, NULL /* force removal */); + IPFW_DYN_UNLOCK(); +} + +/** + * lookup a dynamic rule, locked version + */ +static ipfw_dyn_rule * +lookup_dyn_rule_locked(struct ipfw_flow_id *pkt, int *match_direction, + struct tcphdr *tcp) +{ + /* + * stateful ipfw extensions. + * Lookup into dynamic session queue + */ +#define MATCH_REVERSE 0 +#define MATCH_FORWARD 1 +#define MATCH_NONE 2 +#define MATCH_UNKNOWN 3 + int i, dir = MATCH_NONE; + ipfw_dyn_rule *prev, *q=NULL; + + IPFW_DYN_LOCK_ASSERT(); + + if (V_ipfw_dyn_v == NULL) + goto done; /* not found */ + i = hash_packet( pkt ); + for (prev=NULL, q = V_ipfw_dyn_v[i] ; q != NULL ; ) { + if (q->dyn_type == O_LIMIT_PARENT && q->count) + goto next; + if (TIME_LEQ( q->expire, time_uptime)) { /* expire entry */ + UNLINK_DYN_RULE(prev, V_ipfw_dyn_v[i], q); + continue; + } + if (pkt->proto == q->id.proto && + q->dyn_type != O_LIMIT_PARENT) { + if (IS_IP6_FLOW_ID(pkt)) { + if (IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6), + &(q->id.src_ip6)) && + IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6), + &(q->id.dst_ip6)) && + pkt->src_port == q->id.src_port && + pkt->dst_port == q->id.dst_port ) { + dir = MATCH_FORWARD; + break; + } + if (IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6), + &(q->id.dst_ip6)) && + IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6), + &(q->id.src_ip6)) && + pkt->src_port == q->id.dst_port && + pkt->dst_port == q->id.src_port ) { + dir = MATCH_REVERSE; + break; + } + } else { + if (pkt->src_ip == q->id.src_ip && + pkt->dst_ip == q->id.dst_ip && + pkt->src_port == q->id.src_port && + pkt->dst_port == q->id.dst_port ) { + dir = MATCH_FORWARD; + break; + } + if (pkt->src_ip == q->id.dst_ip && + pkt->dst_ip == q->id.src_ip && + pkt->src_port == q->id.dst_port && + pkt->dst_port == q->id.src_port ) { + dir = MATCH_REVERSE; + break; + } + } + } +next: + prev = q; + q = q->next; + } + if (q == NULL) + goto done; /* q = NULL, not found */ + + if ( prev != NULL) { /* found and not in front */ + prev->next = q->next; + q->next = V_ipfw_dyn_v[i]; + V_ipfw_dyn_v[i] = q; + } + if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */ + u_char flags = pkt->_flags & (TH_FIN|TH_SYN|TH_RST); + +#define BOTH_SYN (TH_SYN | (TH_SYN << 8)) +#define BOTH_FIN (TH_FIN | (TH_FIN << 8)) + q->state |= (dir == MATCH_FORWARD ) ? flags : (flags << 8); + switch (q->state) { + case TH_SYN: /* opening */ + q->expire = time_uptime + V_dyn_syn_lifetime; + break; + + case BOTH_SYN: /* move to established */ + case BOTH_SYN | TH_FIN : /* one side tries to close */ + case BOTH_SYN | (TH_FIN << 8) : + if (tcp) { +#define _SEQ_GE(a,b) ((int)(a) - (int)(b) >= 0) + u_int32_t ack = ntohl(tcp->th_ack); + if (dir == MATCH_FORWARD) { + if (q->ack_fwd == 0 || _SEQ_GE(ack, q->ack_fwd)) + q->ack_fwd = ack; + else { /* ignore out-of-sequence */ + break; + } + } else { + if (q->ack_rev == 0 || _SEQ_GE(ack, q->ack_rev)) + q->ack_rev = ack; + else { /* ignore out-of-sequence */ + break; + } + } + } + q->expire = time_uptime + V_dyn_ack_lifetime; + break; + + case BOTH_SYN | BOTH_FIN: /* both sides closed */ + if (V_dyn_fin_lifetime >= V_dyn_keepalive_period) + V_dyn_fin_lifetime = V_dyn_keepalive_period - 1; + q->expire = time_uptime + V_dyn_fin_lifetime; + break; + + default: +#if 0 + /* + * reset or some invalid combination, but can also + * occur if we use keep-state the wrong way. + */ + if ( (q->state & ((TH_RST << 8)|TH_RST)) == 0) + printf("invalid state: 0x%x\n", q->state); +#endif + if (V_dyn_rst_lifetime >= V_dyn_keepalive_period) + V_dyn_rst_lifetime = V_dyn_keepalive_period - 1; + q->expire = time_uptime + V_dyn_rst_lifetime; + break; + } + } else if (pkt->proto == IPPROTO_UDP) { + q->expire = time_uptime + V_dyn_udp_lifetime; + } else { + /* other protocols */ + q->expire = time_uptime + V_dyn_short_lifetime; + } +done: + if (match_direction) + *match_direction = dir; + return q; +} + +ipfw_dyn_rule * +ipfw_lookup_dyn_rule(struct ipfw_flow_id *pkt, int *match_direction, + struct tcphdr *tcp) +{ + ipfw_dyn_rule *q; + + IPFW_DYN_LOCK(); + q = lookup_dyn_rule_locked(pkt, match_direction, tcp); + if (q == NULL) + IPFW_DYN_UNLOCK(); + /* NB: return table locked when q is not NULL */ + return q; +} + +static void +realloc_dynamic_table(void) +{ + IPFW_DYN_LOCK_ASSERT(); + + /* + * Try reallocation, make sure we have a power of 2 and do + * not allow more than 64k entries. In case of overflow, + * default to 1024. + */ + + if (V_dyn_buckets > 65536) + V_dyn_buckets = 1024; + if ((V_dyn_buckets & (V_dyn_buckets-1)) != 0) { /* not a power of 2 */ + V_dyn_buckets = V_curr_dyn_buckets; /* reset */ + return; + } + V_curr_dyn_buckets = V_dyn_buckets; + if (V_ipfw_dyn_v != NULL) + free(V_ipfw_dyn_v, M_IPFW); + for (;;) { + V_ipfw_dyn_v = malloc(V_curr_dyn_buckets * sizeof(ipfw_dyn_rule *), + M_IPFW, M_NOWAIT | M_ZERO); + if (V_ipfw_dyn_v != NULL || V_curr_dyn_buckets <= 2) + break; + V_curr_dyn_buckets /= 2; + } +} + +/** + * Install state of type 'type' for a dynamic session. + * The hash table contains two type of rules: + * - regular rules (O_KEEP_STATE) + * - rules for sessions with limited number of sess per user + * (O_LIMIT). When they are created, the parent is + * increased by 1, and decreased on delete. In this case, + * the third parameter is the parent rule and not the chain. + * - "parent" rules for the above (O_LIMIT_PARENT). + */ +static ipfw_dyn_rule * +add_dyn_rule(struct ipfw_flow_id *id, u_int8_t dyn_type, struct ip_fw *rule) +{ + ipfw_dyn_rule *r; + int i; + + IPFW_DYN_LOCK_ASSERT(); + + if (V_ipfw_dyn_v == NULL || + (V_dyn_count == 0 && V_dyn_buckets != V_curr_dyn_buckets)) { + realloc_dynamic_table(); + if (V_ipfw_dyn_v == NULL) + return NULL; /* failed ! */ + } + i = hash_packet(id); + + r = uma_zalloc(ipfw_dyn_rule_zone, M_NOWAIT | M_ZERO); + if (r == NULL) { + printf ("ipfw: sorry cannot allocate state\n"); + return NULL; + } + + /* increase refcount on parent, and set pointer */ + if (dyn_type == O_LIMIT) { + ipfw_dyn_rule *parent = (ipfw_dyn_rule *)rule; + if ( parent->dyn_type != O_LIMIT_PARENT) + panic("invalid parent"); + parent->count++; + r->parent = parent; + rule = parent->rule; + } + + r->id = *id; + r->expire = time_uptime + V_dyn_syn_lifetime; + r->rule = rule; + r->dyn_type = dyn_type; + r->pcnt = r->bcnt = 0; + r->count = 0; + + r->bucket = i; + r->next = V_ipfw_dyn_v[i]; + V_ipfw_dyn_v[i] = r; + V_dyn_count++; + DEB({ + struct in_addr da; +#ifdef INET6 + char src[INET6_ADDRSTRLEN]; + char dst[INET6_ADDRSTRLEN]; +#else + char src[INET_ADDRSTRLEN]; + char dst[INET_ADDRSTRLEN]; +#endif + +#ifdef INET6 + if (IS_IP6_FLOW_ID(&(r->id))) { + ip6_sprintf(src, &r->id.src_ip6); + ip6_sprintf(dst, &r->id.dst_ip6); + } else +#endif + { + da.s_addr = htonl(r->id.src_ip); + inet_ntoa_r(da, src); + da.s_addr = htonl(r->id.dst_ip); + inet_ntoa_r(da, dst); + } + printf("ipfw: add dyn entry ty %d %s %d -> %s %d, total %d\n", + dyn_type, src, r->id.src_port, dst, r->id.dst_port, + V_dyn_count); + }) + return r; +} + +/** + * lookup dynamic parent rule using pkt and rule as search keys. + * If the lookup fails, then install one. + */ +static ipfw_dyn_rule * +lookup_dyn_parent(struct ipfw_flow_id *pkt, struct ip_fw *rule) +{ + ipfw_dyn_rule *q; + int i; + + IPFW_DYN_LOCK_ASSERT(); + + if (V_ipfw_dyn_v) { + int is_v6 = IS_IP6_FLOW_ID(pkt); + i = hash_packet( pkt ); + for (q = V_ipfw_dyn_v[i] ; q != NULL ; q=q->next) + if (q->dyn_type == O_LIMIT_PARENT && + rule== q->rule && + pkt->proto == q->id.proto && + pkt->src_port == q->id.src_port && + pkt->dst_port == q->id.dst_port && + ( + (is_v6 && + IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6), + &(q->id.src_ip6)) && + IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6), + &(q->id.dst_ip6))) || + (!is_v6 && + pkt->src_ip == q->id.src_ip && + pkt->dst_ip == q->id.dst_ip) + ) + ) { + q->expire = time_uptime + V_dyn_short_lifetime; + DEB(printf("ipfw: lookup_dyn_parent found 0x%p\n",q);) + return q; + } + } + return add_dyn_rule(pkt, O_LIMIT_PARENT, rule); +} + +/** + * Install dynamic state for rule type cmd->o.opcode + * + * Returns 1 (failure) if state is not installed because of errors or because + * session limitations are enforced. + */ +int +ipfw_install_state(struct ip_fw *rule, ipfw_insn_limit *cmd, + struct ip_fw_args *args, uint32_t tablearg) +{ + static int last_log; + ipfw_dyn_rule *q; + struct in_addr da; +#ifdef INET6 + char src[INET6_ADDRSTRLEN + 2], dst[INET6_ADDRSTRLEN + 2]; +#else + char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN]; +#endif + + src[0] = '\0'; + dst[0] = '\0'; + + IPFW_DYN_LOCK(); + + DEB( +#ifdef INET6 + if (IS_IP6_FLOW_ID(&(args->f_id))) { + ip6_sprintf(src, &args->f_id.src_ip6); + ip6_sprintf(dst, &args->f_id.dst_ip6); + } else +#endif + { + da.s_addr = htonl(args->f_id.src_ip); + inet_ntoa_r(da, src); + da.s_addr = htonl(args->f_id.dst_ip); + inet_ntoa_r(da, dst); + } + printf("ipfw: %s: type %d %s %u -> %s %u\n", + __func__, cmd->o.opcode, src, args->f_id.src_port, + dst, args->f_id.dst_port); + src[0] = '\0'; + dst[0] = '\0'; + ) + + q = lookup_dyn_rule_locked(&args->f_id, NULL, NULL); + + if (q != NULL) { /* should never occur */ + if (last_log != time_uptime) { + last_log = time_uptime; + printf("ipfw: %s: entry already present, done\n", + __func__); + } + IPFW_DYN_UNLOCK(); + return (0); + } + + if (V_dyn_count >= V_dyn_max) + /* Run out of slots, try to remove any expired rule. */ + remove_dyn_rule(NULL, (ipfw_dyn_rule *)1); + + if (V_dyn_count >= V_dyn_max) { + if (last_log != time_uptime) { + last_log = time_uptime; + printf("ipfw: %s: Too many dynamic rules\n", __func__); + } + IPFW_DYN_UNLOCK(); + return (1); /* cannot install, notify caller */ + } + + switch (cmd->o.opcode) { + case O_KEEP_STATE: /* bidir rule */ + add_dyn_rule(&args->f_id, O_KEEP_STATE, rule); + break; + + case O_LIMIT: { /* limit number of sessions */ + struct ipfw_flow_id id; + ipfw_dyn_rule *parent; + uint32_t conn_limit; + uint16_t limit_mask = cmd->limit_mask; + + conn_limit = (cmd->conn_limit == IP_FW_TABLEARG) ? + tablearg : cmd->conn_limit; + + DEB( + if (cmd->conn_limit == IP_FW_TABLEARG) + printf("ipfw: %s: O_LIMIT rule, conn_limit: %u " + "(tablearg)\n", __func__, conn_limit); + else + printf("ipfw: %s: O_LIMIT rule, conn_limit: %u\n", + __func__, conn_limit); + ) + + id.dst_ip = id.src_ip = id.dst_port = id.src_port = 0; + id.proto = args->f_id.proto; + id.addr_type = args->f_id.addr_type; + id.fib = M_GETFIB(args->m); + + if (IS_IP6_FLOW_ID (&(args->f_id))) { + if (limit_mask & DYN_SRC_ADDR) + id.src_ip6 = args->f_id.src_ip6; + if (limit_mask & DYN_DST_ADDR) + id.dst_ip6 = args->f_id.dst_ip6; + } else { + if (limit_mask & DYN_SRC_ADDR) + id.src_ip = args->f_id.src_ip; + if (limit_mask & DYN_DST_ADDR) + id.dst_ip = args->f_id.dst_ip; + } + if (limit_mask & DYN_SRC_PORT) + id.src_port = args->f_id.src_port; + if (limit_mask & DYN_DST_PORT) + id.dst_port = args->f_id.dst_port; + if ((parent = lookup_dyn_parent(&id, rule)) == NULL) { + printf("ipfw: %s: add parent failed\n", __func__); + IPFW_DYN_UNLOCK(); + return (1); + } + + if (parent->count >= conn_limit) { + /* See if we can remove some expired rule. */ + remove_dyn_rule(rule, parent); + if (parent->count >= conn_limit) { + if (V_fw_verbose && last_log != time_uptime) { + last_log = time_uptime; +#ifdef INET6 + /* + * XXX IPv6 flows are not + * supported yet. + */ + if (IS_IP6_FLOW_ID(&(args->f_id))) { + char ip6buf[INET6_ADDRSTRLEN]; + snprintf(src, sizeof(src), + "[%s]", ip6_sprintf(ip6buf, + &args->f_id.src_ip6)); + snprintf(dst, sizeof(dst), + "[%s]", ip6_sprintf(ip6buf, + &args->f_id.dst_ip6)); + } else +#endif + { + da.s_addr = + htonl(args->f_id.src_ip); + inet_ntoa_r(da, src); + da.s_addr = + htonl(args->f_id.dst_ip); + inet_ntoa_r(da, dst); + } + log(LOG_SECURITY | LOG_DEBUG, + "ipfw: %d %s %s:%u -> %s:%u, %s\n", + parent->rule->rulenum, + "drop session", + src, (args->f_id.src_port), + dst, (args->f_id.dst_port), + "too many entries"); + } + IPFW_DYN_UNLOCK(); + return (1); + } + } + add_dyn_rule(&args->f_id, O_LIMIT, (struct ip_fw *)parent); + break; + } + default: + printf("ipfw: %s: unknown dynamic rule type %u\n", + __func__, cmd->o.opcode); + IPFW_DYN_UNLOCK(); + return (1); + } + + /* XXX just set lifetime */ + lookup_dyn_rule_locked(&args->f_id, NULL, NULL); + + IPFW_DYN_UNLOCK(); + return (0); +} + +/* + * Generate a TCP packet, containing either a RST or a keepalive. + * When flags & TH_RST, we are sending a RST packet, because of a + * "reset" action matched the packet. + * Otherwise we are sending a keepalive, and flags & TH_ + * The 'replyto' mbuf is the mbuf being replied to, if any, and is required + * so that MAC can label the reply appropriately. + */ +struct mbuf * +ipfw_send_pkt(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t seq, + u_int32_t ack, int flags) +{ +#ifndef __FreeBSD__ + return NULL; +#else + struct mbuf *m; + int len, dir; + struct ip *h = NULL; /* stupid compiler */ +#ifdef INET6 + struct ip6_hdr *h6 = NULL; +#endif + struct tcphdr *th = NULL; + + MGETHDR(m, M_DONTWAIT, MT_DATA); + if (m == NULL) + return (NULL); + + M_SETFIB(m, id->fib); +#ifdef MAC + if (replyto != NULL) + mac_netinet_firewall_reply(replyto, m); + else + mac_netinet_firewall_send(m); +#else + (void)replyto; /* don't warn about unused arg */ +#endif + + switch (id->addr_type) { + case 4: + len = sizeof(struct ip) + sizeof(struct tcphdr); + break; +#ifdef INET6 + case 6: + len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); + break; +#endif + default: + /* XXX: log me?!? */ + FREE_PKT(m); + return (NULL); + } + dir = ((flags & (TH_SYN | TH_RST)) == TH_SYN); + + m->m_data += max_linkhdr; + m->m_flags |= M_SKIP_FIREWALL; + m->m_pkthdr.len = m->m_len = len; + m->m_pkthdr.rcvif = NULL; + bzero(m->m_data, len); + + switch (id->addr_type) { + case 4: + h = mtod(m, struct ip *); + + /* prepare for checksum */ + h->ip_p = IPPROTO_TCP; + h->ip_len = htons(sizeof(struct tcphdr)); + if (dir) { + h->ip_src.s_addr = htonl(id->src_ip); + h->ip_dst.s_addr = htonl(id->dst_ip); + } else { + h->ip_src.s_addr = htonl(id->dst_ip); + h->ip_dst.s_addr = htonl(id->src_ip); + } + + th = (struct tcphdr *)(h + 1); + break; +#ifdef INET6 + case 6: + h6 = mtod(m, struct ip6_hdr *); + + /* prepare for checksum */ + h6->ip6_nxt = IPPROTO_TCP; + h6->ip6_plen = htons(sizeof(struct tcphdr)); + if (dir) { + h6->ip6_src = id->src_ip6; + h6->ip6_dst = id->dst_ip6; + } else { + h6->ip6_src = id->dst_ip6; + h6->ip6_dst = id->src_ip6; + } + + th = (struct tcphdr *)(h6 + 1); + break; +#endif + } + + if (dir) { + th->th_sport = htons(id->src_port); + th->th_dport = htons(id->dst_port); + } else { + th->th_sport = htons(id->dst_port); + th->th_dport = htons(id->src_port); + } + th->th_off = sizeof(struct tcphdr) >> 2; + + if (flags & TH_RST) { + if (flags & TH_ACK) { + th->th_seq = htonl(ack); + th->th_flags = TH_RST; + } else { + if (flags & TH_SYN) + seq++; + th->th_ack = htonl(seq); + th->th_flags = TH_RST | TH_ACK; + } + } else { + /* + * Keepalive - use caller provided sequence numbers + */ + th->th_seq = htonl(seq); + th->th_ack = htonl(ack); + th->th_flags = TH_ACK; + } + + switch (id->addr_type) { + case 4: + th->th_sum = in_cksum(m, len); + + /* finish the ip header */ + h->ip_v = 4; + h->ip_hl = sizeof(*h) >> 2; + h->ip_tos = IPTOS_LOWDELAY; + h->ip_off = 0; + /* ip_len must be in host format for ip_output */ + h->ip_len = len; + h->ip_ttl = V_ip_defttl; + h->ip_sum = 0; + break; +#ifdef INET6 + case 6: + th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(*h6), + sizeof(struct tcphdr)); + + /* finish the ip6 header */ + h6->ip6_vfc |= IPV6_VERSION; + h6->ip6_hlim = IPV6_DEFHLIM; + break; +#endif + } + + return (m); +#endif /* __FreeBSD__ */ +} + +/* + * This procedure is only used to handle keepalives. It is invoked + * every dyn_keepalive_period + */ +static void +ipfw_tick(void * vnetx) +{ + struct mbuf *m0, *m, *mnext, **mtailp; +#ifdef INET6 + struct mbuf *m6, **m6_tailp; +#endif + int i; + ipfw_dyn_rule *q; +#ifdef VIMAGE + struct vnet *vp = vnetx; +#endif + + CURVNET_SET(vp); + if (V_dyn_keepalive == 0 || V_ipfw_dyn_v == NULL || V_dyn_count == 0) + goto done; + + /* + * We make a chain of packets to go out here -- not deferring + * until after we drop the IPFW dynamic rule lock would result + * in a lock order reversal with the normal packet input -> ipfw + * call stack. + */ + m0 = NULL; + mtailp = &m0; +#ifdef INET6 + m6 = NULL; + m6_tailp = &m6; +#endif + IPFW_DYN_LOCK(); + for (i = 0 ; i < V_curr_dyn_buckets ; i++) { + for (q = V_ipfw_dyn_v[i] ; q ; q = q->next ) { + if (q->dyn_type == O_LIMIT_PARENT) + continue; + if (q->id.proto != IPPROTO_TCP) + continue; + if ( (q->state & BOTH_SYN) != BOTH_SYN) + continue; + if (TIME_LEQ(time_uptime + V_dyn_keepalive_interval, + q->expire)) + continue; /* too early */ + if (TIME_LEQ(q->expire, time_uptime)) + continue; /* too late, rule expired */ + + m = ipfw_send_pkt(NULL, &(q->id), q->ack_rev - 1, + q->ack_fwd, TH_SYN); + mnext = ipfw_send_pkt(NULL, &(q->id), q->ack_fwd - 1, + q->ack_rev, 0); + + switch (q->id.addr_type) { + case 4: + if (m != NULL) { + *mtailp = m; + mtailp = &(*mtailp)->m_nextpkt; + } + if (mnext != NULL) { + *mtailp = mnext; + mtailp = &(*mtailp)->m_nextpkt; + } + break; +#ifdef INET6 + case 6: + if (m != NULL) { + *m6_tailp = m; + m6_tailp = &(*m6_tailp)->m_nextpkt; + } + if (mnext != NULL) { + *m6_tailp = mnext; + m6_tailp = &(*m6_tailp)->m_nextpkt; + } + break; +#endif + } + + m = mnext = NULL; + } + } + IPFW_DYN_UNLOCK(); + for (m = mnext = m0; m != NULL; m = mnext) { + mnext = m->m_nextpkt; + m->m_nextpkt = NULL; + ip_output(m, NULL, NULL, 0, NULL, NULL); + } +#ifdef INET6 + for (m = mnext = m6; m != NULL; m = mnext) { + mnext = m->m_nextpkt; + m->m_nextpkt = NULL; + ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); + } +#endif +done: + callout_reset(&V_ipfw_timeout, V_dyn_keepalive_period * hz, + ipfw_tick, vnetx); + CURVNET_RESTORE(); +} + +void +ipfw_dyn_attach(void) +{ + ipfw_dyn_rule_zone = uma_zcreate("IPFW dynamic rule", + sizeof(ipfw_dyn_rule), NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + + IPFW_DYN_LOCK_INIT(); +} + +void +ipfw_dyn_detach(void) +{ + uma_zdestroy(ipfw_dyn_rule_zone); + IPFW_DYN_LOCK_DESTROY(); +} + +void +ipfw_dyn_init(void) +{ + V_ipfw_dyn_v = NULL; + V_dyn_buckets = 256; /* must be power of 2 */ + V_curr_dyn_buckets = 256; /* must be power of 2 */ + + V_dyn_ack_lifetime = 300; + V_dyn_syn_lifetime = 20; + V_dyn_fin_lifetime = 1; + V_dyn_rst_lifetime = 1; + V_dyn_udp_lifetime = 10; + V_dyn_short_lifetime = 5; + + V_dyn_keepalive_interval = 20; + V_dyn_keepalive_period = 5; + V_dyn_keepalive = 1; /* do send keepalives */ + + V_dyn_max = 4096; /* max # of dynamic rules */ + callout_init(&V_ipfw_timeout, CALLOUT_MPSAFE); + callout_reset(&V_ipfw_timeout, hz, ipfw_tick, curvnet); +} + +void +ipfw_dyn_uninit(int pass) +{ + if (pass == 0) + callout_drain(&V_ipfw_timeout); + else { + if (V_ipfw_dyn_v != NULL) + free(V_ipfw_dyn_v, M_IPFW); + } +} + +int +ipfw_dyn_len(void) +{ + return (V_ipfw_dyn_v == NULL) ? 0 : + (V_dyn_count * sizeof(ipfw_dyn_rule)); +} + +void +ipfw_get_dynamic(char **pbp, const char *ep) +{ + ipfw_dyn_rule *p, *last = NULL; + char *bp; + int i; + + if (V_ipfw_dyn_v == NULL) + return; + bp = *pbp; + + IPFW_DYN_LOCK(); + for (i = 0 ; i < V_curr_dyn_buckets; i++) + for (p = V_ipfw_dyn_v[i] ; p != NULL; p = p->next) { + if (bp + sizeof *p <= ep) { + ipfw_dyn_rule *dst = + (ipfw_dyn_rule *)bp; + bcopy(p, dst, sizeof *p); + bcopy(&(p->rule->rulenum), &(dst->rule), + sizeof(p->rule->rulenum)); + /* + * store set number into high word of + * dst->rule pointer. + */ + bcopy(&(p->rule->set), + (char *)&dst->rule + + sizeof(p->rule->rulenum), + sizeof(p->rule->set)); + /* + * store a non-null value in "next". + * The userland code will interpret a + * NULL here as a marker + * for the last dynamic rule. + */ + bcopy(&dst, &dst->next, sizeof(dst)); + last = dst; + dst->expire = + TIME_LEQ(dst->expire, time_uptime) ? + 0 : dst->expire - time_uptime ; + bp += sizeof(ipfw_dyn_rule); + } + } + IPFW_DYN_UNLOCK(); + if (last != NULL) /* mark last dynamic rule */ + bzero(&last->next, sizeof(last)); + *pbp = bp; +} +/* end of file */ diff --git a/sys/netinet/ipfw/ip_fw_log.c b/sys/netinet/ipfw/ip_fw_log.c new file mode 100644 index 0000000..93bd19b --- /dev/null +++ b/sys/netinet/ipfw/ip_fw_log.c @@ -0,0 +1,435 @@ +/*- + * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +/* + * Logging support for ipfw + */ + +#if !defined(KLD_MODULE) +#include "opt_ipfw.h" +#include "opt_ipdivert.h" +#include "opt_ipdn.h" +#include "opt_inet.h" +#ifndef INET +#error IPFIREWALL requires INET. +#endif /* INET */ +#endif +#include "opt_inet6.h" +#include "opt_ipsec.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/mbuf.h> +#include <sys/kernel.h> +#include <sys/socket.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> +#include <net/ethernet.h> /* for ETHERTYPE_IP */ +#include <net/if.h> +#include <net/vnet.h> +#include <net/if_types.h> /* for IFT_ETHER */ +#include <net/bpf.h> /* for BPF */ + +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/ip_icmp.h> +#include <netinet/ip_var.h> +#include <netinet/ip_fw.h> +#include <netinet/ipfw/ip_fw_private.h> +#include <netinet/tcp_var.h> +#include <netinet/udp.h> + +#include <netinet/ip6.h> +#include <netinet/icmp6.h> +#ifdef INET6 +#include <netinet6/in6_var.h> /* ip6_sprintf() */ +#endif + +#ifdef MAC +#include <security/mac/mac_framework.h> +#endif + +/* + * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T + * Other macros just cast void * into the appropriate type + */ +#define L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl)) +#define TCP(p) ((struct tcphdr *)(p)) +#define SCTP(p) ((struct sctphdr *)(p)) +#define UDP(p) ((struct udphdr *)(p)) +#define ICMP(p) ((struct icmphdr *)(p)) +#define ICMP6(p) ((struct icmp6_hdr *)(p)) + +#define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0 +#define SNP(buf) buf, sizeof(buf) + +#ifdef WITHOUT_BPF +void +ipfw_log_bpf(int onoff) +{ +} +#else /* !WITHOUT_BPF */ +static struct ifnet *log_if; /* hook to attach to bpf */ + +/* we use this dummy function for all ifnet callbacks */ +static int +log_dummy(struct ifnet *ifp, u_long cmd, caddr_t addr) +{ + return EINVAL; +} + +void +ipfw_log_bpf(int onoff) +{ + struct ifnet *ifp; + + if (onoff) { + if (log_if) + return; + ifp = if_alloc(IFT_ETHER); + if (ifp == NULL) + return; + if_initname(ifp, "ipfw", 0); + ifp->if_mtu = 65536; + ifp->if_flags = IFF_UP | IFF_SIMPLEX | IFF_MULTICAST; + ifp->if_init = (void *)log_dummy; + ifp->if_ioctl = log_dummy; + ifp->if_start = (void *)log_dummy; + ifp->if_output = (void *)log_dummy; + ifp->if_addrlen = 6; + ifp->if_hdrlen = 14; + if_attach(ifp); + ifp->if_baudrate = IF_Mbps(10); + bpfattach(ifp, DLT_EN10MB, 14); + log_if = ifp; + } else { + if (log_if) { + ether_ifdetach(log_if); + if_free(log_if); + } + log_if = NULL; + } +} +#endif /* !WITHOUT_BPF */ + +/* + * We enter here when we have a rule with O_LOG. + * XXX this function alone takes about 2Kbytes of code! + */ +void +ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args, + struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg, + struct ip *ip) +{ + char *action; + int limit_reached = 0; + char action2[40], proto[128], fragment[32]; + + if (V_fw_verbose == 0) { +#ifndef WITHOUT_BPF + struct m_hdr mh; + + if (log_if == NULL || log_if->if_bpf == NULL) + return; + /* BPF treats the "mbuf" as read-only */ + mh.mh_next = m; + mh.mh_len = ETHER_HDR_LEN; + if (args->eh) { /* layer2, use orig hdr */ + mh.mh_data = (char *)args->eh; + } else { + /* add fake header. Later we will store + * more info in the header + */ + mh.mh_data = "DDDDDDSSSSSS\x08\x00"; + } + BPF_MTAP(log_if, (struct mbuf *)&mh); +#endif /* !WITHOUT_BPF */ + return; + } + /* the old 'log' function */ + fragment[0] = '\0'; + proto[0] = '\0'; + + if (f == NULL) { /* bogus pkt */ + if (V_verbose_limit != 0 && V_norule_counter >= V_verbose_limit) + return; + V_norule_counter++; + if (V_norule_counter == V_verbose_limit) + limit_reached = V_verbose_limit; + action = "Refuse"; + } else { /* O_LOG is the first action, find the real one */ + ipfw_insn *cmd = ACTION_PTR(f); + ipfw_insn_log *l = (ipfw_insn_log *)cmd; + + if (l->max_log != 0 && l->log_left == 0) + return; + l->log_left--; + if (l->log_left == 0) + limit_reached = l->max_log; + cmd += F_LEN(cmd); /* point to first action */ + if (cmd->opcode == O_ALTQ) { + ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd; + + snprintf(SNPARGS(action2, 0), "Altq %d", + altq->qid); + cmd += F_LEN(cmd); + } + if (cmd->opcode == O_PROB) + cmd += F_LEN(cmd); + + if (cmd->opcode == O_TAG) + cmd += F_LEN(cmd); + + action = action2; + switch (cmd->opcode) { + case O_DENY: + action = "Deny"; + break; + + case O_REJECT: + if (cmd->arg1==ICMP_REJECT_RST) + action = "Reset"; + else if (cmd->arg1==ICMP_UNREACH_HOST) + action = "Reject"; + else + snprintf(SNPARGS(action2, 0), "Unreach %d", + cmd->arg1); + break; + + case O_UNREACH6: + if (cmd->arg1==ICMP6_UNREACH_RST) + action = "Reset"; + else + snprintf(SNPARGS(action2, 0), "Unreach %d", + cmd->arg1); + break; + + case O_ACCEPT: + action = "Accept"; + break; + case O_COUNT: + action = "Count"; + break; + case O_DIVERT: + snprintf(SNPARGS(action2, 0), "Divert %d", + cmd->arg1); + break; + case O_TEE: + snprintf(SNPARGS(action2, 0), "Tee %d", + cmd->arg1); + break; + case O_SETFIB: + snprintf(SNPARGS(action2, 0), "SetFib %d", + cmd->arg1); + break; + case O_SKIPTO: + snprintf(SNPARGS(action2, 0), "SkipTo %d", + cmd->arg1); + break; + case O_PIPE: + snprintf(SNPARGS(action2, 0), "Pipe %d", + cmd->arg1); + break; + case O_QUEUE: + snprintf(SNPARGS(action2, 0), "Queue %d", + cmd->arg1); + break; + case O_FORWARD_IP: { + ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd; + int len; + struct in_addr dummyaddr; + if (sa->sa.sin_addr.s_addr == INADDR_ANY) + dummyaddr.s_addr = htonl(tablearg); + else + dummyaddr.s_addr = sa->sa.sin_addr.s_addr; + + len = snprintf(SNPARGS(action2, 0), "Forward to %s", + inet_ntoa(dummyaddr)); + + if (sa->sa.sin_port) + snprintf(SNPARGS(action2, len), ":%d", + sa->sa.sin_port); + } + break; + case O_NETGRAPH: + snprintf(SNPARGS(action2, 0), "Netgraph %d", + cmd->arg1); + break; + case O_NGTEE: + snprintf(SNPARGS(action2, 0), "Ngtee %d", + cmd->arg1); + break; + case O_NAT: + action = "Nat"; + break; + case O_REASS: + action = "Reass"; + break; + default: + action = "UNKNOWN"; + break; + } + } + + if (hlen == 0) { /* non-ip */ + snprintf(SNPARGS(proto, 0), "MAC"); + + } else { + int len; +#ifdef INET6 + char src[INET6_ADDRSTRLEN + 2], dst[INET6_ADDRSTRLEN + 2]; +#else + char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN]; +#endif + struct icmphdr *icmp; + struct tcphdr *tcp; + struct udphdr *udp; +#ifdef INET6 + struct ip6_hdr *ip6 = NULL; + struct icmp6_hdr *icmp6; +#endif + src[0] = '\0'; + dst[0] = '\0'; +#ifdef INET6 + if (IS_IP6_FLOW_ID(&(args->f_id))) { + char ip6buf[INET6_ADDRSTRLEN]; + snprintf(src, sizeof(src), "[%s]", + ip6_sprintf(ip6buf, &args->f_id.src_ip6)); + snprintf(dst, sizeof(dst), "[%s]", + ip6_sprintf(ip6buf, &args->f_id.dst_ip6)); + + ip6 = (struct ip6_hdr *)ip; + tcp = (struct tcphdr *)(((char *)ip) + hlen); + udp = (struct udphdr *)(((char *)ip) + hlen); + } else +#endif + { + tcp = L3HDR(struct tcphdr, ip); + udp = L3HDR(struct udphdr, ip); + + inet_ntoa_r(ip->ip_src, src); + inet_ntoa_r(ip->ip_dst, dst); + } + + switch (args->f_id.proto) { + case IPPROTO_TCP: + len = snprintf(SNPARGS(proto, 0), "TCP %s", src); + if (offset == 0) + snprintf(SNPARGS(proto, len), ":%d %s:%d", + ntohs(tcp->th_sport), + dst, + ntohs(tcp->th_dport)); + else + snprintf(SNPARGS(proto, len), " %s", dst); + break; + + case IPPROTO_UDP: + len = snprintf(SNPARGS(proto, 0), "UDP %s", src); + if (offset == 0) + snprintf(SNPARGS(proto, len), ":%d %s:%d", + ntohs(udp->uh_sport), + dst, + ntohs(udp->uh_dport)); + else + snprintf(SNPARGS(proto, len), " %s", dst); + break; + + case IPPROTO_ICMP: + icmp = L3HDR(struct icmphdr, ip); + if (offset == 0) + len = snprintf(SNPARGS(proto, 0), + "ICMP:%u.%u ", + icmp->icmp_type, icmp->icmp_code); + else + len = snprintf(SNPARGS(proto, 0), "ICMP "); + len += snprintf(SNPARGS(proto, len), "%s", src); + snprintf(SNPARGS(proto, len), " %s", dst); + break; +#ifdef INET6 + case IPPROTO_ICMPV6: + icmp6 = (struct icmp6_hdr *)(((char *)ip) + hlen); + if (offset == 0) + len = snprintf(SNPARGS(proto, 0), + "ICMPv6:%u.%u ", + icmp6->icmp6_type, icmp6->icmp6_code); + else + len = snprintf(SNPARGS(proto, 0), "ICMPv6 "); + len += snprintf(SNPARGS(proto, len), "%s", src); + snprintf(SNPARGS(proto, len), " %s", dst); + break; +#endif + default: + len = snprintf(SNPARGS(proto, 0), "P:%d %s", + args->f_id.proto, src); + snprintf(SNPARGS(proto, len), " %s", dst); + break; + } + +#ifdef INET6 + if (IS_IP6_FLOW_ID(&(args->f_id))) { + if (offset & (IP6F_OFF_MASK | IP6F_MORE_FRAG)) + snprintf(SNPARGS(fragment, 0), + " (frag %08x:%d@%d%s)", + args->f_id.extra, + ntohs(ip6->ip6_plen) - hlen, + ntohs(offset & IP6F_OFF_MASK) << 3, + (offset & IP6F_MORE_FRAG) ? "+" : ""); + } else +#endif + { + int ipoff, iplen; + ipoff = ntohs(ip->ip_off); + iplen = ntohs(ip->ip_len); + if (ipoff & (IP_MF | IP_OFFMASK)) + snprintf(SNPARGS(fragment, 0), + " (frag %d:%d@%d%s)", + ntohs(ip->ip_id), iplen - (ip->ip_hl << 2), + offset << 3, + (ipoff & IP_MF) ? "+" : ""); + } + } +#ifdef __FreeBSD__ + if (oif || m->m_pkthdr.rcvif) + log(LOG_SECURITY | LOG_INFO, + "ipfw: %d %s %s %s via %s%s\n", + f ? f->rulenum : -1, + action, proto, oif ? "out" : "in", + oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname, + fragment); + else +#endif + log(LOG_SECURITY | LOG_INFO, + "ipfw: %d %s %s [no if info]%s\n", + f ? f->rulenum : -1, + action, proto, fragment); + if (limit_reached) + log(LOG_SECURITY | LOG_NOTICE, + "ipfw: limit %d reached on entry %d\n", + limit_reached, f ? f->rulenum : -1); +} +/* end of file */ diff --git a/sys/netinet/ipfw/ip_fw_nat.c b/sys/netinet/ipfw/ip_fw_nat.c index cd6a1cf..f30b754 100644 --- a/sys/netinet/ipfw/ip_fw_nat.c +++ b/sys/netinet/ipfw/ip_fw_nat.c @@ -29,114 +29,80 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/systm.h> -#include <sys/condvar.h> #include <sys/eventhandler.h> #include <sys/malloc.h> -#include <sys/mbuf.h> #include <sys/kernel.h> #include <sys/lock.h> -#include <sys/jail.h> #include <sys/module.h> -#include <sys/priv.h> -#include <sys/proc.h> #include <sys/rwlock.h> -#include <sys/socket.h> -#include <sys/socketvar.h> -#include <sys/sysctl.h> -#include <sys/syslog.h> -#include <sys/ucred.h> + +#define IPFW_INTERNAL /* Access to protected data structures in ip_fw.h. */ #include <netinet/libalias/alias.h> #include <netinet/libalias/alias_local.h> -#define IPFW_INTERNAL /* Access to protected data structures in ip_fw.h. */ - #include <net/if.h> #include <netinet/in.h> #include <netinet/ip.h> #include <netinet/ip_var.h> -#include <netinet/ip_icmp.h> #include <netinet/ip_fw.h> +#include <netinet/ipfw/ip_fw_private.h> #include <netinet/tcp.h> -#include <netinet/tcp_timer.h> -#include <netinet/tcp_var.h> -#include <netinet/tcpip.h> #include <netinet/udp.h> -#include <netinet/udp_var.h> #include <machine/in_cksum.h> /* XXX for in_cksum */ -MALLOC_DECLARE(M_IPFW); - static VNET_DEFINE(eventhandler_tag, ifaddr_event_tag); #define V_ifaddr_event_tag VNET(ifaddr_event_tag) -extern ipfw_nat_t *ipfw_nat_ptr; -extern ipfw_nat_cfg_t *ipfw_nat_cfg_ptr; -extern ipfw_nat_cfg_t *ipfw_nat_del_ptr; -extern ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr; -extern ipfw_nat_cfg_t *ipfw_nat_get_log_ptr; - -static void +static void ifaddr_change(void *arg __unused, struct ifnet *ifp) { struct cfg_nat *ptr; struct ifaddr *ifa; + struct ip_fw_chain *chain; - IPFW_WLOCK(&V_layer3_chain); + chain = &V_layer3_chain; + IPFW_WLOCK(chain); /* Check every nat entry... */ - LIST_FOREACH(ptr, &V_layer3_chain.nat, _next) { + LIST_FOREACH(ptr, &chain->nat, _next) { /* ...using nic 'ifp->if_xname' as dynamic alias address. */ - if (strncmp(ptr->if_name, ifp->if_xname, IF_NAMESIZE) == 0) { - if_addr_rlock(ifp); - TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { - if (ifa->ifa_addr == NULL) - continue; - if (ifa->ifa_addr->sa_family != AF_INET) - continue; - ptr->ip = ((struct sockaddr_in *) - (ifa->ifa_addr))->sin_addr; - LibAliasSetAddress(ptr->lib, ptr->ip); - } - if_addr_runlock(ifp); + if (strncmp(ptr->if_name, ifp->if_xname, IF_NAMESIZE) != 0) + continue; + if_addr_rlock(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr == NULL) + continue; + if (ifa->ifa_addr->sa_family != AF_INET) + continue; + ptr->ip = ((struct sockaddr_in *) + (ifa->ifa_addr))->sin_addr; + LibAliasSetAddress(ptr->lib, ptr->ip); } + if_addr_runlock(ifp); } - IPFW_WUNLOCK(&V_layer3_chain); + IPFW_WUNLOCK(chain); } +/* + * delete the pointers for nat entry ix, or all of them if ix < 0 + */ static void -flush_nat_ptrs(const int i) +flush_nat_ptrs(struct ip_fw_chain *chain, const int ix) { - struct ip_fw *rule; - - IPFW_WLOCK_ASSERT(&V_layer3_chain); - for (rule = V_layer3_chain.rules; rule; rule = rule->next) { - ipfw_insn_nat *cmd = (ipfw_insn_nat *)ACTION_PTR(rule); - if (cmd->o.opcode != O_NAT) - continue; - if (cmd->nat != NULL && cmd->nat->id == i) + int i; + ipfw_insn_nat *cmd; + + IPFW_WLOCK_ASSERT(chain); + for (i = 0; i < chain->n_rules; i++) { + cmd = (ipfw_insn_nat *)ACTION_PTR(chain->map[i]); + /* XXX skip log and the like ? */ + if (cmd->o.opcode == O_NAT && cmd->nat != NULL && + (ix < 0 || cmd->nat->id == ix)) cmd->nat = NULL; } } -#define HOOK_NAT(b, p) do { \ - IPFW_WLOCK_ASSERT(&V_layer3_chain); \ - LIST_INSERT_HEAD(b, p, _next); \ - } while (0) - -#define UNHOOK_NAT(p) do { \ - IPFW_WLOCK_ASSERT(&V_layer3_chain); \ - LIST_REMOVE(p, _next); \ - } while (0) - -#define HOOK_REDIR(b, p) do { \ - LIST_INSERT_HEAD(b, p, _next); \ - } while (0) - -#define HOOK_SPOOL(b, p) do { \ - LIST_INSERT_HEAD(b, p, _next); \ - } while (0) - static void del_redir_spool_cfg(struct cfg_nat *n, struct redir_chain *head) { @@ -165,9 +131,9 @@ del_redir_spool_cfg(struct cfg_nat *n, struct redir_chain *head) free(r, M_IPFW); break; default: - printf("unknown redirect mode: %u\n", r->mode); + printf("unknown redirect mode: %u\n", r->mode); /* XXX - panic?!?!? */ - break; + break; } } } @@ -178,7 +144,6 @@ add_redir_spool_cfg(char *buf, struct cfg_nat *ptr) struct cfg_redir *r, *ser_r; struct cfg_spool *s, *ser_s; int cnt, off, i; - char *panic_err; for (cnt = 0, off = 0; cnt < ptr->redir_cnt; cnt++) { ser_r = (struct cfg_redir *)&buf[off]; @@ -201,7 +166,7 @@ add_redir_spool_cfg(char *buf, struct cfg_nat *ptr) remotePortCopy = 0; r->alink[i] = LibAliasRedirectPort(ptr->lib, r->laddr, htons(r->lport + i), r->raddr, - htons(remotePortCopy), r->paddr, + htons(remotePortCopy), r->paddr, htons(r->pport + i), r->proto); if (r->alink[i] == NULL) { r->alink[0] = NULL; @@ -215,30 +180,26 @@ add_redir_spool_cfg(char *buf, struct cfg_nat *ptr) break; default: printf("unknown redirect mode: %u\n", r->mode); - break; + break; + } + /* XXX perhaps return an error instead of panic ? */ + if (r->alink[0] == NULL) + panic("LibAliasRedirect* returned NULL"); + /* LSNAT handling. */ + for (i = 0; i < r->spool_cnt; i++) { + ser_s = (struct cfg_spool *)&buf[off]; + s = malloc(SOF_REDIR, M_IPFW, M_WAITOK | M_ZERO); + memcpy(s, ser_s, SOF_SPOOL); + LibAliasAddServer(ptr->lib, r->alink[0], + s->addr, htons(s->port)); + off += SOF_SPOOL; + /* Hook spool entry. */ + LIST_INSERT_HEAD(&r->spool_chain, s, _next); } - if (r->alink[0] == NULL) { - panic_err = "LibAliasRedirect* returned NULL"; - goto bad; - } else /* LSNAT handling. */ - for (i = 0; i < r->spool_cnt; i++) { - ser_s = (struct cfg_spool *)&buf[off]; - s = malloc(SOF_REDIR, M_IPFW, - M_WAITOK | M_ZERO); - memcpy(s, ser_s, SOF_SPOOL); - LibAliasAddServer(ptr->lib, r->alink[0], - s->addr, htons(s->port)); - off += SOF_SPOOL; - /* Hook spool entry. */ - HOOK_SPOOL(&r->spool_chain, s); - } /* And finally hook this redir entry. */ - HOOK_REDIR(&ptr->redir_chain, r); + LIST_INSERT_HEAD(&ptr->redir_chain, r, _next); } return (1); -bad: - /* something really bad happened: panic! */ - panic("%s\n", panic_err); } static int @@ -252,101 +213,80 @@ ipfw_nat(struct ip_fw_args *args, struct cfg_nat *t, struct mbuf *m) ldt = 0; retval = 0; - if ((mcl = m_megapullup(m, m->m_pkthdr.len)) == - NULL) - goto badnat; - ip = mtod(mcl, struct ip *); - if (args->eh == NULL) { - ip->ip_len = htons(ip->ip_len); - ip->ip_off = htons(ip->ip_off); + mcl = m_megapullup(m, m->m_pkthdr.len); + if (mcl == NULL) { + args->m = NULL; + return (IP_FW_DENY); } + ip = mtod(mcl, struct ip *); - /* + /* * XXX - Libalias checksum offload 'duct tape': - * - * locally generated packets have only - * pseudo-header checksum calculated - * and libalias will screw it[1], so - * mark them for later fix. Moreover - * there are cases when libalias - * modify tcp packet data[2], mark it - * for later fix too. * - * [1] libalias was never meant to run - * in kernel, so it doesn't have any - * knowledge about checksum - * offloading, and it expects a packet - * with a full internet - * checksum. Unfortunately, packets - * generated locally will have just the - * pseudo header calculated, and when - * libalias tries to adjust the - * checksum it will actually screw it. + * locally generated packets have only pseudo-header checksum + * calculated and libalias will break it[1], so mark them for + * later fix. Moreover there are cases when libalias modifies + * tcp packet data[2], mark them for later fix too. + * + * [1] libalias was never meant to run in kernel, so it does + * not have any knowledge about checksum offloading, and + * expects a packet with a full internet checksum. + * Unfortunately, packets generated locally will have just the + * pseudo header calculated, and when libalias tries to adjust + * the checksum it will actually compute a wrong value. * - * [2] when libalias modify tcp's data - * content, full TCP checksum has to - * be recomputed: the problem is that - * libalias doesn't have any idea - * about checksum offloading To - * workaround this, we do not do - * checksumming in LibAlias, but only - * mark the packets in th_x2 field. If - * we receive a marked packet, we - * calculate correct checksum for it - * aware of offloading. Why such a - * terrible hack instead of - * recalculating checksum for each - * packet? Because the previous - * checksum was not checked! - * Recalculating checksums for EVERY - * packet will hide ALL transmission - * errors. Yes, marked packets still - * suffer from this problem. But, - * sigh, natd(8) has this problem, - * too. + * [2] when libalias modifies tcp's data content, full TCP + * checksum has to be recomputed: the problem is that + * libalias does not have any idea about checksum offloading. + * To work around this, we do not do checksumming in LibAlias, + * but only mark the packets in th_x2 field. If we receive a + * marked packet, we calculate correct checksum for it + * aware of offloading. Why such a terrible hack instead of + * recalculating checksum for each packet? + * Because the previous checksum was not checked! + * Recalculating checksums for EVERY packet will hide ALL + * transmission errors. Yes, marked packets still suffer from + * this problem. But, sigh, natd(8) has this problem, too. * * TODO: -make libalias mbuf aware (so * it can handle delayed checksum and tso) */ - if (mcl->m_pkthdr.rcvif == NULL && - mcl->m_pkthdr.csum_flags & - CSUM_DELAY_DATA) + if (mcl->m_pkthdr.rcvif == NULL && + mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA) ldt = 1; c = mtod(mcl, char *); if (args->oif == NULL) - retval = LibAliasIn(t->lib, c, + retval = LibAliasIn(t->lib, c, mcl->m_len + M_TRAILINGSPACE(mcl)); else - retval = LibAliasOut(t->lib, c, + retval = LibAliasOut(t->lib, c, mcl->m_len + M_TRAILINGSPACE(mcl)); if (retval == PKT_ALIAS_RESPOND) { - m->m_flags |= M_SKIP_FIREWALL; - retval = PKT_ALIAS_OK; + m->m_flags |= M_SKIP_FIREWALL; + retval = PKT_ALIAS_OK; } if (retval != PKT_ALIAS_OK && retval != PKT_ALIAS_FOUND_HEADER_FRAGMENT) { /* XXX - should i add some logging? */ m_free(mcl); - badnat: args->m = NULL; return (IP_FW_DENY); } - mcl->m_pkthdr.len = mcl->m_len = - ntohs(ip->ip_len); + mcl->m_pkthdr.len = mcl->m_len = ntohs(ip->ip_len); - /* - * XXX - libalias checksum offload - * 'duct tape' (see above) + /* + * XXX - libalias checksum offload + * 'duct tape' (see above) */ - if ((ip->ip_off & htons(IP_OFFMASK)) == 0 && + if ((ip->ip_off & htons(IP_OFFMASK)) == 0 && ip->ip_p == IPPROTO_TCP) { - struct tcphdr *th; + struct tcphdr *th; th = (struct tcphdr *)(ip + 1); - if (th->th_x2) + if (th->th_x2) ldt = 1; } @@ -355,82 +295,83 @@ ipfw_nat(struct ip_fw_args *args, struct cfg_nat *t, struct mbuf *m) struct udphdr *uh; u_short cksum; - ip->ip_len = ntohs(ip->ip_len); + /* XXX check if ip_len can stay in net format */ cksum = in_pseudo( ip->ip_src.s_addr, - ip->ip_dst.s_addr, - htons(ip->ip_p + ip->ip_len - (ip->ip_hl << 2)) + ip->ip_dst.s_addr, + htons(ip->ip_p + ntohs(ip->ip_len) - (ip->ip_hl << 2)) ); - + switch (ip->ip_p) { case IPPROTO_TCP: th = (struct tcphdr *)(ip + 1); - /* - * Maybe it was set in - * libalias... + /* + * Maybe it was set in + * libalias... */ th->th_x2 = 0; th->th_sum = cksum; - mcl->m_pkthdr.csum_data = + mcl->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); break; case IPPROTO_UDP: uh = (struct udphdr *)(ip + 1); uh->uh_sum = cksum; - mcl->m_pkthdr.csum_data = + mcl->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); - break; + break; } - /* - * No hw checksum offloading: do it - * by ourself. - */ - if ((mcl->m_pkthdr.csum_flags & - CSUM_DELAY_DATA) == 0) { + /* No hw checksum offloading: do it ourselves */ + if ((mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA) == 0) { in_delayed_cksum(mcl); - mcl->m_pkthdr.csum_flags &= - ~CSUM_DELAY_DATA; + mcl->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } - ip->ip_len = htons(ip->ip_len); - } - - if (args->eh == NULL) { - ip->ip_len = ntohs(ip->ip_len); - ip->ip_off = ntohs(ip->ip_off); } - args->m = mcl; return (IP_FW_NAT); } -static int +static struct cfg_nat * +lookup_nat(struct nat_list *l, int nat_id) +{ + struct cfg_nat *res; + + LIST_FOREACH(res, l, _next) { + if (res->id == nat_id) + break; + } + return res; +} + +static int ipfw_nat_cfg(struct sockopt *sopt) { struct cfg_nat *ptr, *ser_n; char *buf; + struct ip_fw_chain *chain = &V_layer3_chain; buf = malloc(NAT_BUF_LEN, M_IPFW, M_WAITOK | M_ZERO); - sooptcopyin(sopt, buf, NAT_BUF_LEN, - sizeof(struct cfg_nat)); + sooptcopyin(sopt, buf, NAT_BUF_LEN, sizeof(struct cfg_nat)); ser_n = (struct cfg_nat *)buf; - /* + /* check valid parameter ser_n->id > 0 ? */ + /* * Find/create nat rule. */ - IPFW_WLOCK(&V_layer3_chain); - LOOKUP_NAT(V_layer3_chain, ser_n->id, ptr); + IPFW_WLOCK(chain); + ptr = lookup_nat(&chain->nat, ser_n->id); if (ptr == NULL) { /* New rule: allocate and init new instance. */ - ptr = malloc(sizeof(struct cfg_nat), + ptr = malloc(sizeof(struct cfg_nat), M_IPFW, M_NOWAIT | M_ZERO); if (ptr == NULL) { - IPFW_WUNLOCK(&V_layer3_chain); + IPFW_WUNLOCK(chain); free(buf, M_IPFW); return (ENOSPC); } ptr->lib = LibAliasInit(NULL); if (ptr->lib == NULL) { - IPFW_WUNLOCK(&V_layer3_chain); + IPFW_WUNLOCK(chain); free(ptr, M_IPFW); free(buf, M_IPFW); return (EINVAL); @@ -438,18 +379,18 @@ ipfw_nat_cfg(struct sockopt *sopt) LIST_INIT(&ptr->redir_chain); } else { /* Entry already present: temporarly unhook it. */ - UNHOOK_NAT(ptr); - flush_nat_ptrs(ser_n->id); + LIST_REMOVE(ptr, _next); + flush_nat_ptrs(chain, ser_n->id); } - IPFW_WUNLOCK(&V_layer3_chain); + IPFW_WUNLOCK(chain); - /* + /* * Basic nat configuration. */ ptr->id = ser_n->id; - /* - * XXX - what if this rule doesn't nat any ip and just - * redirect? + /* + * XXX - what if this rule doesn't nat any ip and just + * redirect? * do we set aliasaddress to 0.0.0.0? */ ptr->ip = ser_n->ip; @@ -459,7 +400,7 @@ ipfw_nat_cfg(struct sockopt *sopt) LibAliasSetAddress(ptr->lib, ptr->ip); memcpy(ptr->if_name, ser_n->if_name, IF_NAMESIZE); - /* + /* * Redir and LSNAT configuration. */ /* Delete old cfgs. */ @@ -467,9 +408,9 @@ ipfw_nat_cfg(struct sockopt *sopt) /* Add new entries. */ add_redir_spool_cfg(&buf[(sizeof(struct cfg_nat))], ptr); free(buf, M_IPFW); - IPFW_WLOCK(&V_layer3_chain); - HOOK_NAT(&V_layer3_chain.nat, ptr); - IPFW_WUNLOCK(&V_layer3_chain); + IPFW_WLOCK(chain); + LIST_INSERT_HEAD(&chain->nat, ptr, _next); + IPFW_WUNLOCK(chain); return (0); } @@ -477,18 +418,20 @@ static int ipfw_nat_del(struct sockopt *sopt) { struct cfg_nat *ptr; + struct ip_fw_chain *chain = &V_layer3_chain; int i; - + sooptcopyin(sopt, &i, sizeof i, sizeof i); - IPFW_WLOCK(&V_layer3_chain); - LOOKUP_NAT(V_layer3_chain, i, ptr); + /* XXX validate i */ + IPFW_WLOCK(chain); + ptr = lookup_nat(&chain->nat, i); if (ptr == NULL) { - IPFW_WUNLOCK(&V_layer3_chain); + IPFW_WUNLOCK(chain); return (EINVAL); } - UNHOOK_NAT(ptr); - flush_nat_ptrs(i); - IPFW_WUNLOCK(&V_layer3_chain); + LIST_REMOVE(ptr, _next); + flush_nat_ptrs(chain, i); + IPFW_WUNLOCK(chain); del_redir_spool_cfg(ptr, &ptr->redir_chain); LibAliasUninit(ptr->lib); free(ptr, M_IPFW); @@ -497,56 +440,53 @@ ipfw_nat_del(struct sockopt *sopt) static int ipfw_nat_get_cfg(struct sockopt *sopt) -{ +{ uint8_t *data; struct cfg_nat *n; struct cfg_redir *r; struct cfg_spool *s; int nat_cnt, off; - + struct ip_fw_chain *chain; + int err = ENOSPC; + + chain = &V_layer3_chain; nat_cnt = 0; off = sizeof(nat_cnt); data = malloc(NAT_BUF_LEN, M_IPFW, M_WAITOK | M_ZERO); - IPFW_RLOCK(&V_layer3_chain); + IPFW_RLOCK(chain); /* Serialize all the data. */ - LIST_FOREACH(n, &V_layer3_chain.nat, _next) { + LIST_FOREACH(n, &chain->nat, _next) { nat_cnt++; - if (off + SOF_NAT < NAT_BUF_LEN) { - bcopy(n, &data[off], SOF_NAT); - off += SOF_NAT; - LIST_FOREACH(r, &n->redir_chain, _next) { - if (off + SOF_REDIR < NAT_BUF_LEN) { - bcopy(r, &data[off], - SOF_REDIR); - off += SOF_REDIR; - LIST_FOREACH(s, &r->spool_chain, - _next) { - if (off + SOF_SPOOL < - NAT_BUF_LEN) { - bcopy(s, &data[off], - SOF_SPOOL); - off += SOF_SPOOL; - } else - goto nospace; - } - } else + if (off + SOF_NAT >= NAT_BUF_LEN) + goto nospace; + bcopy(n, &data[off], SOF_NAT); + off += SOF_NAT; + LIST_FOREACH(r, &n->redir_chain, _next) { + if (off + SOF_REDIR >= NAT_BUF_LEN) + goto nospace; + bcopy(r, &data[off], SOF_REDIR); + off += SOF_REDIR; + LIST_FOREACH(s, &r->spool_chain, _next) { + if (off + SOF_SPOOL >= NAT_BUF_LEN) goto nospace; + bcopy(s, &data[off], SOF_SPOOL); + off += SOF_SPOOL; } - } else - goto nospace; + } } - bcopy(&nat_cnt, data, sizeof(nat_cnt)); - IPFW_RUNLOCK(&V_layer3_chain); - sooptcopyout(sopt, data, NAT_BUF_LEN); - free(data, M_IPFW); - return (0); + err = 0; /* all good */ nospace: - IPFW_RUNLOCK(&V_layer3_chain); - printf("serialized data buffer not big enough:" - "please increase NAT_BUF_LEN\n"); + IPFW_RUNLOCK(chain); + if (err == 0) { + bcopy(&nat_cnt, data, sizeof(nat_cnt)); + sooptcopyout(sopt, data, NAT_BUF_LEN); + } else { + printf("serialized data buffer not big enough:" + "please increase NAT_BUF_LEN\n"); + } free(data, M_IPFW); - return (ENOSPC); + return (err); } static int @@ -554,30 +494,35 @@ ipfw_nat_get_log(struct sockopt *sopt) { uint8_t *data; struct cfg_nat *ptr; - int i, size, cnt, sof; + int i, size; + struct ip_fw_chain *chain; - data = NULL; - sof = LIBALIAS_BUF_SIZE; - cnt = 0; + chain = &V_layer3_chain; - IPFW_RLOCK(&V_layer3_chain); - size = i = 0; - LIST_FOREACH(ptr, &V_layer3_chain.nat, _next) { - if (ptr->lib->logDesc == NULL) + IPFW_RLOCK(chain); + /* one pass to count, one to copy the data */ + i = 0; + LIST_FOREACH(ptr, &chain->nat, _next) { + if (ptr->lib->logDesc == NULL) + continue; + i++; + } + size = i * (LIBALIAS_BUF_SIZE + sizeof(int)); + data = malloc(size, M_IPFW, M_NOWAIT | M_ZERO); + if (data == NULL) { + IPFW_RUNLOCK(chain); + return (ENOSPC); + } + i = 0; + LIST_FOREACH(ptr, &chain->nat, _next) { + if (ptr->lib->logDesc == NULL) continue; - cnt++; - size = cnt * (sof + sizeof(int)); - data = realloc(data, size, M_IPFW, M_NOWAIT | M_ZERO); - if (data == NULL) { - IPFW_RUNLOCK(&V_layer3_chain); - return (ENOSPC); - } bcopy(&ptr->id, &data[i], sizeof(int)); i += sizeof(int); - bcopy(ptr->lib->logDesc, &data[i], sof); - i += sof; + bcopy(ptr->lib->logDesc, &data[i], LIBALIAS_BUF_SIZE); + i += LIBALIAS_BUF_SIZE; } - IPFW_RUNLOCK(&V_layer3_chain); + IPFW_RUNLOCK(chain); sooptcopyout(sopt, data, size); free(data, M_IPFW); return(0); @@ -590,38 +535,41 @@ ipfw_nat_init(void) IPFW_WLOCK(&V_layer3_chain); /* init ipfw hooks */ ipfw_nat_ptr = ipfw_nat; + lookup_nat_ptr = lookup_nat; ipfw_nat_cfg_ptr = ipfw_nat_cfg; ipfw_nat_del_ptr = ipfw_nat_del; ipfw_nat_get_cfg_ptr = ipfw_nat_get_cfg; ipfw_nat_get_log_ptr = ipfw_nat_get_log; IPFW_WUNLOCK(&V_layer3_chain); - V_ifaddr_event_tag = EVENTHANDLER_REGISTER(ifaddr_event, ifaddr_change, + V_ifaddr_event_tag = EVENTHANDLER_REGISTER( + ifaddr_event, ifaddr_change, NULL, EVENTHANDLER_PRI_ANY); } static void ipfw_nat_destroy(void) { - struct ip_fw *rule; struct cfg_nat *ptr, *ptr_temp; - - IPFW_WLOCK(&V_layer3_chain); - LIST_FOREACH_SAFE(ptr, &V_layer3_chain.nat, _next, ptr_temp) { + struct ip_fw_chain *chain; + + chain = &V_layer3_chain; + IPFW_WLOCK(chain); + LIST_FOREACH_SAFE(ptr, &chain->nat, _next, ptr_temp) { LIST_REMOVE(ptr, _next); del_redir_spool_cfg(ptr, &ptr->redir_chain); LibAliasUninit(ptr->lib); free(ptr, M_IPFW); } EVENTHANDLER_DEREGISTER(ifaddr_event, V_ifaddr_event_tag); - /* flush all nat ptrs */ - for (rule = V_layer3_chain.rules; rule; rule = rule->next) { - ipfw_insn_nat *cmd = (ipfw_insn_nat *)ACTION_PTR(rule); - if (cmd->o.opcode == O_NAT) - cmd->nat = NULL; - } + flush_nat_ptrs(chain, -1 /* flush all */); /* deregister ipfw_nat */ ipfw_nat_ptr = NULL; - IPFW_WUNLOCK(&V_layer3_chain); + lookup_nat_ptr = NULL; + ipfw_nat_cfg_ptr = NULL; + ipfw_nat_del_ptr = NULL; + ipfw_nat_get_cfg_ptr = NULL; + ipfw_nat_get_log_ptr = NULL; + IPFW_WUNLOCK(chain); } static int @@ -655,3 +603,4 @@ DECLARE_MODULE(ipfw_nat, ipfw_nat_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY MODULE_DEPEND(ipfw_nat, libalias, 1, 1, 1); MODULE_DEPEND(ipfw_nat, ipfw, 2, 2, 2); MODULE_VERSION(ipfw_nat, 1); +/* end of file */ diff --git a/sys/netinet/ipfw/ip_fw_pfil.c b/sys/netinet/ipfw/ip_fw_pfil.c index db73084..b4e31d4 100644 --- a/sys/netinet/ipfw/ip_fw_pfil.c +++ b/sys/netinet/ipfw/ip_fw_pfil.c @@ -46,9 +46,7 @@ __FBSDID("$FreeBSD$"); #include <sys/lock.h> #include <sys/rwlock.h> #include <sys/socket.h> -#include <sys/socketvar.h> #include <sys/sysctl.h> -#include <sys/ucred.h> #include <net/if.h> #include <net/route.h> @@ -60,457 +58,309 @@ __FBSDID("$FreeBSD$"); #include <netinet/ip.h> #include <netinet/ip_var.h> #include <netinet/ip_fw.h> -#include <netinet/ip_divert.h> -#include <netinet/ip_dummynet.h> - +#include <netinet/ipfw/ip_fw_private.h> #include <netgraph/ng_ipfw.h> #include <machine/in_cksum.h> -VNET_DEFINE(int, fw_enable) = 1; +static VNET_DEFINE(int, fw_enable) = 1; +#define V_fw_enable VNET(fw_enable) + #ifdef INET6 -VNET_DEFINE(int, fw6_enable) = 1; +static VNET_DEFINE(int, fw6_enable) = 1; +#define V_fw6_enable VNET(fw6_enable) #endif int ipfw_chg_hook(SYSCTL_HANDLER_ARGS); -/* Divert hooks. */ -ip_divert_packet_t *ip_divert_ptr = NULL; - -/* ng_ipfw hooks. */ -ng_ipfw_input_t *ng_ipfw_input_p = NULL; - /* Forward declarations. */ -static int ipfw_divert(struct mbuf **, int, int); -#define DIV_DIR_IN 1 -#define DIV_DIR_OUT 0 - -int -ipfw_check_in(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir, - struct inpcb *inp) -{ - struct ip_fw_args args; - struct ng_ipfw_tag *ng_tag; - struct m_tag *dn_tag; - int ipfw = 0; - int divert; - int tee; -#ifdef IPFIREWALL_FORWARD - struct m_tag *fwd_tag; -#endif - - KASSERT(dir == PFIL_IN, ("ipfw_check_in wrong direction!")); - - bzero(&args, sizeof(args)); - - ng_tag = (struct ng_ipfw_tag *)m_tag_locate(*m0, NGM_IPFW_COOKIE, 0, - NULL); - if (ng_tag != NULL) { - KASSERT(ng_tag->dir == NG_IPFW_IN, - ("ng_ipfw tag with wrong direction")); - args.rule = ng_tag->rule; - args.rule_id = ng_tag->rule_id; - args.chain_id = ng_tag->chain_id; - m_tag_delete(*m0, (struct m_tag *)ng_tag); - } - -again: - dn_tag = m_tag_find(*m0, PACKET_TAG_DUMMYNET, NULL); - if (dn_tag != NULL){ - struct dn_pkt_tag *dt; - - dt = (struct dn_pkt_tag *)(dn_tag+1); - args.rule = dt->rule; - args.rule_id = dt->rule_id; - args.chain_id = dt->chain_id; - - m_tag_delete(*m0, dn_tag); - } +static int ipfw_divert(struct mbuf **, int, struct ipfw_rule_ref *, int); - args.m = *m0; - args.inp = inp; - tee = 0; - - if (V_fw_one_pass == 0 || args.rule == NULL) { - ipfw = ipfw_chk(&args); - *m0 = args.m; - } else - ipfw = IP_FW_PASS; - - KASSERT(*m0 != NULL || ipfw == IP_FW_DENY, ("%s: m0 is NULL", - __func__)); +#ifdef SYSCTL_NODE - switch (ipfw) { - case IP_FW_PASS: - if (args.next_hop == NULL) - goto pass; +SYSBEGIN(f1) -#ifdef IPFIREWALL_FORWARD - fwd_tag = m_tag_get(PACKET_TAG_IPFORWARD, - sizeof(struct sockaddr_in), M_NOWAIT); - if (fwd_tag == NULL) - goto drop; - bcopy(args.next_hop, (fwd_tag+1), sizeof(struct sockaddr_in)); - m_tag_prepend(*m0, fwd_tag); - - if (in_localip(args.next_hop->sin_addr)) - (*m0)->m_flags |= M_FASTFWD_OURS; - goto pass; -#endif - break; /* not reached */ - - case IP_FW_DENY: - goto drop; - break; /* not reached */ - - case IP_FW_DUMMYNET: - if (ip_dn_io_ptr == NULL) - goto drop; - if (mtod(*m0, struct ip *)->ip_v == 4) - ip_dn_io_ptr(m0, DN_TO_IP_IN, &args); - else if (mtod(*m0, struct ip *)->ip_v == 6) - ip_dn_io_ptr(m0, DN_TO_IP6_IN, &args); - if (*m0 != NULL) - goto again; - return 0; /* packet consumed */ - - case IP_FW_TEE: - tee = 1; - /* fall through */ - - case IP_FW_DIVERT: - divert = ipfw_divert(m0, DIV_DIR_IN, tee); - if (divert) { - *m0 = NULL; - return 0; /* packet consumed */ - } else { - args.rule = NULL; - goto again; /* continue with packet */ - } - - case IP_FW_NGTEE: - if (!NG_IPFW_LOADED) - goto drop; - (void)ng_ipfw_input_p(m0, NG_IPFW_IN, &args, 1); - goto again; /* continue with packet */ - - case IP_FW_NETGRAPH: - if (!NG_IPFW_LOADED) - goto drop; - return ng_ipfw_input_p(m0, NG_IPFW_IN, &args, 0); - - case IP_FW_NAT: - goto again; /* continue with packet */ - - case IP_FW_REASS: - goto again; +SYSCTL_DECL(_net_inet_ip_fw); +SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, enable, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_enable), 0, + ipfw_chg_hook, "I", "Enable ipfw"); +#ifdef INET6 +SYSCTL_DECL(_net_inet6_ip6_fw); +SYSCTL_VNET_PROC(_net_inet6_ip6_fw, OID_AUTO, enable, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw6_enable), 0, + ipfw_chg_hook, "I", "Enable ipfw+6"); +#endif /* INET6 */ - default: - KASSERT(0, ("%s: unknown retval", __func__)); - } +SYSEND -drop: - if (*m0) - m_freem(*m0); - *m0 = NULL; - return (EACCES); -pass: - return 0; /* not filtered */ -} +#endif /* SYSCTL_NODE */ +/* + * The pfilter hook to pass packets to ipfw_chk and then to + * dummynet, divert, netgraph or other modules. + * The packet may be consumed. + */ int -ipfw_check_out(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir, +ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir, struct inpcb *inp) { struct ip_fw_args args; - struct ng_ipfw_tag *ng_tag; - struct m_tag *dn_tag; - int ipfw = 0; - int divert; - int tee; -#ifdef IPFIREWALL_FORWARD - struct m_tag *fwd_tag; -#endif + struct m_tag *tag; + int ipfw; + int ret; - KASSERT(dir == PFIL_OUT, ("ipfw_check_out wrong direction!")); + /* all the processing now uses ip_len in net format */ + if (mtod(*m0, struct ip *)->ip_v == 4) + SET_NET_IPLEN(mtod(*m0, struct ip *)); + /* convert dir to IPFW values */ + dir = (dir == PFIL_IN) ? DIR_IN : DIR_OUT; bzero(&args, sizeof(args)); - ng_tag = (struct ng_ipfw_tag *)m_tag_locate(*m0, NGM_IPFW_COOKIE, 0, - NULL); - if (ng_tag != NULL) { - KASSERT(ng_tag->dir == NG_IPFW_OUT, - ("ng_ipfw tag with wrong direction")); - args.rule = ng_tag->rule; - args.rule_id = ng_tag->rule_id; - args.chain_id = ng_tag->chain_id; - m_tag_delete(*m0, (struct m_tag *)ng_tag); - } - again: - dn_tag = m_tag_find(*m0, PACKET_TAG_DUMMYNET, NULL); - if (dn_tag != NULL) { - struct dn_pkt_tag *dt; - - dt = (struct dn_pkt_tag *)(dn_tag+1); - args.rule = dt->rule; - args.rule_id = dt->rule_id; - args.chain_id = dt->chain_id; - - m_tag_delete(*m0, dn_tag); + /* + * extract and remove the tag if present. If we are left + * with onepass, optimize the outgoing path. + */ + tag = m_tag_locate(*m0, MTAG_IPFW_RULE, 0, NULL); + if (tag != NULL) { + args.rule = *((struct ipfw_rule_ref *)(tag+1)); + m_tag_delete(*m0, tag); + if (args.rule.info & IPFW_ONEPASS) { + SET_HOST_IPLEN(mtod(*m0, struct ip *)); + return 0; + } } args.m = *m0; - args.oif = ifp; + args.oif = dir == DIR_OUT ? ifp : NULL; args.inp = inp; - tee = 0; - if (V_fw_one_pass == 0 || args.rule == NULL) { - ipfw = ipfw_chk(&args); - *m0 = args.m; - } else - ipfw = IP_FW_PASS; + ipfw = ipfw_chk(&args); + *m0 = args.m; KASSERT(*m0 != NULL || ipfw == IP_FW_DENY, ("%s: m0 is NULL", __func__)); + /* breaking out of the switch means drop */ + ret = 0; /* default return value for pass */ switch (ipfw) { case IP_FW_PASS: + /* next_hop may be set by ipfw_chk */ if (args.next_hop == NULL) - goto pass; -#ifdef IPFIREWALL_FORWARD - /* Overwrite existing tag. */ - fwd_tag = m_tag_find(*m0, PACKET_TAG_IPFORWARD, NULL); - if (fwd_tag == NULL) { + break; /* pass */ +#ifndef IPFIREWALL_FORWARD + ret = EACCES; +#else + { + struct m_tag *fwd_tag; + + /* Incoming packets should not be tagged so we do not + * m_tag_find. Outgoing packets may be tagged, so we + * reuse the tag if present. + */ + fwd_tag = (dir == DIR_IN) ? NULL : + m_tag_find(*m0, PACKET_TAG_IPFORWARD, NULL); + if (fwd_tag != NULL) { + m_tag_unlink(*m0, fwd_tag); + } else { fwd_tag = m_tag_get(PACKET_TAG_IPFORWARD, sizeof(struct sockaddr_in), M_NOWAIT); - if (fwd_tag == NULL) - goto drop; - } else - m_tag_unlink(*m0, fwd_tag); + if (fwd_tag == NULL) { + ret = EACCES; + break; /* i.e. drop */ + } + } bcopy(args.next_hop, (fwd_tag+1), sizeof(struct sockaddr_in)); m_tag_prepend(*m0, fwd_tag); if (in_localip(args.next_hop->sin_addr)) (*m0)->m_flags |= M_FASTFWD_OURS; - goto pass; + } #endif - break; /* not reached */ + break; case IP_FW_DENY: - goto drop; - break; /* not reached */ + ret = EACCES; + break; /* i.e. drop */ case IP_FW_DUMMYNET: + ret = EACCES; if (ip_dn_io_ptr == NULL) - break; + break; /* i.e. drop */ if (mtod(*m0, struct ip *)->ip_v == 4) - ip_dn_io_ptr(m0, DN_TO_IP_OUT, &args); + ret = ip_dn_io_ptr(m0, dir, &args); else if (mtod(*m0, struct ip *)->ip_v == 6) - ip_dn_io_ptr(m0, DN_TO_IP6_OUT, &args); + ret = ip_dn_io_ptr(m0, dir | PROTO_IPV6, &args); + else + break; /* drop it */ + /* + * XXX should read the return value. + * dummynet normally eats the packet and sets *m0=NULL + * unless the packet can be sent immediately. In this + * case args is updated and we should re-run the + * check without clearing args. + */ if (*m0 != NULL) goto again; - return 0; /* packet consumed */ - break; case IP_FW_TEE: - tee = 1; - /* fall through */ - case IP_FW_DIVERT: - divert = ipfw_divert(m0, DIV_DIR_OUT, tee); - if (divert) { - *m0 = NULL; - return 0; /* packet consumed */ - } else { - args.rule = NULL; - goto again; /* continue with packet */ + if (ip_divert_ptr == NULL) { + ret = EACCES; + break; /* i.e. drop */ } + ret = ipfw_divert(m0, dir, &args.rule, + (ipfw == IP_FW_TEE) ? 1 : 0); + /* continue processing for the original packet (tee). */ + if (*m0) + goto again; + break; case IP_FW_NGTEE: - if (!NG_IPFW_LOADED) - goto drop; - (void)ng_ipfw_input_p(m0, NG_IPFW_OUT, &args, 1); - goto again; /* continue with packet */ - case IP_FW_NETGRAPH: - if (!NG_IPFW_LOADED) - goto drop; - return ng_ipfw_input_p(m0, NG_IPFW_OUT, &args, 0); + if (ng_ipfw_input_p == NULL) { + ret = EACCES; + break; /* i.e. drop */ + } + ret = ng_ipfw_input_p(m0, dir, &args, + (ipfw == IP_FW_NGTEE) ? 1 : 0); + if (ipfw == IP_FW_NGTEE) /* ignore errors for NGTEE */ + goto again; /* continue with packet */ + break; case IP_FW_NAT: - goto again; /* continue with packet */ - case IP_FW_REASS: - goto again; + goto again; /* continue with packet */ default: KASSERT(0, ("%s: unknown retval", __func__)); } -drop: - if (*m0) - m_freem(*m0); - *m0 = NULL; - return (EACCES); -pass: - return 0; /* not filtered */ + if (ret != 0) { + if (*m0) + FREE_PKT(*m0); + *m0 = NULL; + } + if (*m0 && mtod(*m0, struct ip *)->ip_v == 4) + SET_HOST_IPLEN(mtod(*m0, struct ip *)); + return ret; } +/* do the divert, return 1 on error 0 on success */ static int -ipfw_divert(struct mbuf **m, int incoming, int tee) +ipfw_divert(struct mbuf **m0, int incoming, struct ipfw_rule_ref *rule, + int tee) { /* * ipfw_chk() has already tagged the packet with the divert tag. * If tee is set, copy packet and return original. * If not tee, consume packet and send it to divert socket. */ - struct mbuf *clone, *reass; + struct mbuf *clone; struct ip *ip; - int hlen; - - reass = NULL; - - /* Is divert module loaded? */ - if (ip_divert_ptr == NULL) - goto nodivert; + struct m_tag *tag; /* Cloning needed for tee? */ - if (tee) - clone = m_dup(*m, M_DONTWAIT); - else - clone = *m; - - /* In case m_dup was unable to allocate mbufs. */ - if (clone == NULL) - goto teeout; + if (tee == 0) { + clone = *m0; /* use the original mbuf */ + *m0 = NULL; + } else { + clone = m_dup(*m0, M_DONTWAIT); + /* If we cannot duplicate the mbuf, we sacrifice the divert + * chain and continue with the tee-ed packet. + */ + if (clone == NULL) + return 1; + } /* - * Divert listeners can only handle non-fragmented packets. - * However when tee is set we will *not* de-fragment the packets; - * Doing do would put the reassembly into double-jeopardy. On top - * of that someone doing a tee will probably want to get the packet - * in its original form. + * Divert listeners can normally handle non-fragmented packets, + * but we can only reass in the non-tee case. + * This means that listeners on a tee rule may get fragments, + * and have to live with that. + * Note that we now have the 'reass' ipfw option so if we care + * we can do it before a 'tee'. */ ip = mtod(clone, struct ip *); - if (!tee && ip->ip_off & (IP_MF | IP_OFFMASK)) { - - /* Reassemble packet. */ - reass = ip_reass(clone); - + if (!tee && ntohs(ip->ip_off) & (IP_MF | IP_OFFMASK)) { + int hlen; + struct mbuf *reass; + + SET_HOST_IPLEN(ip); /* ip_reass wants host order */ + reass = ip_reass(clone); /* Reassemble packet. */ + if (reass == NULL) + return 0; /* not an error */ + /* if reass = NULL then it was consumed by ip_reass */ /* * IP header checksum fixup after reassembly and leave header * in network byte order. */ - if (reass != NULL) { - ip = mtod(reass, struct ip *); - hlen = ip->ip_hl << 2; - ip->ip_len = htons(ip->ip_len); - ip->ip_off = htons(ip->ip_off); - ip->ip_sum = 0; - if (hlen == sizeof(struct ip)) - ip->ip_sum = in_cksum_hdr(ip); - else - ip->ip_sum = in_cksum(reass, hlen); - clone = reass; - } else - clone = NULL; - } else { - /* Convert header to network byte order. */ - ip->ip_len = htons(ip->ip_len); - ip->ip_off = htons(ip->ip_off); + ip = mtod(reass, struct ip *); + hlen = ip->ip_hl << 2; + SET_NET_IPLEN(ip); + ip->ip_sum = 0; + if (hlen == sizeof(struct ip)) + ip->ip_sum = in_cksum_hdr(ip); + else + ip->ip_sum = in_cksum(reass, hlen); + clone = reass; + } + /* attach a tag to the packet with the reinject info */ + tag = m_tag_alloc(MTAG_IPFW_RULE, 0, + sizeof(struct ipfw_rule_ref), M_NOWAIT); + if (tag == NULL) { + FREE_PKT(clone); + return 1; } + *((struct ipfw_rule_ref *)(tag+1)) = *rule; + m_tag_prepend(clone, tag); /* Do the dirty job... */ - if (clone && ip_divert_ptr != NULL) - ip_divert_ptr(clone, incoming); - -teeout: - /* - * For tee we leave the divert tag attached to original packet. - * It will then continue rule evaluation after the tee rule. - */ - if (tee) - return 0; - - /* Packet diverted and consumed */ - return 1; - -nodivert: - m_freem(*m); - return 1; -} - -int -ipfw_hook(void) -{ - struct pfil_head *pfh_inet; - - pfh_inet = pfil_head_get(PFIL_TYPE_AF, AF_INET); - if (pfh_inet == NULL) - return ENOENT; - - (void)pfil_add_hook(ipfw_check_in, NULL, PFIL_IN | PFIL_WAITOK, - pfh_inet); - (void)pfil_add_hook(ipfw_check_out, NULL, PFIL_OUT | PFIL_WAITOK, - pfh_inet); - + ip_divert_ptr(clone, incoming); return 0; } -int -ipfw_unhook(void) -{ - struct pfil_head *pfh_inet; - - pfh_inet = pfil_head_get(PFIL_TYPE_AF, AF_INET); - if (pfh_inet == NULL) - return ENOENT; - - (void)pfil_remove_hook(ipfw_check_in, NULL, PFIL_IN | PFIL_WAITOK, - pfh_inet); - (void)pfil_remove_hook(ipfw_check_out, NULL, PFIL_OUT | PFIL_WAITOK, - pfh_inet); - - return 0; -} - -#ifdef INET6 -int -ipfw6_hook(void) +/* + * attach or detach hooks for a given protocol family + */ +static int +ipfw_hook(int onoff, int pf) { - struct pfil_head *pfh_inet6; + struct pfil_head *pfh; - pfh_inet6 = pfil_head_get(PFIL_TYPE_AF, AF_INET6); - if (pfh_inet6 == NULL) + pfh = pfil_head_get(PFIL_TYPE_AF, pf); + if (pfh == NULL) return ENOENT; - (void)pfil_add_hook(ipfw_check_in, NULL, PFIL_IN | PFIL_WAITOK, - pfh_inet6); - (void)pfil_add_hook(ipfw_check_out, NULL, PFIL_OUT | PFIL_WAITOK, - pfh_inet6); + (void) (onoff ? pfil_add_hook : pfil_remove_hook) + (ipfw_check_hook, NULL, PFIL_IN | PFIL_OUT | PFIL_WAITOK, pfh); return 0; } int -ipfw6_unhook(void) +ipfw_attach_hooks(int arg) { - struct pfil_head *pfh_inet6; - - pfh_inet6 = pfil_head_get(PFIL_TYPE_AF, AF_INET6); - if (pfh_inet6 == NULL) - return ENOENT; - - (void)pfil_remove_hook(ipfw_check_in, NULL, PFIL_IN | PFIL_WAITOK, - pfh_inet6); - (void)pfil_remove_hook(ipfw_check_out, NULL, PFIL_OUT | PFIL_WAITOK, - pfh_inet6); - - return 0; + int error = 0; + + if (arg == 0) /* detach */ + ipfw_hook(0, AF_INET); + else if (V_fw_enable && ipfw_hook(1, AF_INET) != 0) { + error = ENOENT; /* see ip_fw_pfil.c::ipfw_hook() */ + printf("ipfw_hook() error\n"); + } +#ifdef INET6 + if (arg == 0) /* detach */ + ipfw_hook(0, AF_INET6); + else if (V_fw6_enable && ipfw_hook(1, AF_INET6) != 0) { + error = ENOENT; + printf("ipfw6_hook() error\n"); + } +#endif + return error; } -#endif /* INET6 */ int ipfw_chg_hook(SYSCTL_HANDLER_ARGS) @@ -518,13 +368,16 @@ ipfw_chg_hook(SYSCTL_HANDLER_ARGS) int enable; int oldenable; int error; + int af; if (arg1 == &VNET_NAME(fw_enable)) { enable = V_fw_enable; + af = AF_INET; } #ifdef INET6 else if (arg1 == &VNET_NAME(fw6_enable)) { enable = V_fw6_enable; + af = AF_INET6; } #endif else @@ -542,27 +395,16 @@ ipfw_chg_hook(SYSCTL_HANDLER_ARGS) if (enable == oldenable) return (0); - if (arg1 == &VNET_NAME(fw_enable)) { - if (enable) - error = ipfw_hook(); - else - error = ipfw_unhook(); - if (error) - return (error); + error = ipfw_hook(enable, af); + if (error) + return (error); + if (af == AF_INET) V_fw_enable = enable; - } #ifdef INET6 - else if (arg1 == &VNET_NAME(fw6_enable)) { - if (enable) - error = ipfw6_hook(); - else - error = ipfw6_unhook(); - if (error) - return (error); + else if (af == AF_INET6) V_fw6_enable = enable; - } #endif return (0); } - +/* end of file */ diff --git a/sys/netinet/ipfw/ip_fw_private.h b/sys/netinet/ipfw/ip_fw_private.h new file mode 100644 index 0000000..c29ae0a --- /dev/null +++ b/sys/netinet/ipfw/ip_fw_private.h @@ -0,0 +1,301 @@ +/*- + * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _IPFW2_PRIVATE_H +#define _IPFW2_PRIVATE_H + +/* + * Internal constants and data structures used by ipfw components + * and not meant to be exported outside the kernel. + */ + +#ifdef _KERNEL + +/* + * For platforms that do not have SYSCTL support, we wrap the + * SYSCTL_* into a function (one per file) to collect the values + * into an array at module initialization. The wrapping macros, + * SYSBEGIN() and SYSEND, are empty in the default case. + */ +#ifndef SYSBEGIN +#define SYSBEGIN(x) +#endif +#ifndef SYSEND +#define SYSEND +#endif + +/* Return values from ipfw_chk() */ +enum { + IP_FW_PASS = 0, + IP_FW_DENY, + IP_FW_DIVERT, + IP_FW_TEE, + IP_FW_DUMMYNET, + IP_FW_NETGRAPH, + IP_FW_NGTEE, + IP_FW_NAT, + IP_FW_REASS, +}; + +/* + * Structure for collecting parameters to dummynet for ip6_output forwarding + */ +struct _ip6dn_args { + struct ip6_pktopts *opt_or; + struct route_in6 ro_or; + int flags_or; + struct ip6_moptions *im6o_or; + struct ifnet *origifp_or; + struct ifnet *ifp_or; + struct sockaddr_in6 dst_or; + u_long mtu_or; + struct route_in6 ro_pmtu_or; +}; + + +/* + * Arguments for calling ipfw_chk() and dummynet_io(). We put them + * all into a structure because this way it is easier and more + * efficient to pass variables around and extend the interface. + */ +struct ip_fw_args { + struct mbuf *m; /* the mbuf chain */ + struct ifnet *oif; /* output interface */ + struct sockaddr_in *next_hop; /* forward address */ + + /* + * On return, it points to the matching rule. + * On entry, rule.slot > 0 means the info is valid and + * contains the the starting rule for an ipfw search. + * If chain_id == chain->id && slot >0 then jump to that slot. + * Otherwise, we locate the first rule >= rulenum:rule_id + */ + struct ipfw_rule_ref rule; /* match/restart info */ + + struct ether_header *eh; /* for bridged packets */ + + struct ipfw_flow_id f_id; /* grabbed from IP header */ + //uint32_t cookie; /* a cookie depending on rule action */ + struct inpcb *inp; + + struct _ip6dn_args dummypar; /* dummynet->ip6_output */ + struct sockaddr_in hopstore; /* store here if cannot use a pointer */ +}; + +MALLOC_DECLARE(M_IPFW); + +/* + * Hooks sometime need to know the direction of the packet + * (divert, dummynet, netgraph, ...) + * We use a generic definition here, with bit0-1 indicating the + * direction, bit 2 indicating layer2 or 3, bit 3-4 indicating the + * specific protocol + * indicating the protocol (if necessary) + */ +enum { + DIR_MASK = 0x3, + DIR_OUT = 0, + DIR_IN = 1, + DIR_FWD = 2, + DIR_DROP = 3, + PROTO_LAYER2 = 0x4, /* set for layer 2 */ + /* PROTO_DEFAULT = 0, */ + PROTO_IPV4 = 0x08, + PROTO_IPV6 = 0x10, + PROTO_IFB = 0x0c, /* layer2 + ifbridge */ + /* PROTO_OLDBDG = 0x14, unused, old bridge */ +}; + +/* wrapper for freeing a packet, in case we need to do more work */ +#ifndef FREE_PKT +#if defined(__linux__) || defined(_WIN32) +#define FREE_PKT(m) netisr_dispatch(-1, m) +#else +#define FREE_PKT(m) m_freem(m) +#endif +#endif /* !FREE_PKT */ + +/* + * Function definitions. + */ + +/* attach (arg = 1) or detach (arg = 0) hooks */ +int ipfw_attach_hooks(int); +#ifdef NOTYET +void ipfw_nat_destroy(void); +#endif + +/* In ip_fw_log.c */ +struct ip; +void ipfw_log_bpf(int); +void ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args, + struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg, + struct ip *ip); +VNET_DECLARE(u_int64_t, norule_counter); +#define V_norule_counter VNET(norule_counter) +VNET_DECLARE(int, verbose_limit); +#define V_verbose_limit VNET(verbose_limit) + +/* In ip_fw_dynamic.c */ + +enum { /* result for matching dynamic rules */ + MATCH_REVERSE = 0, + MATCH_FORWARD, + MATCH_NONE, + MATCH_UNKNOWN, +}; + +/* + * The lock for dynamic rules is only used once outside the file, + * and only to release the result of lookup_dyn_rule(). + * Eventually we may implement it with a callback on the function. + */ +void ipfw_dyn_unlock(void); + +struct tcphdr; +struct mbuf *ipfw_send_pkt(struct mbuf *, struct ipfw_flow_id *, + u_int32_t, u_int32_t, int); +int ipfw_install_state(struct ip_fw *rule, ipfw_insn_limit *cmd, + struct ip_fw_args *args, uint32_t tablearg); +ipfw_dyn_rule *ipfw_lookup_dyn_rule(struct ipfw_flow_id *pkt, + int *match_direction, struct tcphdr *tcp); +void ipfw_remove_dyn_children(struct ip_fw *rule); +void ipfw_get_dynamic(char **bp, const char *ep); + +void ipfw_dyn_attach(void); /* uma_zcreate .... */ +void ipfw_dyn_detach(void); /* uma_zdestroy ... */ +void ipfw_dyn_init(void); /* per-vnet initialization */ +void ipfw_dyn_uninit(int); /* per-vnet deinitialization */ +int ipfw_dyn_len(void); + +/* common variables */ +VNET_DECLARE(int, fw_one_pass); +#define V_fw_one_pass VNET(fw_one_pass) + +VNET_DECLARE(int, fw_verbose); +#define V_fw_verbose VNET(fw_verbose) + +VNET_DECLARE(struct ip_fw_chain, layer3_chain); +#define V_layer3_chain VNET(layer3_chain) + +VNET_DECLARE(u_int32_t, set_disable); +#define V_set_disable VNET(set_disable) + +VNET_DECLARE(int, autoinc_step); +#define V_autoinc_step VNET(autoinc_step) + +struct ip_fw_chain { + struct ip_fw *rules; /* list of rules */ + struct ip_fw *reap; /* list of rules to reap */ + struct ip_fw *default_rule; + int n_rules; /* number of static rules */ + int static_len; /* total len of static rules */ + struct ip_fw **map; /* array of rule ptrs to ease lookup */ + LIST_HEAD(nat_list, cfg_nat) nat; /* list of nat entries */ + struct radix_node_head *tables[IPFW_TABLES_MAX]; +#if defined( __linux__ ) || defined( _WIN32 ) + spinlock_t rwmtx; + spinlock_t uh_lock; +#else + struct rwlock rwmtx; + struct rwlock uh_lock; /* lock for upper half */ +#endif + uint32_t id; /* ruleset id */ +}; + +struct sockopt; /* used by tcp_var.h */ + +/* + * The lock is heavily used by ip_fw2.c (the main file) and ip_fw_nat.c + * so the variable and the macros must be here. + */ + +#define IPFW_LOCK_INIT(_chain) do { \ + rw_init(&(_chain)->rwmtx, "IPFW static rules"); \ + rw_init(&(_chain)->uh_lock, "IPFW UH lock"); \ + } while (0) + +#define IPFW_LOCK_DESTROY(_chain) do { \ + rw_destroy(&(_chain)->rwmtx); \ + rw_destroy(&(_chain)->uh_lock); \ + } while (0) + +#define IPFW_WLOCK_ASSERT(_chain) rw_assert(&(_chain)->rwmtx, RA_WLOCKED) + +#define IPFW_RLOCK(p) rw_rlock(&(p)->rwmtx) +#define IPFW_RUNLOCK(p) rw_runlock(&(p)->rwmtx) +#define IPFW_WLOCK(p) rw_wlock(&(p)->rwmtx) +#define IPFW_WUNLOCK(p) rw_wunlock(&(p)->rwmtx) + +#define IPFW_UH_RLOCK(p) rw_rlock(&(p)->uh_lock) +#define IPFW_UH_RUNLOCK(p) rw_runlock(&(p)->uh_lock) +#define IPFW_UH_WLOCK(p) rw_wlock(&(p)->uh_lock) +#define IPFW_UH_WUNLOCK(p) rw_wunlock(&(p)->uh_lock) + +/* In ip_fw_sockopt.c */ +int ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id); +int ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule); +int ipfw_ctl(struct sockopt *sopt); +int ipfw_chk(struct ip_fw_args *args); +void ipfw_reap_rules(struct ip_fw *head); + +/* In ip_fw_pfil */ +int ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir, + struct inpcb *inp); + +/* In ip_fw_table.c */ +struct radix_node; +int ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, + uint32_t *val); +int ipfw_init_tables(struct ip_fw_chain *ch); +void ipfw_destroy_tables(struct ip_fw_chain *ch); +int ipfw_flush_table(struct ip_fw_chain *ch, uint16_t tbl); +int ipfw_add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, + uint8_t mlen, uint32_t value); +int ipfw_dump_table_entry(struct radix_node *rn, void *arg); +int ipfw_del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, + uint8_t mlen); +int ipfw_count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt); +int ipfw_dump_table(struct ip_fw_chain *ch, ipfw_table *tbl); + +/* In ip_fw_nat.c -- XXX to be moved to ip_var.h */ + +extern struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int); + +typedef int ipfw_nat_t(struct ip_fw_args *, struct cfg_nat *, struct mbuf *); +typedef int ipfw_nat_cfg_t(struct sockopt *); + +extern ipfw_nat_t *ipfw_nat_ptr; +#define IPFW_NAT_LOADED (ipfw_nat_ptr != NULL) + +extern ipfw_nat_cfg_t *ipfw_nat_cfg_ptr; +extern ipfw_nat_cfg_t *ipfw_nat_del_ptr; +extern ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr; +extern ipfw_nat_cfg_t *ipfw_nat_get_log_ptr; + +#endif /* _KERNEL */ +#endif /* _IPFW2_PRIVATE_H */ diff --git a/sys/netinet/ipfw/ip_fw_sockopt.c b/sys/netinet/ipfw/ip_fw_sockopt.c new file mode 100644 index 0000000..e25b960 --- /dev/null +++ b/sys/netinet/ipfw/ip_fw_sockopt.c @@ -0,0 +1,1287 @@ +/*- + * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa + * + * Supported by: Valeria Paoli + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +/* + * Sockopt support for ipfw. The routines here implement + * the upper half of the ipfw code. + */ + +#if !defined(KLD_MODULE) +#include "opt_ipfw.h" +#include "opt_ipdivert.h" +#include "opt_ipdn.h" +#include "opt_inet.h" +#ifndef INET +#error IPFIREWALL requires INET. +#endif /* INET */ +#endif +#include "opt_inet6.h" +#include "opt_ipsec.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> /* struct m_tag used by nested headers */ +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/priv.h> +#include <sys/proc.h> +#include <sys/rwlock.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> +#include <net/if.h> +#include <net/route.h> +#include <net/vnet.h> + +#include <netinet/in.h> +#include <netinet/ip_var.h> /* hooks */ +#include <netinet/ip_fw.h> +#include <netinet/ipfw/ip_fw_private.h> + +#ifdef MAC +#include <security/mac/mac_framework.h> +#endif + +MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's"); + +/* + * static variables followed by global ones (none in this file) + */ + +/* + * Find the smallest rule >= key, id. + * We could use bsearch but it is so simple that we code it directly + */ +int +ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id) +{ + int i, lo, hi; + struct ip_fw *r; + + for (lo = 0, hi = chain->n_rules - 1; lo < hi;) { + i = (lo + hi) / 2; + r = chain->map[i]; + if (r->rulenum < key) + lo = i + 1; /* continue from the next one */ + else if (r->rulenum > key) + hi = i; /* this might be good */ + else if (r->id < id) + lo = i + 1; /* continue from the next one */ + else /* r->id >= id */ + hi = i; /* this might be good */ + }; + return hi; +} + +/* + * allocate a new map, returns the chain locked. extra is the number + * of entries to add or delete. + */ +static struct ip_fw ** +get_map(struct ip_fw_chain *chain, int extra, int locked) +{ + + for (;;) { + struct ip_fw **map; + int i; + + i = chain->n_rules + extra; + map = malloc(i * sizeof(struct ip_fw *), M_IPFW, + locked ? M_NOWAIT : M_WAITOK); + if (map == NULL) { + printf("%s: cannot allocate map\n", __FUNCTION__); + return NULL; + } + if (!locked) + IPFW_UH_WLOCK(chain); + if (i >= chain->n_rules + extra) /* good */ + return map; + /* otherwise we lost the race, free and retry */ + if (!locked) + IPFW_UH_WUNLOCK(chain); + free(map, M_IPFW); + } +} + +/* + * swap the maps. It is supposed to be called with IPFW_UH_WLOCK + */ +static struct ip_fw ** +swap_map(struct ip_fw_chain *chain, struct ip_fw **new_map, int new_len) +{ + struct ip_fw **old_map; + + IPFW_WLOCK(chain); + chain->id++; + chain->n_rules = new_len; + old_map = chain->map; + chain->map = new_map; + IPFW_WUNLOCK(chain); + return old_map; +} + +/* + * Add a new rule to the list. Copy the rule into a malloc'ed area, then + * possibly create a rule number and add the rule to the list. + * Update the rule_number in the input struct so the caller knows it as well. + * XXX DO NOT USE FOR THE DEFAULT RULE. + * Must be called without IPFW_UH held + */ +int +ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule) +{ + struct ip_fw *rule; + int i, l, insert_before; + struct ip_fw **map; /* the new array of pointers */ + + if (chain->rules == NULL || input_rule->rulenum > IPFW_DEFAULT_RULE-1) + return (EINVAL); + + l = RULESIZE(input_rule); + rule = malloc(l, M_IPFW, M_WAITOK | M_ZERO); + if (rule == NULL) + return (ENOSPC); + /* get_map returns with IPFW_UH_WLOCK if successful */ + map = get_map(chain, 1, 0 /* not locked */); + if (map == NULL) { + free(rule, M_IPFW); + return ENOSPC; + } + + bcopy(input_rule, rule, l); + /* clear fields not settable from userland */ + rule->x_next = NULL; + rule->next_rule = NULL; + rule->pcnt = 0; + rule->bcnt = 0; + rule->timestamp = 0; + + if (V_autoinc_step < 1) + V_autoinc_step = 1; + else if (V_autoinc_step > 1000) + V_autoinc_step = 1000; + /* find the insertion point, we will insert before */ + insert_before = rule->rulenum ? rule->rulenum + 1 : IPFW_DEFAULT_RULE; + i = ipfw_find_rule(chain, insert_before, 0); + /* duplicate first part */ + if (i > 0) + bcopy(chain->map, map, i * sizeof(struct ip_fw *)); + map[i] = rule; + /* duplicate remaining part, we always have the default rule */ + bcopy(chain->map + i, map + i + 1, + sizeof(struct ip_fw *) *(chain->n_rules - i)); + if (rule->rulenum == 0) { + /* write back the number */ + rule->rulenum = i > 0 ? map[i-1]->rulenum : 0; + if (rule->rulenum < IPFW_DEFAULT_RULE - V_autoinc_step) + rule->rulenum += V_autoinc_step; + input_rule->rulenum = rule->rulenum; + } + + rule->id = chain->id + 1; + map = swap_map(chain, map, chain->n_rules + 1); + chain->static_len += l; + IPFW_UH_WUNLOCK(chain); + if (map) + free(map, M_IPFW); + return (0); +} + +/* + * Reclaim storage associated with a list of rules. This is + * typically the list created using remove_rule. + * A NULL pointer on input is handled correctly. + */ +void +ipfw_reap_rules(struct ip_fw *head) +{ + struct ip_fw *rule; + + while ((rule = head) != NULL) { + head = head->x_next; + free(rule, M_IPFW); + } +} + +/** + * Remove all rules with given number, and also do set manipulation. + * Assumes chain != NULL && *chain != NULL. + * + * The argument is an u_int32_t. The low 16 bit are the rule or set number, + * the next 8 bits are the new set, the top 8 bits are the command: + * + * 0 delete rules with given number + * 1 delete rules with given set number + * 2 move rules with given number to new set + * 3 move rules with given set number to new set + * 4 swap sets with given numbers + * 5 delete rules with given number and with given set number + */ +static int +del_entry(struct ip_fw_chain *chain, u_int32_t arg) +{ + struct ip_fw *rule; + uint32_t rulenum; /* rule or old_set */ + uint8_t cmd, new_set; + int start, end = 0, i, ofs, n; + struct ip_fw **map = NULL; + int error = 0; + + rulenum = arg & 0xffff; + cmd = (arg >> 24) & 0xff; + new_set = (arg >> 16) & 0xff; + + if (cmd > 5 || new_set > RESVD_SET) + return EINVAL; + if (cmd == 0 || cmd == 2 || cmd == 5) { + if (rulenum >= IPFW_DEFAULT_RULE) + return EINVAL; + } else { + if (rulenum > RESVD_SET) /* old_set */ + return EINVAL; + } + + IPFW_UH_WLOCK(chain); /* prevent conflicts among the writers */ + chain->reap = NULL; /* prepare for deletions */ + + switch (cmd) { + case 0: /* delete rules with given number (0 is special means all) */ + case 1: /* delete all rules with given set number, rule->set == rulenum */ + case 5: /* delete rules with given number and with given set number. + * rulenum - given rule number; + * new_set - given set number. + */ + /* locate first rule to delete (start), the one after the + * last one (end), and count how many rules to delete (n) + */ + n = 0; + if (cmd == 1) { /* look for a specific set, must scan all */ + for (start = -1, i = 0; i < chain->n_rules; i++) { + if (chain->map[start]->set != rulenum) + continue; + if (start < 0) + start = i; + end = i; + n++; + } + end++; /* first non-matching */ + } else { + start = ipfw_find_rule(chain, rulenum, 0); + for (end = start; end < chain->n_rules; end++) { + rule = chain->map[end]; + if (rulenum > 0 && rule->rulenum != rulenum) + break; + if (rule->set != RESVD_SET && + (cmd == 0 || rule->set == new_set) ) + n++; + } + } + if (n == 0 && arg == 0) + break; /* special case, flush on empty ruleset */ + /* allocate the map, if needed */ + if (n > 0) + map = get_map(chain, -n, 1 /* locked */); + if (n == 0 || map == NULL) { + error = EINVAL; + break; + } + /* copy the initial part of the map */ + if (start > 0) + bcopy(chain->map, map, start * sizeof(struct ip_fw *)); + /* copy active rules between start and end */ + for (i = ofs = start; i < end; i++) { + rule = chain->map[i]; + if (!(rule->set != RESVD_SET && + (cmd == 0 || rule->set == new_set) )) + map[ofs++] = chain->map[i]; + } + /* finally the tail */ + bcopy(chain->map + end, map + ofs, + (chain->n_rules - end) * sizeof(struct ip_fw *)); + map = swap_map(chain, map, chain->n_rules - n); + /* now remove the rules deleted */ + for (i = start; i < end; i++) { + rule = map[i]; + if (rule->set != RESVD_SET && + (cmd == 0 || rule->set == new_set) ) { + int l = RULESIZE(rule); + + chain->static_len -= l; + ipfw_remove_dyn_children(rule); + rule->x_next = chain->reap; + chain->reap = rule; + } + } + break; + + case 2: /* move rules with given number to new set */ + for (i = 0; i < chain->n_rules; i++) { + rule = chain->map[i]; + if (rule->rulenum == rulenum) + rule->set = new_set; + } + break; + + case 3: /* move rules with given set number to new set */ + for (i = 0; i < chain->n_rules; i++) { + rule = chain->map[i]; + if (rule->set == rulenum) + rule->set = new_set; + } + break; + + case 4: /* swap two sets */ + for (i = 0; i < chain->n_rules; i++) { + rule = chain->map[i]; + if (rule->set == rulenum) + rule->set = new_set; + else if (rule->set == new_set) + rule->set = rulenum; + } + break; + } + rule = chain->reap; + chain->reap = NULL; + IPFW_UH_WUNLOCK(chain); + ipfw_reap_rules(rule); + if (map) + free(map, M_IPFW); + return error; +} + +/* + * Clear counters for a specific rule. + * Normally run under IPFW_UH_RLOCK, but these are idempotent ops + * so we only care that rules do not disappear. + */ +static void +clear_counters(struct ip_fw *rule, int log_only) +{ + ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule); + + if (log_only == 0) { + rule->bcnt = rule->pcnt = 0; + rule->timestamp = 0; + } + if (l->o.opcode == O_LOG) + l->log_left = l->max_log; +} + +/** + * Reset some or all counters on firewall rules. + * The argument `arg' is an u_int32_t. The low 16 bit are the rule number, + * the next 8 bits are the set number, the top 8 bits are the command: + * 0 work with rules from all set's; + * 1 work with rules only from specified set. + * Specified rule number is zero if we want to clear all entries. + * log_only is 1 if we only want to reset logs, zero otherwise. + */ +static int +zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only) +{ + struct ip_fw *rule; + char *msg; + int i; + + uint16_t rulenum = arg & 0xffff; + uint8_t set = (arg >> 16) & 0xff; + uint8_t cmd = (arg >> 24) & 0xff; + + if (cmd > 1) + return (EINVAL); + if (cmd == 1 && set > RESVD_SET) + return (EINVAL); + + IPFW_UH_RLOCK(chain); + if (rulenum == 0) { + V_norule_counter = 0; + for (i = 0; i < chain->n_rules; i++) { + rule = chain->map[i]; + /* Skip rules not in our set. */ + if (cmd == 1 && rule->set != set) + continue; + clear_counters(rule, log_only); + } + msg = log_only ? "All logging counts reset" : + "Accounting cleared"; + } else { + int cleared = 0; + for (i = 0; i < chain->n_rules; i++) { + rule = chain->map[i]; + if (rule->rulenum == rulenum) { + if (cmd == 0 || rule->set == set) + clear_counters(rule, log_only); + cleared = 1; + } + if (rule->rulenum > rulenum) + break; + } + if (!cleared) { /* we did not find any matching rules */ + IPFW_WUNLOCK(chain); + return (EINVAL); + } + msg = log_only ? "logging count reset" : "cleared"; + } + IPFW_UH_RUNLOCK(chain); + + if (V_fw_verbose) { + int lev = LOG_SECURITY | LOG_NOTICE; + + if (rulenum) + log(lev, "ipfw: Entry %d %s.\n", rulenum, msg); + else + log(lev, "ipfw: %s.\n", msg); + } + return (0); +} + +/* + * Check validity of the structure before insert. + * Rules are simple, so this mostly need to check rule sizes. + */ +static int +check_ipfw_struct(struct ip_fw *rule, int size) +{ + int l, cmdlen = 0; + int have_action=0; + ipfw_insn *cmd; + + if (size < sizeof(*rule)) { + printf("ipfw: rule too short\n"); + return (EINVAL); + } + /* first, check for valid size */ + l = RULESIZE(rule); + if (l != size) { + printf("ipfw: size mismatch (have %d want %d)\n", size, l); + return (EINVAL); + } + if (rule->act_ofs >= rule->cmd_len) { + printf("ipfw: bogus action offset (%u > %u)\n", + rule->act_ofs, rule->cmd_len - 1); + return (EINVAL); + } + /* + * Now go for the individual checks. Very simple ones, basically only + * instruction sizes. + */ + for (l = rule->cmd_len, cmd = rule->cmd ; + l > 0 ; l -= cmdlen, cmd += cmdlen) { + cmdlen = F_LEN(cmd); + if (cmdlen > l) { + printf("ipfw: opcode %d size truncated\n", + cmd->opcode); + return EINVAL; + } + switch (cmd->opcode) { + case O_PROBE_STATE: + case O_KEEP_STATE: + case O_PROTO: + case O_IP_SRC_ME: + case O_IP_DST_ME: + case O_LAYER2: + case O_IN: + case O_FRAG: + case O_DIVERTED: + case O_IPOPT: + case O_IPTOS: + case O_IPPRECEDENCE: + case O_IPVER: + case O_TCPWIN: + case O_TCPFLAGS: + case O_TCPOPTS: + case O_ESTAB: + case O_VERREVPATH: + case O_VERSRCREACH: + case O_ANTISPOOF: + case O_IPSEC: +#ifdef INET6 + case O_IP6_SRC_ME: + case O_IP6_DST_ME: + case O_EXT_HDR: + case O_IP6: +#endif + case O_IP4: + case O_TAG: + if (cmdlen != F_INSN_SIZE(ipfw_insn)) + goto bad_size; + break; + + case O_FIB: + if (cmdlen != F_INSN_SIZE(ipfw_insn)) + goto bad_size; + if (cmd->arg1 >= rt_numfibs) { + printf("ipfw: invalid fib number %d\n", + cmd->arg1); + return EINVAL; + } + break; + + case O_SETFIB: + if (cmdlen != F_INSN_SIZE(ipfw_insn)) + goto bad_size; + if (cmd->arg1 >= rt_numfibs) { + printf("ipfw: invalid fib number %d\n", + cmd->arg1); + return EINVAL; + } + goto check_action; + + case O_UID: + case O_GID: + case O_JAIL: + case O_IP_SRC: + case O_IP_DST: + case O_TCPSEQ: + case O_TCPACK: + case O_PROB: + case O_ICMPTYPE: + if (cmdlen != F_INSN_SIZE(ipfw_insn_u32)) + goto bad_size; + break; + + case O_LIMIT: + if (cmdlen != F_INSN_SIZE(ipfw_insn_limit)) + goto bad_size; + break; + + case O_LOG: + if (cmdlen != F_INSN_SIZE(ipfw_insn_log)) + goto bad_size; + + ((ipfw_insn_log *)cmd)->log_left = + ((ipfw_insn_log *)cmd)->max_log; + + break; + + case O_IP_SRC_MASK: + case O_IP_DST_MASK: + /* only odd command lengths */ + if ( !(cmdlen & 1) || cmdlen > 31) + goto bad_size; + break; + + case O_IP_SRC_SET: + case O_IP_DST_SET: + if (cmd->arg1 == 0 || cmd->arg1 > 256) { + printf("ipfw: invalid set size %d\n", + cmd->arg1); + return EINVAL; + } + if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + + (cmd->arg1+31)/32 ) + goto bad_size; + break; + + case O_IP_SRC_LOOKUP: + case O_IP_DST_LOOKUP: + if (cmd->arg1 >= IPFW_TABLES_MAX) { + printf("ipfw: invalid table number %d\n", + cmd->arg1); + return (EINVAL); + } + if (cmdlen != F_INSN_SIZE(ipfw_insn) && + cmdlen != F_INSN_SIZE(ipfw_insn_u32) + 1 && + cmdlen != F_INSN_SIZE(ipfw_insn_u32)) + goto bad_size; + break; + + case O_MACADDR2: + if (cmdlen != F_INSN_SIZE(ipfw_insn_mac)) + goto bad_size; + break; + + case O_NOP: + case O_IPID: + case O_IPTTL: + case O_IPLEN: + case O_TCPDATALEN: + case O_TAGGED: + if (cmdlen < 1 || cmdlen > 31) + goto bad_size; + break; + + case O_MAC_TYPE: + case O_IP_SRCPORT: + case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */ + if (cmdlen < 2 || cmdlen > 31) + goto bad_size; + break; + + case O_RECV: + case O_XMIT: + case O_VIA: + if (cmdlen != F_INSN_SIZE(ipfw_insn_if)) + goto bad_size; + break; + + case O_ALTQ: + if (cmdlen != F_INSN_SIZE(ipfw_insn_altq)) + goto bad_size; + break; + + case O_PIPE: + case O_QUEUE: + if (cmdlen != F_INSN_SIZE(ipfw_insn)) + goto bad_size; + goto check_action; + + case O_FORWARD_IP: +#ifdef IPFIREWALL_FORWARD + if (cmdlen != F_INSN_SIZE(ipfw_insn_sa)) + goto bad_size; + goto check_action; +#else + return EINVAL; +#endif + + case O_DIVERT: + case O_TEE: + if (ip_divert_ptr == NULL) + return EINVAL; + else + goto check_size; + case O_NETGRAPH: + case O_NGTEE: + if (ng_ipfw_input_p == NULL) + return EINVAL; + else + goto check_size; + case O_NAT: + if (!IPFW_NAT_LOADED) + return EINVAL; + if (cmdlen != F_INSN_SIZE(ipfw_insn_nat)) + goto bad_size; + goto check_action; + case O_FORWARD_MAC: /* XXX not implemented yet */ + case O_CHECK_STATE: + case O_COUNT: + case O_ACCEPT: + case O_DENY: + case O_REJECT: +#ifdef INET6 + case O_UNREACH6: +#endif + case O_SKIPTO: + case O_REASS: +check_size: + if (cmdlen != F_INSN_SIZE(ipfw_insn)) + goto bad_size; +check_action: + if (have_action) { + printf("ipfw: opcode %d, multiple actions" + " not allowed\n", + cmd->opcode); + return EINVAL; + } + have_action = 1; + if (l != cmdlen) { + printf("ipfw: opcode %d, action must be" + " last opcode\n", + cmd->opcode); + return EINVAL; + } + break; +#ifdef INET6 + case O_IP6_SRC: + case O_IP6_DST: + if (cmdlen != F_INSN_SIZE(struct in6_addr) + + F_INSN_SIZE(ipfw_insn)) + goto bad_size; + break; + + case O_FLOW6ID: + if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + + ((ipfw_insn_u32 *)cmd)->o.arg1) + goto bad_size; + break; + + case O_IP6_SRC_MASK: + case O_IP6_DST_MASK: + if ( !(cmdlen & 1) || cmdlen > 127) + goto bad_size; + break; + case O_ICMP6TYPE: + if( cmdlen != F_INSN_SIZE( ipfw_insn_icmp6 ) ) + goto bad_size; + break; +#endif + + default: + switch (cmd->opcode) { +#ifndef INET6 + case O_IP6_SRC_ME: + case O_IP6_DST_ME: + case O_EXT_HDR: + case O_IP6: + case O_UNREACH6: + case O_IP6_SRC: + case O_IP6_DST: + case O_FLOW6ID: + case O_IP6_SRC_MASK: + case O_IP6_DST_MASK: + case O_ICMP6TYPE: + printf("ipfw: no IPv6 support in kernel\n"); + return EPROTONOSUPPORT; +#endif + default: + printf("ipfw: opcode %d, unknown opcode\n", + cmd->opcode); + return EINVAL; + } + } + } + if (have_action == 0) { + printf("ipfw: missing action\n"); + return EINVAL; + } + return 0; + +bad_size: + printf("ipfw: opcode %d size %d wrong\n", + cmd->opcode, cmdlen); + return EINVAL; +} + + +/* + * Translation of requests for compatibility with FreeBSD 7.2/8. + * a static variable tells us if we have an old client from userland, + * and if necessary we translate requests and responses between the + * two formats. + */ +static int is7 = 0; + +struct ip_fw7 { + struct ip_fw7 *next; /* linked list of rules */ + struct ip_fw7 *next_rule; /* ptr to next [skipto] rule */ + /* 'next_rule' is used to pass up 'set_disable' status */ + + uint16_t act_ofs; /* offset of action in 32-bit units */ + uint16_t cmd_len; /* # of 32-bit words in cmd */ + uint16_t rulenum; /* rule number */ + uint8_t set; /* rule set (0..31) */ + // #define RESVD_SET 31 /* set for default and persistent rules */ + uint8_t _pad; /* padding */ + // uint32_t id; /* rule id, only in v.8 */ + /* These fields are present in all rules. */ + uint64_t pcnt; /* Packet counter */ + uint64_t bcnt; /* Byte counter */ + uint32_t timestamp; /* tv_sec of last match */ + + ipfw_insn cmd[1]; /* storage for commands */ +}; + + int convert_rule_to_7(struct ip_fw *rule); +int convert_rule_to_8(struct ip_fw *rule); + +#ifndef RULESIZE7 +#define RULESIZE7(rule) (sizeof(struct ip_fw7) + \ + ((struct ip_fw7 *)(rule))->cmd_len * 4 - 4) +#endif + + +/* + * Copy the static and dynamic rules to the supplied buffer + * and return the amount of space actually used. + * Must be run under IPFW_UH_RLOCK + */ +static size_t +ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space) +{ + char *bp = buf; + char *ep = bp + space; + struct ip_fw *rule, *dst; + int l, i; + time_t boot_seconds; + + boot_seconds = boottime.tv_sec; + for (i = 0; i < chain->n_rules; i++) { + rule = chain->map[i]; + + if (is7) { + /* Convert rule to FreeBSd 7.2 format */ + l = RULESIZE7(rule); + if (bp + l + sizeof(uint32_t) <= ep) { + int error; + bcopy(rule, bp, l + sizeof(uint32_t)); + error = convert_rule_to_7((struct ip_fw *) bp); + if (error) + return 0; /*XXX correct? */ + /* + * XXX HACK. Store the disable mask in the "next" + * pointer in a wild attempt to keep the ABI the same. + * Why do we do this on EVERY rule? + */ + bcopy(&V_set_disable, + &(((struct ip_fw7 *)bp)->next_rule), + sizeof(V_set_disable)); + if (((struct ip_fw7 *)bp)->timestamp) + ((struct ip_fw7 *)bp)->timestamp += boot_seconds; + bp += l; + } + continue; /* go to next rule */ + } + + /* normal mode, don't touch rules */ + l = RULESIZE(rule); + if (bp + l > ep) { /* should not happen */ + printf("overflow dumping static rules\n"); + break; + } + dst = (struct ip_fw *)bp; + bcopy(rule, dst, l); + /* + * XXX HACK. Store the disable mask in the "next" + * pointer in a wild attempt to keep the ABI the same. + * Why do we do this on EVERY rule? + */ + bcopy(&V_set_disable, &dst->next_rule, sizeof(V_set_disable)); + if (dst->timestamp) + dst->timestamp += boot_seconds; + bp += l; + } + ipfw_get_dynamic(&bp, ep); /* protected by the dynamic lock */ + return (bp - (char *)buf); +} + + +/** + * {set|get}sockopt parser. + */ +int +ipfw_ctl(struct sockopt *sopt) +{ +#define RULE_MAXSIZE (256*sizeof(u_int32_t)) + int error; + size_t size; + struct ip_fw *buf, *rule; + struct ip_fw_chain *chain; + u_int32_t rulenum[2]; + + error = priv_check(sopt->sopt_td, PRIV_NETINET_IPFW); + if (error) + return (error); + + /* + * Disallow modifications in really-really secure mode, but still allow + * the logging counters to be reset. + */ + if (sopt->sopt_name == IP_FW_ADD || + (sopt->sopt_dir == SOPT_SET && sopt->sopt_name != IP_FW_RESETLOG)) { + error = securelevel_ge(sopt->sopt_td->td_ucred, 3); + if (error) + return (error); + } + + chain = &V_layer3_chain; + error = 0; + + switch (sopt->sopt_name) { + case IP_FW_GET: + /* + * pass up a copy of the current rules. Static rules + * come first (the last of which has number IPFW_DEFAULT_RULE), + * followed by a possibly empty list of dynamic rule. + * The last dynamic rule has NULL in the "next" field. + * + * Note that the calculated size is used to bound the + * amount of data returned to the user. The rule set may + * change between calculating the size and returning the + * data in which case we'll just return what fits. + */ + for (;;) { + int len = 0, want; + + size = chain->static_len; + size += ipfw_dyn_len(); + if (size >= sopt->sopt_valsize) + break; + buf = malloc(size, M_TEMP, M_WAITOK); + if (buf == NULL) + break; + IPFW_UH_RLOCK(chain); + /* check again how much space we need */ + want = chain->static_len + ipfw_dyn_len(); + if (size >= want) + len = ipfw_getrules(chain, buf, size); + IPFW_UH_RUNLOCK(chain); + if (size >= want) + error = sooptcopyout(sopt, buf, len); + free(buf, M_TEMP); + if (size >= want) + break; + } + break; + + case IP_FW_FLUSH: + /* locking is done within del_entry() */ + error = del_entry(chain, 0); /* special case, rule=0, cmd=0 means all */ + break; + + case IP_FW_ADD: + rule = malloc(RULE_MAXSIZE, M_TEMP, M_WAITOK); + error = sooptcopyin(sopt, rule, RULE_MAXSIZE, + sizeof(struct ip_fw7) ); + + /* + * If the size of commands equals RULESIZE7 then we assume + * a FreeBSD7.2 binary is talking to us (set is7=1). + * is7 is persistent so the next 'ipfw list' command + * will use this format. + * NOTE: If wrong version is guessed (this can happen if + * the first ipfw command is 'ipfw [pipe] list') + * the ipfw binary may crash or loop infinitly... + */ + if (sopt->sopt_valsize == RULESIZE7(rule)) { + is7 = 1; + error = convert_rule_to_8(rule); + if (error) + return error; + if (error == 0) + error = check_ipfw_struct(rule, RULESIZE(rule)); + } else { + is7 = 0; + if (error == 0) + error = check_ipfw_struct(rule, sopt->sopt_valsize); + } + if (error == 0) { + /* locking is done within ipfw_add_rule() */ + error = ipfw_add_rule(chain, rule); + size = RULESIZE(rule); + if (!error && sopt->sopt_dir == SOPT_GET) { + if (is7) { + error = convert_rule_to_7(rule); + size = RULESIZE7(rule); + if (error) + return error; + } + error = sooptcopyout(sopt, rule, size); + } + } + free(rule, M_TEMP); + break; + + case IP_FW_DEL: + /* + * IP_FW_DEL is used for deleting single rules or sets, + * and (ab)used to atomically manipulate sets. Argument size + * is used to distinguish between the two: + * sizeof(u_int32_t) + * delete single rule or set of rules, + * or reassign rules (or sets) to a different set. + * 2*sizeof(u_int32_t) + * atomic disable/enable sets. + * first u_int32_t contains sets to be disabled, + * second u_int32_t contains sets to be enabled. + */ + error = sooptcopyin(sopt, rulenum, + 2*sizeof(u_int32_t), sizeof(u_int32_t)); + if (error) + break; + size = sopt->sopt_valsize; + if (size == sizeof(u_int32_t) && rulenum[0] != 0) { + /* delete or reassign, locking done in del_entry() */ + error = del_entry(chain, rulenum[0]); + } else if (size == 2*sizeof(u_int32_t)) { /* set enable/disable */ + IPFW_UH_WLOCK(chain); + V_set_disable = + (V_set_disable | rulenum[0]) & ~rulenum[1] & + ~(1<<RESVD_SET); /* set RESVD_SET always enabled */ + IPFW_UH_WUNLOCK(chain); + } else + error = EINVAL; + break; + + case IP_FW_ZERO: + case IP_FW_RESETLOG: /* argument is an u_int_32, the rule number */ + rulenum[0] = 0; + if (sopt->sopt_val != 0) { + error = sooptcopyin(sopt, rulenum, + sizeof(u_int32_t), sizeof(u_int32_t)); + if (error) + break; + } + error = zero_entry(chain, rulenum[0], + sopt->sopt_name == IP_FW_RESETLOG); + break; + + /*--- TABLE manipulations are protected by the IPFW_LOCK ---*/ + case IP_FW_TABLE_ADD: + { + ipfw_table_entry ent; + + error = sooptcopyin(sopt, &ent, + sizeof(ent), sizeof(ent)); + if (error) + break; + error = ipfw_add_table_entry(chain, ent.tbl, + ent.addr, ent.masklen, ent.value); + } + break; + + case IP_FW_TABLE_DEL: + { + ipfw_table_entry ent; + + error = sooptcopyin(sopt, &ent, + sizeof(ent), sizeof(ent)); + if (error) + break; + error = ipfw_del_table_entry(chain, ent.tbl, + ent.addr, ent.masklen); + } + break; + + case IP_FW_TABLE_FLUSH: + { + u_int16_t tbl; + + error = sooptcopyin(sopt, &tbl, + sizeof(tbl), sizeof(tbl)); + if (error) + break; + IPFW_WLOCK(chain); + error = ipfw_flush_table(chain, tbl); + IPFW_WUNLOCK(chain); + } + break; + + case IP_FW_TABLE_GETSIZE: + { + u_int32_t tbl, cnt; + + if ((error = sooptcopyin(sopt, &tbl, sizeof(tbl), + sizeof(tbl)))) + break; + IPFW_RLOCK(chain); + error = ipfw_count_table(chain, tbl, &cnt); + IPFW_RUNLOCK(chain); + if (error) + break; + error = sooptcopyout(sopt, &cnt, sizeof(cnt)); + } + break; + + case IP_FW_TABLE_LIST: + { + ipfw_table *tbl; + + if (sopt->sopt_valsize < sizeof(*tbl)) { + error = EINVAL; + break; + } + size = sopt->sopt_valsize; + tbl = malloc(size, M_TEMP, M_WAITOK); + error = sooptcopyin(sopt, tbl, size, sizeof(*tbl)); + if (error) { + free(tbl, M_TEMP); + break; + } + tbl->size = (size - sizeof(*tbl)) / + sizeof(ipfw_table_entry); + IPFW_RLOCK(chain); + error = ipfw_dump_table(chain, tbl); + IPFW_RUNLOCK(chain); + if (error) { + free(tbl, M_TEMP); + break; + } + error = sooptcopyout(sopt, tbl, size); + free(tbl, M_TEMP); + } + break; + + /*--- NAT operations are protected by the IPFW_LOCK ---*/ + case IP_FW_NAT_CFG: + if (IPFW_NAT_LOADED) + error = ipfw_nat_cfg_ptr(sopt); + else { + printf("IP_FW_NAT_CFG: %s\n", + "ipfw_nat not present, please load it"); + error = EINVAL; + } + break; + + case IP_FW_NAT_DEL: + if (IPFW_NAT_LOADED) + error = ipfw_nat_del_ptr(sopt); + else { + printf("IP_FW_NAT_DEL: %s\n", + "ipfw_nat not present, please load it"); + error = EINVAL; + } + break; + + case IP_FW_NAT_GET_CONFIG: + if (IPFW_NAT_LOADED) + error = ipfw_nat_get_cfg_ptr(sopt); + else { + printf("IP_FW_NAT_GET_CFG: %s\n", + "ipfw_nat not present, please load it"); + error = EINVAL; + } + break; + + case IP_FW_NAT_GET_LOG: + if (IPFW_NAT_LOADED) + error = ipfw_nat_get_log_ptr(sopt); + else { + printf("IP_FW_NAT_GET_LOG: %s\n", + "ipfw_nat not present, please load it"); + error = EINVAL; + } + break; + + default: + printf("ipfw: ipfw_ctl invalid option %d\n", sopt->sopt_name); + error = EINVAL; + } + + return (error); +#undef RULE_MAXSIZE +} + + +#define RULE_MAXSIZE (256*sizeof(u_int32_t)) + +/* Functions to convert rules 7.2 <==> 8.0 */ +int +convert_rule_to_7(struct ip_fw *rule) +{ + /* Used to modify original rule */ + struct ip_fw7 *rule7 = (struct ip_fw7 *)rule; + /* copy of original rule, version 8 */ + struct ip_fw *tmp; + + /* Used to copy commands */ + ipfw_insn *ccmd, *dst; + int ll = 0, ccmdlen = 0; + + tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO); + if (tmp == NULL) { + return 1; //XXX error + } + bcopy(rule, tmp, RULE_MAXSIZE); + + /* Copy fields */ + rule7->_pad = tmp->_pad; + rule7->set = tmp->set; + rule7->rulenum = tmp->rulenum; + rule7->cmd_len = tmp->cmd_len; + rule7->act_ofs = tmp->act_ofs; + rule7->next_rule = (struct ip_fw7 *)tmp->next_rule; + rule7->next = (struct ip_fw7 *)tmp->x_next; + rule7->cmd_len = tmp->cmd_len; + rule7->pcnt = tmp->pcnt; + rule7->bcnt = tmp->bcnt; + rule7->timestamp = tmp->timestamp; + + /* Copy commands */ + for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule7->cmd ; + ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) { + ccmdlen = F_LEN(ccmd); + + bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t)); + + if (dst->opcode > O_NAT) + /* O_REASS doesn't exists in 7.2 version, so + * decrement opcode if it is after O_REASS + */ + dst->opcode--; + + if (ccmdlen > ll) { + printf("ipfw: opcode %d size truncated\n", + ccmd->opcode); + return EINVAL; + } + } + free(tmp, M_TEMP); + + return 0; +} + +int +convert_rule_to_8(struct ip_fw *rule) +{ + /* Used to modify original rule */ + struct ip_fw7 *rule7 = (struct ip_fw7 *) rule; + + /* Used to copy commands */ + ipfw_insn *ccmd, *dst; + int ll = 0, ccmdlen = 0; + + /* Copy of original rule */ + struct ip_fw7 *tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO); + if (tmp == NULL) { + return 1; //XXX error + } + + bcopy(rule7, tmp, RULE_MAXSIZE); + + for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule->cmd ; + ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) { + ccmdlen = F_LEN(ccmd); + + bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t)); + + if (dst->opcode > O_NAT) + /* O_REASS doesn't exists in 7.2 version, so + * increment opcode if it is after O_REASS + */ + dst->opcode++; + + if (ccmdlen > ll) { + printf("ipfw: opcode %d size truncated\n", + ccmd->opcode); + return EINVAL; + } + } + + rule->_pad = tmp->_pad; + rule->set = tmp->set; + rule->rulenum = tmp->rulenum; + rule->cmd_len = tmp->cmd_len; + rule->act_ofs = tmp->act_ofs; + rule->next_rule = (struct ip_fw *)tmp->next_rule; + rule->x_next = (struct ip_fw *)tmp->next; + rule->cmd_len = tmp->cmd_len; + rule->id = 0; /* XXX see if is ok = 0 */ + rule->pcnt = tmp->pcnt; + rule->bcnt = tmp->bcnt; + rule->timestamp = tmp->timestamp; + + free (tmp, M_TEMP); + return 0; +} + +/* end of file */ diff --git a/sys/netinet/ipfw/ip_fw_table.c b/sys/netinet/ipfw/ip_fw_table.c new file mode 100644 index 0000000..517622f --- /dev/null +++ b/sys/netinet/ipfw/ip_fw_table.c @@ -0,0 +1,286 @@ +/*- + * Copyright (c) 2004 Ruslan Ermilov and Vsevolod Lobko. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +/* + * Lookup table support for ipfw + * + * Lookup tables are implemented (at the moment) using the radix + * tree used for routing tables. Tables store key-value entries, where + * keys are network prefixes (addr/masklen), and values are integers. + * As a degenerate case we can interpret keys as 32-bit integers + * (with a /32 mask). + * + * The table is protected by the IPFW lock even for manipulation coming + * from userland, because operations are typically fast. + */ + +#if !defined(KLD_MODULE) +#include "opt_ipfw.h" +#include "opt_ipdivert.h" +#include "opt_ipdn.h" +#include "opt_inet.h" +#ifndef INET +#error IPFIREWALL requires INET. +#endif /* INET */ +#endif +#include "opt_inet6.h" +#include "opt_ipsec.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/rwlock.h> +#include <sys/socket.h> +#include <net/if.h> /* ip_fw.h requires IFNAMSIZ */ +#include <net/radix.h> +#include <net/route.h> +#include <net/vnet.h> + +#include <netinet/in.h> +#include <netinet/ip_var.h> /* struct ipfw_rule_ref */ +#include <netinet/ip_fw.h> +#include <sys/queue.h> /* LIST_HEAD */ +#include <netinet/ipfw/ip_fw_private.h> + +#ifdef MAC +#include <security/mac/mac_framework.h> +#endif + +MALLOC_DEFINE(M_IPFW_TBL, "ipfw_tbl", "IpFw tables"); + +struct table_entry { + struct radix_node rn[2]; + struct sockaddr_in addr, mask; + u_int32_t value; +}; + +/* + * The radix code expects addr and mask to be array of bytes, + * with the first byte being the length of the array. rn_inithead + * is called with the offset in bits of the lookup key within the + * array. If we use a sockaddr_in as the underlying type, + * sin_len is conveniently located at offset 0, sin_addr is at + * offset 4 and normally aligned. + * But for portability, let's avoid assumption and make the code explicit + */ +#define KEY_LEN(v) *((uint8_t *)&(v)) +#define KEY_OFS (8*offsetof(struct sockaddr_in, sin_addr)) + +int +ipfw_add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, + uint8_t mlen, uint32_t value) +{ + struct radix_node_head *rnh; + struct table_entry *ent; + struct radix_node *rn; + + if (tbl >= IPFW_TABLES_MAX) + return (EINVAL); + rnh = ch->tables[tbl]; + ent = malloc(sizeof(*ent), M_IPFW_TBL, M_NOWAIT | M_ZERO); + if (ent == NULL) + return (ENOMEM); + ent->value = value; + KEY_LEN(ent->addr) = KEY_LEN(ent->mask) = 8; + ent->mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0); + ent->addr.sin_addr.s_addr = addr & ent->mask.sin_addr.s_addr; + IPFW_WLOCK(ch); + rn = rnh->rnh_addaddr(&ent->addr, &ent->mask, rnh, (void *)ent); + if (rn == NULL) { + IPFW_WUNLOCK(ch); + free(ent, M_IPFW_TBL); + return (EEXIST); + } + IPFW_WUNLOCK(ch); + return (0); +} + +int +ipfw_del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, + uint8_t mlen) +{ + struct radix_node_head *rnh; + struct table_entry *ent; + struct sockaddr_in sa, mask; + + if (tbl >= IPFW_TABLES_MAX) + return (EINVAL); + rnh = ch->tables[tbl]; + KEY_LEN(sa) = KEY_LEN(mask) = 8; + mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0); + sa.sin_addr.s_addr = addr & mask.sin_addr.s_addr; + IPFW_WLOCK(ch); + ent = (struct table_entry *)rnh->rnh_deladdr(&sa, &mask, rnh); + if (ent == NULL) { + IPFW_WUNLOCK(ch); + return (ESRCH); + } + IPFW_WUNLOCK(ch); + free(ent, M_IPFW_TBL); + return (0); +} + +static int +flush_table_entry(struct radix_node *rn, void *arg) +{ + struct radix_node_head * const rnh = arg; + struct table_entry *ent; + + ent = (struct table_entry *) + rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh); + if (ent != NULL) + free(ent, M_IPFW_TBL); + return (0); +} + +int +ipfw_flush_table(struct ip_fw_chain *ch, uint16_t tbl) +{ + struct radix_node_head *rnh; + + IPFW_WLOCK_ASSERT(ch); + + if (tbl >= IPFW_TABLES_MAX) + return (EINVAL); + rnh = ch->tables[tbl]; + KASSERT(rnh != NULL, ("NULL IPFW table")); + rnh->rnh_walktree(rnh, flush_table_entry, rnh); + return (0); +} + +void +ipfw_destroy_tables(struct ip_fw_chain *ch) +{ + uint16_t tbl; + struct radix_node_head *rnh; + + IPFW_WLOCK_ASSERT(ch); + + for (tbl = 0; tbl < IPFW_TABLES_MAX; tbl++) { + ipfw_flush_table(ch, tbl); + rnh = ch->tables[tbl]; + rn_detachhead((void **)&rnh); + } +} + +int +ipfw_init_tables(struct ip_fw_chain *ch) +{ + int i; + uint16_t j; + + for (i = 0; i < IPFW_TABLES_MAX; i++) { + if (!rn_inithead((void **)&ch->tables[i], KEY_OFS)) { + for (j = 0; j < i; j++) { + (void) ipfw_flush_table(ch, j); + } + return (ENOMEM); + } + } + return (0); +} + +int +ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, + uint32_t *val) +{ + struct radix_node_head *rnh; + struct table_entry *ent; + struct sockaddr_in sa; + + if (tbl >= IPFW_TABLES_MAX) + return (0); + rnh = ch->tables[tbl]; + KEY_LEN(sa) = 8; + sa.sin_addr.s_addr = addr; + ent = (struct table_entry *)(rnh->rnh_lookup(&sa, NULL, rnh)); + if (ent != NULL) { + *val = ent->value; + return (1); + } + return (0); +} + +static int +count_table_entry(struct radix_node *rn, void *arg) +{ + u_int32_t * const cnt = arg; + + (*cnt)++; + return (0); +} + +int +ipfw_count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt) +{ + struct radix_node_head *rnh; + + if (tbl >= IPFW_TABLES_MAX) + return (EINVAL); + rnh = ch->tables[tbl]; + *cnt = 0; + rnh->rnh_walktree(rnh, count_table_entry, cnt); + return (0); +} + +static int +dump_table_entry(struct radix_node *rn, void *arg) +{ + struct table_entry * const n = (struct table_entry *)rn; + ipfw_table * const tbl = arg; + ipfw_table_entry *ent; + + if (tbl->cnt == tbl->size) + return (1); + ent = &tbl->ent[tbl->cnt]; + ent->tbl = tbl->tbl; + if (in_nullhost(n->mask.sin_addr)) + ent->masklen = 0; + else + ent->masklen = 33 - ffs(ntohl(n->mask.sin_addr.s_addr)); + ent->addr = n->addr.sin_addr.s_addr; + ent->value = n->value; + tbl->cnt++; + return (0); +} + +int +ipfw_dump_table(struct ip_fw_chain *ch, ipfw_table *tbl) +{ + struct radix_node_head *rnh; + + if (tbl->tbl >= IPFW_TABLES_MAX) + return (EINVAL); + rnh = ch->tables[tbl->tbl]; + tbl->cnt = 0; + rnh->rnh_walktree(rnh, dump_table_entry, tbl); + return (0); +} +/* end of file */ diff --git a/sys/netinet/ipfw/test/Makefile b/sys/netinet/ipfw/test/Makefile new file mode 100644 index 0000000..bbeb942 --- /dev/null +++ b/sys/netinet/ipfw/test/Makefile @@ -0,0 +1,50 @@ +# +# $FreeBSD$ +# +# Makefile for building userland tests +# this is written in a form compatible with gmake + +SCHED_SRCS = test_dn_sched.c +SCHED_SRCS += dn_sched_fifo.c +SCHED_SRCS += dn_sched_wf2q.c +SCHED_SRCS += dn_sched_qfq.c +SCHED_SRCS += dn_sched_rr.c +SCHED_SRCS += dn_heap.c +SCHED_SRCS += main.c + +SCHED_OBJS=$(SCHED_SRCS:.c=.o) + +HEAP_SRCS = dn_heap.c test_dn_heap.c +HEAP_OBJS=$(HEAP_SRCS:.c=.o) + +VPATH= .:.. + +CFLAGS = -I.. -I. -Wall -Werror -O3 -DIPFW +TARGETS= test_sched # no test_heap by default + +all: $(TARGETS) + +test_heap : $(HEAP_OBJS) + $(CC) -o $@ $(HEAP_OBJS) + +test_sched : $(SCHED_OBJS) + $(CC) -o $@ $(SCHED_OBJS) + +$(SCHED_OBJS): dn_test.h +main.o: mylist.h + +clean: + - rm *.o $(TARGETS) *.core + +ALLSRCS = $(SCHED_SRCS) dn_test.h mylist.h \ + dn_sched.h dn_heap.h ip_dn_private.h Makefile +TMPBASE = /tmp/testXYZ +TMPDIR = $(TMPBASE)/test + +tgz: + -rm -rf $(TMPDIR) + mkdir -p $(TMPDIR) + -cp -p $(ALLSRCS) $(TMPDIR) + -(cd ..; cp -p $(ALLSRCS) $(TMPDIR)) + ls -la $(TMPDIR) + (cd $(TMPBASE); tar cvzf /tmp/test.tgz test) diff --git a/sys/netinet/ipfw/test/dn_test.h b/sys/netinet/ipfw/test/dn_test.h new file mode 100644 index 0000000..4e079bc --- /dev/null +++ b/sys/netinet/ipfw/test/dn_test.h @@ -0,0 +1,175 @@ +/* + * $FreeBSD$ + * + * userspace compatibility code for dummynet schedulers + */ + +#ifndef _DN_TEST_H +#define _DN_TEST_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <inttypes.h> +#include <stdio.h> +#include <stdlib.h> +#include <strings.h> /* bzero, ffs, ... */ +#include <string.h> /* strcmp */ +#include <errno.h> +#include <sys/queue.h> +#include <sys/time.h> + +extern int debug; +#define ND(fmt, args...) do {} while (0) +#define D1(fmt, args...) do {} while (0) +#define D(fmt, args...) fprintf(stderr, "%-8s " fmt "\n", \ + __FUNCTION__, ## args) +#define DX(lev, fmt, args...) do { \ + if (debug > lev) D(fmt, ## args); } while (0) + + +#ifndef offsetof +#define offsetof(t,m) (int)((&((t *)0L)->m)) +#endif + +#include <mylist.h> + +/* prevent include of other system headers */ +#define _NETINET_IP_VAR_H_ /* ip_fw_args */ +#define _IPFW2_H +#define _SYS_MBUF_H_ + +enum { + DN_QUEUE, +}; + +enum { + DN_SCHED_FIFO, + DN_SCHED_WF2QP, +}; + +struct dn_id { + int type, subtype, len, id; +}; + +struct dn_fs { + int par[4]; /* flowset parameters */ + + /* simulation entries. + * 'index' is not strictly necessary + * y is used for the inverse mapping , + */ + int index; + int y; /* inverse mapping */ + int base_y; /* inverse mapping */ + int next_y; /* inverse mapping */ + int n_flows; + int first_flow; + int next_flow; /* first_flow + n_flows */ + /* + * when generating, let 'cur' go from 0 to n_flows-1, + * then point to flow first_flow + cur + */ + int cur; +}; + +struct dn_sch { +}; + +struct dn_flow { + struct dn_id oid; + int length; + int len_bytes; + int drops; + uint64_t tot_bytes; + uint32_t flow_id; + struct list_head h; /* used by the generator */ +}; + +struct dn_link { +}; + +struct ip_fw_args { +}; + +struct mbuf { + struct { + int len; + } m_pkthdr; + struct mbuf *m_nextpkt; + int flow_id; /* for testing, index of a flow */ + //int flowset_id; /* for testing, index of a flowset */ + void *cfg; /* config args */ +}; + +#define MALLOC_DECLARE(x) +#define KASSERT(x, y) do { if (!(x)) printf y ; exit(0); } while (0) +struct ipfw_flow_id { +}; + +typedef void * module_t; + +struct _md_t { + const char *name; + int (*f)(module_t, int, void *); + void *p; +}; + +typedef struct _md_t moduledata_t; + +#define DECLARE_MODULE(name, b, c, d) \ + moduledata_t *_g_##name = & b +#define MODULE_DEPEND(a, b, c, d, e) + +#ifdef IPFW +#include <dn_heap.h> +#include <ip_dn_private.h> +#include <dn_sched.h> +#else +struct dn_queue { + struct dn_fsk *fs; /* parent flowset. */ + struct dn_sch_inst *_si; /* parent sched instance. */ +}; +struct dn_schk { +}; +struct dn_fsk { + struct dn_fs fs; + struct dn_schk *sched; +}; +struct dn_sch_inst { + struct dn_schk *sched; +}; +struct dn_alg { + int type; + const char *name; + void *enqueue, *dequeue; + int q_datalen, si_datalen, schk_datalen; + int (*config)(struct dn_schk *); + int (*new_sched)(struct dn_sch_inst *); + int (*new_fsk)(struct dn_fsk *); + int (*new_queue)(struct dn_queue *q); +}; + +#endif + +#ifndef __FreeBSD__ +int fls(int); +#endif + +static inline void +mq_append(struct mq *q, struct mbuf *m) +{ + if (q->head == NULL) + q->head = m; + else + q->tail->m_nextpkt = m; + q->tail = m; + m->m_nextpkt = NULL; +} + +#ifdef __cplusplus +} +#endif + +#endif /* _DN_TEST_H */ diff --git a/sys/netinet/ipfw/test/main.c b/sys/netinet/ipfw/test/main.c new file mode 100644 index 0000000..be9fdf5 --- /dev/null +++ b/sys/netinet/ipfw/test/main.c @@ -0,0 +1,636 @@ +/* + * $FreeBSD$ + * + * Testing program for schedulers + * + * The framework include a simple controller which, at each + * iteration, decides whether we can enqueue and/or dequeue. + * Then the mainloop runs the required number of tests, + * keeping track of statistics. + */ + +#include "dn_test.h" + +struct q_list { + struct list_head h; +}; + +struct cfg_s { + int ac; + char * const *av; + + const char *name; + int loops; + struct timeval time; + + /* running counters */ + uint32_t _enqueue; + uint32_t drop; + uint32_t pending; + uint32_t dequeue; + + /* generator parameters */ + int th_min, th_max; + int maxburst; + int lmin, lmax; /* packet len */ + int flows; /* number of flows */ + int flowsets; /* number of flowsets */ + int wsum; /* sum of weights of all flows */ + int max_y; /* max random number in the generation */ + int cur_y, cur_fs; /* used in generation, between 0 and max_y - 1 */ + const char *fs_config; /* flowset config */ + int can_dequeue; + int burst; /* count of packets sent in a burst */ + struct mbuf *tosend; /* packet to send -- also flag to enqueue */ + + struct mbuf *freelist; + + struct mbuf *head, *tail; /* a simple tailq */ + + /* scheduler hooks */ + int (*enq)(struct dn_sch_inst *, struct dn_queue *, + struct mbuf *); + struct mbuf * (*deq)(struct dn_sch_inst *); + /* size of the three fields including sched-specific areas */ + int schk_len; + int q_len; /* size of a queue including sched-fields */ + int si_len; /* size of a sch_inst including sched-fields */ + char *q; /* array of flow queues */ + /* use a char* because size is variable */ + struct dn_fsk *fs; /* array of flowsets */ + struct dn_sch_inst *si; + struct dn_schk *sched; + + /* generator state */ + int state; /* 0 = going up, 1: going down */ + + /* + * We keep lists for each backlog level, and always serve + * the one with shortest backlog. llmask contains a bitmap + * of lists, and ll are the heads of the lists. The last + * entry (BACKLOG) contains all entries considered 'full' + * XXX to optimize things, entry i could contain queues with + * 2^{i-1}+1 .. 2^i entries. + */ +#define BACKLOG 30 + uint32_t llmask; + struct list_head ll[BACKLOG + 10]; +}; + +/* FI2Q and Q2FI converts from flow_id to dn_queue and back. + * We cannot easily use pointer arithmetic because it is variable size. + */ +#define FI2Q(c, i) ((struct dn_queue *)((c)->q + (c)->q_len * (i))) +#define Q2FI(c, q) (((char *)(q) - (c)->q)/(c)->q_len) + +int debug = 0; + +struct dn_parms dn_cfg; + +static void controller(struct cfg_s *c); + +/* release a packet: put the mbuf in the freelist, and the queue in + * the bucket. + */ +int +drop(struct cfg_s *c, struct mbuf *m) +{ + struct dn_queue *q; + int i; + + c->drop++; + q = FI2Q(c, m->flow_id); + i = q->ni.length; // XXX or ffs... + + ND("q %p id %d current length %d", q, m->flow_id, i); + if (i < BACKLOG) { + struct list_head *h = &q->ni.h; + c->llmask &= ~(1<<(i+1)); + c->llmask |= (1<<(i)); + list_del(h); + list_add_tail(h, &c->ll[i]); + } + m->m_nextpkt = c->freelist; + c->freelist = m; + return 0; +} + +/* dequeue returns NON-NULL when a packet is dropped */ +static int +enqueue(struct cfg_s *c, void *_m) +{ + struct mbuf *m = _m; + if (c->enq) + return c->enq(c->si, FI2Q(c, m->flow_id), m); + if (c->head == NULL) + c->head = m; + else + c->tail->m_nextpkt = m; + c->tail = m; + return 0; /* default - success */ +} + +/* dequeue returns NON-NULL when a packet is available */ +static void * +dequeue(struct cfg_s *c) +{ + struct mbuf *m; + if (c->deq) + return c->deq(c->si); + if ((m = c->head)) { + m = c->head; + c->head = m->m_nextpkt; + m->m_nextpkt = NULL; + } + return m; +} + +static int +mainloop(struct cfg_s *c) +{ + int i; + struct mbuf *m; + + for (i=0; i < c->loops; i++) { + /* implement histeresis */ + controller(c); + DX(3, "loop %d enq %d send %p rx %d", + i, c->_enqueue, c->tosend, c->can_dequeue); + if ( (m = c->tosend) ) { + c->_enqueue++; + if (enqueue(c, m)) { + drop(c, m); + ND("loop %d enqueue fail", i ); + } else { + ND("enqueue ok"); + c->pending++; + } + } + if (c->can_dequeue) { + c->dequeue++; + if ((m = dequeue(c))) { + c->pending--; + drop(c, m); + c->drop--; /* compensate */ + } + } + } + DX(1, "mainloop ends %d", i); + return 0; +} + +int +dump(struct cfg_s *c) +{ + int i; + struct dn_queue *q; + + for (i=0; i < c->flows; i++) { + q = FI2Q(c, i); + DX(1, "queue %4d tot %10lld", i, q->ni.tot_bytes); + } + DX(1, "done %d loops\n", c->loops); + return 0; +} + +/* interpret a number in human form */ +static long +getnum(const char *s, char **next, const char *key) +{ + char *end = NULL; + long l; + + if (next) /* default */ + *next = NULL; + if (s && *s) { + DX(3, "token is <%s> %s", s, key ? key : "-"); + l = strtol(s, &end, 0); + } else { + DX(3, "empty string"); + l = -1; + } + if (l < 0) { + DX(2, "invalid %s for %s", s ? s : "NULL", (key ? key : "") ); + return 0; // invalid + } + if (!end || !*end) + return l; + if (*end == 'n') + l = -l; /* multiply by n */ + else if (*end == 'K') + l = l*1000; + else if (*end == 'M') + l = l*1000000; + else if (*end == 'k') + l = l*1024; + else if (*end == 'm') + l = l*1024*1024; + else if (*end == 'w') + ; + else {/* not recognized */ + D("suffix %s for %s, next %p", end, key, next); + end--; + } + end++; + DX(3, "suffix now %s for %s, next %p", end, key, next); + if (next && *end) { + DX(3, "setting next to %s for %s", end, key); + *next = end; + } + return l; +} + +/* + * flowsets are a comma-separated list of + * weight:maxlen:flows + * indicating how many flows are hooked to that fs. + * Both weight and range can be min-max-steps. + * In a first pass we just count the number of flowsets and flows, + * in a second pass we complete the setup. + */ +static void +parse_flowsets(struct cfg_s *c, const char *fs, int pass) +{ + char *s, *cur, *next; + int n_flows = 0, n_fs = 0, wsum = 0; + int i, j; + struct dn_fs *prev = NULL; + + DX(3, "--- pass %d flows %d flowsets %d", pass, c->flows, c->flowsets); + if (pass == 0) + c->fs_config = fs; + s = c->fs_config ? strdup(c->fs_config) : NULL; + if (s == NULL) { + if (pass == 0) + D("no fsconfig"); + return; + } + for (next = s; (cur = strsep(&next, ","));) { + char *p = NULL; + int w, w_h, w_steps, wi; + int len, len_h, l_steps, li; + int flows; + + w = getnum(strsep(&cur, ":"), &p, "weight"); + if (w <= 0) + w = 1; + w_h = p ? getnum(p+1, &p, "weight_max") : w; + w_steps = p ? getnum(p+1, &p, "w_steps") : (w_h == w ?1:2); + len = getnum(strsep(&cur, ":"), &p, "len"); + if (len <= 0) + len = 1000; + len_h = p ? getnum(p+1, &p, "len_max") : len; + l_steps = p ? getnum(p+1, &p, "l_steps") : (len_h == len ? 1 : 2); + flows = getnum(strsep(&cur, ":"), NULL, "flows"); + if (flows == 0) + flows = 1; + DX(4, "weight %d..%d (%d) len %d..%d (%d) flows %d", + w, w_h, w_steps, len, len_h, l_steps, flows); + if (w == 0 || w_h < w || len == 0 || len_h < len || + flows == 0) { + DX(4,"wrong parameters %s", fs); + return; + } + n_flows += flows * w_steps * l_steps; + for (i = 0; i < w_steps; i++) { + wi = w + ((w_h - w)* i)/(w_steps == 1 ? 1 : (w_steps-1)); + for (j = 0; j < l_steps; j++, n_fs++) { + struct dn_fs *fs = &c->fs[n_fs].fs; // tentative + int x; + + li = len + ((len_h - len)* j)/(l_steps == 1 ? 1 : (l_steps-1)); + x = (wi*2048)/li; + DX(3, "----- fs %4d weight %4d lmax %4d X %4d flows %d", + n_fs, wi, li, x, flows); + if (pass == 0) + continue; + if (c->fs == NULL || c->flowsets <= n_fs) { + D("error in number of flowsets"); + return; + } + wsum += wi * flows; + fs->par[0] = wi; + fs->par[1] = li; + fs->index = n_fs; + fs->n_flows = flows; + fs->cur = fs->first_flow = prev==NULL ? 0 : prev->next_flow; + fs->next_flow = fs->first_flow + fs->n_flows; + fs->y = x * flows; + fs->base_y = (prev == NULL) ? 0 : prev->next_y; + fs->next_y = fs->base_y + fs->y; + prev = fs; + } + } + } + c->max_y = prev ? prev->base_y + prev->y : 0; + c->flows = n_flows; + c->flowsets = n_fs; + c->wsum = wsum; + if (pass == 0) + return; + + /* now link all flows to their parent flowsets */ + DX(1,"%d flows on %d flowsets max_y %d", c->flows, c->flowsets, c->max_y); + for (i=0; i < c->flowsets; i++) { + struct dn_fs *fs = &c->fs[i].fs; + DX(1, "fs %3d w %5d l %4d flow %5d .. %5d y %6d .. %6d", + i, fs->par[0], fs->par[1], + fs->first_flow, fs->next_flow, + fs->base_y, fs->next_y); + for (j = fs->first_flow; j < fs->next_flow; j++) { + struct dn_queue *q = FI2Q(c, j); + q->fs = &c->fs[i]; + } + } +} + +static int +init(struct cfg_s *c) +{ + int i; + int ac = c->ac; + char * const *av = c->av; + + c->si_len = sizeof(struct dn_sch_inst); + c->q_len = sizeof(struct dn_queue); + moduledata_t *mod = NULL; + struct dn_alg *p = NULL; + + c->th_min = 0; + c->th_max = -20;/* 20 packets per flow */ + c->lmin = c->lmax = 1280; /* packet len */ + c->flows = 1; + c->flowsets = 1; + c->name = "null"; + ac--; av++; + while (ac > 1) { + if (!strcmp(*av, "-n")) { + c->loops = getnum(av[1], NULL, av[0]); + } else if (!strcmp(*av, "-d")) { + debug = atoi(av[1]); + } else if (!strcmp(*av, "-alg")) { + extern moduledata_t *_g_dn_fifo; + extern moduledata_t *_g_dn_wf2qp; + extern moduledata_t *_g_dn_rr; + extern moduledata_t *_g_dn_qfq; +#ifdef WITH_KPS + extern moduledata_t *_g_dn_kps; +#endif + if (!strcmp(av[1], "rr")) + mod = _g_dn_rr; + else if (!strcmp(av[1], "wf2qp")) + mod = _g_dn_wf2qp; + else if (!strcmp(av[1], "fifo")) + mod = _g_dn_fifo; + else if (!strcmp(av[1], "qfq")) + mod = _g_dn_qfq; +#ifdef WITH_KPS + else if (!strcmp(av[1], "kps")) + mod = _g_dn_kps; +#endif + else + mod = NULL; + c->name = mod ? mod->name : "NULL"; + DX(3, "using scheduler %s", c->name); + } else if (!strcmp(*av, "-len")) { + c->lmin = getnum(av[1], NULL, av[0]); + c->lmax = c->lmin; + DX(3, "setting max to %d", c->th_max); + } else if (!strcmp(*av, "-burst")) { + c->maxburst = getnum(av[1], NULL, av[0]); + DX(3, "setting max to %d", c->th_max); + } else if (!strcmp(*av, "-qmax")) { + c->th_max = getnum(av[1], NULL, av[0]); + DX(3, "setting max to %d", c->th_max); + } else if (!strcmp(*av, "-qmin")) { + c->th_min = getnum(av[1], NULL, av[0]); + DX(3, "setting min to %d", c->th_min); + } else if (!strcmp(*av, "-flows")) { + c->flows = getnum(av[1], NULL, av[0]); + DX(3, "setting flows to %d", c->flows); + } else if (!strcmp(*av, "-flowsets")) { + parse_flowsets(c, av[1], 0); + DX(3, "setting flowsets to %d", c->flowsets); + } else { + D("option %s not recognised, ignore", *av); + } + ac -= 2; av += 2; + } + if (c->maxburst <= 0) + c->maxburst = 1; + if (c->loops <= 0) + c->loops = 1; + if (c->flows <= 0) + c->flows = 1; + if (c->flowsets <= 0) + c->flowsets = 1; + if (c->lmin <= 0) + c->lmin = 1; + if (c->lmax <= 0) + c->lmax = 1; + /* multiply by N */ + if (c->th_min < 0) + c->th_min = c->flows * -c->th_min; + if (c->th_max < 0) + c->th_max = c->flows * -c->th_max; + if (c->th_max <= c->th_min) + c->th_max = c->th_min + 1; + if (mod) { + p = mod->p; + DX(3, "using module %s f %p p %p", mod->name, mod->f, mod->p); + DX(3, "modname %s ty %d", p->name, p->type); + c->enq = p->enqueue; + c->deq = p->dequeue; + c->si_len += p->si_datalen; + c->q_len += p->q_datalen; + c->schk_len += p->schk_datalen; + } + /* allocate queues, flowsets and one scheduler */ + c->q = calloc(c->flows, c->q_len); + c->fs = calloc(c->flowsets, sizeof(struct dn_fsk)); + c->si = calloc(1, c->si_len); + c->sched = calloc(c->flows, c->schk_len); + if (c->q == NULL || c->fs == NULL) { + D("error allocating memory for flows"); + exit(1); + } + c->si->sched = c->sched; + if (p) { + if (p->config) + p->config(c->sched); + if (p->new_sched) + p->new_sched(c->si); + } + /* parse_flowsets links queues to their flowsets */ + parse_flowsets(c, av[1], 1); + /* complete the work calling new_fsk */ + for (i = 0; i < c->flowsets; i++) { + if (c->fs[i].fs.par[1] == 0) + c->fs[i].fs.par[1] = 1000; /* default pkt len */ + c->fs[i].sched = c->sched; + if (p && p->new_fsk) + p->new_fsk(&c->fs[i]); + } + + /* initialize the lists for the generator, and put + * all flows in the list for backlog = 0 + */ + for (i=0; i <= BACKLOG+5; i++) + INIT_LIST_HEAD(&c->ll[i]); + + for (i = 0; i < c->flows; i++) { + struct dn_queue *q = FI2Q(c, i); + if (q->fs == NULL) + q->fs = &c->fs[0]; /* XXX */ + q->_si = c->si; + if (p && p->new_queue) + p->new_queue(q); + INIT_LIST_HEAD(&q->ni.h); + list_add_tail(&q->ni.h, &c->ll[0]); + } + c->llmask = 1; + return 0; +} + + +int +main(int ac, char *av[]) +{ + struct cfg_s c; + struct timeval end; + double ll; + int i; + char msg[40]; + + bzero(&c, sizeof(c)); + c.ac = ac; + c.av = av; + init(&c); + gettimeofday(&c.time, NULL); + mainloop(&c); + gettimeofday(&end, NULL); + end.tv_sec -= c.time.tv_sec; + end.tv_usec -= c.time.tv_usec; + if (end.tv_usec < 0) { + end.tv_usec += 1000000; + end.tv_sec--; + } + c.time = end; + ll = end.tv_sec*1000000 + end.tv_usec; + ll *= 1000; /* convert to nanoseconds */ + ll /= c._enqueue; + sprintf(msg, "1::%d", c.flows); + D("%-8s n %d %d time %d.%06d %8.3f qlen %d %d flows %s drops %d", + c.name, c._enqueue, c.loops, + (int)c.time.tv_sec, (int)c.time.tv_usec, ll, + c.th_min, c.th_max, + c.fs_config ? c.fs_config : msg, c.drop); + dump(&c); + DX(1, "done ac %d av %p", ac, av); + for (i=0; i < ac; i++) + DX(1, "arg %d %s", i, av[i]); + return 0; +} + +/* + * The controller decides whether in this iteration we should send + * (the packet is in c->tosend) and/or receive (flag c->can_dequeue) + */ +static void +controller(struct cfg_s *c) +{ + struct mbuf *m; + struct dn_fs *fs; + int flow_id; + + /* histeresis between max and min */ + if (c->state == 0 && c->pending >= c->th_max) + c->state = 1; + else if (c->state == 1 && c->pending <= c->th_min) + c->state = 0; + ND(1, "state %d pending %2d", c->state, c->pending); + c->can_dequeue = c->state; + c->tosend = NULL; + if (c->state) + return; + + if (1) { + int i; + struct dn_queue *q; + struct list_head *h; + + i = ffs(c->llmask) - 1; + if (i < 0) { + DX(2, "no candidate"); + c->can_dequeue = 1; + return; + } + h = &c->ll[i]; + ND(1, "backlog %d p %p prev %p next %p", i, h, h->prev, h->next); + q = list_first_entry(h, struct dn_queue, ni.h); + list_del(&q->ni.h); + flow_id = Q2FI(c, q); + DX(2, "extracted flow %p %d backlog %d", q, flow_id, i); + if (list_empty(h)) { + ND(2, "backlog %d empty", i); + c->llmask &= ~(1<<i); + } + ND(1, "before %d p %p prev %p next %p", i+1, h+1, h[1].prev, h[1].next); + list_add_tail(&q->ni.h, h+1); + ND(1, " after %d p %p prev %p next %p", i+1, h+1, h[1].prev, h[1].next); + if (i < BACKLOG) { + ND(2, "backlog %d full", i+1); + c->llmask |= 1<<(1+i); + } + fs = &q->fs->fs; + c->cur_fs = q->fs - c->fs; + fs->cur = flow_id; + } else { + /* XXX this does not work ? */ + /* now decide whom to send the packet, and the length */ + /* lookup in the flow table */ + if (c->cur_y >= c->max_y) { /* handle wraparound */ + c->cur_y = 0; + c->cur_fs = 0; + } + fs = &c->fs[c->cur_fs].fs; + flow_id = fs->cur++; + if (fs->cur >= fs->next_flow) + fs->cur = fs->first_flow; + c->cur_y++; + if (c->cur_y >= fs->next_y) + c->cur_fs++; + } + + /* construct a packet */ + if (c->freelist) { + m = c->tosend = c->freelist; + c->freelist = c->freelist->m_nextpkt; + } else { + m = c->tosend = calloc(1, sizeof(struct mbuf)); + } + if (m == NULL) + return; + + m->cfg = c; + m->m_nextpkt = NULL; + m->m_pkthdr.len = fs->par[1]; // XXX maxlen + m->flow_id = flow_id; + + ND(2,"y %6d flow %5d fs %3d weight %4d len %4d", + c->cur_y, m->flow_id, c->cur_fs, + fs->par[0], m->m_pkthdr.len); + +} + +/* +Packet allocation: +to achieve a distribution that matches weights, for each X=w/lmax class +we should generate a number of packets proportional to Y = X times the number +of flows in the class. +So we construct an array with the cumulative distribution of Y's, +and use it to identify the flow via inverse mapping (if the Y's are +not too many we can use an array for the lookup). In practice, +each flow will have X entries [virtually] pointing to it. + +*/ diff --git a/sys/netinet/ipfw/test/mylist.h b/sys/netinet/ipfw/test/mylist.h new file mode 100644 index 0000000..6247f32 --- /dev/null +++ b/sys/netinet/ipfw/test/mylist.h @@ -0,0 +1,49 @@ +/* + * $FreeBSD$ + * + * linux-like bidirectional lists + */ + +#ifndef _MYLIST_H +#define _MYLIST_H +struct list_head { + struct list_head *prev, *next; +}; + +#define INIT_LIST_HEAD(l) do { (l)->prev = (l)->next = (l); } while (0) +#define list_empty(l) ( (l)->next == l ) +static inline void +__list_add(struct list_head *o, struct list_head *prev, + struct list_head *next) +{ + next->prev = o; + o->next = next; + o->prev = prev; + prev->next = o; +} + +static inline void +list_add_tail(struct list_head *o, struct list_head *head) +{ + __list_add(o, head->prev, head); +} + +#define list_first_entry(pL, ty, member) \ + (ty *)((char *)((pL)->next) - offsetof(ty, member)) + +static inline void +__list_del(struct list_head *prev, struct list_head *next) +{ + next->prev = prev; + prev->next = next; +} + +static inline void +list_del(struct list_head *entry) +{ + ND("called on %p", entry); + __list_del(entry->prev, entry->next); + entry->next = entry->prev = NULL; +} + +#endif /* _MYLIST_H */ diff --git a/sys/netinet/ipfw/test/test_dn_heap.c b/sys/netinet/ipfw/test/test_dn_heap.c new file mode 100644 index 0000000..d460cf2 --- /dev/null +++ b/sys/netinet/ipfw/test/test_dn_heap.c @@ -0,0 +1,162 @@ +/*- + * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Userland code for testing binary heaps and hash tables + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +#include <sys/param.h> + +#include <stdio.h> +#include <strings.h> +#include <stdlib.h> + +#include "dn_heap.h" +#define log(x, arg...) fprintf(stderr, ## arg) +#define panic(x...) fprintf(stderr, ## x), exit(1) + +#include <string.h> + +struct x { + struct x *ht_link; + char buf[0]; +}; + +uint32_t hf(uintptr_t key, int flags, void *arg) +{ + return (flags & DNHT_KEY_IS_OBJ) ? + ((struct x *)key)->buf[0] : *(char *)key; +} + +int matchf(void *obj, uintptr_t key, int flags, void *arg) +{ + char *s = (flags & DNHT_KEY_IS_OBJ) ? + ((struct x *)key)->buf : (char *)key; + return (strcmp(((struct x *)obj)->buf, s) == 0); +} + +void *newfn(uintptr_t key, int flags, void *arg) +{ + char *s = (char *)key; + struct x *p = malloc(sizeof(*p) + 1 + strlen(s)); + if (p) + strcpy(p->buf, s); + return p; +} + +char *strings[] = { + "undici", "unico", "doppio", "devoto", + "uno", "due", "tre", "quattro", "cinque", "sei", + "uno", "due", "tre", "quattro", "cinque", "sei", + NULL, +}; + +int doprint(void *_x, void *arg) +{ + struct x *x = _x; + printf("found element <%s>\n", x->buf); + return (int)arg; +} + +static void +test_hash() +{ + char **p; + struct dn_ht *h; + uintptr_t x = 0; + uintptr_t x1 = 0; + + /* first, find and allocate */ + h = dn_ht_init(NULL, 10, 0, hf, matchf, newfn); + + for (p = strings; *p; p++) { + dn_ht_find(h, (uintptr_t)*p, DNHT_INSERT, NULL); + } + dn_ht_scan(h, doprint, 0); + printf("/* second -- find without allocate */\n"); + h = dn_ht_init(NULL, 10, 0, hf, matchf, NULL); + for (p = strings; *p; p++) { + void **y = newfn((uintptr_t)*p, 0, NULL); + if (x == 0) + x = (uintptr_t)y; + else { + if (x1 == 0) + x1 = (uintptr_t)*p; + } + dn_ht_find(h, (uintptr_t)y, DNHT_INSERT | DNHT_KEY_IS_OBJ, NULL); + } + dn_ht_scan(h, doprint, 0); + printf("remove %p gives %p\n", (void *)x, + dn_ht_find(h, x, DNHT_KEY_IS_OBJ | DNHT_REMOVE, NULL)); + printf("remove %p gives %p\n", (void *)x, + dn_ht_find(h, x, DNHT_KEY_IS_OBJ | DNHT_REMOVE, NULL)); + printf("remove %p gives %p\n", (void *)x, + dn_ht_find(h, x1, DNHT_REMOVE, NULL)); + printf("remove %p gives %p\n", (void *)x, + dn_ht_find(h, x1, DNHT_REMOVE, NULL)); + dn_ht_scan(h, doprint, 0); +} + +int +main(int argc, char *argv[]) +{ + struct dn_heap h; + int i, n, n2, n3; + + test_hash(); + return 0; + + /* n = elements, n2 = cycles */ + n = (argc > 1) ? atoi(argv[1]) : 0; + if (n <= 0 || n > 1000000) + n = 100; + n2 = (argc > 2) ? atoi(argv[2]) : 0; + if (n2 <= 0) + n = 1000000; + n3 = (argc > 3) ? atoi(argv[3]) : 0; + bzero(&h, sizeof(h)); + heap_init(&h, n, -1); + while (n2-- > 0) { + uint64_t prevk = 0; + for (i=0; i < n; i++) + heap_insert(&h, n3 ? n-i: random(), (void *)(100+i)); + + for (i=0; h.elements > 0; i++) { + uint64_t k = h.p[0].key; + if (k < prevk) + panic("wrong sequence\n"); + prevk = k; + if (0) + printf("%d key %llu, val %p\n", + i, h.p[0].key, h.p[0].object); + heap_extract(&h, NULL); + } + } + return 0; +} diff --git a/sys/netinet/ipfw/test/test_dn_sched.c b/sys/netinet/ipfw/test/test_dn_sched.c new file mode 100644 index 0000000..ee46c95 --- /dev/null +++ b/sys/netinet/ipfw/test/test_dn_sched.c @@ -0,0 +1,89 @@ +/* + * $FreeBSD$ + * + * library functions for userland testing of dummynet schedulers + */ + +#include "dn_test.h" + +void +m_freem(struct mbuf *m) +{ + printf("free %p\n", m); +} + +int +dn_sched_modevent(module_t mod, int cmd, void *arg) +{ + return 0; +} + +void +dn_free_pkts(struct mbuf *m) +{ + struct mbuf *x; + while ( (x = m) ) { + m = m->m_nextpkt; + m_freem(x); + } +} + +int +dn_delete_queue(void *_q, void *do_free) +{ + struct dn_queue *q = _q; + if (q->mq.head) + dn_free_pkts(q->mq.head); + free(q); + return 0; +} + +/* + * This is a simplified function for testing purposes, which does + * not implement statistics or random loss. + * Enqueue a packet in q, subject to space and queue management policy + * (whose parameters are in q->fs). + * Update stats for the queue and the scheduler. + * Return 0 on success, 1 on drop. The packet is consumed anyways. + */ +int +dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop) +{ + if (drop) + goto drop; + if (q->ni.length >= 200) + goto drop; + mq_append(&q->mq, m); + q->ni.length++; + q->ni.tot_bytes += m->m_pkthdr.len; + return 0; + +drop: + q->ni.drops++; + return 1; +} + +int +ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg) +{ + if (*v < lo) { + *v = dflt; + } else if (*v > hi) { + *v = hi; + } + return *v; +} + +#ifndef __FreeBSD__ +int +fls(int mask) +{ + int bit; + + if (mask == 0) + return (0); + for (bit = 1; mask != 1; bit++) + mask = (unsigned int)mask >> 1; + return (bit); +} +#endif diff --git a/sys/netinet/raw_ip.c b/sys/netinet/raw_ip.c index 3573472..9341cf2 100644 --- a/sys/netinet/raw_ip.c +++ b/sys/netinet/raw_ip.c @@ -80,14 +80,18 @@ VNET_DEFINE(struct inpcbinfo, ripcbinfo); #define V_ripcbinfo VNET(ripcbinfo) /* - * Control and data hooks for ipfw and dummynet. + * Control and data hooks for ipfw, dummynet, divert and so on. * The data hooks are not used here but it is convenient * to keep them all in one place. */ VNET_DEFINE(ip_fw_chk_ptr_t, ip_fw_chk_ptr) = NULL; VNET_DEFINE(ip_fw_ctl_ptr_t, ip_fw_ctl_ptr) = NULL; -int (*ip_dn_ctl_ptr)(struct sockopt *) = NULL; -int (*ip_dn_io_ptr)(struct mbuf **m, int dir, struct ip_fw_args *fwa) = NULL; + +int (*ip_dn_ctl_ptr)(struct sockopt *); +int (*ip_dn_io_ptr)(struct mbuf **, int, struct ip_fw_args *); +void (*ip_divert_ptr)(struct mbuf *, int); +int (*ng_ipfw_input_p)(struct mbuf **, int, + struct ip_fw_args *, int); /* * Hooks for multicast routing. They all default to NULL, so leave them not |