summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authortruckman <truckman@FreeBSD.org>2016-05-26 21:40:13 +0000
committertruckman <truckman@FreeBSD.org>2016-05-26 21:40:13 +0000
commit2a78edb668fc80fd75e386e745ea0d125834dc3c (patch)
treee8cbaadc109ca54068eb29a8721cc8352d14d72c
parent75eb020bfe01f541c363c8aa62785741a5c3d5e5 (diff)
downloadFreeBSD-src-2a78edb668fc80fd75e386e745ea0d125834dc3c.zip
FreeBSD-src-2a78edb668fc80fd75e386e745ea0d125834dc3c.tar.gz
Import Dummynet AQM version 0.2.1 (CoDel, FQ-CoDel, PIE and FQ-PIE).
Centre for Advanced Internet Architectures Implementing AQM in FreeBSD * Overview <http://caia.swin.edu.au/freebsd/aqm/index.html> * Articles, Papers and Presentations <http://caia.swin.edu.au/freebsd/aqm/papers.html> * Patches and Tools <http://caia.swin.edu.au/freebsd/aqm/downloads.html> Overview Recent years have seen a resurgence of interest in better managing the depth of bottleneck queues in routers, switches and other places that get congested. Solutions include transport protocol enhancements at the end-hosts (such as delay-based or hybrid congestion control schemes) and active queue management (AQM) schemes applied within bottleneck queues. The notion of AQM has been around since at least the late 1990s (e.g. RFC 2309). In recent years the proliferation of oversized buffers in all sorts of network devices (aka bufferbloat) has stimulated keen community interest in four new AQM schemes -- CoDel, FQ-CoDel, PIE and FQ-PIE. The IETF AQM working group is looking to document these schemes, and independent implementations are a corner-stone of the IETF's process for confirming the clarity of publicly available protocol descriptions. While significant development work on all three schemes has occured in the Linux kernel, there is very little in FreeBSD. Project Goals This project began in late 2015, and aims to design and implement functionally-correct versions of CoDel, FQ-CoDel, PIE and FQ_PIE in FreeBSD (with code BSD-licensed as much as practical). We have chosen to do this as extensions to FreeBSD's ipfw/dummynet firewall and traffic shaper. Implementation of these AQM schemes in FreeBSD will: * Demonstrate whether the publicly available documentation is sufficient to enable independent, functionally equivalent implementations * Provide a broader suite of AQM options for sections the networking community that rely on FreeBSD platforms Program Members: * Rasool Al Saadi (developer) * Grenville Armitage (project lead) Acknowledgements: This project has been made possible in part by a gift from the Comcast Innovation Fund. Submitted by: Rasool Al-Saadi <ralsaadi@swin.edu.au> X-No objection: core MFC after: 2 weeks Differential Revision: https://reviews.freebsd.org/D6388
-rw-r--r--sbin/ipfw/dummynet.c582
-rw-r--r--sbin/ipfw/ipfw2.h25
-rw-r--r--sys/modules/dummynet/Makefile3
-rw-r--r--sys/netinet/ip_dummynet.h27
-rw-r--r--sys/netpfil/ipfw/dn_aqm.h167
-rw-r--r--sys/netpfil/ipfw/dn_aqm_codel.c444
-rw-r--r--sys/netpfil/ipfw/dn_aqm_codel.h222
-rw-r--r--sys/netpfil/ipfw/dn_aqm_pie.c793
-rw-r--r--sys/netpfil/ipfw/dn_aqm_pie.h151
-rw-r--r--sys/netpfil/ipfw/dn_sched.h9
-rw-r--r--sys/netpfil/ipfw/dn_sched_fifo.c6
-rw-r--r--sys/netpfil/ipfw/dn_sched_fq_codel.c617
-rw-r--r--sys/netpfil/ipfw/dn_sched_fq_codel.h167
-rw-r--r--sys/netpfil/ipfw/dn_sched_fq_codel_helper.h187
-rw-r--r--sys/netpfil/ipfw/dn_sched_fq_pie.c1262
-rw-r--r--sys/netpfil/ipfw/dn_sched_prio.c6
-rw-r--r--sys/netpfil/ipfw/dn_sched_qfq.c6
-rw-r--r--sys/netpfil/ipfw/dn_sched_rr.c6
-rw-r--r--sys/netpfil/ipfw/dn_sched_wf2q.c7
-rw-r--r--sys/netpfil/ipfw/ip_dn_glue.c3
-rw-r--r--sys/netpfil/ipfw/ip_dn_io.c37
-rw-r--r--sys/netpfil/ipfw/ip_dn_private.h59
-rw-r--r--sys/netpfil/ipfw/ip_dummynet.c425
23 files changed, 5203 insertions, 8 deletions
diff --git a/sbin/ipfw/dummynet.c b/sbin/ipfw/dummynet.c
index dc95a19..ca2dfc8 100644
--- a/sbin/ipfw/dummynet.c
+++ b/sbin/ipfw/dummynet.c
@@ -1,4 +1,11 @@
/*
+ * Codel/FQ_Codel and PIE/FQ_PIE Code:
+ * Copyright (C) 2016 Centre for Advanced Internet Architectures,
+ * Swinburne University of Technology, Melbourne, Australia.
+ * Portions of this code were made possible in part by a gift from
+ * The Comcast Innovation Fund.
+ * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au>
+ *
* Copyright (c) 2002-2003,2010 Luigi Rizzo
*
* Redistribution and use in source forms, with and without modification,
@@ -15,6 +22,7 @@
* dummynet support
*/
+#define NEW_AQM
#include <sys/types.h>
#include <sys/socket.h>
/* XXX there are several sysctl leftover here */
@@ -22,6 +30,10 @@
#include "ipfw2.h"
+#ifdef NEW_AQM
+#include <stdint.h>
+#endif
+
#include <ctype.h>
#include <err.h>
#include <errno.h>
@@ -59,6 +71,12 @@ static struct _s_x dummynet_params[] = {
{ "ecn", TOK_ECN },
{ "red", TOK_RED },
{ "gred", TOK_GRED },
+#ifdef NEW_AQM
+ { "codel", TOK_CODEL}, /* Codel AQM */
+ { "fq_codel", TOK_FQ_CODEL}, /* FQ-Codel */
+ { "pie", TOK_PIE}, /* PIE AQM */
+ { "fq_pie", TOK_FQ_PIE}, /* FQ-PIE */
+#endif
{ "bw", TOK_BW },
{ "bandwidth", TOK_BW },
{ "delay", TOK_DELAY },
@@ -81,6 +99,32 @@ static struct _s_x dummynet_params[] = {
{ NULL, 0 } /* terminator */
};
+#ifdef NEW_AQM
+/* AQM/extra sched parameters tokens*/
+static struct _s_x aqm_params[] = {
+ { "target", TOK_TARGET},
+ { "interval", TOK_INTERVAL},
+ { "limit", TOK_LIMIT},
+ { "flows", TOK_FLOWS},
+ { "quantum", TOK_QUANTUM},
+ { "ecn", TOK_ECN},
+ { "noecn", TOK_NO_ECN},
+ { "tupdate", TOK_TUPDATE},
+ { "max_burst", TOK_MAX_BURST},
+ { "max_ecnth", TOK_MAX_ECNTH},
+ { "alpha", TOK_ALPHA},
+ { "beta", TOK_BETA},
+ { "capdrop", TOK_CAPDROP},
+ { "nocapdrop", TOK_NO_CAPDROP},
+ { "onoff", TOK_ONOFF},
+ { "dre", TOK_DRE},
+ { "ts", TOK_TS},
+ { "derand", TOK_DERAND},
+ { "noderand", TOK_NO_DERAND},
+ { NULL, 0 } /* terminator */
+};
+#endif
+
#define O_NEXT(p, len) ((void *)((char *)p + len))
static void
@@ -102,6 +146,214 @@ o_next(struct dn_id **o, int len, int type)
return ret;
}
+#ifdef NEW_AQM
+
+/* Codel flags */
+enum {
+ CODEL_ECN_ENABLED = 1
+};
+
+/* PIE flags, from PIE kernel module */
+enum {
+ PIE_ECN_ENABLED = 1,
+ PIE_CAPDROP_ENABLED = 2,
+ PIE_ON_OFF_MODE_ENABLED = 4,
+ PIE_DEPRATEEST_ENABLED = 8,
+ PIE_DERAND_ENABLED = 16
+};
+
+#define PIE_FIX_POINT_BITS 13
+#define PIE_SCALE (1L<<PIE_FIX_POINT_BITS)
+
+/* integer to time */
+void
+us_to_time(int t,char *strt)
+{
+ if (t < 0)
+ strt[0]='\0';
+ else if ( t==0 )
+ sprintf(strt,"%d", t);
+ else if (t< 1000)
+ sprintf(strt,"%dus", t);
+ else if (t < 1000000)
+ sprintf(strt,"%gms", (float) t / 1000);
+ else
+ sprintf(strt,"%gfs", (float) t / 1000000);
+}
+
+/*
+ * returns -1 if s is not a valid time, otherwise, return time in us
+ */
+static long
+time_to_us(const char *s)
+{
+ int i, dots = 0;
+ int len = strlen(s);
+ char strt[16]="", stru[16]="";
+
+ if (len>15)
+ return -1;
+ for (i = 0; i<len && (isdigit(s[i]) || s[i]=='.') ; i++)
+ if (s[i]=='.') {
+ if (dots)
+ return -1;
+ else
+ dots++;
+ }
+
+ if (!i)
+ return -1;
+ strncpy(strt, s, i);
+ if (i<len)
+ strcpy(stru, s+i);
+ else
+ strcpy(stru, "ms");
+
+ if (!strcasecmp(stru, "us"))
+ return atol(strt);
+ if (!strcasecmp(stru, "ms"))
+ return (strtod(strt, NULL) * 1000);
+ if (!strcasecmp(stru, "s"))
+ return (strtod(strt, NULL)*1000000);
+
+ return -1;
+}
+
+
+/* Get AQM or scheduler extra parameters */
+void
+get_extra_parms(uint32_t nr, char *out, int subtype)
+{
+ struct dn_extra_parms *ep;
+ int ret;
+ char strt1[15], strt2[15], strt3[15];
+ u_int l;
+
+ /* prepare the request */
+ l = sizeof(struct dn_extra_parms);
+ ep = safe_calloc(1, l);
+ memset(ep, 0, sizeof(*ep));
+ *out = '\0';
+
+ oid_fill(&ep->oid, l, DN_CMD_GET, DN_API_VERSION);
+ ep->oid.len = l;
+ ep->oid.subtype = subtype;
+ ep->nr = nr;
+
+ ret = do_cmd(-IP_DUMMYNET3, ep, (uintptr_t)&l);
+ if (ret) {
+ free(ep);
+ errx(EX_DATAERR, "Error getting extra parameters\n");
+ }
+
+ switch (subtype) {
+ case DN_AQM_PARAMS:
+ if( !strcasecmp(ep->name, "codel")) {
+ us_to_time(ep->par[0], strt1);
+ us_to_time(ep->par[1], strt2);
+ l = sprintf(out, " AQM CoDel target %s interval %s",
+ strt1, strt2);
+ if (ep->par[2] & CODEL_ECN_ENABLED)
+ l = sprintf(out + l, " ECN");
+ else
+ l += sprintf(out + l, " NoECN");
+ } else if( !strcasecmp(ep->name, "pie")) {
+ us_to_time(ep->par[0], strt1);
+ us_to_time(ep->par[1], strt2);
+ us_to_time(ep->par[2], strt3);
+ l = sprintf(out, " AQM type PIE target %s tupdate %s alpha "
+ "%g beta %g max_burst %s max_ecnth %.3g",
+ strt1,
+ strt2,
+ ep->par[4] / (float) PIE_SCALE,
+ ep->par[5] / (float) PIE_SCALE,
+ strt3,
+ ep->par[3] / (float) PIE_SCALE
+ );
+
+ if (ep->par[6] & PIE_ECN_ENABLED)
+ l += sprintf(out + l, " ECN");
+ else
+ l += sprintf(out + l, " NoECN");
+ if (ep->par[6] & PIE_CAPDROP_ENABLED)
+ l += sprintf(out + l, " CapDrop");
+ else
+ l += sprintf(out + l, " NoCapDrop");
+ if (ep->par[6] & PIE_ON_OFF_MODE_ENABLED)
+ l += sprintf(out + l, " OnOff");
+ if (ep->par[6] & PIE_DEPRATEEST_ENABLED)
+ l += sprintf(out + l, " DRE");
+ else
+ l += sprintf(out + l, " TS");
+ if (ep->par[6] & PIE_DERAND_ENABLED)
+ l += sprintf(out + l, " Derand");
+ else
+ l += sprintf(out + l, " NoDerand");
+ }
+ break;
+
+ case DN_SCH_PARAMS:
+ if (!strcasecmp(ep->name,"FQ_CODEL")) {
+ us_to_time(ep->par[0], strt1);
+ us_to_time(ep->par[1], strt2);
+ l = sprintf(out," FQ_CODEL target %s interval %s"
+ " quantum %jd limit %jd flows %jd",
+ strt1, strt2,
+ (intmax_t) ep->par[3],
+ (intmax_t) ep->par[4],
+ (intmax_t) ep->par[5]
+ );
+ if (ep->par[2] & CODEL_ECN_ENABLED)
+ l += sprintf(out + l, " ECN");
+ else
+ l += sprintf(out + l, " NoECN");
+ l += sprintf(out + l, "\n");
+ } else if (!strcasecmp(ep->name,"FQ_PIE")) {
+ us_to_time(ep->par[0], strt1);
+ us_to_time(ep->par[1], strt2);
+ us_to_time(ep->par[2], strt3);
+ l = sprintf(out, " FQ_PIE target %s tupdate %s alpha "
+ "%g beta %g max_burst %s max_ecnth %.3g"
+ " quantum %jd limit %jd flows %jd",
+ strt1,
+ strt2,
+ ep->par[4] / (float) PIE_SCALE,
+ ep->par[5] / (float) PIE_SCALE,
+ strt3,
+ ep->par[3] / (float) PIE_SCALE,
+ (intmax_t) ep->par[7],
+ (intmax_t) ep->par[8],
+ (intmax_t) ep->par[9]
+ );
+
+ if (ep->par[6] & PIE_ECN_ENABLED)
+ l += sprintf(out + l, " ECN");
+ else
+ l += sprintf(out + l, " NoECN");
+ if (ep->par[6] & PIE_CAPDROP_ENABLED)
+ l += sprintf(out + l, " CapDrop");
+ else
+ l += sprintf(out + l, " NoCapDrop");
+ if (ep->par[6] & PIE_ON_OFF_MODE_ENABLED)
+ l += sprintf(out + l, " OnOff");
+ if (ep->par[6] & PIE_DEPRATEEST_ENABLED)
+ l += sprintf(out + l, " DRE");
+ else
+ l += sprintf(out + l, " TS");
+ if (ep->par[6] & PIE_DERAND_ENABLED)
+ l += sprintf(out + l, " Derand");
+ else
+ l += sprintf(out + l, " NoDerand");
+ l += sprintf(out + l, "\n");
+ }
+ break;
+ }
+
+ free(ep);
+}
+#endif
+
+
#if 0
static int
sort_q(void *arg, const void *pa, const void *pb)
@@ -221,7 +473,7 @@ print_flowset_parms(struct dn_fs *fs, char *prefix)
int l;
char qs[30];
char plr[30];
- char red[90]; /* Display RED parameters */
+ char red[200]; /* Display RED parameters */
l = fs->qsize;
if (fs->flags & DN_QSIZE_BYTES) {
@@ -246,6 +498,11 @@ print_flowset_parms(struct dn_fs *fs, char *prefix)
1.0 * fs->max_p / (double)(1 << SCALE_RED));
if (fs->flags & DN_IS_ECN)
strncat(red, " (ecn)", 6);
+#ifdef NEW_AQM
+ /* get AQM parameters */
+ } else if (fs->flags & DN_IS_AQM) {
+ get_extra_parms(fs->fs_nr, red, DN_AQM_PARAMS);
+#endif
} else
sprintf(red, "droptail");
@@ -338,6 +595,11 @@ list_pipes(struct dn_id *oid, struct dn_id *end)
printf(" sched %d type %s flags 0x%x %d buckets %d active\n",
s->sched_nr,
s->name, s->flags, s->buckets, s->oid.id);
+#ifdef NEW_AQM
+ char parms[200];
+ get_extra_parms(s->sched_nr, parms, DN_SCH_PARAMS);
+ printf("%s",parms);
+#endif
if (s->flags & DN_HAVE_MASK)
print_mask(&s->sched_mask);
}
@@ -745,6 +1007,242 @@ load_extra_delays(const char *filename, struct dn_profile *p,
strncpy(p->name, profile_name, sizeof(p->name));
}
+#ifdef NEW_AQM
+
+/* Parse AQM/extra scheduler parameters */
+static int
+process_extra_parms(int *ac, char **av, struct dn_extra_parms *ep,
+ uint16_t type)
+{
+ int i;
+
+ /* use kernel defaults */
+ for (i=0; i<DN_MAX_EXTRA_PARM; i++)
+ ep->par[i] = -1;
+
+ switch(type) {
+ case TOK_CODEL:
+ case TOK_FQ_CODEL:
+ /* Codel
+ * 0- target, 1- interval, 2- flags,
+ * FQ_CODEL
+ * 3- quantum, 4- limit, 5- flows
+ */
+ if (type==TOK_CODEL)
+ ep->par[2] = 0;
+ else
+ ep->par[2] = CODEL_ECN_ENABLED;
+
+ while (*ac > 0) {
+ int tok = match_token(aqm_params, *av);
+ (*ac)--; av++;
+ switch(tok) {
+ case TOK_TARGET:
+ if (*ac <= 0 || time_to_us(av[0]) < 0)
+ errx(EX_DATAERR, "target needs time\n");
+
+ ep->par[0] = time_to_us(av[0]);
+ (*ac)--; av++;
+ break;
+
+ case TOK_INTERVAL:
+ if (*ac <= 0 || time_to_us(av[0]) < 0)
+ errx(EX_DATAERR, "interval needs time\n");
+
+ ep->par[1] = time_to_us(av[0]);
+ (*ac)--; av++;
+ break;
+
+ case TOK_ECN:
+ ep->par[2] = CODEL_ECN_ENABLED;
+ break;
+ case TOK_NO_ECN:
+ ep->par[2] &= ~CODEL_ECN_ENABLED;
+ break;
+ /* Config fq_codel parameters */
+ case TOK_QUANTUM:
+ if (type != TOK_FQ_CODEL)
+ errx(EX_DATAERR, "quantum is not for codel\n");
+ if (*ac <= 0 || !is_valid_number(av[0]))
+ errx(EX_DATAERR, "quantum needs number\n");
+
+ ep->par[3]= atoi(av[0]);
+ (*ac)--; av++;
+ break;
+
+ case TOK_LIMIT:
+ if (type != TOK_FQ_CODEL)
+ errx(EX_DATAERR, "limit is not for codel, use queue instead\n");
+ if (*ac <= 0 || !is_valid_number(av[0]))
+ errx(EX_DATAERR, "limit needs number\n");
+
+ ep->par[4] = atoi(av[0]);
+ (*ac)--; av++;
+ break;
+
+ case TOK_FLOWS:
+ if (type != TOK_FQ_CODEL)
+ errx(EX_DATAERR, "flows is not for codel\n");
+ if (*ac <= 0 || !is_valid_number(av[0]))
+ errx(EX_DATAERR, "flows needs number\n");
+
+ ep->par[5] = atoi(av[0]);
+ (*ac)--; av++;
+ break;
+
+ default:
+ printf("%s is Invalid parameter\n", av[-1]);
+ }
+ }
+ break;
+ case TOK_PIE:
+ case TOK_FQ_PIE:
+ /* PIE
+ * 0- target , 1- tupdate, 2- max_burst,
+ * 3- max_ecnth, 4- alpha,
+ * 5- beta, 6- flags
+ * FQ_CODEL
+ * 7- quantum, 8- limit, 9- flows
+ */
+
+ if ( type == TOK_PIE)
+ ep->par[6] = PIE_CAPDROP_ENABLED | PIE_DEPRATEEST_ENABLED
+ | PIE_DERAND_ENABLED;
+ else
+ /* for FQ-PIE, use TS mode */
+ ep->par[6] = PIE_CAPDROP_ENABLED | PIE_DERAND_ENABLED
+ | PIE_ECN_ENABLED;
+
+ while (*ac > 0) {
+ int tok = match_token(aqm_params, *av);
+ (*ac)--; av++;
+ switch(tok) {
+ case TOK_TARGET:
+ if (*ac <= 0 || time_to_us(av[0]) < 0)
+ errx(EX_DATAERR, "target needs time\n");
+
+ ep->par[0] = time_to_us(av[0]);
+ (*ac)--; av++;
+ break;
+
+ case TOK_TUPDATE:
+ if (*ac <= 0 || time_to_us(av[0]) < 0)
+ errx(EX_DATAERR, "tupdate needs time\n");
+
+ ep->par[1] = time_to_us(av[0]);
+ (*ac)--; av++;
+ break;
+
+ case TOK_MAX_BURST:
+ if (*ac <= 0 || time_to_us(av[0]) < 0)
+ errx(EX_DATAERR, "max_burst needs time\n");
+
+ ep->par[2] = time_to_us(av[0]);
+ (*ac)--; av++;
+ break;
+
+ case TOK_MAX_ECNTH:
+ if (*ac <= 0 || !is_valid_number(av[0]))
+ errx(EX_DATAERR, "max_ecnth needs number\n");
+
+ ep->par[3] = atof(av[0]) * PIE_SCALE;
+ (*ac)--; av++;
+ break;
+
+ case TOK_ALPHA:
+ if (*ac <= 0 || !is_valid_number(av[0]))
+ errx(EX_DATAERR, "alpha needs number\n");
+
+ ep->par[4] = atof(av[0]) * PIE_SCALE;
+ (*ac)--; av++;
+ break;
+
+ case TOK_BETA:
+ if (*ac <= 0 || !is_valid_number(av[0]))
+ errx(EX_DATAERR, "beta needs number\n");
+
+ ep->par[5] = atof(av[0]) * PIE_SCALE;
+ (*ac)--; av++;
+ break;
+
+ case TOK_ECN:
+ ep->par[6] |= PIE_ECN_ENABLED;
+ break;
+ case TOK_NO_ECN:
+ ep->par[6] &= ~PIE_ECN_ENABLED;
+ break;
+
+ case TOK_CAPDROP:
+ ep->par[6] |= PIE_CAPDROP_ENABLED;
+ break;
+ case TOK_NO_CAPDROP:
+ ep->par[6] &= ~PIE_CAPDROP_ENABLED;
+ break;
+
+ case TOK_ONOFF:
+ ep->par[6] |= PIE_ON_OFF_MODE_ENABLED;
+ break;
+
+ case TOK_DRE:
+ ep->par[6] |= PIE_DEPRATEEST_ENABLED;
+ break;
+
+ case TOK_TS:
+ ep->par[6] &= ~PIE_DEPRATEEST_ENABLED;
+ break;
+
+ case TOK_DERAND:
+ ep->par[6] |= PIE_DERAND_ENABLED;
+ break;
+ case TOK_NO_DERAND:
+ ep->par[6] &= ~PIE_DERAND_ENABLED;
+ break;
+
+ /* Config fq_pie parameters */
+ case TOK_QUANTUM:
+ if (type != TOK_FQ_PIE)
+ errx(EX_DATAERR, "quantum is not for pie\n");
+ if (*ac <= 0 || !is_valid_number(av[0]))
+ errx(EX_DATAERR, "quantum needs number\n");
+
+ ep->par[7]= atoi(av[0]);
+ (*ac)--; av++;
+ break;
+
+ case TOK_LIMIT:
+ if (type != TOK_FQ_PIE)
+ errx(EX_DATAERR, "limit is not for pie, use queue instead\n");
+ if (*ac <= 0 || !is_valid_number(av[0]))
+ errx(EX_DATAERR, "limit needs number\n");
+
+ ep->par[8] = atoi(av[0]);
+ (*ac)--; av++;
+ break;
+
+ case TOK_FLOWS:
+ if (type != TOK_FQ_PIE)
+ errx(EX_DATAERR, "flows is not for pie\n");
+ if (*ac <= 0 || !is_valid_number(av[0]))
+ errx(EX_DATAERR, "flows needs number\n");
+
+ ep->par[9] = atoi(av[0]);
+ (*ac)--; av++;
+ break;
+
+
+ default:
+ printf("%s is invalid parameter\n", av[-1]);
+ }
+ }
+ break;
+ }
+
+ return 0;
+}
+
+#endif
+
+
/*
* configuration of pipes, schedulers, flowsets.
* When we configure a new scheduler, an empty pipe is created, so:
@@ -776,6 +1274,12 @@ ipfw_config_pipe(int ac, char **av)
struct dn_fs *fs = NULL;
struct dn_profile *pf = NULL;
struct ipfw_flow_id *mask = NULL;
+#ifdef NEW_AQM
+ struct dn_extra_parms *aqm_extra;
+ struct dn_extra_parms *sch_extra;
+ int lmax_extra;
+#endif
+
int lmax;
uint32_t _foo = 0, *flags = &_foo , *buckets = &_foo;
@@ -787,6 +1291,15 @@ ipfw_config_pipe(int ac, char **av)
lmax += sizeof(struct dn_sch) + sizeof(struct dn_link) +
sizeof(struct dn_fs) + sizeof(struct dn_profile);
+#ifdef NEW_AQM
+ /* Extra Params */
+ lmax_extra = sizeof(struct dn_extra_parms);
+ /* two lmax_extra because one for AQM params and another
+ * sch params
+ */
+ lmax += lmax_extra*2;
+#endif
+
av++; ac--;
/* Pipe number */
if (ac && isdigit(**av)) {
@@ -812,8 +1325,16 @@ ipfw_config_pipe(int ac, char **av)
* The FIFO scheduler and link are derived from the
* WF2Q+ one in the kernel.
*/
+#ifdef NEW_AQM
+ sch_extra = o_next(&buf, lmax_extra, DN_TEXT);
+ sch_extra ->oid.subtype = 0; /* don't configure scheduler */
+#endif
sch = o_next(&buf, sizeof(*sch), DN_SCH);
p = o_next(&buf, sizeof(*p), DN_LINK);
+#ifdef NEW_AQM
+ aqm_extra = o_next(&buf, lmax_extra, DN_TEXT);
+ aqm_extra ->oid.subtype = 0; /* don't configure AQM */
+#endif
fs = o_next(&buf, sizeof(*fs), DN_FS);
sch->sched_nr = i;
@@ -831,6 +1352,10 @@ ipfw_config_pipe(int ac, char **av)
break;
case 2: /* "queue N config ... " */
+#ifdef NEW_AQM
+ aqm_extra = o_next(&buf, lmax_extra, DN_TEXT);
+ aqm_extra ->oid.subtype = 0;
+#endif
fs = o_next(&buf, sizeof(*fs), DN_FS);
fs->fs_nr = i;
mask = &fs->flow_mask;
@@ -839,7 +1364,15 @@ ipfw_config_pipe(int ac, char **av)
break;
case 3: /* "sched N config ..." */
+#ifdef NEW_AQM
+ sch_extra = o_next(&buf, lmax_extra, DN_TEXT);
+ sch_extra ->oid.subtype = 0;
+#endif
sch = o_next(&buf, sizeof(*sch), DN_SCH);
+#ifdef NEW_AQM
+ aqm_extra = o_next(&buf, lmax_extra, DN_TEXT);
+ aqm_extra ->oid.subtype = 0;
+#endif
fs = o_next(&buf, sizeof(*fs), DN_FS);
sch->sched_nr = i;
mask = &sch->sched_mask;
@@ -1026,7 +1559,31 @@ ipfw_config_pipe(int ac, char **av)
} /* end while, config masks */
end_mask:
break;
+#ifdef NEW_AQM
+ case TOK_CODEL:
+ case TOK_PIE:
+ NEED(fs, "codel/pie is only for flowsets");
+
+ fs->flags &= ~(DN_IS_RED|DN_IS_GENTLE_RED);
+ fs->flags |= DN_IS_AQM;
+
+ strcpy(aqm_extra->name,av[-1]);
+ aqm_extra->oid.subtype = DN_AQM_PARAMS;
+
+ process_extra_parms(&ac, av, aqm_extra, tok);
+ break;
+ case TOK_FQ_CODEL:
+ case TOK_FQ_PIE:
+ if (!strcmp(av[-1],"type"))
+ errx(EX_DATAERR, "use type before fq_codel/fq_pie");
+
+ NEED(sch, "fq_codel/fq_pie is only for schd");
+ strcpy(sch_extra->name,av[-1]);
+ sch_extra->oid.subtype = DN_SCH_PARAMS;
+ process_extra_parms(&ac, av, sch_extra, tok);
+ break;
+#endif
case TOK_RED:
case TOK_GRED:
NEED1("red/gred needs w_q/min_th/max_th/max_p\n");
@@ -1093,7 +1650,20 @@ end_mask:
errx(1, "type %s too long\n", av[0]);
strcpy(sch->name, av[0]);
sch->oid.subtype = 0; /* use string */
- ac--; av++;
+#ifdef NEW_AQM
+ /* if fq_codel is selected, consider all tokens after it
+ * as parameters
+ */
+ if (!strcasecmp(av[0],"fq_codel") || !strcasecmp(av[0],"fq_pie")){
+ strcpy(sch_extra->name,av[0]);
+ sch_extra->oid.subtype = DN_SCH_PARAMS;
+ process_extra_parms(&ac, av, sch_extra, tok);
+ } else {
+ ac--;av++;
+ }
+#else
+ ac--;av++;
+#endif
break;
}
@@ -1187,9 +1757,17 @@ end_mask:
errx(EX_DATAERR, "2 <= queue size <= %ld", limit);
}
+#ifdef NEW_AQM
+ if ((fs->flags & DN_IS_ECN) && !((fs->flags & DN_IS_RED)||
+ (fs->flags & DN_IS_AQM)))
+ errx(EX_USAGE, "ECN can be used with red/gred/"
+ "codel/fq_codel only!");
+#else
if ((fs->flags & DN_IS_ECN) && !(fs->flags & DN_IS_RED))
errx(EX_USAGE, "enable red/gred for ECN");
+#endif
+
if (fs->flags & DN_IS_RED) {
size_t len;
int lookup_depth, avg_pkt_size;
diff --git a/sbin/ipfw/ipfw2.h b/sbin/ipfw/ipfw2.h
index d18803d..7ea2083 100644
--- a/sbin/ipfw/ipfw2.h
+++ b/sbin/ipfw/ipfw2.h
@@ -171,6 +171,31 @@ enum tokens {
TOK_ECN,
TOK_DROPTAIL,
TOK_PROTO,
+#ifdef NEW_AQM
+ /* AQM tokens*/
+ TOK_NO_ECN,
+ TOK_CODEL,
+ TOK_FQ_CODEL,
+ TOK_TARGET,
+ TOK_INTERVAL,
+ TOK_FLOWS,
+ TOK_QUANTUM,
+
+ TOK_PIE,
+ TOK_FQ_PIE,
+ TOK_TUPDATE,
+ TOK_MAX_BURST,
+ TOK_MAX_ECNTH,
+ TOK_ALPHA,
+ TOK_BETA,
+ TOK_CAPDROP,
+ TOK_NO_CAPDROP,
+ TOK_ONOFF,
+ TOK_DRE,
+ TOK_TS,
+ TOK_DERAND,
+ TOK_NO_DERAND,
+#endif
/* dummynet tokens */
TOK_WEIGHT,
TOK_LMAX,
diff --git a/sys/modules/dummynet/Makefile b/sys/modules/dummynet/Makefile
index d7d8b1b..877e01a 100644
--- a/sys/modules/dummynet/Makefile
+++ b/sys/modules/dummynet/Makefile
@@ -4,8 +4,9 @@
KMOD= dummynet
SRCS= ip_dummynet.c
SRCS+= ip_dn_glue.c ip_dn_io.c
+SRCS+= dn_aqm_codel.c dn_aqm_pie.c
SRCS+= dn_heap.c dn_sched_fifo.c dn_sched_qfq.c dn_sched_rr.c dn_sched_wf2q.c
-SRCS+= dn_sched_prio.c
+SRCS+= dn_sched_prio.c dn_sched_fq_codel.c dn_sched_fq_pie.c
SRCS+= opt_inet6.h
.include <bsd.kmod.mk>
diff --git a/sys/netinet/ip_dummynet.h b/sys/netinet/ip_dummynet.h
index 202f1e2..377b5b0 100644
--- a/sys/netinet/ip_dummynet.h
+++ b/sys/netinet/ip_dummynet.h
@@ -29,7 +29,7 @@
#ifndef _IP_DUMMYNET_H
#define _IP_DUMMYNET_H
-
+#define NEW_AQM
/*
* Definition of the kernel-userland API for dummynet.
*
@@ -85,7 +85,13 @@ enum {
/* special commands for emulation of sysctl variables */
DN_SYSCTL_GET,
DN_SYSCTL_SET,
-
+#ifdef NEW_AQM
+ /* subtypes used for setting/getting extra parameters.
+ * these subtypes used with IP_DUMMYNET3 command (get)
+ * and DN_TEXT (set). */
+ DN_AQM_PARAMS, /* AQM extra params */
+ DN_SCH_PARAMS, /* scheduler extra params */
+#endif
DN_LAST,
};
@@ -105,6 +111,9 @@ enum { /* user flags */
DN_IS_RED = 0x0020,
DN_IS_GENTLE_RED= 0x0040,
DN_IS_ECN = 0x0080,
+ #ifdef NEW_AQM
+ DN_IS_AQM = 0x0100, /* AQMs: e.g Codel & PIE */
+ #endif
DN_PIPE_CMD = 0x1000, /* pipe config... */
};
@@ -210,7 +219,19 @@ struct dn_profile {
int samples[ED_MAX_SAMPLES_NO]; /* may be shorter */
};
-
+#ifdef NEW_AQM
+/* Extra parameters for AQM and scheduler.
+ * This struct is used to pass and retrieve parameters (configurations)
+ * to/from AQM and Scheduler.
+ */
+struct dn_extra_parms {
+ struct dn_id oid;
+ char name[16];
+ uint32_t nr;
+#define DN_MAX_EXTRA_PARM 10
+ int64_t par[DN_MAX_EXTRA_PARM];
+};
+#endif
/*
* Overall structure of dummynet
diff --git a/sys/netpfil/ipfw/dn_aqm.h b/sys/netpfil/ipfw/dn_aqm.h
new file mode 100644
index 0000000..df1d187
--- /dev/null
+++ b/sys/netpfil/ipfw/dn_aqm.h
@@ -0,0 +1,167 @@
+/*-
+ * Copyright (C) 2016 Centre for Advanced Internet Architectures,
+ * Swinburne University of Technology, Melbourne, Australia.
+ * Portions of this code were made possible in part by a gift from
+ * The Comcast Innovation Fund.
+ * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * API for writting an Active Queue Management algorithm for Dummynet
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IP_DN_AQM_H
+#define _IP_DN_AQM_H
+
+
+/* NOW is the current time in millisecond*/
+#define NOW ((dn_cfg.curr_time * tick) / 1000)
+
+#define AQM_UNOW (dn_cfg.curr_time * tick)
+#define AQM_TIME_1US ((aqm_time_t)(1))
+#define AQM_TIME_1MS ((aqm_time_t)(1000))
+#define AQM_TIME_1S ((aqm_time_t)(AQM_TIME_1MS * 1000))
+
+/* aqm time allows to store up to 4294 seconds */
+typedef uint32_t aqm_time_t;
+typedef int32_t aqm_stime_t;
+
+#define DN_AQM_MTAG_TS 55345
+
+/* Macro for variable bounding */
+#define BOUND_VAR(x,l,h) (x < l? l : x > h? h : x)
+
+/* sysctl variable to count number of droped packets */
+extern unsigned long io_pkt_drop;
+
+/*
+ * Structure for holding data and function pointers that together represent a
+ * AQM algorithm.
+ */
+ struct dn_aqm {
+#define DN_AQM_NAME_MAX 50
+ char name[DN_AQM_NAME_MAX]; /* name of AQM algorithm */
+ uint32_t type; /* AQM type number */
+
+ /* Methods implemented by AQM algorithm:
+ *
+ * enqueue enqueue packet 'm' on queue 'q'.
+ * Return 0 on success, 1 on drop.
+ *
+ * dequeue dequeue a packet from queue 'q'.
+ * Return a packet, NULL if no packet available.
+ *
+ * config configure AQM algorithm
+ * If required, this function should allocate space to store
+ * the configurations and set 'fs->aqmcfg' to point to this space.
+ * 'dn_extra_parms' includes array of parameters send
+ * from ipfw userland command.
+ * Return 0 on success, non-zero otherwise.
+ *
+ * deconfig deconfigure AQM algorithm.
+ * The allocated configuration memory space should be freed here.
+ * Return 0 on success, non-zero otherwise.
+ *
+ * init initialise AQM status variables of queue 'q'
+ * This function is used to allocate space and init AQM status for a
+ * queue and q->aqm_status to point to this space.
+ * Return 0 on success, non-zero otherwise.
+ *
+ * cleanup cleanup AQM status variables of queue 'q'
+ * The allocated memory space for AQM status should be freed here.
+ * Return 0 on success, non-zero otherwise.
+ *
+ * getconfig retrieve AQM configurations
+ * This function is used to return AQM parameters to userland
+ * command. The function should fill 'dn_extra_parms' struct with
+ * the AQM configurations using 'par' array.
+ *
+ */
+
+ int (*enqueue)(struct dn_queue *, struct mbuf *);
+ struct mbuf * (*dequeue)(struct dn_queue *);
+ int (*config)(struct dn_fsk *, struct dn_extra_parms *ep, int);
+ int (*deconfig)(struct dn_fsk *);
+ int (*init)(struct dn_queue *);
+ int (*cleanup)(struct dn_queue *);
+ int (*getconfig)(struct dn_fsk *, struct dn_extra_parms *);
+
+ int ref_count; /*Number of queues instances in the system */
+ int cfg_ref_count; /*Number of AQM instances in the system */
+ SLIST_ENTRY (dn_aqm) next; /* Next AQM in the list */
+};
+
+/* Helper function to update queue and scheduler statistics.
+ * negative len + drop -> drop
+ * negative len -> dequeue
+ * positive len -> enqueue
+ * positive len + drop -> drop during enqueue
+ */
+__inline static void
+update_stats(struct dn_queue *q, int len, int drop)
+{
+ int inc = 0;
+ struct dn_flow *sni;
+ struct dn_flow *qni;
+
+ sni = &q->_si->ni;
+ qni = &q->ni;
+
+ if (len < 0)
+ inc = -1;
+ else if(len > 0)
+ inc = 1;
+
+ if (drop) {
+ qni->drops++;
+ sni->drops++;
+ io_pkt_drop++;
+ } else {
+ /*update queue stats */
+ qni->length += inc;
+ qni->len_bytes += len;
+
+ /*update scheduler instance stats */
+ sni->length += inc;
+ sni->len_bytes += len;
+ }
+ /* tot_pkts is updated in dn_enqueue function */
+}
+
+
+/* kernel module related function */
+int
+dn_aqm_modevent(module_t mod, int cmd, void *arg);
+
+#define DECLARE_DNAQM_MODULE(name, dnaqm) \
+ static moduledata_t name##_mod = { \
+ #name, dn_aqm_modevent, dnaqm \
+ }; \
+ DECLARE_MODULE(name, name##_mod, \
+ SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); \
+ MODULE_DEPEND(name, dummynet, 3, 3, 3)
+
+#endif
diff --git a/sys/netpfil/ipfw/dn_aqm_codel.c b/sys/netpfil/ipfw/dn_aqm_codel.c
new file mode 100644
index 0000000..0080170
--- /dev/null
+++ b/sys/netpfil/ipfw/dn_aqm_codel.c
@@ -0,0 +1,444 @@
+/*
+ * Codel - The Controlled-Delay Active Queue Management algorithm.
+ *
+ * $FreeBSD$
+ *
+ * Copyright (C) 2016 Centre for Advanced Internet Architectures,
+ * Swinburne University of Technology, Melbourne, Australia.
+ * Portions of this code were made possible in part by a gift from
+ * The Comcast Innovation Fund.
+ * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h> /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */
+#include <net/netisr.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/ip.h> /* ip_len, ip_off */
+#include <netinet/ip_var.h> /* ip_output(), IP_FORWARDING */
+#include <netinet/ip_fw.h>
+#include <netinet/ip_dummynet.h>
+#include <netinet/if_ether.h> /* various ether_* routines */
+#include <netinet/ip6.h> /* for ip6_input, ip6_output prototypes */
+#include <netinet6/ip6_var.h>
+#include <netpfil/ipfw/dn_heap.h>
+
+#ifdef NEW_AQM
+#include <netpfil/ipfw/ip_fw_private.h>
+#include <netpfil/ipfw/ip_dn_private.h>
+#include <netpfil/ipfw/dn_aqm.h>
+#include <netpfil/ipfw/dn_aqm_codel.h>
+#include <netpfil/ipfw/dn_sched.h>
+
+#define DN_AQM_CODEL 1
+
+static struct dn_aqm codel_desc;
+
+/* default codel parameters */
+struct dn_aqm_codel_parms codel_sysctl = {5000 * AQM_TIME_1US,
+ 100000 * AQM_TIME_1US, 0};
+
+static int
+codel_sysctl_interval_handler(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ long value;
+
+ value = codel_sysctl.interval;
+ value /= AQM_TIME_1US;
+ error = sysctl_handle_long(oidp, &value, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ if (value < 1 || value > 100 * AQM_TIME_1S)
+ return (EINVAL);
+ codel_sysctl.interval = value * AQM_TIME_1US ;
+ return (0);
+}
+
+static int
+codel_sysctl_target_handler(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ long value;
+
+ value = codel_sysctl.target;
+ value /= AQM_TIME_1US;
+ error = sysctl_handle_long(oidp, &value, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ D("%ld", value);
+ if (value < 1 || value > 5 * AQM_TIME_1S)
+ return (EINVAL);
+ codel_sysctl.target = value * AQM_TIME_1US ;
+ return (0);
+}
+
+/* defining Codel sysctl variables */
+SYSBEGIN(f4)
+
+SYSCTL_DECL(_net_inet);
+SYSCTL_DECL(_net_inet_ip);
+SYSCTL_DECL(_net_inet_ip_dummynet);
+static SYSCTL_NODE(_net_inet_ip_dummynet, OID_AUTO,
+ codel, CTLFLAG_RW, 0, "CODEL");
+
+#ifdef SYSCTL_NODE
+SYSCTL_PROC(_net_inet_ip_dummynet_codel, OID_AUTO, target,
+ CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,codel_sysctl_target_handler, "L",
+ "CoDel target in microsecond");
+
+SYSCTL_PROC(_net_inet_ip_dummynet_codel, OID_AUTO, interval,
+ CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, codel_sysctl_interval_handler, "L",
+ "CoDel interval in microsecond");
+#endif
+
+/* This function computes codel_interval/sqrt(count)
+ * Newton's method of approximation is used to compute 1/sqrt(count).
+ * http://betterexplained.com/articles/
+ * understanding-quakes-fast-inverse-square-root/
+ */
+aqm_time_t
+control_law(struct codel_status *cst, struct dn_aqm_codel_parms *cprms,
+ aqm_time_t t)
+{
+ uint32_t count;
+ uint64_t temp;
+ count = cst->count;
+
+ /* we don't calculate isqrt(1) to get more accurate result*/
+ if (count == 1) {
+ /* prepare isqrt (old guess) for the next iteration i.e. 1/sqrt(2)*/
+ cst->isqrt = (1UL<< FIX_POINT_BITS) * 7/10;
+ /* return time + isqrt(1)*interval */
+ return t + cprms->interval;
+ }
+
+ /* newguess = g(1.5 - 0.5*c*g^2)
+ * Multiplying both sides by 2 to make all the constants intergers
+ * newguess * 2 = g(3 - c*g^2) g=old guess, c=count
+ * So, newguess = newguess /2
+ * Fixed point operations are used here.
+ */
+
+ /* Calculate g^2 */
+ temp = (uint32_t) cst->isqrt * cst->isqrt;
+ /* Calculate (3 - c*g^2) i.e. (3 - c * temp) */
+ temp = (3ULL<< (FIX_POINT_BITS*2)) - (count * temp);
+
+ /*
+ * Divide by 2 because we multiplied the original equation by two
+ * Also, we shift the result by 8 bits to prevent overflow.
+ * */
+ temp >>= (1 + 8);
+
+ /* Now, temp = (1.5 - 0.5*c*g^2)
+ * Calculate g (1.5 - 0.5*c*g^2) i.e. g * temp
+ */
+ temp = (cst->isqrt * temp) >> (FIX_POINT_BITS + FIX_POINT_BITS - 8);
+ cst->isqrt = temp;
+
+ /* calculate codel_interval/sqrt(count) */
+ return t + ((cprms->interval * temp) >> FIX_POINT_BITS);
+}
+
+/*
+ * Extract a packet from the head of queue 'q'
+ * Return a packet or NULL if the queue is empty.
+ * Also extract packet's timestamp from mtag.
+ */
+struct mbuf *
+codel_extract_head(struct dn_queue *q, aqm_time_t *pkt_ts)
+{
+ struct m_tag *mtag;
+ struct mbuf *m = q->mq.head;
+
+ if (m == NULL)
+ return m;
+ q->mq.head = m->m_nextpkt;
+
+ /* Update stats */
+ update_stats(q, -m->m_pkthdr.len, 0);
+
+ if (q->ni.length == 0) /* queue is now idle */
+ q->q_time = dn_cfg.curr_time;
+
+ /* extract packet TS*/
+ mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL);
+ if (mtag == NULL) {
+ D("Codel timestamp mtag not found!");
+ *pkt_ts = 0;
+ } else {
+ *pkt_ts = *(aqm_time_t *)(mtag + 1);
+ m_tag_delete(m,mtag);
+ }
+
+ return m;
+}
+
+/*
+ * Enqueue a packet 'm' in queue 'q'
+ */
+static int
+aqm_codel_enqueue(struct dn_queue *q, struct mbuf *m)
+{
+ struct dn_fs *f;
+ uint64_t len;
+ struct codel_status *cst; /*codel status variables */
+ struct m_tag *mtag;
+
+ f = &(q->fs->fs);
+ len = m->m_pkthdr.len;
+ cst = q->aqm_status;
+ if(!cst) {
+ D("Codel queue is not initialized\n");
+ goto drop;
+ }
+
+ /* Finding maximum packet size */
+ // XXX we can get MTU from driver instead
+ if (len > cst->maxpkt_size)
+ cst->maxpkt_size = len;
+
+ /* check for queue size and drop the tail if exceed queue limit*/
+ if (f->flags & DN_QSIZE_BYTES) {
+ if ( q->ni.len_bytes > f->qsize)
+ goto drop;
+ }
+ else {
+ if ( q->ni.length >= f->qsize)
+ goto drop;
+ }
+
+ /* Add timestamp as mtag */
+ mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL);
+ if (mtag == NULL)
+ mtag = m_tag_alloc(MTAG_ABI_COMPAT, DN_AQM_MTAG_TS,
+ sizeof(aqm_time_t), M_NOWAIT);
+ if (mtag == NULL) {
+ m_freem(m);
+ goto drop;
+ }
+
+ *(aqm_time_t *)(mtag + 1) = AQM_UNOW;
+ m_tag_prepend(m, mtag);
+
+ mq_append(&q->mq, m);
+ update_stats(q, len, 0);
+ return (0);
+
+drop:
+ update_stats(q, 0, 1);
+ FREE_PKT(m);
+ return (1);
+}
+
+
+/* Dequeue a pcaket from queue q */
+static struct mbuf *
+aqm_codel_dequeue(struct dn_queue *q)
+{
+ return codel_dequeue(q);
+}
+
+/*
+ * initialize Codel for queue 'q'
+ * First allocate memory for codel status.
+ */
+static int
+aqm_codel_init(struct dn_queue *q)
+{
+ struct codel_status *cst;
+
+ if (!q->fs->aqmcfg) {
+ D("Codel is not configure!d");
+ return EINVAL;
+ }
+
+ q->aqm_status = malloc(sizeof(struct codel_status),
+ M_DUMMYNET, M_NOWAIT | M_ZERO);
+ if (q->aqm_status == NULL) {
+ D("Cannot allocate AQM_codel private data");
+ return ENOMEM ;
+ }
+
+ /* init codel status variables */
+ cst = q->aqm_status;
+ cst->dropping=0;
+ cst->first_above_time=0;
+ cst->drop_next_time=0;
+ cst->count=0;
+ cst->maxpkt_size = 500;
+
+ /* increase reference counters */
+ codel_desc.ref_count++;
+
+ return 0;
+}
+
+/*
+ * Clean up Codel status for queue 'q'
+ * Destroy memory allocated for codel status.
+ */
+static int
+aqm_codel_cleanup(struct dn_queue *q)
+{
+
+ if (q && q->aqm_status) {
+ free(q->aqm_status, M_DUMMYNET);
+ q->aqm_status = NULL;
+ /* decrease reference counters */
+ codel_desc.ref_count--;
+ }
+ else
+ D("Codel already cleaned up");
+ return 0;
+}
+
+/*
+ * Config codel parameters
+ * also allocate memory for codel configurations
+ */
+static int
+aqm_codel_config(struct dn_fsk* fs, struct dn_extra_parms *ep, int len)
+{
+ struct dn_aqm_codel_parms *ccfg;
+
+ int l = sizeof(struct dn_extra_parms);
+ if (len < l) {
+ D("invalid sched parms length got %d need %d", len, l);
+ return EINVAL;
+ }
+ /* we free the old cfg because maybe the original allocation
+ * not the same size as the new one (different AQM type).
+ */
+ if (fs->aqmcfg) {
+ free(fs->aqmcfg, M_DUMMYNET);
+ fs->aqmcfg = NULL;
+ }
+
+ fs->aqmcfg = malloc(sizeof(struct dn_aqm_codel_parms),
+ M_DUMMYNET, M_NOWAIT | M_ZERO);
+ if (fs->aqmcfg== NULL) {
+ D("cannot allocate AQM_codel configuration parameters");
+ return ENOMEM;
+ }
+
+ /* configure codel parameters */
+ ccfg = fs->aqmcfg;
+
+ if (ep->par[0] < 0)
+ ccfg->target = codel_sysctl.target;
+ else
+ ccfg->target = ep->par[0] * AQM_TIME_1US;
+
+ if (ep->par[1] < 0)
+ ccfg->interval = codel_sysctl.interval;
+ else
+ ccfg->interval = ep->par[1] * AQM_TIME_1US;
+
+ if (ep->par[2] < 0)
+ ccfg->flags = 0;
+ else
+ ccfg->flags = ep->par[2];
+
+ /* bound codel configurations */
+ ccfg->target = BOUND_VAR(ccfg->target,1, 5 * AQM_TIME_1S);
+ ccfg->interval = BOUND_VAR(ccfg->interval,1, 5 * AQM_TIME_1S);
+ /* increase config reference counter */
+ codel_desc.cfg_ref_count++;
+
+ return 0;
+}
+
+/*
+ * Deconfigure Codel and free memory allocation
+ */
+static int
+aqm_codel_deconfig(struct dn_fsk* fs)
+{
+
+ if (fs && fs->aqmcfg) {
+ free(fs->aqmcfg, M_DUMMYNET);
+ fs->aqmcfg = NULL;
+ fs->aqmfp = NULL;
+ /* decrease config reference counter */
+ codel_desc.cfg_ref_count--;
+ }
+
+ return 0;
+}
+
+/*
+ * Retrieve Codel configuration parameters.
+ */
+static int
+aqm_codel_getconfig(struct dn_fsk *fs, struct dn_extra_parms * ep)
+{
+ struct dn_aqm_codel_parms *ccfg;
+
+ if (fs->aqmcfg) {
+ strcpy(ep->name, codel_desc.name);
+ ccfg = fs->aqmcfg;
+ ep->par[0] = ccfg->target / AQM_TIME_1US;
+ ep->par[1] = ccfg->interval / AQM_TIME_1US;
+ ep->par[2] = ccfg->flags;
+ return 0;
+ }
+ return 1;
+}
+
+static struct dn_aqm codel_desc = {
+ _SI( .type = ) DN_AQM_CODEL,
+ _SI( .name = ) "CODEL",
+ _SI( .enqueue = ) aqm_codel_enqueue,
+ _SI( .dequeue = ) aqm_codel_dequeue,
+ _SI( .config = ) aqm_codel_config,
+ _SI( .getconfig = ) aqm_codel_getconfig,
+ _SI( .deconfig = ) aqm_codel_deconfig,
+ _SI( .init = ) aqm_codel_init,
+ _SI( .cleanup = ) aqm_codel_cleanup,
+};
+
+DECLARE_DNAQM_MODULE(dn_aqm_codel, &codel_desc);
+
+
+#endif
diff --git a/sys/netpfil/ipfw/dn_aqm_codel.h b/sys/netpfil/ipfw/dn_aqm_codel.h
new file mode 100644
index 0000000..f5618e7
--- /dev/null
+++ b/sys/netpfil/ipfw/dn_aqm_codel.h
@@ -0,0 +1,222 @@
+/*
+ * Codel - The Controlled-Delay Active Queue Management algorithm.
+ *
+ * $FreeBSD$
+ *
+ * Copyright (C) 2016 Centre for Advanced Internet Architectures,
+ * Swinburne University of Technology, Melbourne, Australia.
+ * Portions of this code were made possible in part by a gift from
+ * The Comcast Innovation Fund.
+ * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au>
+ *
+ * Copyright (C) 2011-2014 Kathleen Nichols <nichols@pollere.com>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * o Redistributions of source code must retain the above copyright
+ * notice, this list of conditions, and the following disclaimer,
+ * without modification.
+ *
+ * o Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * o The names of the authors may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General Public
+ * License ("GPL") version 2, in which case the provisions of the GPL
+ * apply INSTEAD OF those given above.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _IP_DN_AQM_CODEL_H
+#define _IP_DN_AQM_CODEL_H
+
+
+// XXX How to choose MTAG?
+#define FIX_POINT_BITS 16
+
+enum {
+ CODEL_ECN_ENABLED = 1
+};
+
+/* Codel parameters */
+struct dn_aqm_codel_parms {
+ aqm_time_t target;
+ aqm_time_t interval;
+ uint32_t flags;
+};
+
+/* codel status variables */
+struct codel_status {
+ uint32_t count; /* number of dropped pkts since entering drop state */
+ uint16_t dropping; /* dropping state */
+ aqm_time_t drop_next_time; /* time for next drop */
+ aqm_time_t first_above_time; /* time for first ts over target we observed */
+ uint16_t isqrt; /* last isqrt for control low */
+ uint16_t maxpkt_size; /* max packet size seen so far */
+};
+
+struct mbuf *codel_extract_head(struct dn_queue *, aqm_time_t *);
+aqm_time_t control_law(struct codel_status *,
+ struct dn_aqm_codel_parms *, aqm_time_t );
+
+__inline static struct mbuf *
+codel_dodequeue(struct dn_queue *q, aqm_time_t now, uint16_t *ok_to_drop)
+{
+ struct mbuf * m;
+ struct dn_aqm_codel_parms *cprms;
+ struct codel_status *cst;
+ aqm_time_t pkt_ts, sojourn_time;
+
+ *ok_to_drop = 0;
+ m = codel_extract_head(q, &pkt_ts);
+
+ cst = q->aqm_status;
+
+ if (m == NULL) {
+ /* queue is empty - we can't be above target */
+ cst->first_above_time= 0;
+ return m;
+ }
+
+ cprms = q->fs->aqmcfg;
+
+ /* To span a large range of bandwidths, CoDel runs two
+ * different AQMs in parallel. One is sojourn-time-based
+ * and takes effect when the time to send an MTU-sized
+ * packet is less than target. The 1st term of the "if"
+ * below does this. The other is backlog-based and takes
+ * effect when the time to send an MTU-sized packet is >=
+ * target. The goal here is to keep the output link
+ * utilization high by never allowing the queue to get
+ * smaller than the amount that arrives in a typical
+ * interarrival time (MTU-sized packets arriving spaced
+ * by the amount of time it takes to send such a packet on
+ * the bottleneck). The 2nd term of the "if" does this.
+ */
+ sojourn_time = now - pkt_ts;
+ if (sojourn_time < cprms->target || q->ni.len_bytes <= cst->maxpkt_size) {
+ /* went below - stay below for at least interval */
+ cst->first_above_time = 0;
+ } else {
+ if (cst->first_above_time == 0) {
+ /* just went above from below. if still above at
+ * first_above_time, will say it's ok to drop. */
+ cst->first_above_time = now + cprms->interval;
+ } else if (now >= cst->first_above_time) {
+ *ok_to_drop = 1;
+ }
+ }
+ return m;
+}
+
+/*
+ * Dequeue a packet from queue 'q'
+ */
+__inline static struct mbuf *
+codel_dequeue(struct dn_queue *q)
+{
+ struct mbuf *m;
+ struct dn_aqm_codel_parms *cprms;
+ struct codel_status *cst;
+ aqm_time_t now;
+ uint16_t ok_to_drop;
+
+ cst = q->aqm_status;;
+ cprms = q->fs->aqmcfg;
+ now = AQM_UNOW;
+
+ m = codel_dodequeue(q, now, &ok_to_drop);
+ if (cst->dropping) {
+ if (!ok_to_drop) {
+ /* sojourn time below target - leave dropping state */
+ cst->dropping = false;
+ }
+ /*
+ * Time for the next drop. Drop current packet and dequeue
+ * next. If the dequeue doesn't take us out of dropping
+ * state, schedule the next drop. A large backlog might
+ * result in drop rates so high that the next drop should
+ * happen now, hence the 'while' loop.
+ */
+ while (now >= cst->drop_next_time && cst->dropping) {
+
+ /* mark the packet */
+ if (cprms->flags & CODEL_ECN_ENABLED && ecn_mark(m)) {
+ cst->count++;
+ /* schedule the next mark. */
+ cst->drop_next_time = control_law(cst, cprms,
+ cst->drop_next_time);
+ return m;
+ }
+
+ /* drop the packet */
+ update_stats(q, 0, 1);
+ FREE_PKT(m);
+ m = codel_dodequeue(q, now, &ok_to_drop);
+
+ if (!ok_to_drop) {
+ /* leave dropping state */
+ cst->dropping = false;
+ } else {
+ cst->count++;
+ /* schedule the next drop. */
+ cst->drop_next_time = control_law(cst, cprms,
+ cst->drop_next_time);
+ }
+ }
+ /* If we get here we're not in dropping state. The 'ok_to_drop'
+ * return from dodequeue means that the sojourn time has been
+ * above 'target' for 'interval' so enter dropping state.
+ */
+ } else if (ok_to_drop) {
+
+ /* if ECN option is disabled or the packet cannot be marked,
+ * drop the packet and extract another.
+ */
+ if (!(cprms->flags & CODEL_ECN_ENABLED) || !ecn_mark(m)) {
+ update_stats(q, 0, 1);
+ FREE_PKT(m);
+ m = codel_dodequeue(q, now, &ok_to_drop);
+ }
+
+ cst->dropping = true;
+
+ /* If min went above target close to when it last went
+ * below, assume that the drop rate that controlled the
+ * queue on the last cycle is a good starting point to
+ * control it now. ('drop_next' will be at most 'interval'
+ * later than the time of the last drop so 'now - drop_next'
+ * is a good approximation of the time from the last drop
+ * until now.)
+ */
+ cst->count = (cst->count > 2 && ((aqm_stime_t)now -
+ (aqm_stime_t)cst->drop_next_time) < 8* cprms->interval)?
+ cst->count - 2 : 1;
+ /* we don't have to set initial guess for Newton's method isqrt as
+ * we initilaize isqrt in control_law function when count == 1 */
+ cst->drop_next_time = control_law(cst, cprms, now);
+ }
+
+ return m;
+}
+
+#endif
diff --git a/sys/netpfil/ipfw/dn_aqm_pie.c b/sys/netpfil/ipfw/dn_aqm_pie.c
new file mode 100644
index 0000000..d994986
--- /dev/null
+++ b/sys/netpfil/ipfw/dn_aqm_pie.c
@@ -0,0 +1,793 @@
+/*
+ * PIE - Proportional Integral controller Enhanced AQM algorithm.
+ *
+ * $FreeBSD$
+ *
+ * Copyright (C) 2016 Centre for Advanced Internet Architectures,
+ * Swinburne University of Technology, Melbourne, Australia.
+ * Portions of this code were made possible in part by a gift from
+ * The Comcast Innovation Fund.
+ * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h> /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */
+#include <net/netisr.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/ip.h> /* ip_len, ip_off */
+#include <netinet/ip_var.h> /* ip_output(), IP_FORWARDING */
+#include <netinet/ip_fw.h>
+#include <netinet/ip_dummynet.h>
+#include <netinet/if_ether.h> /* various ether_* routines */
+#include <netinet/ip6.h> /* for ip6_input, ip6_output prototypes */
+#include <netinet6/ip6_var.h>
+#include <netpfil/ipfw/dn_heap.h>
+
+#ifdef NEW_AQM
+#include <netpfil/ipfw/ip_fw_private.h>
+#include <netpfil/ipfw/ip_dn_private.h>
+#include <netpfil/ipfw/dn_aqm.h>
+#include <netpfil/ipfw/dn_aqm_pie.h>
+#include <netpfil/ipfw/dn_sched.h>
+
+/* for debugging */
+#include <sys/syslog.h>
+
+static struct dn_aqm pie_desc;
+
+/* PIE defaults
+ * target=15ms, tupdate=15ms, max_burst=150ms,
+ * max_ecnth=0.1, alpha=0.125, beta=1.25,
+ */
+struct dn_aqm_pie_parms pie_sysctl =
+ { 15 * AQM_TIME_1MS, 15 * AQM_TIME_1MS, 150 * AQM_TIME_1MS,
+ PIE_SCALE/10 , PIE_SCALE * 0.125, PIE_SCALE * 1.25 ,
+ PIE_CAPDROP_ENABLED | PIE_DEPRATEEST_ENABLED | PIE_DERAND_ENABLED };
+
+static int
+pie_sysctl_alpha_beta_handler(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ long value;
+
+ if (!strcmp(oidp->oid_name,"alpha"))
+ value = pie_sysctl.alpha;
+ else
+ value = pie_sysctl.beta;
+
+ value = value * 1000 / PIE_SCALE;
+ error = sysctl_handle_long(oidp, &value, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ if (value < 1 || value > 7 * PIE_SCALE)
+ return (EINVAL);
+ value = (value * PIE_SCALE) / 1000;
+ if (!strcmp(oidp->oid_name,"alpha"))
+ pie_sysctl.alpha = value;
+ else
+ pie_sysctl.beta = value;
+ return (0);
+}
+
+static int
+pie_sysctl_target_tupdate_maxb_handler(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ long value;
+
+ if (!strcmp(oidp->oid_name,"target"))
+ value = pie_sysctl.qdelay_ref;
+ else if (!strcmp(oidp->oid_name,"tupdate"))
+ value = pie_sysctl.tupdate;
+ else
+ value = pie_sysctl.max_burst;
+
+ value = value / AQM_TIME_1US;
+ error = sysctl_handle_long(oidp, &value, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ if (value < 1 || value > 10 * AQM_TIME_1S)
+ return (EINVAL);
+ value = value * AQM_TIME_1US;
+
+ if (!strcmp(oidp->oid_name,"target"))
+ pie_sysctl.qdelay_ref = value;
+ else if (!strcmp(oidp->oid_name,"tupdate"))
+ pie_sysctl.tupdate = value;
+ else
+ pie_sysctl.max_burst = value;
+ return (0);
+}
+
+static int
+pie_sysctl_max_ecnth_handler(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ long value;
+
+ value = pie_sysctl.max_ecnth;
+ value = value * 1000 / PIE_SCALE;
+ error = sysctl_handle_long(oidp, &value, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ if (value < 1 || value > PIE_SCALE)
+ return (EINVAL);
+ value = (value * PIE_SCALE) / 1000;
+ pie_sysctl.max_ecnth = value;
+ return (0);
+}
+
+/* define PIE sysctl variables */
+SYSBEGIN(f4)
+SYSCTL_DECL(_net_inet);
+SYSCTL_DECL(_net_inet_ip);
+SYSCTL_DECL(_net_inet_ip_dummynet);
+static SYSCTL_NODE(_net_inet_ip_dummynet, OID_AUTO,
+ pie, CTLFLAG_RW, 0, "PIE");
+
+#ifdef SYSCTL_NODE
+SYSCTL_PROC(_net_inet_ip_dummynet_pie, OID_AUTO, target,
+ CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,
+ pie_sysctl_target_tupdate_maxb_handler, "L",
+ "queue target in microsecond");
+SYSCTL_PROC(_net_inet_ip_dummynet_pie, OID_AUTO, tupdate,
+ CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,
+ pie_sysctl_target_tupdate_maxb_handler, "L",
+ "the frequency of drop probability calculation in microsecond");
+SYSCTL_PROC(_net_inet_ip_dummynet_pie, OID_AUTO, max_burst,
+ CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,
+ pie_sysctl_target_tupdate_maxb_handler, "L",
+ "Burst allowance interval in microsecond");
+
+SYSCTL_PROC(_net_inet_ip_dummynet_pie, OID_AUTO, max_ecnth,
+ CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,
+ pie_sysctl_max_ecnth_handler, "L",
+ "ECN safeguard threshold scaled by 1000");
+
+SYSCTL_PROC(_net_inet_ip_dummynet_pie, OID_AUTO, alpha,
+ CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,
+ pie_sysctl_alpha_beta_handler, "L",
+ "PIE alpha scaled by 1000");
+SYSCTL_PROC(_net_inet_ip_dummynet_pie, OID_AUTO, beta,
+ CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,
+ pie_sysctl_alpha_beta_handler, "L",
+ "beta scaled by 1000");
+#endif
+
+
+/*
+ * Callout function for drop probability calculation
+ * This function is called over tupdate ms and takes pointer of PIE
+ * status variables as an argument
+ */
+static void
+calculate_drop_prob(void *x)
+{
+ int64_t p, prob, oldprob;
+ struct dn_aqm_pie_parms *pprms;
+ struct pie_status *pst = (struct pie_status *) x;
+
+ /* dealing with race condition */
+ if (callout_pending(&pst->aqm_pie_callout)) {
+ /* callout was reset */
+ mtx_unlock(&pst->lock_mtx);
+ return;
+ }
+
+ if (!callout_active(&pst->aqm_pie_callout)) {
+ /* callout was stopped */
+ mtx_unlock(&pst->lock_mtx);
+ mtx_destroy(&pst->lock_mtx);
+ free(x, M_DUMMYNET);
+ //pst->pq->aqm_status = NULL;
+ pie_desc.ref_count--;
+ return;
+ }
+ callout_deactivate(&pst->aqm_pie_callout);
+
+ pprms = pst->parms;
+ prob = pst->drop_prob;
+
+ /* calculate current qdelay */
+ if (pprms->flags & PIE_DEPRATEEST_ENABLED) {
+ pst->current_qdelay = ((uint64_t)pst->pq->ni.len_bytes *
+ pst->avg_dq_time) >> PIE_DQ_THRESHOLD_BITS;
+ }
+
+ /* calculate drop probability */
+ p = (int64_t)pprms->alpha *
+ ((int64_t)pst->current_qdelay - (int64_t)pprms->qdelay_ref);
+ p +=(int64_t) pprms->beta *
+ ((int64_t)pst->current_qdelay - (int64_t)pst->qdelay_old);
+
+ /* We PIE_MAX_PROB shift by 12-bits to increase the division precision */
+ p *= (PIE_MAX_PROB << 12) / AQM_TIME_1S;
+
+ /* auto-tune drop probability */
+ if (prob< PIE_MAX_PROB * 0.000001)
+ p >>= 11 + PIE_FIX_POINT_BITS+12;
+ else if (prob < PIE_MAX_PROB * 0.00001)
+ p >>= 9 + PIE_FIX_POINT_BITS+12;
+ else if (prob < PIE_MAX_PROB * 0.0001)
+ p >>= 7 + PIE_FIX_POINT_BITS+12;
+ else if (prob < PIE_MAX_PROB * 0.001)
+ p >>= 5 + PIE_FIX_POINT_BITS+12;
+ else if (prob < PIE_MAX_PROB * 0.01)
+ p >>= 3 + PIE_FIX_POINT_BITS+12;
+ else if (prob < PIE_MAX_PROB * 0.1)
+ p >>= 1 + PIE_FIX_POINT_BITS+12;
+ else
+ p >>= PIE_FIX_POINT_BITS+12;
+
+ oldprob = prob;
+
+ /* Cap Drop adjustment */
+ if ((pprms->flags & PIE_CAPDROP_ENABLED) && prob >= PIE_MAX_PROB / 10
+ && p > PIE_MAX_PROB / 50 )
+ p = PIE_MAX_PROB / 50;
+
+ prob = prob + p;
+
+ /* decay the drop probability exponentially */
+ if (pst->current_qdelay == 0 && pst->qdelay_old == 0)
+ /* 0.98 ~= 1- 1/64 */
+ prob = prob - (prob >> 6);
+
+
+ /* check for multiplication overflow/underflow */
+ if (p>0) {
+ if (prob<oldprob) {
+ D("overflow");
+ prob= PIE_MAX_PROB;
+ }
+ }
+ else
+ if (prob>oldprob) {
+ prob= 0;
+ D("underflow");
+ }
+
+ /* make drop probability between 0 and PIE_MAX_PROB*/
+ if (prob < 0)
+ prob = 0;
+ else if (prob > PIE_MAX_PROB)
+ prob = PIE_MAX_PROB;
+
+ pst->drop_prob = prob;
+
+ /* store current queue delay value in old queue delay*/
+ pst->qdelay_old = pst->current_qdelay;
+
+ /* update burst allowance */
+ if ((pst->sflags & PIE_ACTIVE) && pst->burst_allowance>0) {
+
+ if (pst->burst_allowance > pprms->tupdate )
+ pst->burst_allowance -= pprms->tupdate;
+ else
+ pst->burst_allowance = 0;
+ }
+
+ /* reschedule calculate_drop_prob function */
+ if (pst->sflags & PIE_ACTIVE)
+ callout_reset_sbt(&pst->aqm_pie_callout,
+ (uint64_t)pprms->tupdate * SBT_1US, 0, calculate_drop_prob, pst, 0);
+
+ mtx_unlock(&pst->lock_mtx);
+}
+
+/*
+ * Extract a packet from the head of queue 'q'
+ * Return a packet or NULL if the queue is empty.
+ * If getts is set, also extract packet's timestamp from mtag.
+ */
+static struct mbuf *
+pie_extract_head(struct dn_queue *q, aqm_time_t *pkt_ts, int getts)
+{
+ struct m_tag *mtag;
+ struct mbuf *m = q->mq.head;
+
+ if (m == NULL)
+ return m;
+ q->mq.head = m->m_nextpkt;
+
+ /* Update stats */
+ update_stats(q, -m->m_pkthdr.len, 0);
+
+ if (q->ni.length == 0) /* queue is now idle */
+ q->q_time = dn_cfg.curr_time;
+
+ if (getts) {
+ /* extract packet TS*/
+ mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL);
+ if (mtag == NULL) {
+ D("PIE timestamp mtag not found!");
+ *pkt_ts = 0;
+ } else {
+ *pkt_ts = *(aqm_time_t *)(mtag + 1);
+ m_tag_delete(m,mtag);
+ }
+ }
+ return m;
+}
+
+/*
+ * Initiate PIE variable and optionally activate it
+ */
+__inline static void
+init_activate_pie(struct pie_status *pst, int resettimer)
+{
+ struct dn_aqm_pie_parms *pprms;
+
+ mtx_lock(&pst->lock_mtx);
+ pprms = pst->parms;
+ pst->drop_prob = 0;
+ pst->qdelay_old = 0;
+ pst->burst_allowance = pprms->max_burst;
+ pst->accu_prob = 0;
+ pst->dq_count = 0;
+ pst->avg_dq_time = 0;
+ pst->sflags = PIE_INMEASUREMENT;
+ pst->measurement_start = AQM_UNOW;
+
+ if (resettimer) {
+ pst->sflags |= PIE_ACTIVE;
+ callout_reset_sbt(&pst->aqm_pie_callout,
+ (uint64_t)pprms->tupdate * SBT_1US,
+ 0, calculate_drop_prob, pst, 0);
+ }
+ //DX(2, "PIE Activated");
+ mtx_unlock(&pst->lock_mtx);
+}
+
+/*
+ * Deactivate PIE and stop probe update callout
+ */
+__inline static void
+deactivate_pie(struct pie_status *pst)
+{
+ mtx_lock(&pst->lock_mtx);
+ pst->sflags &= ~(PIE_ACTIVE | PIE_INMEASUREMENT);
+ callout_stop(&pst->aqm_pie_callout);
+ //D("PIE Deactivated");
+ mtx_unlock(&pst->lock_mtx);
+}
+
+/*
+ * Dequeue and return a pcaket from queue 'q' or NULL if 'q' is empty.
+ * Also, caculate depature time or queue delay using timestamp
+ */
+static struct mbuf *
+aqm_pie_dequeue(struct dn_queue *q)
+{
+ struct mbuf *m;
+ struct dn_flow *ni; /* stats for scheduler instance */
+ struct dn_aqm_pie_parms *pprms;
+ struct pie_status *pst;
+ aqm_time_t now;
+ aqm_time_t pkt_ts, dq_time;
+ int32_t w;
+
+ pst = q->aqm_status;
+ pprms = pst->parms;
+ ni = &q->_si->ni;
+
+ /*we extarct packet ts only when Departure Rate Estimation dis not used*/
+ m = pie_extract_head(q, &pkt_ts, !(pprms->flags & PIE_DEPRATEEST_ENABLED));
+
+ if (!m || !(pst->sflags & PIE_ACTIVE))
+ return m;
+
+ now = AQM_UNOW;
+ if (pprms->flags & PIE_DEPRATEEST_ENABLED) {
+ /* calculate average depature time */
+ if(pst->sflags & PIE_INMEASUREMENT) {
+ pst->dq_count += m->m_pkthdr.len;
+
+ if (pst->dq_count >= PIE_DQ_THRESHOLD) {
+ dq_time = now - pst->measurement_start;
+
+ /*
+ * if we don't have old avg dq_time i.e PIE is (re)initialized,
+ * don't use weight to calculate new avg_dq_time
+ */
+ if(pst->avg_dq_time == 0)
+ pst->avg_dq_time = dq_time;
+ else {
+ /*
+ * weight = PIE_DQ_THRESHOLD/2^6, but we scaled
+ * weight by 2^8. Thus, scaled
+ * weight = PIE_DQ_THRESHOLD /2^8
+ * */
+ w = PIE_DQ_THRESHOLD >> 8;
+ pst->avg_dq_time = (dq_time* w
+ + (pst->avg_dq_time * ((1L << 8) - w))) >> 8;
+ pst->sflags &= ~PIE_INMEASUREMENT;
+ }
+ }
+ }
+
+ /*
+ * Start new measurment cycle when the queue has
+ * PIE_DQ_THRESHOLD worth of bytes.
+ */
+ if(!(pst->sflags & PIE_INMEASUREMENT) &&
+ q->ni.len_bytes >= PIE_DQ_THRESHOLD) {
+ pst->sflags |= PIE_INMEASUREMENT;
+ pst->measurement_start = now;
+ pst->dq_count = 0;
+ }
+ }
+ /* Optionally, use packet timestamp to estimate queue delay */
+ else
+ pst->current_qdelay = now - pkt_ts;
+
+ return m;
+}
+
+/*
+ * Enqueue a packet in q, subject to space and PIE queue management policy
+ * (whose parameters are in q->fs).
+ * Update stats for the queue and the scheduler.
+ * Return 0 on success, 1 on drop. The packet is consumed anyways.
+ */
+static int
+aqm_pie_enqueue(struct dn_queue *q, struct mbuf* m)
+{
+ struct dn_fs *f;
+ uint64_t len;
+ uint32_t qlen;
+ struct pie_status *pst;
+ struct dn_aqm_pie_parms *pprms;
+ int t;
+
+ len = m->m_pkthdr.len;
+ pst = q->aqm_status;
+ if(!pst) {
+ DX(2, "PIE queue is not initialized\n");
+ update_stats(q, 0, 1);
+ FREE_PKT(m);
+ return 1;
+ }
+
+ f = &(q->fs->fs);
+ pprms = pst->parms;
+ t = ENQUE;
+
+ /* get current queue length in bytes or packets*/
+ qlen = (f->flags & DN_QSIZE_BYTES) ?
+ q->ni.len_bytes : q->ni.length;
+
+ /* check for queue size and drop the tail if exceed queue limit*/
+ if (qlen >= f->qsize)
+ t = DROP;
+ /* drop/mark the packet when PIE is active and burst time elapsed */
+ else if ((pst->sflags & PIE_ACTIVE) && pst->burst_allowance==0
+ && drop_early(pst, q->ni.len_bytes) == DROP) {
+ /*
+ * if drop_prob over ECN threshold, drop the packet
+ * otherwise mark and enqueue it.
+ */
+ if ((pprms->flags & PIE_ECN_ENABLED) && pst->drop_prob <
+ (pprms->max_ecnth << (PIE_PROB_BITS - PIE_FIX_POINT_BITS))
+ && ecn_mark(m))
+ t = ENQUE;
+ else
+ t = DROP;
+ }
+
+ /* Turn PIE on when 1/3 of the queue is full */
+ if (!(pst->sflags & PIE_ACTIVE) && qlen >= pst->one_third_q_size) {
+ init_activate_pie(pst, 1);
+ }
+
+ /* Reset burst tolerance and optinally turn PIE off*/
+ if ((pst->sflags & PIE_ACTIVE) && pst->drop_prob == 0 &&
+ pst->current_qdelay < (pprms->qdelay_ref >> 1) &&
+ pst->qdelay_old < (pprms->qdelay_ref >> 1)) {
+
+ pst->burst_allowance = pprms->max_burst;
+ if ((pprms->flags & PIE_ON_OFF_MODE_ENABLED) && qlen<=0)
+ deactivate_pie(pst);
+ }
+
+ /* Timestamp the packet if Departure Rate Estimation is disabled */
+ if (t != DROP && !(pprms->flags & PIE_DEPRATEEST_ENABLED)) {
+ /* Add TS to mbuf as a TAG */
+ struct m_tag *mtag;
+ mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL);
+ if (mtag == NULL)
+ mtag = m_tag_alloc(MTAG_ABI_COMPAT, DN_AQM_MTAG_TS,
+ sizeof(aqm_time_t), M_NOWAIT);
+ if (mtag == NULL) {
+ m_freem(m);
+ t = DROP;
+ }
+ *(aqm_time_t *)(mtag + 1) = AQM_UNOW;
+ m_tag_prepend(m, mtag);
+ }
+
+ if (t != DROP) {
+ mq_append(&q->mq, m);
+ update_stats(q, len, 0);
+ return (0);
+ } else {
+ update_stats(q, 0, 1);
+
+ /* reset accu_prob after packet drop */
+ pst->accu_prob = 0;
+ FREE_PKT(m);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * initialize PIE for queue 'q'
+ * First allocate memory for PIE status.
+ */
+static int
+aqm_pie_init(struct dn_queue *q)
+{
+ struct pie_status *pst;
+ struct dn_aqm_pie_parms *pprms;
+ int err = 0;
+
+ pprms = q->fs->aqmcfg;
+
+ do { /* exit with break when error occurs*/
+ if (!pprms){
+ D("AQM_PIE is not configured");
+ err = EINVAL;
+ break;
+ }
+
+ q->aqm_status = malloc(sizeof(struct pie_status),
+ M_DUMMYNET, M_NOWAIT | M_ZERO);
+ if (q->aqm_status == NULL) {
+ D("cannot allocate PIE private data");
+ err = ENOMEM ;
+ break;
+ }
+
+ pst = q->aqm_status;
+ /* increase reference count for PIE module */
+ pie_desc.ref_count++;
+
+ pst->pq = q;
+ pst->parms = pprms;
+
+ /* For speed optimization, we caculate 1/3 queue size once here */
+ // we can use x/3 = (x >>2) + (x >>4) + (x >>7)
+ pst->one_third_q_size = q->fs->fs.qsize/3;
+
+ mtx_init(&pst->lock_mtx, "mtx_pie", NULL, MTX_DEF);
+ callout_init_mtx(&pst->aqm_pie_callout, &pst->lock_mtx,
+ CALLOUT_RETURNUNLOCKED);
+
+ pst->current_qdelay = 0;
+ init_activate_pie(pst, !(pprms->flags & PIE_ON_OFF_MODE_ENABLED));
+
+ //DX(2, "aqm_PIE_init");
+
+ } while(0);
+
+ return err;
+}
+
+/*
+ * Clean up PIE status for queue 'q'
+ * Destroy memory allocated for PIE status.
+ */
+static int
+aqm_pie_cleanup(struct dn_queue *q)
+{
+
+ if(!q) {
+ D("q is null");
+ return 0;
+ }
+ struct pie_status *pst = q->aqm_status;
+ if(!pst) {
+ //D("queue is already cleaned up");
+ return 0;
+ }
+ if(!q->fs || !q->fs->aqmcfg) {
+ D("fs is null or no cfg");
+ return 1;
+ }
+ if (q->fs->aqmfp && q->fs->aqmfp->type !=DN_AQM_PIE) {
+ D("Not PIE fs (%d)", q->fs->fs.fs_nr);
+ return 1;
+ }
+
+ mtx_lock(&pst->lock_mtx);
+
+ /* stop callout timer */
+ if (callout_stop(&pst->aqm_pie_callout) || !(pst->sflags & PIE_ACTIVE)) {
+ mtx_unlock(&pst->lock_mtx);
+ mtx_destroy(&pst->lock_mtx);
+ free(q->aqm_status, M_DUMMYNET);
+ q->aqm_status = NULL;
+ pie_desc.ref_count--;
+ return 0;
+ } else {
+ q->aqm_status = NULL;
+ mtx_unlock(&pst->lock_mtx);
+ DX(2, "PIE callout has not been stoped from cleanup!");
+ return EBUSY;
+ }
+ return 0;
+}
+
+/*
+ * Config PIE parameters
+ * also allocate memory for PIE configurations
+ */
+static int
+aqm_pie_config(struct dn_fsk* fs, struct dn_extra_parms *ep, int len)
+{
+ struct dn_aqm_pie_parms *pcfg;
+
+ int l = sizeof(struct dn_extra_parms);
+ if (len < l) {
+ D("invalid sched parms length got %d need %d", len, l);
+ return EINVAL;
+ }
+ /* we free the old cfg because maybe the orignal allocation
+ * was used for diffirent AQM type.
+ */
+ if (fs->aqmcfg) {
+ free(fs->aqmcfg, M_DUMMYNET);
+ fs->aqmcfg = NULL;
+ }
+
+ fs->aqmcfg = malloc(sizeof(struct dn_aqm_pie_parms),
+ M_DUMMYNET, M_NOWAIT | M_ZERO);
+ if (fs->aqmcfg== NULL) {
+ D("cannot allocate PIE configuration parameters");
+ return ENOMEM;
+ }
+
+ /* par array contains pie configuration as follow
+ * 0- qdelay_ref,1- tupdate, 2- max_burst
+ * 3- max_ecnth, 4- alpha, 5- beta, 6- flags
+ */
+
+ /* configure PIE parameters */
+ pcfg = fs->aqmcfg;
+
+ if (ep->par[0] < 0)
+ pcfg->qdelay_ref = pie_sysctl.qdelay_ref * AQM_TIME_1US;
+ else
+ pcfg->qdelay_ref = ep->par[0];
+ if (ep->par[1] < 0)
+ pcfg->tupdate = pie_sysctl.tupdate * AQM_TIME_1US;
+ else
+ pcfg->tupdate = ep->par[1];
+ if (ep->par[2] < 0)
+ pcfg->max_burst = pie_sysctl.max_burst * AQM_TIME_1US;
+ else
+ pcfg->max_burst = ep->par[2];
+ if (ep->par[3] < 0)
+ pcfg->max_ecnth = pie_sysctl.max_ecnth;
+ else
+ pcfg->max_ecnth = ep->par[3];
+ if (ep->par[4] < 0)
+ pcfg->alpha = pie_sysctl.alpha;
+ else
+ pcfg->alpha = ep->par[4];
+ if (ep->par[5] < 0)
+ pcfg->beta = pie_sysctl.beta;
+ else
+ pcfg->beta = ep->par[5];
+ if (ep->par[6] < 0)
+ pcfg->flags = pie_sysctl.flags;
+ else
+ pcfg->flags = ep->par[6];
+
+ /* bound PIE configurations */
+ pcfg->qdelay_ref = BOUND_VAR(pcfg->qdelay_ref, 1, 10 * AQM_TIME_1S);
+ pcfg->tupdate = BOUND_VAR(pcfg->tupdate, 1, 10 * AQM_TIME_1S);
+ pcfg->max_burst = BOUND_VAR(pcfg->max_burst, 0, 10 * AQM_TIME_1S);
+ pcfg->max_ecnth = BOUND_VAR(pcfg->max_ecnth, 0, PIE_SCALE);
+ pcfg->alpha = BOUND_VAR(pcfg->alpha, 0, 7 * PIE_SCALE);
+ pcfg->beta = BOUND_VAR(pcfg->beta, 0 , 7 * PIE_SCALE);
+
+ pie_desc.cfg_ref_count++;
+ //D("pie cfg_ref_count=%d", pie_desc.cfg_ref_count);
+ return 0;
+}
+
+/*
+ * Deconfigure PIE and free memory allocation
+ */
+static int
+aqm_pie_deconfig(struct dn_fsk* fs)
+{
+ if (fs && fs->aqmcfg) {
+ free(fs->aqmcfg, M_DUMMYNET);
+ fs->aqmcfg = NULL;
+ pie_desc.cfg_ref_count--;
+ }
+ return 0;
+}
+
+/*
+ * Retrieve PIE configuration parameters.
+ */
+static int
+aqm_pie_getconfig (struct dn_fsk *fs, struct dn_extra_parms * ep)
+{
+ struct dn_aqm_pie_parms *pcfg;
+ if (fs->aqmcfg) {
+ strcpy(ep->name, pie_desc.name);
+ pcfg = fs->aqmcfg;
+ ep->par[0] = pcfg->qdelay_ref / AQM_TIME_1US;
+ ep->par[1] = pcfg->tupdate / AQM_TIME_1US;
+ ep->par[2] = pcfg->max_burst / AQM_TIME_1US;
+ ep->par[3] = pcfg->max_ecnth;
+ ep->par[4] = pcfg->alpha;
+ ep->par[5] = pcfg->beta;
+ ep->par[6] = pcfg->flags;
+
+ return 0;
+ }
+ return 1;
+}
+
+static struct dn_aqm pie_desc = {
+ _SI( .type = ) DN_AQM_PIE,
+ _SI( .name = ) "PIE",
+ _SI( .ref_count = ) 0,
+ _SI( .cfg_ref_count = ) 0,
+ _SI( .enqueue = ) aqm_pie_enqueue,
+ _SI( .dequeue = ) aqm_pie_dequeue,
+ _SI( .config = ) aqm_pie_config,
+ _SI( .deconfig = ) aqm_pie_deconfig,
+ _SI( .getconfig = ) aqm_pie_getconfig,
+ _SI( .init = ) aqm_pie_init,
+ _SI( .cleanup = ) aqm_pie_cleanup,
+};
+
+DECLARE_DNAQM_MODULE(dn_aqm_pie, &pie_desc);
+#endif
diff --git a/sys/netpfil/ipfw/dn_aqm_pie.h b/sys/netpfil/ipfw/dn_aqm_pie.h
new file mode 100644
index 0000000..b045077
--- /dev/null
+++ b/sys/netpfil/ipfw/dn_aqm_pie.h
@@ -0,0 +1,151 @@
+/*
+ * PIE - Proportional Integral controller Enhanced AQM algorithm.
+ *
+ * $FreeBSD$
+ *
+ * Copyright (C) 2016 Centre for Advanced Internet Architectures,
+ * Swinburne University of Technology, Melbourne, Australia.
+ * Portions of this code were made possible in part by a gift from
+ * The Comcast Innovation Fund.
+ * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _IP_DN_AQM_PIE_H
+#define _IP_DN_AQM_PIE_H
+
+#define DN_AQM_PIE 2
+#define PIE_DQ_THRESHOLD_BITS 14
+/* 2^14 =16KB */
+#define PIE_DQ_THRESHOLD (1UL << PIE_DQ_THRESHOLD_BITS)
+#define MEAN_PKTSIZE 800
+
+/* 31-bits because random() generates range from 0->(2**31)-1 */
+#define PIE_PROB_BITS 31
+#define PIE_MAX_PROB ((1ULL<<PIE_PROB_BITS) -1)
+
+/* for 16-bits, we have 3-bits for integer part and 13-bits for fraction */
+#define PIE_FIX_POINT_BITS 13
+#define PIE_SCALE (1UL<<PIE_FIX_POINT_BITS)
+
+
+/* PIE options */
+enum {
+ PIE_ECN_ENABLED =1,
+ PIE_CAPDROP_ENABLED = 2,
+ PIE_ON_OFF_MODE_ENABLED = 4,
+ PIE_DEPRATEEST_ENABLED = 8,
+ PIE_DERAND_ENABLED = 16
+};
+
+/* PIE parameters */
+struct dn_aqm_pie_parms {
+ aqm_time_t qdelay_ref; /* AQM Latency Target (default: 15ms) */
+ aqm_time_t tupdate; /* a period to calculate drop probability (default:15ms) */
+ aqm_time_t max_burst; /* AQM Max Burst Allowance (default: 150ms) */
+ uint16_t max_ecnth; /*AQM Max ECN Marking Threshold (default: 10%) */
+ uint16_t alpha; /* (default: 1/8) */
+ uint16_t beta; /* (default: 1+1/4) */
+ uint32_t flags; /* PIE options */
+};
+
+/* PIE status variables */
+struct pie_status{
+ struct callout aqm_pie_callout;
+ aqm_time_t burst_allowance;
+ uint32_t drop_prob;
+ aqm_time_t current_qdelay;
+ aqm_time_t qdelay_old;
+ uint64_t accu_prob;
+ aqm_time_t measurement_start;
+ aqm_time_t avg_dq_time;
+ uint32_t dq_count;
+ uint32_t sflags;
+ struct dn_aqm_pie_parms *parms; /* pointer to PIE configurations */
+ /* pointer to parent queue of FQ-PIE sub-queues, or queue of owner fs. */
+ struct dn_queue *pq;
+ struct mtx lock_mtx;
+ uint32_t one_third_q_size; /* 1/3 of queue size, for speed optization */
+};
+
+enum {
+ ENQUE = 1,
+ DROP,
+ MARKECN
+};
+
+/* PIE current state */
+enum {
+ PIE_ACTIVE = 1,
+ PIE_INMEASUREMENT = 2
+};
+
+/*
+ * Check if eneque should drop packet to control delay or not based on
+ * PIe algorithm.
+ * return DROP if it is time to drop or ENQUE otherwise.
+ * This function is used by PIE and FQ-PIE.
+ */
+__inline static int
+drop_early(struct pie_status *pst, uint32_t qlen)
+{
+ struct dn_aqm_pie_parms *pprms;
+
+ pprms = pst->parms;
+
+ /* queue is not congested */
+
+ if ((pst->qdelay_old < (pprms->qdelay_ref >> 1)
+ && pst->drop_prob < PIE_MAX_PROB / 5 )
+ || qlen <= 2 * MEAN_PKTSIZE)
+ return ENQUE;
+
+
+ if (pst->drop_prob == 0)
+ pst->accu_prob = 0;
+
+ /* increment accu_prob */
+ if (pprms->flags & PIE_DERAND_ENABLED)
+ pst->accu_prob += pst->drop_prob;
+
+ /* De-randomize option
+ * if accu_prob < 0.85 -> enqueue
+ * if accu_prob>8.5 ->drop
+ * between 0.85 and 8.5 || !De-randomize --> drop on prob
+ */
+ if (pprms->flags & PIE_DERAND_ENABLED) {
+ if(pst->accu_prob < (uint64_t) (PIE_MAX_PROB * 0.85))
+ return ENQUE;
+ if( pst->accu_prob >= (uint64_t) (PIE_MAX_PROB * 8.5))
+ return DROP;
+ }
+
+ if (random() < pst->drop_prob) {
+ pst->accu_prob = 0;
+ return DROP;
+ }
+
+ return ENQUE;
+}
+
+#endif
diff --git a/sys/netpfil/ipfw/dn_sched.h b/sys/netpfil/ipfw/dn_sched.h
index 4268edf..602fdf9 100644
--- a/sys/netpfil/ipfw/dn_sched.h
+++ b/sys/netpfil/ipfw/dn_sched.h
@@ -132,6 +132,10 @@ struct dn_alg {
int (*free_fsk)(struct dn_fsk *f);
int (*new_queue)(struct dn_queue *q);
int (*free_queue)(struct dn_queue *q);
+#ifdef NEW_AQM
+ /* Getting scheduler extra parameters */
+ int (*getconfig)(struct dn_schk *, struct dn_extra_parms *);
+#endif
/* run-time fields */
int ref_count; /* XXX number of instances in the system */
@@ -165,6 +169,11 @@ dn_dequeue(struct dn_queue *q)
struct mbuf *m = q->mq.head;
if (m == NULL)
return NULL;
+#ifdef NEW_AQM
+ /* Call AQM dequeue function */
+ if (q->fs->aqmfp && q->fs->aqmfp->dequeue )
+ return q->fs->aqmfp->dequeue(q);
+#endif
q->mq.head = m->m_nextpkt;
q->mq.count--;
diff --git a/sys/netpfil/ipfw/dn_sched_fifo.c b/sys/netpfil/ipfw/dn_sched_fifo.c
index e3e0cfe..be51649 100644
--- a/sys/netpfil/ipfw/dn_sched_fifo.c
+++ b/sys/netpfil/ipfw/dn_sched_fifo.c
@@ -42,6 +42,9 @@
#include <netinet/ip_dummynet.h>
#include <netpfil/ipfw/dn_heap.h>
#include <netpfil/ipfw/ip_dn_private.h>
+#ifdef NEW_AQM
+#include <netpfil/ipfw/dn_aqm.h>
+#endif
#include <netpfil/ipfw/dn_sched.h>
#else
#include <dn_test.h>
@@ -116,6 +119,9 @@ static struct dn_alg fifo_desc = {
_SI( .free_fsk = ) NULL,
_SI( .new_queue = ) NULL,
_SI( .free_queue = ) NULL,
+#ifdef NEW_AQM
+ _SI( .getconfig = ) NULL,
+#endif
};
DECLARE_DNSCHED_MODULE(dn_fifo, &fifo_desc);
diff --git a/sys/netpfil/ipfw/dn_sched_fq_codel.c b/sys/netpfil/ipfw/dn_sched_fq_codel.c
new file mode 100644
index 0000000..c783730
--- /dev/null
+++ b/sys/netpfil/ipfw/dn_sched_fq_codel.c
@@ -0,0 +1,617 @@
+/*
+ * FQ_Codel - The FlowQueue-Codel scheduler/AQM
+ *
+ * $FreeBSD$
+ *
+ * Copyright (C) 2016 Centre for Advanced Internet Architectures,
+ * Swinburne University of Technology, Melbourne, Australia.
+ * Portions of this code were made possible in part by a gift from
+ * The Comcast Innovation Fund.
+ * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifdef _KERNEL
+#include <sys/malloc.h>
+#include <sys/socket.h>
+//#include <sys/socketvar.h>
+#include <sys/kernel.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <net/if.h> /* IFNAMSIZ */
+#include <netinet/in.h>
+#include <netinet/ip_var.h> /* ipfw_rule_ref */
+#include <netinet/ip_fw.h> /* flow_id */
+#include <netinet/ip_dummynet.h>
+
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+
+#include <netpfil/ipfw/ip_fw_private.h>
+#include <sys/sysctl.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet/ip_icmp.h>
+#include <netinet/tcp.h>
+#include <netinet/udp.h>
+#include <sys/queue.h>
+#include <sys/hash.h>
+
+#include <netpfil/ipfw/dn_heap.h>
+#include <netpfil/ipfw/ip_dn_private.h>
+
+#include <netpfil/ipfw/dn_aqm.h>
+#include <netpfil/ipfw/dn_aqm_codel.h>
+#include <netpfil/ipfw/dn_sched.h>
+#include <netpfil/ipfw/dn_sched_fq_codel.h>
+#include <netpfil/ipfw/dn_sched_fq_codel_helper.h>
+
+#else
+#include <dn_test.h>
+#endif
+
+/* NOTE: In fq_codel module, we reimplements CoDel AQM functions
+ * because fq_codel use different flows (sub-queues) structure and
+ * dn_queue includes many variables not needed by a flow (sub-queue
+ * )i.e. avoid extra overhead (88 bytes vs 208 bytes).
+ * Also, CoDel functions manages stats of sub-queues as well as the main queue.
+ */
+
+#define DN_SCHED_FQ_CODEL 6
+
+static struct dn_alg fq_codel_desc;
+
+/* fq_codel default parameters including codel */
+struct dn_sch_fq_codel_parms
+fq_codel_sysctl = {{5000 * AQM_TIME_1US, 100000 * AQM_TIME_1US,
+ CODEL_ECN_ENABLED}, 1024, 10240, 1514};
+
+static int
+fqcodel_sysctl_interval_handler(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ long value;
+
+ value = fq_codel_sysctl.ccfg.interval;
+ value /= AQM_TIME_1US;
+ error = sysctl_handle_long(oidp, &value, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ if (value < 1 || value > 100 * AQM_TIME_1S)
+ return (EINVAL);
+ fq_codel_sysctl.ccfg.interval = value * AQM_TIME_1US ;
+
+ return (0);
+}
+
+static int
+fqcodel_sysctl_target_handler(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ long value;
+
+ value = fq_codel_sysctl.ccfg.target;
+ value /= AQM_TIME_1US;
+ error = sysctl_handle_long(oidp, &value, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ if (value < 1 || value > 5 * AQM_TIME_1S)
+ return (EINVAL);
+ fq_codel_sysctl.ccfg.target = value * AQM_TIME_1US ;
+
+ return (0);
+}
+
+
+SYSBEGIN(f4)
+
+SYSCTL_DECL(_net_inet);
+SYSCTL_DECL(_net_inet_ip);
+SYSCTL_DECL(_net_inet_ip_dummynet);
+static SYSCTL_NODE(_net_inet_ip_dummynet, OID_AUTO, fqcodel,
+ CTLFLAG_RW, 0, "FQ_CODEL");
+
+#ifdef SYSCTL_NODE
+
+SYSCTL_PROC(_net_inet_ip_dummynet_fqcodel, OID_AUTO, target,
+ CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, fqcodel_sysctl_target_handler, "L",
+ "FQ_CoDel target in microsecond");
+SYSCTL_PROC(_net_inet_ip_dummynet_fqcodel, OID_AUTO, interval,
+ CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, fqcodel_sysctl_interval_handler, "L",
+ "FQ_CoDel interval in microsecond");
+
+SYSCTL_UINT(_net_inet_ip_dummynet_fqcodel, OID_AUTO, quantum,
+ CTLFLAG_RW, &fq_codel_sysctl.quantum, 1514, "FQ_CoDel quantum");
+SYSCTL_UINT(_net_inet_ip_dummynet_fqcodel, OID_AUTO, flows,
+ CTLFLAG_RW, &fq_codel_sysctl.flows_cnt, 1024,
+ "Number of queues for FQ_CoDel");
+SYSCTL_UINT(_net_inet_ip_dummynet_fqcodel, OID_AUTO, limit,
+ CTLFLAG_RW, &fq_codel_sysctl.limit, 10240, "FQ_CoDel queues size limit");
+#endif
+
+/* Drop a packet form the head of codel queue */
+static void
+codel_drop_head(struct fq_codel_flow *q, struct fq_codel_si *si)
+{
+ struct mbuf *m = q->mq.head;
+
+ if (m == NULL)
+ return;
+ q->mq.head = m->m_nextpkt;
+
+ fq_update_stats(q, si, -m->m_pkthdr.len, 1);
+
+ if (si->main_q.ni.length == 0) /* queue is now idle */
+ si->main_q.q_time = dn_cfg.curr_time;
+
+ FREE_PKT(m);
+}
+
+/* Enqueue a packet 'm' to a queue 'q' and add timestamp to that packet.
+ * Return 1 when unable to add timestamp, otherwise return 0
+ */
+static int
+codel_enqueue(struct fq_codel_flow *q, struct mbuf *m, struct fq_codel_si *si)
+{
+ uint64_t len;
+
+ len = m->m_pkthdr.len;
+ /* finding maximum packet size */
+ if (len > q->cst.maxpkt_size)
+ q->cst.maxpkt_size = len;
+
+ /* Add timestamp to mbuf as MTAG */
+ struct m_tag *mtag;
+ mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL);
+ if (mtag == NULL)
+ mtag = m_tag_alloc(MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, sizeof(aqm_time_t),
+ M_NOWAIT);
+ if (mtag == NULL) {
+ m_freem(m);
+ goto drop;
+ }
+ *(aqm_time_t *)(mtag + 1) = AQM_UNOW;
+ m_tag_prepend(m, mtag);
+
+ mq_append(&q->mq, m);
+ fq_update_stats(q, si, len, 0);
+ return 0;
+
+drop:
+ fq_update_stats(q, si, len, 1);
+ m_freem(m);
+ return 1;
+}
+
+/*
+ * Classify a packet to queue number using Jenkins hash function.
+ * Return: queue number
+ * the input of the hash are protocol no, perturbation, src IP, dst IP,
+ * src port, dst port,
+ */
+static inline int
+fq_codel_classify_flow(struct mbuf *m, uint16_t fcount, struct fq_codel_si *si)
+{
+ struct ip *ip;
+ struct tcphdr *th;
+ struct udphdr *uh;
+ uint8_t tuple[41];
+ uint16_t hash=0;
+
+//#ifdef INET6
+ struct ip6_hdr *ip6;
+ int isip6;
+ isip6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
+
+ if(isip6) {
+ ip6 = mtod(m, struct ip6_hdr *);
+ *((uint8_t *) &tuple[0]) = ip6->ip6_nxt;
+ *((uint32_t *) &tuple[1]) = si->perturbation;
+ memcpy(&tuple[5], ip6->ip6_src.s6_addr, 16);
+ memcpy(&tuple[21], ip6->ip6_dst.s6_addr, 16);
+
+ switch (ip6->ip6_nxt) {
+ case IPPROTO_TCP:
+ th = (struct tcphdr *)(ip6 + 1);
+ *((uint16_t *) &tuple[37]) = th->th_dport;
+ *((uint16_t *) &tuple[39]) = th->th_sport;
+ break;
+
+ case IPPROTO_UDP:
+ uh = (struct udphdr *)(ip6 + 1);
+ *((uint16_t *) &tuple[37]) = uh->uh_dport;
+ *((uint16_t *) &tuple[39]) = uh->uh_sport;
+ break;
+ default:
+ memset(&tuple[37], 0, 4);
+
+ }
+
+ hash = jenkins_hash(tuple, 41, HASHINIT) % fcount;
+ return hash;
+ }
+//#endif
+
+ /* IPv4 */
+ ip = mtod(m, struct ip *);
+ *((uint8_t *) &tuple[0]) = ip->ip_p;
+ *((uint32_t *) &tuple[1]) = si->perturbation;
+ *((uint32_t *) &tuple[5]) = ip->ip_src.s_addr;
+ *((uint32_t *) &tuple[9]) = ip->ip_dst.s_addr;
+
+ switch (ip->ip_p) {
+ case IPPROTO_TCP:
+ th = (struct tcphdr *)(ip + 1);
+ *((uint16_t *) &tuple[13]) = th->th_dport;
+ *((uint16_t *) &tuple[15]) = th->th_sport;
+ break;
+
+ case IPPROTO_UDP:
+ uh = (struct udphdr *)(ip + 1);
+ *((uint16_t *) &tuple[13]) = uh->uh_dport;
+ *((uint16_t *) &tuple[15]) = uh->uh_sport;
+ break;
+ default:
+ memset(&tuple[13], 0, 4);
+
+ }
+ hash = jenkins_hash(tuple, 17, HASHINIT) % fcount;
+
+ return hash;
+}
+
+/*
+ * Enqueue a packet into an appropriate queue according to
+ * FQ_CODEL algorithm.
+ */
+static int
+fq_codel_enqueue(struct dn_sch_inst *_si, struct dn_queue *_q,
+ struct mbuf *m)
+{
+ struct fq_codel_si *si;
+ struct fq_codel_schk *schk;
+ struct dn_sch_fq_codel_parms *param;
+ struct dn_queue *mainq;
+ int idx, drop, i, maxidx;
+
+ mainq = (struct dn_queue *)(_si + 1);
+ si = (struct fq_codel_si *)_si;
+ schk = (struct fq_codel_schk *)(si->_si.sched+1);
+ param = &schk->cfg;
+
+ /* classify a packet to queue number*/
+ idx = fq_codel_classify_flow(m, param->flows_cnt, si);
+ /* enqueue packet into appropriate queue using CoDel AQM.
+ * Note: 'codel_enqueue' function returns 1 only when it unable to
+ * add timestamp to packet (no limit check)*/
+ drop = codel_enqueue(&si->flows[idx], m, si);
+
+ /* codel unable to timestamp a packet */
+ if (drop)
+ return 1;
+
+ /* If the flow (sub-queue) is not active ,then add it to the tail of
+ * new flows list, initialize and activate it.
+ */
+ if (!si->flows[idx].active ) {
+ STAILQ_INSERT_TAIL(&si->newflows, &si->flows[idx], flowchain);
+ si->flows[idx].deficit = param->quantum;
+ si->flows[idx].cst.dropping = false;
+ si->flows[idx].cst.first_above_time = 0;
+ si->flows[idx].active = 1;
+ //D("activate %d",idx);
+ }
+
+ /* check the limit for all queues and remove a packet from the
+ * largest one
+ */
+ if (mainq->ni.length > schk->cfg.limit) { D("over limit");
+ /* find first active flow */
+ for (maxidx = 0; maxidx < schk->cfg.flows_cnt; maxidx++)
+ if (si->flows[maxidx].active)
+ break;
+ if (maxidx < schk->cfg.flows_cnt) {
+ /* find the largest sub- queue */
+ for (i = maxidx + 1; i < schk->cfg.flows_cnt; i++)
+ if (si->flows[i].active && si->flows[i].stats.length >
+ si->flows[maxidx].stats.length)
+ maxidx = i;
+ codel_drop_head(&si->flows[maxidx], si);
+ D("maxidx = %d",maxidx);
+ drop = 1;
+ }
+ }
+
+ return drop;
+}
+
+/*
+ * Dequeue a packet from an appropriate queue according to
+ * FQ_CODEL algorithm.
+ */
+static struct mbuf *
+fq_codel_dequeue(struct dn_sch_inst *_si)
+{
+ struct fq_codel_si *si;
+ struct fq_codel_schk *schk;
+ struct dn_sch_fq_codel_parms *param;
+ struct fq_codel_flow *f;
+ struct mbuf *mbuf;
+ struct fq_codel_list *fq_codel_flowlist;
+
+ si = (struct fq_codel_si *)_si;
+ schk = (struct fq_codel_schk *)(si->_si.sched+1);
+ param = &schk->cfg;
+
+ do {
+ /* select a list to start with */
+ if (STAILQ_EMPTY(&si->newflows))
+ fq_codel_flowlist = &si->oldflows;
+ else
+ fq_codel_flowlist = &si->newflows;
+
+ /* Both new and old queue lists are empty, return NULL */
+ if (STAILQ_EMPTY(fq_codel_flowlist))
+ return NULL;
+
+ f = STAILQ_FIRST(fq_codel_flowlist);
+ while (f != NULL) {
+ /* if there is no flow(sub-queue) deficit, increase deficit
+ * by quantum, move the flow to the tail of old flows list
+ * and try another flow.
+ * Otherwise, the flow will be used for dequeue.
+ */
+ if (f->deficit < 0) {
+ f->deficit += param->quantum;
+ STAILQ_REMOVE_HEAD(fq_codel_flowlist, flowchain);
+ STAILQ_INSERT_TAIL(&si->oldflows, f, flowchain);
+ } else
+ break;
+
+ f = STAILQ_FIRST(fq_codel_flowlist);
+ }
+
+ /* the new flows list is empty, try old flows list */
+ if (STAILQ_EMPTY(fq_codel_flowlist))
+ continue;
+
+ /* Dequeue a packet from the selected flow */
+ mbuf = fqc_codel_dequeue(f, si);
+
+ /* Codel did not return a packet */
+ if (!mbuf) {
+ /* If the selected flow belongs to new flows list, then move
+ * it to the tail of old flows list. Otherwise, deactivate it and
+ * remove it from the old list and
+ */
+ if (fq_codel_flowlist == &si->newflows) {
+ STAILQ_REMOVE_HEAD(fq_codel_flowlist, flowchain);
+ STAILQ_INSERT_TAIL(&si->oldflows, f, flowchain);
+ } else {
+ f->active = 0;
+ STAILQ_REMOVE_HEAD(fq_codel_flowlist, flowchain);
+ }
+ /* start again */
+ continue;
+ }
+
+ /* we have a packet to return,
+ * update flow deficit and return the packet*/
+ f->deficit -= mbuf->m_pkthdr.len;
+ return mbuf;
+
+ } while (1);
+
+ /* unreachable point */
+ return NULL;
+}
+
+/*
+ * Initialize fq_codel scheduler instance.
+ * also, allocate memory for flows array.
+ */
+static int
+fq_codel_new_sched(struct dn_sch_inst *_si)
+{
+ struct fq_codel_si *si;
+ struct dn_queue *q;
+ struct fq_codel_schk *schk;
+ int i;
+
+ si = (struct fq_codel_si *)_si;
+ schk = (struct fq_codel_schk *)(_si->sched+1);
+
+ if(si->flows) {
+ D("si already configured!");
+ return 0;
+ }
+
+ /* init the main queue */
+ q = &si->main_q;
+ set_oid(&q->ni.oid, DN_QUEUE, sizeof(*q));
+ q->_si = _si;
+ q->fs = _si->sched->fs;
+
+ /* allocate memory for flows array */
+ si->flows = malloc(schk->cfg.flows_cnt * sizeof(struct fq_codel_flow),
+ M_DUMMYNET, M_NOWAIT | M_ZERO);
+ if (si->flows == NULL) {
+ D("cannot allocate memory for fq_codel configuration parameters");
+ return ENOMEM ;
+ }
+
+ /* init perturbation for this si */
+ si->perturbation = random();
+
+ /* init the old and new flows lists */
+ STAILQ_INIT(&si->newflows);
+ STAILQ_INIT(&si->oldflows);
+
+ /* init the flows (sub-queues) */
+ for (i = 0; i < schk->cfg.flows_cnt; i++) {
+ /* init codel */
+ si->flows[i].cst.maxpkt_size = 500;
+ }
+
+ fq_codel_desc.ref_count++;
+ return 0;
+}
+
+/*
+ * Free fq_codel scheduler instance.
+ */
+static int
+fq_codel_free_sched(struct dn_sch_inst *_si)
+{
+ struct fq_codel_si *si = (struct fq_codel_si *)_si ;
+
+ /* free the flows array */
+ free(si->flows , M_DUMMYNET);
+ si->flows = NULL;
+ fq_codel_desc.ref_count--;
+
+ return 0;
+}
+
+/*
+ * Configure fq_codel scheduler.
+ * the configurations for the scheduler is passed from userland.
+ */
+static int
+fq_codel_config(struct dn_schk *_schk)
+{
+ struct fq_codel_schk *schk;
+ struct dn_extra_parms *ep;
+ struct dn_sch_fq_codel_parms *fqc_cfg;
+
+ schk = (struct fq_codel_schk *)(_schk+1);
+ ep = (struct dn_extra_parms *) _schk->cfg;
+
+ /* par array contains fq_codel configuration as follow
+ * Codel: 0- target,1- interval, 2- flags
+ * FQ_CODEL: 3- quantum, 4- limit, 5- flows
+ */
+ if (ep && ep->oid.len ==sizeof(*ep) &&
+ ep->oid.subtype == DN_SCH_PARAMS) {
+
+ fqc_cfg = &schk->cfg;
+ if (ep->par[0] < 0)
+ fqc_cfg->ccfg.target = fq_codel_sysctl.ccfg.target;
+ else
+ fqc_cfg->ccfg.target = ep->par[0] * AQM_TIME_1US;
+
+ if (ep->par[1] < 0)
+ fqc_cfg->ccfg.interval = fq_codel_sysctl.ccfg.interval;
+ else
+ fqc_cfg->ccfg.interval = ep->par[1] * AQM_TIME_1US;
+
+ if (ep->par[2] < 0)
+ fqc_cfg->ccfg.flags = 0;
+ else
+ fqc_cfg->ccfg.flags = ep->par[2];
+
+ /* FQ configurations */
+ if (ep->par[3] < 0)
+ fqc_cfg->quantum = fq_codel_sysctl.quantum;
+ else
+ fqc_cfg->quantum = ep->par[3];
+
+ if (ep->par[4] < 0)
+ fqc_cfg->limit = fq_codel_sysctl.limit;
+ else
+ fqc_cfg->limit = ep->par[4];
+
+ if (ep->par[5] < 0)
+ fqc_cfg->flows_cnt = fq_codel_sysctl.flows_cnt;
+ else
+ fqc_cfg->flows_cnt = ep->par[5];
+
+ /* Bound the configurations */
+ fqc_cfg->ccfg.target = BOUND_VAR(fqc_cfg->ccfg.target, 1 ,
+ 5 * AQM_TIME_1S); ;
+ fqc_cfg->ccfg.interval = BOUND_VAR(fqc_cfg->ccfg.interval, 1,
+ 100 * AQM_TIME_1S);
+
+ fqc_cfg->quantum = BOUND_VAR(fqc_cfg->quantum,1, 9000);
+ fqc_cfg->limit= BOUND_VAR(fqc_cfg->limit,1,20480);
+ fqc_cfg->flows_cnt= BOUND_VAR(fqc_cfg->flows_cnt,1,65536);
+ }
+ else
+ return 1;
+
+ return 0;
+}
+
+/*
+ * Return fq_codel scheduler configurations
+ * the configurations for the scheduler is passed to userland.
+ */
+static int
+fq_codel_getconfig (struct dn_schk *_schk, struct dn_extra_parms *ep) {
+
+ struct fq_codel_schk *schk = (struct fq_codel_schk *)(_schk+1);
+ struct dn_sch_fq_codel_parms *fqc_cfg;
+
+ fqc_cfg = &schk->cfg;
+
+ strcpy(ep->name, fq_codel_desc.name);
+ ep->par[0] = fqc_cfg->ccfg.target / AQM_TIME_1US;
+ ep->par[1] = fqc_cfg->ccfg.interval / AQM_TIME_1US;
+ ep->par[2] = fqc_cfg->ccfg.flags;
+
+ ep->par[3] = fqc_cfg->quantum;
+ ep->par[4] = fqc_cfg->limit;
+ ep->par[5] = fqc_cfg->flows_cnt;
+
+ return 0;
+}
+
+/*
+ * fq_codel scheduler descriptor
+ * contains the type of the scheduler, the name, the size of extra
+ * data structures, and function pointers.
+ */
+static struct dn_alg fq_codel_desc = {
+ _SI( .type = ) DN_SCHED_FQ_CODEL,
+ _SI( .name = ) "FQ_CODEL",
+ _SI( .flags = ) 0,
+
+ _SI( .schk_datalen = ) sizeof(struct fq_codel_schk),
+ _SI( .si_datalen = ) sizeof(struct fq_codel_si) - sizeof(struct dn_sch_inst),
+ _SI( .q_datalen = ) 0,
+
+ _SI( .enqueue = ) fq_codel_enqueue,
+ _SI( .dequeue = ) fq_codel_dequeue,
+ _SI( .config = ) fq_codel_config, /* new sched i.e. sched X config ...*/
+ _SI( .destroy = ) NULL, /*sched x delete */
+ _SI( .new_sched = ) fq_codel_new_sched, /* new schd instance */
+ _SI( .free_sched = ) fq_codel_free_sched, /* delete schd instance */
+ _SI( .new_fsk = ) NULL,
+ _SI( .free_fsk = ) NULL,
+ _SI( .new_queue = ) NULL,
+ _SI( .free_queue = ) NULL,
+ _SI( .getconfig = ) fq_codel_getconfig,
+ _SI( .ref_count = ) 0
+};
+
+DECLARE_DNSCHED_MODULE(dn_fq_codel, &fq_codel_desc);
diff --git a/sys/netpfil/ipfw/dn_sched_fq_codel.h b/sys/netpfil/ipfw/dn_sched_fq_codel.h
new file mode 100644
index 0000000..4b65781
--- /dev/null
+++ b/sys/netpfil/ipfw/dn_sched_fq_codel.h
@@ -0,0 +1,167 @@
+/*-
+ * Copyright (C) 2016 Centre for Advanced Internet Architectures,
+ * Swinburne University of Technology, Melbourne, Australia.
+ * Portions of this code were made possible in part by a gift from
+ * The Comcast Innovation Fund.
+ * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * FQ_Codel Structures and helper functions
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IP_DN_SCHED_FQ_CODEL_H
+#define _IP_DN_SCHED_FQ_CODEL_H
+
+/* list of queues */
+STAILQ_HEAD(fq_codel_list, fq_codel_flow) ;
+
+/* fq_codel parameters including codel */
+struct dn_sch_fq_codel_parms {
+ struct dn_aqm_codel_parms ccfg; /* CoDel Parameters */
+ /* FQ_CODEL Parameters */
+ uint32_t flows_cnt; /* number of flows */
+ uint32_t limit; /* hard limit of fq_codel queue size*/
+ uint32_t quantum;
+}; /* defaults */
+
+/* flow (sub-queue) stats */
+struct flow_stats {
+ uint64_t tot_pkts; /* statistics counters */
+ uint64_t tot_bytes;
+ uint32_t length; /* Queue length, in packets */
+ uint32_t len_bytes; /* Queue length, in bytes */
+ uint32_t drops;
+};
+
+/* A flow of packets (sub-queue).*/
+struct fq_codel_flow {
+ struct mq mq; /* list of packets */
+ struct flow_stats stats; /* statistics */
+ int deficit;
+ int active; /* 1: flow is active (in a list) */
+ struct codel_status cst;
+ STAILQ_ENTRY(fq_codel_flow) flowchain;
+};
+
+/* extra fq_codel scheduler configurations */
+struct fq_codel_schk {
+ struct dn_sch_fq_codel_parms cfg;
+};
+
+/* fq_codel scheduler instance */
+struct fq_codel_si {
+ struct dn_sch_inst _si; /* standard scheduler instance */
+ struct dn_queue main_q; /* main queue is after si directly */
+
+ struct fq_codel_flow *flows; /* array of flows (queues) */
+ uint32_t perturbation; /* random value */
+ struct fq_codel_list newflows; /* list of new queues */
+ struct fq_codel_list oldflows; /* list of old queues */
+};
+
+/* Helper function to update queue&main-queue and scheduler statistics.
+ * negative len + drop -> drop
+ * negative len -> dequeue
+ * positive len -> enqueue
+ * positive len + drop -> drop during enqueue
+ */
+__inline static void
+fq_update_stats(struct fq_codel_flow *q, struct fq_codel_si *si, int len,
+ int drop)
+{
+ int inc = 0;
+
+ if (len < 0)
+ inc = -1;
+ else if (len > 0)
+ inc = 1;
+
+ if (drop) {
+ si->main_q.ni.drops ++;
+ q->stats.drops ++;
+ si->_si.ni.drops ++;
+ io_pkt_drop ++;
+ }
+
+ if (!drop || (drop && len < 0)) {
+ /* Update stats for the main queue */
+ si->main_q.ni.length += inc;
+ si->main_q.ni.len_bytes += len;
+
+ /*update sub-queue stats */
+ q->stats.length += inc;
+ q->stats.len_bytes += len;
+
+ /*update scheduler instance stats */
+ si->_si.ni.length += inc;
+ si->_si.ni.len_bytes += len;
+ }
+
+ if (inc > 0) {
+ si->main_q.ni.tot_bytes += len;
+ si->main_q.ni.tot_pkts ++;
+
+ q->stats.tot_bytes +=len;
+ q->stats.tot_pkts++;
+
+ si->_si.ni.tot_bytes +=len;
+ si->_si.ni.tot_pkts ++;
+ }
+
+}
+
+/* extract the head of fq_codel sub-queue */
+__inline static struct mbuf *
+fq_codel_extract_head(struct fq_codel_flow *q, aqm_time_t *pkt_ts, struct fq_codel_si *si)
+{
+ struct mbuf *m = q->mq.head;
+
+ if (m == NULL)
+ return m;
+ q->mq.head = m->m_nextpkt;
+
+ fq_update_stats(q, si, -m->m_pkthdr.len, 0);
+
+ if (si->main_q.ni.length == 0) /* queue is now idle */
+ si->main_q.q_time = dn_cfg.curr_time;
+
+ /* extract packet timestamp*/
+ struct m_tag *mtag;
+ mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL);
+ if (mtag == NULL){
+ D("timestamp tag is not found!");
+ *pkt_ts = 0;
+ } else {
+ *pkt_ts = *(aqm_time_t *)(mtag + 1);
+ m_tag_delete(m,mtag);
+ }
+
+ return m;
+}
+
+
+#endif
diff --git a/sys/netpfil/ipfw/dn_sched_fq_codel_helper.h b/sys/netpfil/ipfw/dn_sched_fq_codel_helper.h
new file mode 100644
index 0000000..da663dc
--- /dev/null
+++ b/sys/netpfil/ipfw/dn_sched_fq_codel_helper.h
@@ -0,0 +1,187 @@
+/*
+ * Codel - The Controlled-Delay Active Queue Management algorithm.
+ *
+ * $FreeBSD$
+ *
+ * Copyright (C) 2016 Centre for Advanced Internet Architectures,
+ * Swinburne University of Technology, Melbourne, Australia.
+ * Portions of this code were made possible in part by a gift from
+ * The Comcast Innovation Fund.
+ * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au>
+ *
+ * Copyright (C) 2011-2014 Kathleen Nichols <nichols@pollere.com>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * o Redistributions of source code must retain the above copyright
+ * notice, this list of conditions, and the following disclaimer,
+ * without modification.
+ *
+ * o Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * o The names of the authors may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General Public
+ * License ("GPL") version 2, in which case the provisions of the GPL
+ * apply INSTEAD OF those given above.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _IP_DN_SCHED_FQ_CODEL_HELPER_H
+#define _IP_DN_SCHED_FQ_CODEL_HELPER_H
+
+__inline static struct mbuf *
+fqc_dodequeue(struct fq_codel_flow *q, aqm_time_t now, uint16_t *ok_to_drop,
+ struct fq_codel_si *si)
+{
+ struct mbuf * m;
+ struct fq_codel_schk *schk = (struct fq_codel_schk *)(si->_si.sched+1);
+ aqm_time_t pkt_ts, sojourn_time;
+
+ *ok_to_drop = 0;
+ m = fq_codel_extract_head(q, &pkt_ts, si);
+
+ if (m == NULL) {
+ /*queue is empty - we can't be above target*/
+ q->cst.first_above_time= 0;
+ return m;
+ }
+
+ /* To span a large range of bandwidths, CoDel runs two
+ * different AQMs in parallel. One is sojourn-time-based
+ * and takes effect when the time to send an MTU-sized
+ * packet is less than target. The 1st term of the "if"
+ * below does this. The other is backlog-based and takes
+ * effect when the time to send an MTU-sized packet is >=
+ * target. The goal here is to keep the output link
+ * utilization high by never allowing the queue to get
+ * smaller than the amount that arrives in a typical
+ * interarrival time (MTU-sized packets arriving spaced
+ * by the amount of time it takes to send such a packet on
+ * the bottleneck). The 2nd term of the "if" does this.
+ */
+ sojourn_time = now - pkt_ts;
+ if (sojourn_time < schk->cfg.ccfg.target || q->stats.len_bytes <= q->cst.maxpkt_size) {
+ /* went below - stay below for at least interval */
+ q->cst.first_above_time = 0;
+ } else {
+ if (q->cst.first_above_time == 0) {
+ /* just went above from below. if still above at
+ * first_above_time, will say it's ok to drop. */
+ q->cst.first_above_time = now + schk->cfg.ccfg.interval;
+ } else if (now >= q->cst.first_above_time) {
+ *ok_to_drop = 1;
+ }
+ }
+ return m;
+}
+
+/* Codel dequeue function */
+__inline static struct mbuf *
+fqc_codel_dequeue(struct fq_codel_flow *q, struct fq_codel_si *si)
+{
+ struct mbuf *m;
+ struct dn_aqm_codel_parms *cprms;
+ struct codel_status *cst;
+ aqm_time_t now;
+ uint16_t ok_to_drop;
+ struct fq_codel_schk *schk = (struct fq_codel_schk *)(si->_si.sched+1);
+
+ cst = &q->cst;
+ cprms = &schk->cfg.ccfg;
+
+ now = AQM_UNOW;
+ m = fqc_dodequeue(q, now, &ok_to_drop, si);
+
+ if (cst->dropping) {
+ if (!ok_to_drop) {
+ /* sojourn time below target - leave dropping state */
+ cst->dropping = false;
+ }
+
+ /* Time for the next drop. Drop current packet and dequeue
+ * next. If the dequeue doesn't take us out of dropping
+ * state, schedule the next drop. A large backlog might
+ * result in drop rates so high that the next drop should
+ * happen now, hence the 'while' loop.
+ */
+ while (now >= cst->drop_next_time && cst->dropping) {
+
+ /* mark the packet */
+ if (cprms->flags & CODEL_ECN_ENABLED && ecn_mark(m)) {
+ cst->count++;
+ /* schedule the next mark. */
+ cst->drop_next_time = control_law(cst, cprms, cst->drop_next_time);
+ return m;
+ }
+
+ /* drop the packet */
+ fq_update_stats(q, si, 0, 1);
+ m_freem(m);
+ m = fqc_dodequeue(q, now, &ok_to_drop, si);
+
+ if (!ok_to_drop) {
+ /* leave dropping state */
+ cst->dropping = false;
+ } else {
+ cst->count++;
+ /* schedule the next drop. */
+ cst->drop_next_time = control_law(cst, cprms, cst->drop_next_time);
+ }
+ }
+ /* If we get here we're not in dropping state. The 'ok_to_drop'
+ * return from dodequeue means that the sojourn time has been
+ * above 'target' for 'interval' so enter dropping state.
+ */
+ } else if (ok_to_drop) {
+
+ /* if ECN option is disabled or the packet cannot be marked,
+ * drop the packet and extract another.
+ */
+ if (!(cprms->flags & CODEL_ECN_ENABLED) || !ecn_mark(m)) {
+ fq_update_stats(q, si, 0, 1);
+ m_freem(m);
+ m = fqc_dodequeue(q, now, &ok_to_drop,si);
+ }
+
+ cst->dropping = true;
+
+ /* If min went above target close to when it last went
+ * below, assume that the drop rate that controlled the
+ * queue on the last cycle is a good starting point to
+ * control it now. ('drop_next' will be at most 'interval'
+ * later than the time of the last drop so 'now - drop_next'
+ * is a good approximation of the time from the last drop
+ * until now.)
+ */
+ cst->count = (cst->count > 2 && ((aqm_stime_t)now -
+ (aqm_stime_t)cst->drop_next_time) < 8* cprms->interval)? cst->count - 2 : 1;
+
+ /* we don't have to set initial guess for Newton's method isqrt as
+ * we initilaize isqrt in control_law function when count == 1 */
+ cst->drop_next_time = control_law(cst, cprms, now);
+ }
+
+ return m;
+}
+
+#endif
diff --git a/sys/netpfil/ipfw/dn_sched_fq_pie.c b/sys/netpfil/ipfw/dn_sched_fq_pie.c
new file mode 100644
index 0000000..6b9628a
--- /dev/null
+++ b/sys/netpfil/ipfw/dn_sched_fq_pie.c
@@ -0,0 +1,1262 @@
+/*
+ * FQ_PIE - The FlowQueue-PIE scheduler/AQM
+ *
+ * $FreeBSD$
+ *
+ * Copyright (C) 2016 Centre for Advanced Internet Architectures,
+ * Swinburne University of Technology, Melbourne, Australia.
+ * Portions of this code were made possible in part by a gift from
+ * The Comcast Innovation Fund.
+ * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/* Important note:
+ * As there is no an office document for FQ-PIE specification, we used
+ * FQ-CoDel algorithm with some modifications to implement FQ-PIE.
+ * This FQ-PIE implementation is a beta version and have not been tested
+ * extensively. Our FQ-PIE uses stand-alone PIE AQM per sub-queue. By
+ * default, timestamp is used to calculate queue delay instead of departure
+ * rate estimation method. Although departure rate estimation is available
+ * as testing option, the results could be incorrect. Moreover, turning PIE on
+ * and off option is available but it does not work properly in this version.
+ */
+
+
+#ifdef _KERNEL
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/kernel.h>
+#include <sys/mbuf.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <net/if.h> /* IFNAMSIZ */
+#include <netinet/in.h>
+#include <netinet/ip_var.h> /* ipfw_rule_ref */
+#include <netinet/ip_fw.h> /* flow_id */
+#include <netinet/ip_dummynet.h>
+
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+
+#include <netpfil/ipfw/ip_fw_private.h>
+#include <sys/sysctl.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet/ip_icmp.h>
+#include <netinet/tcp.h>
+#include <netinet/udp.h>
+#include <sys/queue.h>
+#include <sys/hash.h>
+
+#include <netpfil/ipfw/dn_heap.h>
+#include <netpfil/ipfw/ip_dn_private.h>
+
+#include <netpfil/ipfw/dn_aqm.h>
+#include <netpfil/ipfw/dn_aqm_pie.h>
+#include <netpfil/ipfw/dn_sched.h>
+
+#else
+#include <dn_test.h>
+#endif
+
+#define DN_SCHED_FQ_PIE 7
+
+/* list of queues */
+STAILQ_HEAD(fq_pie_list, fq_pie_flow) ;
+
+/* FQ_PIE parameters including PIE */
+struct dn_sch_fq_pie_parms {
+ struct dn_aqm_pie_parms pcfg; /* PIE configuration Parameters */
+ /* FQ_PIE Parameters */
+ uint32_t flows_cnt; /* number of flows */
+ uint32_t limit; /* hard limit of FQ_PIE queue size*/
+ uint32_t quantum;
+};
+
+/* flow (sub-queue) stats */
+struct flow_stats {
+ uint64_t tot_pkts; /* statistics counters */
+ uint64_t tot_bytes;
+ uint32_t length; /* Queue length, in packets */
+ uint32_t len_bytes; /* Queue length, in bytes */
+ uint32_t drops;
+};
+
+/* A flow of packets (sub-queue)*/
+struct fq_pie_flow {
+ struct mq mq; /* list of packets */
+ struct flow_stats stats; /* statistics */
+ int deficit;
+ int active; /* 1: flow is active (in a list) */
+ struct pie_status pst; /* pie status variables */
+ struct fq_pie_si *psi; /* parent scheduler instance */
+ STAILQ_ENTRY(fq_pie_flow) flowchain;
+};
+
+/* extra fq_pie scheduler configurations */
+struct fq_pie_schk {
+ struct dn_sch_fq_pie_parms cfg;
+};
+
+/* fq_pie scheduler instance */
+struct fq_pie_si {
+ struct dn_sch_inst _si; /* standard scheduler instance */
+ struct dn_queue main_q; /* main queue is after si directly */
+ uint32_t nr_active_q;
+ struct fq_pie_flow *flows; /* array of flows (queues) */
+ uint32_t perturbation; /* random value */
+ struct fq_pie_list newflows; /* list of new queues */
+ struct fq_pie_list oldflows; /* list of old queues */
+};
+
+
+struct mem_to_free {
+ void *mem_flows;
+ void *mem_callout;
+};
+static struct mtx freemem_mtx;
+static struct dn_alg fq_pie_desc;
+
+/* Default FQ-PIE parameters including PIE */
+/* PIE defaults
+ * target=15ms, max_burst=150ms, max_ecnth=0.1,
+ * alpha=0.125, beta=1.25, tupdate=15ms
+ * FQ-
+ * flows=1024, limit=10240, quantum =1514
+ */
+struct dn_sch_fq_pie_parms
+ fq_pie_sysctl = {{15000 * AQM_TIME_1US, 15000 * AQM_TIME_1US,
+ 150000 * AQM_TIME_1US, PIE_SCALE * 0.1, PIE_SCALE * 0.125,
+ PIE_SCALE * 1.25, PIE_CAPDROP_ENABLED | PIE_DERAND_ENABLED},
+ 1024, 10240, 1514};
+
+static int
+fqpie_sysctl_alpha_beta_handler(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ long value;
+
+ if (!strcmp(oidp->oid_name,"alpha"))
+ value = fq_pie_sysctl.pcfg.alpha;
+ else
+ value = fq_pie_sysctl.pcfg.beta;
+
+ value = value * 1000 / PIE_SCALE;
+ error = sysctl_handle_long(oidp, &value, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ if (value < 1 || value > 7 * PIE_SCALE)
+ return (EINVAL);
+ value = (value * PIE_SCALE) / 1000;
+ if (!strcmp(oidp->oid_name,"alpha"))
+ fq_pie_sysctl.pcfg.alpha = value;
+ else
+ fq_pie_sysctl.pcfg.beta = value;
+ return (0);
+}
+
+static int
+fqpie_sysctl_target_tupdate_maxb_handler(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ long value;
+
+ if (!strcmp(oidp->oid_name,"target"))
+ value = fq_pie_sysctl.pcfg.qdelay_ref;
+ else if (!strcmp(oidp->oid_name,"tupdate"))
+ value = fq_pie_sysctl.pcfg.tupdate;
+ else
+ value = fq_pie_sysctl.pcfg.max_burst;
+
+ value = value / AQM_TIME_1US;
+ error = sysctl_handle_long(oidp, &value, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ if (value < 1 || value > 10 * AQM_TIME_1S)
+ return (EINVAL);
+ value = value * AQM_TIME_1US;
+
+ if (!strcmp(oidp->oid_name,"target"))
+ fq_pie_sysctl.pcfg.qdelay_ref = value;
+ else if (!strcmp(oidp->oid_name,"tupdate"))
+ fq_pie_sysctl.pcfg.tupdate = value;
+ else
+ fq_pie_sysctl.pcfg.max_burst = value;
+ return (0);
+}
+
+static int
+fqpie_sysctl_max_ecnth_handler(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ long value;
+
+ value = fq_pie_sysctl.pcfg.max_ecnth;
+ value = value * 1000 / PIE_SCALE;
+ error = sysctl_handle_long(oidp, &value, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ if (value < 1 || value > PIE_SCALE)
+ return (EINVAL);
+ value = (value * PIE_SCALE) / 1000;
+ fq_pie_sysctl.pcfg.max_ecnth = value;
+ return (0);
+}
+
+/* define FQ- PIE sysctl variables */
+SYSBEGIN(f4)
+SYSCTL_DECL(_net_inet);
+SYSCTL_DECL(_net_inet_ip);
+SYSCTL_DECL(_net_inet_ip_dummynet);
+static SYSCTL_NODE(_net_inet_ip_dummynet, OID_AUTO, fqpie,
+ CTLFLAG_RW, 0, "FQ_PIE");
+
+#ifdef SYSCTL_NODE
+
+SYSCTL_PROC(_net_inet_ip_dummynet_fqpie, OID_AUTO, target,
+ CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,
+ fqpie_sysctl_target_tupdate_maxb_handler, "L",
+ "queue target in microsecond");
+
+SYSCTL_PROC(_net_inet_ip_dummynet_fqpie, OID_AUTO, tupdate,
+ CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,
+ fqpie_sysctl_target_tupdate_maxb_handler, "L",
+ "the frequency of drop probability calculation in microsecond");
+
+SYSCTL_PROC(_net_inet_ip_dummynet_fqpie, OID_AUTO, max_burst,
+ CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,
+ fqpie_sysctl_target_tupdate_maxb_handler, "L",
+ "Burst allowance interval in microsecond");
+
+SYSCTL_PROC(_net_inet_ip_dummynet_fqpie, OID_AUTO, max_ecnth,
+ CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,
+ fqpie_sysctl_max_ecnth_handler, "L",
+ "ECN safeguard threshold scaled by 1000");
+
+SYSCTL_PROC(_net_inet_ip_dummynet_fqpie, OID_AUTO, alpha,
+ CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,
+ fqpie_sysctl_alpha_beta_handler, "L", "PIE alpha scaled by 1000");
+
+SYSCTL_PROC(_net_inet_ip_dummynet_fqpie, OID_AUTO, beta,
+ CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,
+ fqpie_sysctl_alpha_beta_handler, "L", "beta scaled by 1000");
+
+SYSCTL_UINT(_net_inet_ip_dummynet_fqpie, OID_AUTO, quantum,
+ CTLFLAG_RW, &fq_pie_sysctl.quantum, 1514, "quantum for FQ_PIE");
+SYSCTL_UINT(_net_inet_ip_dummynet_fqpie, OID_AUTO, flows,
+ CTLFLAG_RW, &fq_pie_sysctl.flows_cnt, 1024, "Number of queues for FQ_PIE");
+SYSCTL_UINT(_net_inet_ip_dummynet_fqpie, OID_AUTO, limit,
+ CTLFLAG_RW, &fq_pie_sysctl.limit, 10240, "limit for FQ_PIE");
+#endif
+
+/* Helper function to update queue&main-queue and scheduler statistics.
+ * negative len & drop -> drop
+ * negative len -> dequeue
+ * positive len -> enqueue
+ * positive len + drop -> drop during enqueue
+ */
+__inline static void
+fq_update_stats(struct fq_pie_flow *q, struct fq_pie_si *si, int len,
+ int drop)
+{
+ int inc = 0;
+
+ if (len < 0)
+ inc = -1;
+ else if (len > 0)
+ inc = 1;
+
+ if (drop) {
+ si->main_q.ni.drops ++;
+ q->stats.drops ++;
+ si->_si.ni.drops ++;
+ io_pkt_drop ++;
+ }
+
+ if (!drop || (drop && len < 0)) {
+ /* Update stats for the main queue */
+ si->main_q.ni.length += inc;
+ si->main_q.ni.len_bytes += len;
+
+ /*update sub-queue stats */
+ q->stats.length += inc;
+ q->stats.len_bytes += len;
+
+ /*update scheduler instance stats */
+ si->_si.ni.length += inc;
+ si->_si.ni.len_bytes += len;
+ }
+
+ if (inc > 0) {
+ si->main_q.ni.tot_bytes += len;
+ si->main_q.ni.tot_pkts ++;
+
+ q->stats.tot_bytes +=len;
+ q->stats.tot_pkts++;
+
+ si->_si.ni.tot_bytes +=len;
+ si->_si.ni.tot_pkts ++;
+ }
+
+}
+
+/*
+ * Extract a packet from the head of sub-queue 'q'
+ * Return a packet or NULL if the queue is empty.
+ * If getts is set, also extract packet's timestamp from mtag.
+ */
+__inline static struct mbuf *
+fq_pie_extract_head(struct fq_pie_flow *q, aqm_time_t *pkt_ts,
+ struct fq_pie_si *si, int getts)
+{
+ struct mbuf *m = q->mq.head;
+
+ if (m == NULL)
+ return m;
+ q->mq.head = m->m_nextpkt;
+
+ fq_update_stats(q, si, -m->m_pkthdr.len, 0);
+
+ if (si->main_q.ni.length == 0) /* queue is now idle */
+ si->main_q.q_time = dn_cfg.curr_time;
+
+ if (getts) {
+ /* extract packet timestamp*/
+ struct m_tag *mtag;
+ mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL);
+ if (mtag == NULL){
+ D("PIE timestamp mtag not found!");
+ *pkt_ts = 0;
+ } else {
+ *pkt_ts = *(aqm_time_t *)(mtag + 1);
+ m_tag_delete(m,mtag);
+ }
+ }
+ return m;
+}
+
+/*
+ * Callout function for drop probability calculation
+ * This function is called over tupdate ms and takes pointer of FQ-PIE
+ * flow as an argument
+ */
+static void
+fq_calculate_drop_prob(void *x)
+{
+ struct fq_pie_flow *q = (struct fq_pie_flow *) x;
+ struct pie_status *pst = &q->pst;
+ struct dn_aqm_pie_parms *pprms;
+ int64_t p, prob, oldprob;
+ aqm_time_t now;
+
+ /* dealing with race condition */
+ if (callout_pending(&pst->aqm_pie_callout)) {
+ /* callout was reset */
+ mtx_unlock(&pst->lock_mtx);
+ return;
+ }
+
+ if (!callout_active(&pst->aqm_pie_callout)) {
+ /* callout was stopped */
+ mtx_unlock(&pst->lock_mtx);
+ mtx_destroy(&pst->lock_mtx);
+ q->psi->nr_active_q--;
+ return;
+ }
+ callout_deactivate(&pst->aqm_pie_callout);
+
+ now = AQM_UNOW;
+ pprms = pst->parms;
+ prob = pst->drop_prob;
+
+ /* calculate current qdelay */
+ if (pprms->flags & PIE_DEPRATEEST_ENABLED) {
+ pst->current_qdelay = ((uint64_t)q->stats.len_bytes * pst->avg_dq_time)
+ >> PIE_DQ_THRESHOLD_BITS;
+ }
+
+ /* calculate drop probability */
+ p = (int64_t)pprms->alpha *
+ ((int64_t)pst->current_qdelay - (int64_t)pprms->qdelay_ref);
+ p +=(int64_t) pprms->beta *
+ ((int64_t)pst->current_qdelay - (int64_t)pst->qdelay_old);
+
+ /* We PIE_MAX_PROB shift by 12-bits to increase the division precision */
+ p *= (PIE_MAX_PROB << 12) / AQM_TIME_1S;
+
+ /* auto-tune drop probability */
+ if (prob< PIE_MAX_PROB * 0.000001)
+ p >>= 11 + PIE_FIX_POINT_BITS+12;
+ else if (prob < PIE_MAX_PROB * 0.00001)
+ p >>= 9 + PIE_FIX_POINT_BITS+12;
+ else if (prob < PIE_MAX_PROB * 0.0001)
+ p >>= 7 + PIE_FIX_POINT_BITS+12;
+ else if (prob < PIE_MAX_PROB * 0.001)
+ p >>= 5 + PIE_FIX_POINT_BITS+12;
+ else if (prob < PIE_MAX_PROB * 0.01)
+ p >>= 3 + PIE_FIX_POINT_BITS+12;
+ else if (prob < PIE_MAX_PROB * 0.1)
+ p >>= 1 + PIE_FIX_POINT_BITS+12;
+ else
+ p >>= PIE_FIX_POINT_BITS+12;
+
+ oldprob = prob;
+
+ /* Cap Drop adjustment */
+ if ((pprms->flags & PIE_CAPDROP_ENABLED) && prob >= PIE_MAX_PROB / 10
+ && p > PIE_MAX_PROB / 50 )
+ p = PIE_MAX_PROB / 50;
+
+ prob = prob + p;
+
+ /* decay the drop probability exponentially */
+ if (pst->current_qdelay == 0 && pst->qdelay_old == 0)
+ /* 0.98 ~= 1- 1/64 */
+ prob = prob - (prob >> 6);
+
+
+ /* check for multiplication over/under flow */
+ if (p>0) {
+ if (prob<oldprob) {
+ D("overflow");
+ prob= PIE_MAX_PROB;
+ }
+ }
+ else
+ if (prob>oldprob) {
+ prob= 0;
+ D("underflow");
+ }
+
+ /* make drop probability between 0 and PIE_MAX_PROB*/
+ if (prob < 0)
+ prob = 0;
+ else if (prob > PIE_MAX_PROB)
+ prob = PIE_MAX_PROB;
+
+ pst->drop_prob = prob;
+
+ /* store current delay value */
+ pst->qdelay_old = pst->current_qdelay;
+
+ /* update burst allowance */
+ if ((pst->sflags & PIE_ACTIVE) && pst->burst_allowance) {
+ if (pst->burst_allowance > pprms->tupdate)
+ pst->burst_allowance -= pprms->tupdate;
+ else
+ pst->burst_allowance = 0;
+ }
+
+ if (pst->sflags & PIE_ACTIVE)
+ callout_reset_sbt(&pst->aqm_pie_callout,
+ (uint64_t)pprms->tupdate * SBT_1US,
+ 0, fq_calculate_drop_prob, q, 0);
+
+ mtx_unlock(&pst->lock_mtx);
+}
+
+/*
+ * Reset PIE variables & activate the queue
+ */
+__inline static void
+fq_activate_pie(struct fq_pie_flow *q)
+{
+ struct pie_status *pst = &q->pst;
+ struct dn_aqm_pie_parms *pprms;
+
+ mtx_lock(&pst->lock_mtx);
+ pprms = pst->parms;
+
+ pprms = pst->parms;
+ pst->drop_prob = 0;
+ pst->qdelay_old = 0;
+ pst->burst_allowance = pprms->max_burst;
+ pst->accu_prob = 0;
+ pst->dq_count = 0;
+ pst->avg_dq_time = 0;
+ pst->sflags = PIE_INMEASUREMENT | PIE_ACTIVE;
+ pst->measurement_start = AQM_UNOW;
+
+ callout_reset_sbt(&pst->aqm_pie_callout,
+ (uint64_t)pprms->tupdate * SBT_1US,
+ 0, fq_calculate_drop_prob, q, 0);
+
+ mtx_unlock(&pst->lock_mtx);
+}
+
+
+ /*
+ * Deactivate PIE and stop probe update callout
+ */
+__inline static void
+fq_deactivate_pie(struct pie_status *pst)
+{
+ mtx_lock(&pst->lock_mtx);
+ pst->sflags &= ~(PIE_ACTIVE | PIE_INMEASUREMENT);
+ callout_stop(&pst->aqm_pie_callout);
+ //D("PIE Deactivated");
+ mtx_unlock(&pst->lock_mtx);
+}
+
+ /*
+ * Initialize PIE for sub-queue 'q'
+ */
+static int
+pie_init(struct fq_pie_flow *q)
+{
+ struct pie_status *pst=&q->pst;
+ struct dn_aqm_pie_parms *pprms = pst->parms;
+ struct fq_pie_schk *fqpie_schk;
+
+ fqpie_schk = (struct fq_pie_schk *)(q->psi->_si.sched+1);
+ int err = 0;
+
+ if (!pprms){
+ D("AQM_PIE is not configured");
+ err = EINVAL;
+ } else {
+ q->psi->nr_active_q++;
+
+ /* For speed optimization, we caculate 1/3 queue size once here */
+ // XXX limit divided by number of queues divided by 3 ???
+ pst->one_third_q_size = (fqpie_schk->cfg.limit /
+ fqpie_schk->cfg.flows_cnt) / 3;
+
+ mtx_init(&pst->lock_mtx, "mtx_pie", NULL, MTX_DEF);
+ callout_init_mtx(&pst->aqm_pie_callout, &pst->lock_mtx,
+ CALLOUT_RETURNUNLOCKED);
+ }
+
+ return err;
+}
+
+/*
+ * Clean up PIE status for sub-queue 'q'
+ * Stop callout timer and destroy mtx
+ */
+static int
+pie_cleanup(struct fq_pie_flow *q)
+{
+ struct pie_status *pst = &q->pst;
+
+ mtx_lock(&pst->lock_mtx);
+ if (callout_stop(&pst->aqm_pie_callout) || !(pst->sflags & PIE_ACTIVE)) {
+ mtx_unlock(&pst->lock_mtx);
+ mtx_destroy(&pst->lock_mtx);
+ q->psi->nr_active_q--;
+ } else {
+ mtx_unlock(&pst->lock_mtx);
+ return EBUSY;
+ }
+ return 0;
+}
+
+/*
+ * Dequeue and return a pcaket from sub-queue 'q' or NULL if 'q' is empty.
+ * Also, caculate depature time or queue delay using timestamp
+ */
+ static struct mbuf *
+pie_dequeue(struct fq_pie_flow *q, struct fq_pie_si *si)
+{
+ struct mbuf *m;
+ struct dn_aqm_pie_parms *pprms;
+ struct pie_status *pst;
+ aqm_time_t now;
+ aqm_time_t pkt_ts, dq_time;
+ int32_t w;
+
+ pst = &q->pst;
+ pprms = q->pst.parms;
+
+ /*we extarct packet ts only when Departure Rate Estimation dis not used*/
+ m = fq_pie_extract_head(q, &pkt_ts, si,
+ !(pprms->flags & PIE_DEPRATEEST_ENABLED));
+
+ if (!m || !(pst->sflags & PIE_ACTIVE))
+ return m;
+
+ now = AQM_UNOW;
+ if (pprms->flags & PIE_DEPRATEEST_ENABLED) {
+ /* calculate average depature time */
+ if(pst->sflags & PIE_INMEASUREMENT) {
+ pst->dq_count += m->m_pkthdr.len;
+
+ if (pst->dq_count >= PIE_DQ_THRESHOLD) {
+ dq_time = now - pst->measurement_start;
+
+ /*
+ * if we don't have old avg dq_time i.e PIE is (re)initialized,
+ * don't use weight to calculate new avg_dq_time
+ */
+ if(pst->avg_dq_time == 0)
+ pst->avg_dq_time = dq_time;
+ else {
+ /*
+ * weight = PIE_DQ_THRESHOLD/2^6, but we scaled
+ * weight by 2^8. Thus, scaled
+ * weight = PIE_DQ_THRESHOLD /2^8
+ * */
+ w = PIE_DQ_THRESHOLD >> 8;
+ pst->avg_dq_time = (dq_time* w
+ + (pst->avg_dq_time * ((1L << 8) - w))) >> 8;
+ pst->sflags &= ~PIE_INMEASUREMENT;
+ }
+ }
+ }
+
+ /*
+ * Start new measurment cycle when the queue has
+ * PIE_DQ_THRESHOLD worth of bytes.
+ */
+ if(!(pst->sflags & PIE_INMEASUREMENT) &&
+ q->stats.len_bytes >= PIE_DQ_THRESHOLD) {
+ pst->sflags |= PIE_INMEASUREMENT;
+ pst->measurement_start = now;
+ pst->dq_count = 0;
+ }
+ }
+ /* Optionally, use packet timestamp to estimate queue delay */
+ else
+ pst->current_qdelay = now - pkt_ts;
+
+ return m;
+}
+
+
+ /*
+ * Enqueue a packet in q, subject to space and FQ-PIE queue management policy
+ * (whose parameters are in q->fs).
+ * Update stats for the queue and the scheduler.
+ * Return 0 on success, 1 on drop. The packet is consumed anyways.
+ */
+static int
+pie_enqueue(struct fq_pie_flow *q, struct mbuf* m, struct fq_pie_si *si)
+{
+ uint64_t len;
+ struct pie_status *pst;
+ struct dn_aqm_pie_parms *pprms;
+ int t;
+
+ len = m->m_pkthdr.len;
+ pst = &q->pst;
+ pprms = pst->parms;
+ t = ENQUE;
+
+ /* drop/mark the packet when PIE is active and burst time elapsed */
+ if (pst->sflags & PIE_ACTIVE && pst->burst_allowance == 0
+ && drop_early(pst, q->stats.len_bytes) == DROP) {
+ /*
+ * if drop_prob over ECN threshold, drop the packet
+ * otherwise mark and enqueue it.
+ */
+ if (pprms->flags & PIE_ECN_ENABLED && pst->drop_prob <
+ (pprms->max_ecnth << (PIE_PROB_BITS - PIE_FIX_POINT_BITS))
+ && ecn_mark(m))
+ t = ENQUE;
+ else
+ t = DROP;
+ }
+
+ /* Turn PIE on when 1/3 of the queue is full */
+ if (!(pst->sflags & PIE_ACTIVE) && q->stats.len_bytes >=
+ pst->one_third_q_size) {
+ fq_activate_pie(q);
+ }
+
+ /* reset burst tolerance and optinally turn PIE off*/
+ if (pst->drop_prob == 0 && pst->current_qdelay < (pprms->qdelay_ref >> 1)
+ && pst->qdelay_old < (pprms->qdelay_ref >> 1)) {
+
+ pst->burst_allowance = pprms->max_burst;
+ if (pprms->flags & PIE_ON_OFF_MODE_ENABLED && q->stats.len_bytes<=0)
+ fq_deactivate_pie(pst);
+ }
+
+ /* Use timestamp if Departure Rate Estimation mode is disabled */
+ if (t != DROP && !(pprms->flags & PIE_DEPRATEEST_ENABLED)) {
+ /* Add TS to mbuf as a TAG */
+ struct m_tag *mtag;
+ mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL);
+ if (mtag == NULL)
+ mtag = m_tag_alloc(MTAG_ABI_COMPAT, DN_AQM_MTAG_TS,
+ sizeof(aqm_time_t), M_NOWAIT);
+ if (mtag == NULL) {
+ m_freem(m);
+ t = DROP;
+ }
+ *(aqm_time_t *)(mtag + 1) = AQM_UNOW;
+ m_tag_prepend(m, mtag);
+ }
+
+ if (t != DROP) {
+ mq_append(&q->mq, m);
+ fq_update_stats(q, si, len, 0);
+ return 0;
+ } else {
+ fq_update_stats(q, si, len, 1);
+ pst->accu_prob = 0;
+ FREE_PKT(m);
+ return 1;
+ }
+
+ return 0;
+}
+
+/* Drop a packet form the head of FQ-PIE sub-queue */
+static void
+pie_drop_head(struct fq_pie_flow *q, struct fq_pie_si *si)
+{
+ struct mbuf *m = q->mq.head;
+
+ if (m == NULL)
+ return;
+ q->mq.head = m->m_nextpkt;
+
+ fq_update_stats(q, si, -m->m_pkthdr.len, 1);
+
+ if (si->main_q.ni.length == 0) /* queue is now idle */
+ si->main_q.q_time = dn_cfg.curr_time;
+ /* reset accu_prob after packet drop */
+ q->pst.accu_prob = 0;
+
+ FREE_PKT(m);
+}
+
+/*
+ * Classify a packet to queue number using Jenkins hash function.
+ * Return: queue number
+ * the input of the hash are protocol no, perturbation, src IP, dst IP,
+ * src port, dst port,
+ */
+static inline int
+fq_pie_classify_flow(struct mbuf *m, uint16_t fcount, struct fq_pie_si *si)
+{
+ struct ip *ip;
+ struct tcphdr *th;
+ struct udphdr *uh;
+ uint8_t tuple[41];
+ uint16_t hash=0;
+
+//#ifdef INET6
+ struct ip6_hdr *ip6;
+ int isip6;
+ isip6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
+
+ if(isip6) {
+ ip6 = mtod(m, struct ip6_hdr *);
+ *((uint8_t *) &tuple[0]) = ip6->ip6_nxt;
+ *((uint32_t *) &tuple[1]) = si->perturbation;
+ memcpy(&tuple[5], ip6->ip6_src.s6_addr, 16);
+ memcpy(&tuple[21], ip6->ip6_dst.s6_addr, 16);
+
+ switch (ip6->ip6_nxt) {
+ case IPPROTO_TCP:
+ th = (struct tcphdr *)(ip6 + 1);
+ *((uint16_t *) &tuple[37]) = th->th_dport;
+ *((uint16_t *) &tuple[39]) = th->th_sport;
+ break;
+
+ case IPPROTO_UDP:
+ uh = (struct udphdr *)(ip6 + 1);
+ *((uint16_t *) &tuple[37]) = uh->uh_dport;
+ *((uint16_t *) &tuple[39]) = uh->uh_sport;
+ break;
+ default:
+ memset(&tuple[37], 0, 4);
+ }
+
+ hash = jenkins_hash(tuple, 41, HASHINIT) % fcount;
+ return hash;
+ }
+//#endif
+
+ /* IPv4 */
+ ip = mtod(m, struct ip *);
+ *((uint8_t *) &tuple[0]) = ip->ip_p;
+ *((uint32_t *) &tuple[1]) = si->perturbation;
+ *((uint32_t *) &tuple[5]) = ip->ip_src.s_addr;
+ *((uint32_t *) &tuple[9]) = ip->ip_dst.s_addr;
+
+ switch (ip->ip_p) {
+ case IPPROTO_TCP:
+ th = (struct tcphdr *)(ip + 1);
+ *((uint16_t *) &tuple[13]) = th->th_dport;
+ *((uint16_t *) &tuple[15]) = th->th_sport;
+ break;
+
+ case IPPROTO_UDP:
+ uh = (struct udphdr *)(ip + 1);
+ *((uint16_t *) &tuple[13]) = uh->uh_dport;
+ *((uint16_t *) &tuple[15]) = uh->uh_sport;
+ break;
+ default:
+ memset(&tuple[13], 0, 4);
+ }
+ hash = jenkins_hash(tuple, 17, HASHINIT) % fcount;
+
+ return hash;
+}
+
+/*
+ * Enqueue a packet into an appropriate queue according to
+ * FQ-CoDe; algorithm.
+ */
+static int
+fq_pie_enqueue(struct dn_sch_inst *_si, struct dn_queue *_q,
+ struct mbuf *m)
+{
+ struct fq_pie_si *si;
+ struct fq_pie_schk *schk;
+ struct dn_sch_fq_pie_parms *param;
+ struct dn_queue *mainq;
+ int idx, drop, i, maxidx;
+
+ mainq = (struct dn_queue *)(_si + 1);
+ si = (struct fq_pie_si *)_si;
+ schk = (struct fq_pie_schk *)(si->_si.sched+1);
+ param = &schk->cfg;
+
+ /* classify a packet to queue number*/
+ idx = fq_pie_classify_flow(m, param->flows_cnt, si);
+
+ /* enqueue packet into appropriate queue using PIE AQM.
+ * Note: 'pie_enqueue' function returns 1 only when it unable to
+ * add timestamp to packet (no limit check)*/
+ drop = pie_enqueue(&si->flows[idx], m, si);
+
+ /* pie unable to timestamp a packet */
+ if (drop)
+ return 1;
+
+ /* If the flow (sub-queue) is not active ,then add it to tail of
+ * new flows list, initialize and activate it.
+ */
+ if (!si->flows[idx].active) {
+ STAILQ_INSERT_TAIL(&si->newflows, &si->flows[idx], flowchain);
+ si->flows[idx].deficit = param->quantum;
+ fq_activate_pie(&si->flows[idx]);
+ si->flows[idx].active = 1;
+ }
+
+ /* check the limit for all queues and remove a packet from the
+ * largest one
+ */
+ if (mainq->ni.length > schk->cfg.limit) {
+ /* find first active flow */
+ for (maxidx = 0; maxidx < schk->cfg.flows_cnt; maxidx++)
+ if (si->flows[maxidx].active)
+ break;
+ if (maxidx < schk->cfg.flows_cnt) {
+ /* find the largest sub- queue */
+ for (i = maxidx + 1; i < schk->cfg.flows_cnt; i++)
+ if (si->flows[i].active && si->flows[i].stats.length >
+ si->flows[maxidx].stats.length)
+ maxidx = i;
+ pie_drop_head(&si->flows[maxidx], si);
+ drop = 1;
+ }
+ }
+
+ return drop;
+}
+
+/*
+ * Dequeue a packet from an appropriate queue according to
+ * FQ-CoDel algorithm.
+ */
+static struct mbuf *
+fq_pie_dequeue(struct dn_sch_inst *_si)
+{
+ struct fq_pie_si *si;
+ struct fq_pie_schk *schk;
+ struct dn_sch_fq_pie_parms *param;
+ struct fq_pie_flow *f;
+ struct mbuf *mbuf;
+ struct fq_pie_list *fq_pie_flowlist;
+
+ si = (struct fq_pie_si *)_si;
+ schk = (struct fq_pie_schk *)(si->_si.sched+1);
+ param = &schk->cfg;
+
+ do {
+ /* select a list to start with */
+ if (STAILQ_EMPTY(&si->newflows))
+ fq_pie_flowlist = &si->oldflows;
+ else
+ fq_pie_flowlist = &si->newflows;
+
+ /* Both new and old queue lists are empty, return NULL */
+ if (STAILQ_EMPTY(fq_pie_flowlist))
+ return NULL;
+
+ f = STAILQ_FIRST(fq_pie_flowlist);
+ while (f != NULL) {
+ /* if there is no flow(sub-queue) deficit, increase deficit
+ * by quantum, move the flow to the tail of old flows list
+ * and try another flow.
+ * Otherwise, the flow will be used for dequeue.
+ */
+ if (f->deficit < 0) {
+ f->deficit += param->quantum;
+ STAILQ_REMOVE_HEAD(fq_pie_flowlist, flowchain);
+ STAILQ_INSERT_TAIL(&si->oldflows, f, flowchain);
+ } else
+ break;
+
+ f = STAILQ_FIRST(fq_pie_flowlist);
+ }
+
+ /* the new flows list is empty, try old flows list */
+ if (STAILQ_EMPTY(fq_pie_flowlist))
+ continue;
+
+ /* Dequeue a packet from the selected flow */
+ mbuf = pie_dequeue(f, si);
+
+ /* pie did not return a packet */
+ if (!mbuf) {
+ /* If the selected flow belongs to new flows list, then move
+ * it to the tail of old flows list. Otherwise, deactivate it and
+ * remove it from the old list and
+ */
+ if (fq_pie_flowlist == &si->newflows) {
+ STAILQ_REMOVE_HEAD(fq_pie_flowlist, flowchain);
+ STAILQ_INSERT_TAIL(&si->oldflows, f, flowchain);
+ } else {
+ f->active = 0;
+ fq_deactivate_pie(&f->pst);
+ STAILQ_REMOVE_HEAD(fq_pie_flowlist, flowchain);
+ }
+ /* start again */
+ continue;
+ }
+
+ /* we have a packet to return,
+ * update flow deficit and return the packet*/
+ f->deficit -= mbuf->m_pkthdr.len;
+ return mbuf;
+
+ } while (1);
+
+ /* unreachable point */
+ return NULL;
+}
+
+/*
+ * Initialize fq_pie scheduler instance.
+ * also, allocate memory for flows array.
+ */
+static int
+fq_pie_new_sched(struct dn_sch_inst *_si)
+{
+ struct fq_pie_si *si;
+ struct dn_queue *q;
+ struct fq_pie_schk *schk;
+ int i;
+
+ si = (struct fq_pie_si *)_si;
+ schk = (struct fq_pie_schk *)(_si->sched+1);
+
+ if(si->flows) {
+ D("si already configured!");
+ return 0;
+ }
+
+ /* init the main queue */
+ q = &si->main_q;
+ set_oid(&q->ni.oid, DN_QUEUE, sizeof(*q));
+ q->_si = _si;
+ q->fs = _si->sched->fs;
+
+ /* allocate memory for flows array */
+ si->flows = malloc(schk->cfg.flows_cnt * sizeof(struct fq_pie_flow),
+ M_DUMMYNET, M_NOWAIT | M_ZERO);
+ if (si->flows == NULL) {
+ D("cannot allocate memory for fq_pie configuration parameters");
+ return ENOMEM ;
+ }
+
+ /* init perturbation for this si */
+ si->perturbation = random();
+ si->nr_active_q = 0;
+
+ /* init the old and new flows lists */
+ STAILQ_INIT(&si->newflows);
+ STAILQ_INIT(&si->oldflows);
+
+ /* init the flows (sub-queues) */
+ for (i = 0; i < schk->cfg.flows_cnt; i++) {
+ si->flows[i].pst.parms = &schk->cfg.pcfg;
+ si->flows[i].psi = si;
+ pie_init(&si->flows[i]);
+ }
+
+ /* init mtx lock and callout function for free memory */
+ if (!fq_pie_desc.ref_count) {
+ mtx_init(&freemem_mtx, "mtx_pie", NULL, MTX_DEF);
+ }
+
+ mtx_lock(&freemem_mtx);
+ fq_pie_desc.ref_count++;
+ mtx_unlock(&freemem_mtx);
+
+ return 0;
+}
+
+/*
+ * Free FQ-PIE flows memory callout function.
+ * This function is scheduled when a flow or more still active and
+ * the scheduer is about to be destroyed, to prevent memory leak.
+ */
+static void
+free_flows(void *_mem)
+{
+ struct mem_to_free *mem = _mem;
+
+ free(mem->mem_flows, M_DUMMYNET);
+ free(mem->mem_callout, M_DUMMYNET);
+ free(_mem, M_DUMMYNET);
+
+ fq_pie_desc.ref_count--;
+ if (!fq_pie_desc.ref_count) {
+ mtx_unlock(&freemem_mtx);
+ mtx_destroy(&freemem_mtx);
+ } else
+ mtx_unlock(&freemem_mtx);
+ //D("mem freed ok!");
+}
+
+/*
+ * Free fq_pie scheduler instance.
+ */
+static int
+fq_pie_free_sched(struct dn_sch_inst *_si)
+{
+ struct fq_pie_si *si;
+ struct fq_pie_schk *schk;
+ int i;
+
+ si = (struct fq_pie_si *)_si;
+ schk = (struct fq_pie_schk *)(_si->sched+1);
+
+ for (i = 0; i < schk->cfg.flows_cnt; i++) {
+ pie_cleanup(&si->flows[i]);
+ }
+
+ /* if there are still some queues have a callout going to start,
+ * we cannot free flows memory. If we do so, a panic can happen
+ * as prob calculate callout function uses flows memory.
+ */
+ if (!si->nr_active_q) {
+ /* free the flows array */
+ free(si->flows , M_DUMMYNET);
+ si->flows = NULL;
+ mtx_lock(&freemem_mtx);
+ fq_pie_desc.ref_count--;
+ if (!fq_pie_desc.ref_count) {
+ mtx_unlock(&freemem_mtx);
+ mtx_destroy(&freemem_mtx);
+ } else
+ mtx_unlock(&freemem_mtx);
+ //D("ok!");
+ return 0;
+ } else {
+ /* memory leak happens here. So, we register a callout function to free
+ * flows memory later.
+ */
+ D("unable to stop all fq_pie sub-queues!");
+ mtx_lock(&freemem_mtx);
+
+ struct callout *mem_callout;
+ struct mem_to_free *mem;
+
+ mem = malloc(sizeof(*mem), M_DUMMYNET,
+ M_NOWAIT | M_ZERO);
+ mem_callout = malloc(sizeof(*mem_callout), M_DUMMYNET,
+ M_NOWAIT | M_ZERO);
+
+ callout_init_mtx(mem_callout, &freemem_mtx,
+ CALLOUT_RETURNUNLOCKED);
+
+ mem->mem_flows = si->flows;
+ mem->mem_callout = mem_callout;
+ callout_reset_sbt(mem_callout,
+ (uint64_t)(si->flows[0].pst.parms->tupdate + 1000) * SBT_1US,
+ 0, free_flows, mem, 0);
+
+ si->flows = NULL;
+ mtx_unlock(&freemem_mtx);
+
+ return EBUSY;
+ }
+}
+
+/*
+ * Configure FQ-PIE scheduler.
+ * the configurations for the scheduler is passed fromipfw userland.
+ */
+static int
+fq_pie_config(struct dn_schk *_schk)
+{
+ struct fq_pie_schk *schk;
+ struct dn_extra_parms *ep;
+ struct dn_sch_fq_pie_parms *fqp_cfg;
+
+ schk = (struct fq_pie_schk *)(_schk+1);
+ ep = (struct dn_extra_parms *) _schk->cfg;
+
+ /* par array contains fq_pie configuration as follow
+ * PIE: 0- qdelay_ref,1- tupdate, 2- max_burst
+ * 3- max_ecnth, 4- alpha, 5- beta, 6- flags
+ * FQ_PIE: 7- quantum, 8- limit, 9- flows
+ */
+ if (ep && ep->oid.len ==sizeof(*ep) &&
+ ep->oid.subtype == DN_SCH_PARAMS) {
+
+ fqp_cfg = &schk->cfg;
+ if (ep->par[0] < 0)
+ fqp_cfg->pcfg.qdelay_ref = fq_pie_sysctl.pcfg.qdelay_ref;
+ else
+ fqp_cfg->pcfg.qdelay_ref = ep->par[0];
+ if (ep->par[1] < 0)
+ fqp_cfg->pcfg.tupdate = fq_pie_sysctl.pcfg.tupdate;
+ else
+ fqp_cfg->pcfg.tupdate = ep->par[1];
+ if (ep->par[2] < 0)
+ fqp_cfg->pcfg.max_burst = fq_pie_sysctl.pcfg.max_burst;
+ else
+ fqp_cfg->pcfg.max_burst = ep->par[2];
+ if (ep->par[3] < 0)
+ fqp_cfg->pcfg.max_ecnth = fq_pie_sysctl.pcfg.max_ecnth;
+ else
+ fqp_cfg->pcfg.max_ecnth = ep->par[3];
+ if (ep->par[4] < 0)
+ fqp_cfg->pcfg.alpha = fq_pie_sysctl.pcfg.alpha;
+ else
+ fqp_cfg->pcfg.alpha = ep->par[4];
+ if (ep->par[5] < 0)
+ fqp_cfg->pcfg.beta = fq_pie_sysctl.pcfg.beta;
+ else
+ fqp_cfg->pcfg.beta = ep->par[5];
+ if (ep->par[6] < 0)
+ fqp_cfg->pcfg.flags = 0;
+ else
+ fqp_cfg->pcfg.flags = ep->par[6];
+
+ /* FQ configurations */
+ if (ep->par[7] < 0)
+ fqp_cfg->quantum = fq_pie_sysctl.quantum;
+ else
+ fqp_cfg->quantum = ep->par[7];
+ if (ep->par[8] < 0)
+ fqp_cfg->limit = fq_pie_sysctl.limit;
+ else
+ fqp_cfg->limit = ep->par[8];
+ if (ep->par[9] < 0)
+ fqp_cfg->flows_cnt = fq_pie_sysctl.flows_cnt;
+ else
+ fqp_cfg->flows_cnt = ep->par[9];
+
+ /* Bound the configurations */
+ fqp_cfg->pcfg.qdelay_ref = BOUND_VAR(fqp_cfg->pcfg.qdelay_ref,
+ 1, 5 * AQM_TIME_1S);
+ fqp_cfg->pcfg.tupdate = BOUND_VAR(fqp_cfg->pcfg.tupdate,
+ 1, 5 * AQM_TIME_1S);
+ fqp_cfg->pcfg.max_burst = BOUND_VAR(fqp_cfg->pcfg.max_burst,
+ 0, 5 * AQM_TIME_1S);
+ fqp_cfg->pcfg.max_ecnth = BOUND_VAR(fqp_cfg->pcfg.max_ecnth,
+ 0, PIE_SCALE);
+ fqp_cfg->pcfg.alpha = BOUND_VAR(fqp_cfg->pcfg.alpha, 0, 7 * PIE_SCALE);
+ fqp_cfg->pcfg.beta = BOUND_VAR(fqp_cfg->pcfg.beta, 0, 7 * PIE_SCALE);
+
+ fqp_cfg->quantum = BOUND_VAR(fqp_cfg->quantum,1,9000);
+ fqp_cfg->limit= BOUND_VAR(fqp_cfg->limit,1,20480);
+ fqp_cfg->flows_cnt= BOUND_VAR(fqp_cfg->flows_cnt,1,65536);
+ }
+ else {
+ D("Wrong parameters for fq_pie scheduler");
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * Return FQ-PIE scheduler configurations
+ * the configurations for the scheduler is passed to userland.
+ */
+static int
+fq_pie_getconfig (struct dn_schk *_schk, struct dn_extra_parms *ep) {
+
+ struct fq_pie_schk *schk = (struct fq_pie_schk *)(_schk+1);
+ struct dn_sch_fq_pie_parms *fqp_cfg;
+
+ fqp_cfg = &schk->cfg;
+
+ strcpy(ep->name, fq_pie_desc.name);
+ ep->par[0] = fqp_cfg->pcfg.qdelay_ref;
+ ep->par[1] = fqp_cfg->pcfg.tupdate;
+ ep->par[2] = fqp_cfg->pcfg.max_burst;
+ ep->par[3] = fqp_cfg->pcfg.max_ecnth;
+ ep->par[4] = fqp_cfg->pcfg.alpha;
+ ep->par[5] = fqp_cfg->pcfg.beta;
+ ep->par[6] = fqp_cfg->pcfg.flags;
+
+ ep->par[7] = fqp_cfg->quantum;
+ ep->par[8] = fqp_cfg->limit;
+ ep->par[9] = fqp_cfg->flows_cnt;
+
+ return 0;
+}
+
+/*
+ * FQ-PIE scheduler descriptor
+ * contains the type of the scheduler, the name, the size of extra
+ * data structures, and function pointers.
+ */
+static struct dn_alg fq_pie_desc = {
+ _SI( .type = ) DN_SCHED_FQ_PIE,
+ _SI( .name = ) "FQ_PIE",
+ _SI( .flags = ) 0,
+
+ _SI( .schk_datalen = ) sizeof(struct fq_pie_schk),
+ _SI( .si_datalen = ) sizeof(struct fq_pie_si) - sizeof(struct dn_sch_inst),
+ _SI( .q_datalen = ) 0,
+
+ _SI( .enqueue = ) fq_pie_enqueue,
+ _SI( .dequeue = ) fq_pie_dequeue,
+ _SI( .config = ) fq_pie_config, /* new sched i.e. sched X config ...*/
+ _SI( .destroy = ) NULL, /*sched x delete */
+ _SI( .new_sched = ) fq_pie_new_sched, /* new schd instance */
+ _SI( .free_sched = ) fq_pie_free_sched, /* delete schd instance */
+ _SI( .new_fsk = ) NULL,
+ _SI( .free_fsk = ) NULL,
+ _SI( .new_queue = ) NULL,
+ _SI( .free_queue = ) NULL,
+ _SI( .getconfig = ) fq_pie_getconfig,
+ _SI( .ref_count = ) 0
+};
+
+DECLARE_DNSCHED_MODULE(dn_fq_pie, &fq_pie_desc);
diff --git a/sys/netpfil/ipfw/dn_sched_prio.c b/sys/netpfil/ipfw/dn_sched_prio.c
index b779515..915b4cb 100644
--- a/sys/netpfil/ipfw/dn_sched_prio.c
+++ b/sys/netpfil/ipfw/dn_sched_prio.c
@@ -41,6 +41,9 @@
#include <netinet/ip_dummynet.h>
#include <netpfil/ipfw/dn_heap.h>
#include <netpfil/ipfw/ip_dn_private.h>
+#ifdef NEW_AQM
+#include <netpfil/ipfw/dn_aqm.h>
+#endif
#include <netpfil/ipfw/dn_sched.h>
#else
#include <dn_test.h>
@@ -223,6 +226,9 @@ static struct dn_alg prio_desc = {
_SI( .new_queue = ) prio_new_queue,
_SI( .free_queue = ) prio_free_queue,
+#ifdef NEW_AQM
+ _SI( .getconfig = ) NULL,
+#endif
};
diff --git a/sys/netpfil/ipfw/dn_sched_qfq.c b/sys/netpfil/ipfw/dn_sched_qfq.c
index 02e2d8e..5e2549e 100644
--- a/sys/netpfil/ipfw/dn_sched_qfq.c
+++ b/sys/netpfil/ipfw/dn_sched_qfq.c
@@ -42,6 +42,9 @@
#include <netinet/ip_dummynet.h>
#include <netpfil/ipfw/dn_heap.h>
#include <netpfil/ipfw/ip_dn_private.h>
+#ifdef NEW_AQM
+#include <netpfil/ipfw/dn_aqm.h>
+#endif
#include <netpfil/ipfw/dn_sched.h>
#else
#include <dn_test.h>
@@ -837,6 +840,9 @@ static struct dn_alg qfq_desc = {
_SI( .free_fsk = ) NULL,
_SI( .new_queue = ) qfq_new_queue,
_SI( .free_queue = ) qfq_free_queue,
+#ifdef NEW_AQM
+ _SI( .getconfig = ) NULL,
+#endif
};
DECLARE_DNSCHED_MODULE(dn_qfq, &qfq_desc);
diff --git a/sys/netpfil/ipfw/dn_sched_rr.c b/sys/netpfil/ipfw/dn_sched_rr.c
index a4ec9b6..e45ee63 100644
--- a/sys/netpfil/ipfw/dn_sched_rr.c
+++ b/sys/netpfil/ipfw/dn_sched_rr.c
@@ -42,6 +42,9 @@
#include <netinet/ip_dummynet.h>
#include <netpfil/ipfw/dn_heap.h>
#include <netpfil/ipfw/ip_dn_private.h>
+#ifdef NEW_AQM
+#include <netpfil/ipfw/dn_aqm.h>
+#endif
#include <netpfil/ipfw/dn_sched.h>
#else
#include <dn_test.h>
@@ -309,6 +312,9 @@ static struct dn_alg rr_desc = {
_SI( .free_fsk = ) NULL,
_SI( .new_queue = ) rr_new_queue,
_SI( .free_queue = ) rr_free_queue,
+#ifdef NEW_AQM
+ _SI( .getconfig = ) NULL,
+#endif
};
diff --git a/sys/netpfil/ipfw/dn_sched_wf2q.c b/sys/netpfil/ipfw/dn_sched_wf2q.c
index a91c1ce..06f92a9 100644
--- a/sys/netpfil/ipfw/dn_sched_wf2q.c
+++ b/sys/netpfil/ipfw/dn_sched_wf2q.c
@@ -43,6 +43,9 @@
#include <netinet/ip_dummynet.h>
#include <netpfil/ipfw/dn_heap.h>
#include <netpfil/ipfw/ip_dn_private.h>
+#ifdef NEW_AQM
+#include <netpfil/ipfw/dn_aqm.h>
+#endif
#include <netpfil/ipfw/dn_sched.h>
#else
#include <dn_test.h>
@@ -367,6 +370,10 @@ static struct dn_alg wf2qp_desc = {
_SI( .new_queue = ) wf2qp_new_queue,
_SI( .free_queue = ) wf2qp_free_queue,
+#ifdef NEW_AQM
+ _SI( .getconfig = ) NULL,
+#endif
+
};
diff --git a/sys/netpfil/ipfw/ip_dn_glue.c b/sys/netpfil/ipfw/ip_dn_glue.c
index 02fbf3e..4c4659a 100644
--- a/sys/netpfil/ipfw/ip_dn_glue.c
+++ b/sys/netpfil/ipfw/ip_dn_glue.c
@@ -55,6 +55,9 @@
#include <netpfil/ipfw/ip_fw_private.h>
#include <netpfil/ipfw/dn_heap.h>
#include <netpfil/ipfw/ip_dn_private.h>
+#ifdef NEW_AQM
+#include <netpfil/ipfw/dn_aqm.h>
+#endif
#include <netpfil/ipfw/dn_sched.h>
/* FREEBSD7.2 ip_dummynet.h r191715*/
diff --git a/sys/netpfil/ipfw/ip_dn_io.c b/sys/netpfil/ipfw/ip_dn_io.c
index e8319bb..831b909 100644
--- a/sys/netpfil/ipfw/ip_dn_io.c
+++ b/sys/netpfil/ipfw/ip_dn_io.c
@@ -63,6 +63,9 @@ __FBSDID("$FreeBSD$");
#include <netpfil/ipfw/ip_fw_private.h>
#include <netpfil/ipfw/dn_heap.h>
#include <netpfil/ipfw/ip_dn_private.h>
+#ifdef NEW_AQM
+#include <netpfil/ipfw/dn_aqm.h>
+#endif
#include <netpfil/ipfw/dn_sched.h>
/*
@@ -84,8 +87,12 @@ static long tick_diff;
static unsigned long io_pkt;
static unsigned long io_pkt_fast;
-static unsigned long io_pkt_drop;
+#ifdef NEW_AQM
+unsigned long io_pkt_drop;
+#else
+static unsigned long io_pkt_drop;
+#endif
/*
* We use a heap to store entities for which we have pending timer events.
* The heap is checked at every tick and all entities with expired events
@@ -148,7 +155,11 @@ SYSBEGIN(f4)
SYSCTL_DECL(_net_inet);
SYSCTL_DECL(_net_inet_ip);
+#ifdef NEW_AQM
+SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet");
+#else
static SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet");
+#endif
/* wrapper to pass dn_cfg fields to SYSCTL_* */
//#define DC(x) (&(VNET_NAME(_base_dn_cfg).x))
@@ -250,6 +261,14 @@ static struct dn_pkt_tag *
dn_tag_get(struct mbuf *m)
{
struct m_tag *mtag = m_tag_first(m);
+#ifdef NEW_AQM
+ /* XXX: to skip ts m_tag. For Debugging only*/
+ if (mtag != NULL && mtag->m_tag_id == DN_AQM_MTAG_TS) {
+ m_tag_delete(m,mtag);
+ mtag = m_tag_first(m);
+ D("skip TS tag");
+ }
+#endif
KASSERT(mtag != NULL &&
mtag->m_tag_cookie == MTAG_ABI_COMPAT &&
mtag->m_tag_id == PACKET_TAG_DUMMYNET,
@@ -257,6 +276,7 @@ dn_tag_get(struct mbuf *m)
return (struct dn_pkt_tag *)(mtag+1);
}
+#ifndef NEW_AQM
static inline void
mq_append(struct mq *q, struct mbuf *m)
{
@@ -296,6 +316,7 @@ mq_append(struct mq *q, struct mbuf *m)
q->tail = m;
m->m_nextpkt = NULL;
}
+#endif
/*
* Dispose a list of packet. Use a functions so if we need to do
@@ -420,7 +441,10 @@ red_drops (struct dn_queue *q, int len)
/*
* ECN/ECT Processing (partially adopted from altq)
*/
-static int
+#ifndef NEW_AQM
+static
+#endif
+int
ecn_mark(struct mbuf* m)
{
struct ip *ip;
@@ -503,6 +527,11 @@ dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop)
goto drop;
if (f->plr && random() < f->plr)
goto drop;
+#ifdef NEW_AQM
+ /* Call AQM enqueue function */
+ if (q->fs->aqmfp)
+ return q->fs->aqmfp->enqueue(q ,m);
+#endif
if (f->flags & DN_IS_RED && red_drops(q, m->m_pkthdr.len)) {
if (!(f->flags & DN_IS_ECN) || !ecn_mark(m))
goto drop;
@@ -890,6 +919,10 @@ dummynet_io(struct mbuf **m0, int dir, struct ip_fw_args *fwa)
if (fs->sched->fp->enqueue(si, q, m)) {
/* packet was dropped by enqueue() */
m = *m0 = NULL;
+
+ /* dn_enqueue already increases io_pkt_drop */
+ io_pkt_drop--;
+
goto dropit;
}
diff --git a/sys/netpfil/ipfw/ip_dn_private.h b/sys/netpfil/ipfw/ip_dn_private.h
index 42aa378..2fce136 100644
--- a/sys/netpfil/ipfw/ip_dn_private.h
+++ b/sys/netpfil/ipfw/ip_dn_private.h
@@ -81,6 +81,10 @@ SLIST_HEAD(dn_fsk_head, dn_fsk);
SLIST_HEAD(dn_queue_head, dn_queue);
SLIST_HEAD(dn_alg_head, dn_alg);
+#ifdef NEW_AQM
+SLIST_HEAD(dn_aqm_head, dn_aqm); /* for new AQMs */
+#endif
+
struct mq { /* a basic queue of packets*/
struct mbuf *head, *tail;
int count;
@@ -136,6 +140,9 @@ struct dn_parms {
/* list of flowsets without a scheduler -- use sch_chain */
struct dn_fsk_head fsu; /* list of unlinked flowsets */
struct dn_alg_head schedlist; /* list of algorithms */
+#ifdef NEW_AQM
+ struct dn_aqm_head aqmlist; /* list of AQMs */
+#endif
/* Store the fs/sch to scan when draining. The value is the
* bucket number of the hash table. Expire can be disabled
@@ -232,6 +239,10 @@ struct dn_fsk { /* kernel side of a flowset */
int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */
int avg_pkt_size ; /* medium packet size */
int max_pkt_size ; /* max packet size */
+#ifdef NEW_AQM
+ struct dn_aqm *aqmfp; /* Pointer to AQM functions */
+ void *aqmcfg; /* configuration parameters for AQM */
+#endif
};
/*
@@ -254,6 +265,9 @@ struct dn_queue {
int count; /* arrivals since last RED drop */
int random; /* random value (scaled) */
uint64_t q_time; /* start of queue idle time */
+#ifdef NEW_AQM
+ void *aqm_status; /* per-queue status variables*/
+#endif
};
@@ -401,4 +415,49 @@ int do_config(void *p, int l);
void dn_drain_scheduler(void);
void dn_drain_queue(void);
+#ifdef NEW_AQM
+int ecn_mark(struct mbuf* m);
+
+/* moved from ip_dn_io.c to here to be available for AQMs modules*/
+static inline void
+mq_append(struct mq *q, struct mbuf *m)
+{
+#ifdef USERSPACE
+ // buffers from netmap need to be copied
+ // XXX note that the routine is not expected to fail
+ ND("append %p to %p", m, q);
+ if (m->m_flags & M_STACK) {
+ struct mbuf *m_new;
+ void *p;
+ int l, ofs;
+
+ ofs = m->m_data - m->__m_extbuf;
+ // XXX allocate
+ MGETHDR(m_new, M_NOWAIT, MT_DATA);
+ ND("*** WARNING, volatile buf %p ext %p %d dofs %d m_new %p",
+ m, m->__m_extbuf, m->__m_extlen, ofs, m_new);
+ p = m_new->__m_extbuf; /* new pointer */
+ l = m_new->__m_extlen; /* new len */
+ if (l <= m->__m_extlen) {
+ panic("extlen too large");
+ }
+
+ *m_new = *m; // copy
+ m_new->m_flags &= ~M_STACK;
+ m_new->__m_extbuf = p; // point to new buffer
+ _pkt_copy(m->__m_extbuf, p, m->__m_extlen);
+ m_new->m_data = p + ofs;
+ m = m_new;
+ }
+#endif /* USERSPACE */
+ if (q->head == NULL)
+ q->head = m;
+ else
+ q->tail->m_nextpkt = m;
+ q->count++;
+ q->tail = m;
+ m->m_nextpkt = NULL;
+}
+#endif /* NEW_AQM */
+
#endif /* _IP_DN_PRIVATE_H */
diff --git a/sys/netpfil/ipfw/ip_dummynet.c b/sys/netpfil/ipfw/ip_dummynet.c
index 941e0a3..dbd5aa7 100644
--- a/sys/netpfil/ipfw/ip_dummynet.c
+++ b/sys/netpfil/ipfw/ip_dummynet.c
@@ -1,4 +1,11 @@
/*-
+ * Codel/FQ_Codel and PIE/FQ-PIE Code:
+ * Copyright (C) 2016 Centre for Advanced Internet Architectures,
+ * Swinburne University of Technology, Melbourne, Australia.
+ * Portions of this code were made possible in part by a gift from
+ * The Comcast Innovation Fund.
+ * Implemented by Rasool Al-Saadi <ralsaadi@swin.edu.au>
+ *
* Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa
* Portions Copyright (c) 2000 Akamba Corp.
* All rights reserved
@@ -58,6 +65,9 @@ __FBSDID("$FreeBSD$");
#include <netpfil/ipfw/ip_fw_private.h>
#include <netpfil/ipfw/dn_heap.h>
#include <netpfil/ipfw/ip_dn_private.h>
+#ifdef NEW_AQM
+#include <netpfil/ipfw/dn_aqm.h>
+#endif
#include <netpfil/ipfw/dn_sched.h>
/* which objects to copy */
@@ -98,6 +108,21 @@ dn_reschedule(void)
}
/*----- end of callout hooks -----*/
+#ifdef NEW_AQM
+/* Return AQM descriptor for given type or name. */
+static struct dn_aqm *
+find_aqm_type(int type, char *name)
+{
+ struct dn_aqm *d;
+
+ SLIST_FOREACH(d, &dn_cfg.aqmlist, next) {
+ if (d->type == type || (name && !strcasecmp(d->name, name)))
+ return d;
+ }
+ return NULL; /* not found */
+}
+#endif
+
/* Return a scheduler descriptor given the type or name. */
static struct dn_alg *
find_sched_type(int type, char *name)
@@ -320,7 +345,15 @@ q_new(uintptr_t key, int flags, void *arg)
if (fs->sched->fp->new_queue)
fs->sched->fp->new_queue(q);
+
+#ifdef NEW_AQM
+ /* call AQM init function after creating a queue*/
+ if (fs->aqmfp && fs->aqmfp->init)
+ if(fs->aqmfp->init(q))
+ D("unable to init AQM for fs %d", fs->fs.fs_nr);
+#endif
dn_cfg.queue_count++;
+
return q;
}
@@ -334,6 +367,13 @@ dn_delete_queue(struct dn_queue *q, int flags)
{
struct dn_fsk *fs = q->fs;
+#ifdef NEW_AQM
+ /* clean up AQM status for queue 'q'
+ * cleanup here is called just with MULTIQUEUE
+ */
+ if (fs && fs->aqmfp && fs->aqmfp->cleanup)
+ fs->aqmfp->cleanup(q);
+#endif
// D("fs %p si %p\n", fs, q->_si);
/* notify the parent scheduler that the queue is going away */
if (fs && fs->sched->fp->free_queue)
@@ -475,6 +515,16 @@ si_new(uintptr_t key, int flags, void *arg)
if (s->sch.flags & DN_HAVE_MASK)
si->ni.fid = *(struct ipfw_flow_id *)key;
+#ifdef NEW_AQM
+ /* init AQM status for !DN_MULTIQUEUE sched*/
+ if (!(s->fp->flags & DN_MULTIQUEUE))
+ if (s->fs->aqmfp && s->fs->aqmfp->init)
+ if(s->fs->aqmfp->init((struct dn_queue *)(si + 1))) {
+ D("unable to init AQM for fs %d", s->fs->fs.fs_nr);
+ goto error;
+ }
+#endif
+
dn_cfg.si_count++;
return si;
@@ -504,6 +554,20 @@ si_destroy(void *_si, void *arg)
dn_free_pkts(dl->mq.head); /* drain delay line */
if (si->kflags & DN_ACTIVE) /* remove si from event heap */
heap_extract(&dn_cfg.evheap, si);
+
+#ifdef NEW_AQM
+ /* clean up AQM status for !DN_MULTIQUEUE sched
+ * Note that all queues belong to fs were cleaned up in fsk_detach.
+ * When drain_scheduler is called s->fs and q->fs are pointing
+ * to a correct fs, so we can use fs in this case.
+ */
+ if (!(s->fp->flags & DN_MULTIQUEUE)) {
+ struct dn_queue *q = (struct dn_queue *)(si + 1);
+ if (q->aqm_status && q->fs->aqmfp)
+ if (q->fs->aqmfp->cleanup)
+ q->fs->aqmfp->cleanup(q);
+ }
+#endif
if (s->fp->free_sched)
s->fp->free_sched(si);
bzero(si, sizeof(*si)); /* safety */
@@ -592,6 +656,67 @@ fsk_new(uintptr_t key, int flags, void *arg)
return fs;
}
+#ifdef NEW_AQM
+/* callback function for cleaning up AQM queue status belongs to a flowset
+ * connected to scheduler instance '_si' (for !DN_MULTIQUEUE only).
+ */
+static int
+si_cleanup_q(void *_si, void *arg)
+{
+ struct dn_sch_inst *si = _si;
+
+ if (!(si->sched->fp->flags & DN_MULTIQUEUE)) {
+ if (si->sched->fs->aqmfp && si->sched->fs->aqmfp->cleanup)
+ si->sched->fs->aqmfp->cleanup((struct dn_queue *) (si+1));
+ }
+ return 0;
+}
+
+/* callback to clean up queue AQM status.*/
+static int
+q_cleanup_q(void *_q, void *arg)
+{
+ struct dn_queue *q = _q;
+ q->fs->aqmfp->cleanup(q);
+ return 0;
+}
+
+/* Clean up all AQM queues status belongs to flowset 'fs' and then
+ * deconfig AQM for flowset 'fs'
+ */
+static void
+aqm_cleanup_deconfig_fs(struct dn_fsk *fs)
+{
+ struct dn_sch_inst *si;
+
+ /* clean up AQM status for all queues for !DN_MULTIQUEUE sched*/
+ if (fs->fs.fs_nr > DN_MAX_ID) {
+ if (fs->sched && !(fs->sched->fp->flags & DN_MULTIQUEUE)) {
+ if (fs->sched->sch.flags & DN_HAVE_MASK)
+ dn_ht_scan(fs->sched->siht, si_cleanup_q, NULL);
+ else {
+ /* single si i.e. no sched mask */
+ si = (struct dn_sch_inst *) fs->sched->siht;
+ if (si && fs->aqmfp && fs->aqmfp->cleanup)
+ fs->aqmfp->cleanup((struct dn_queue *) (si+1));
+ }
+ }
+ }
+
+ /* clean up AQM status for all queues for DN_MULTIQUEUE sched*/
+ if (fs->sched && fs->sched->fp->flags & DN_MULTIQUEUE && fs->qht) {
+ if (fs->fs.flags & DN_QHT_HASH)
+ dn_ht_scan(fs->qht, q_cleanup_q, NULL);
+ else
+ fs->aqmfp->cleanup((struct dn_queue *)(fs->qht));
+ }
+
+ /* deconfig AQM */
+ if(fs->aqmcfg && fs->aqmfp && fs->aqmfp->deconfig)
+ fs->aqmfp->deconfig(fs);
+}
+#endif
+
/*
* detach flowset from its current scheduler. Flags as follows:
* DN_DETACH removes from the fsk_list
@@ -620,6 +745,10 @@ fsk_detach(struct dn_fsk *fs, int flags)
free(fs->w_q_lookup, M_DUMMYNET);
fs->w_q_lookup = NULL;
qht_delete(fs, flags);
+#ifdef NEW_AQM
+ aqm_cleanup_deconfig_fs(fs);
+#endif
+
if (fs->sched && fs->sched->fp->free_fsk)
fs->sched->fp->free_fsk(fs);
fs->sched = NULL;
@@ -1191,6 +1320,183 @@ update_fs(struct dn_schk *s)
}
}
+#ifdef NEW_AQM
+/* Retrieve AQM configurations to ipfw userland
+ */
+static int
+get_aqm_parms(struct sockopt *sopt)
+{
+ struct dn_extra_parms *ep;
+ struct dn_fsk *fs;
+ size_t sopt_valsize;
+ int l, err = 0;
+
+ sopt_valsize = sopt->sopt_valsize;
+ l = sizeof(*ep);
+ if (sopt->sopt_valsize < l) {
+ D("bad len sopt->sopt_valsize %d len %d",
+ (int) sopt->sopt_valsize , l);
+ err = EINVAL;
+ return err;
+ }
+ ep = malloc(l, M_DUMMYNET, M_WAITOK);
+ if(!ep) {
+ err = ENOMEM ;
+ return err;
+ }
+ do {
+ err = sooptcopyin(sopt, ep, l, l);
+ if(err)
+ break;
+ sopt->sopt_valsize = sopt_valsize;
+ if (ep->oid.len < l) {
+ err = EINVAL;
+ break;
+ }
+
+ fs = dn_ht_find(dn_cfg.fshash, ep->nr, 0, NULL);
+ if (!fs) {
+ D("fs %d not found", ep->nr);
+ err = EINVAL;
+ break;
+ }
+
+ if (fs->aqmfp && fs->aqmfp->getconfig) {
+ if(fs->aqmfp->getconfig(fs, ep)) {
+ D("Error while trying to get AQM params");
+ err = EINVAL;
+ break;
+ }
+ ep->oid.len = l;
+ err = sooptcopyout(sopt, ep, l);
+ }
+ }while(0);
+
+ free(ep, M_DUMMYNET);
+ return err;
+}
+
+/* Retrieve AQM configurations to ipfw userland
+ */
+static int
+get_sched_parms(struct sockopt *sopt)
+{
+ struct dn_extra_parms *ep;
+ struct dn_schk *schk;
+ size_t sopt_valsize;
+ int l, err = 0;
+
+ sopt_valsize = sopt->sopt_valsize;
+ l = sizeof(*ep);
+ if (sopt->sopt_valsize < l) {
+ D("bad len sopt->sopt_valsize %d len %d",
+ (int) sopt->sopt_valsize , l);
+ err = EINVAL;
+ return err;
+ }
+ ep = malloc(l, M_DUMMYNET, M_WAITOK);
+ if(!ep) {
+ err = ENOMEM ;
+ return err;
+ }
+ do {
+ err = sooptcopyin(sopt, ep, l, l);
+ if(err)
+ break;
+ sopt->sopt_valsize = sopt_valsize;
+ if (ep->oid.len < l) {
+ err = EINVAL;
+ break;
+ }
+
+ schk = locate_scheduler(ep->nr);
+ if (!schk) {
+ D("sched %d not found", ep->nr);
+ err = EINVAL;
+ break;
+ }
+
+ if (schk->fp && schk->fp->getconfig) {
+ if(schk->fp->getconfig(schk, ep)) {
+ D("Error while trying to get sched params");
+ err = EINVAL;
+ break;
+ }
+ ep->oid.len = l;
+ err = sooptcopyout(sopt, ep, l);
+ }
+ }while(0);
+ free(ep, M_DUMMYNET);
+
+ return err;
+}
+
+/* Configure AQM for flowset 'fs'.
+ * extra parameters are passed from userland.
+ */
+static int
+config_aqm(struct dn_fsk *fs, struct dn_extra_parms *ep, int busy)
+{
+ int err = 0;
+
+ do {
+ /* no configurations */
+ if (!ep) {
+ err = 0;
+ break;
+ }
+
+ /* no AQM for this flowset*/
+ if (!strcmp(ep->name,"")) {
+ err = 0;
+ break;
+ }
+ if (ep->oid.len < sizeof(*ep)) {
+ D("short aqm len %d", ep->oid.len);
+ err = EINVAL;
+ break;
+ }
+
+ if (busy) {
+ D("Unable to configure flowset, flowset busy!");
+ err = EINVAL;
+ break;
+ }
+
+ /* deconfigure old aqm if exist */
+ if (fs->aqmcfg && fs->aqmfp && fs->aqmfp->deconfig) {
+ aqm_cleanup_deconfig_fs(fs);
+ }
+
+ if (!(fs->aqmfp = find_aqm_type(0, ep->name))) {
+ D("AQM functions not found for type %s!", ep->name);
+ fs->fs.flags &= ~DN_IS_AQM;
+ err = EINVAL;
+ break;
+ } else
+ fs->fs.flags |= DN_IS_AQM;
+
+ if (ep->oid.subtype != DN_AQM_PARAMS) {
+ D("Wrong subtype");
+ err = EINVAL;
+ break;
+ }
+
+ if (fs->aqmfp->config) {
+ err = fs->aqmfp->config(fs, ep, ep->oid.len);
+ if (err) {
+ D("Unable to configure AQM for FS %d", fs->fs.fs_nr );
+ fs->fs.flags &= ~DN_IS_AQM;
+ fs->aqmfp = NULL;
+ break;
+ }
+ }
+ } while(0);
+
+ return err;
+}
+#endif
+
/*
* Configuration -- to preserve backward compatibility we use
* the following scheme (N is 65536)
@@ -1323,6 +1629,14 @@ config_fs(struct dn_fs *nfs, struct dn_id *arg, int locked)
}
if (bcmp(&fs->fs, nfs, sizeof(*nfs)) == 0) {
ND("flowset %d unchanged", i);
+#ifdef NEW_AQM
+ /* reconfigure AQM as the parameters can be changed.
+ * we consider the flowsetis busy if it has scheduler instance(s)
+ */
+ s = locate_scheduler(nfs->sched_nr);
+ config_aqm(fs, (struct dn_extra_parms *) arg,
+ s != NULL && s->siht != NULL);
+#endif
break; /* no change, nothing to do */
}
if (oldc != dn_cfg.fsk_count) /* new item */
@@ -1341,6 +1655,10 @@ config_fs(struct dn_fs *nfs, struct dn_id *arg, int locked)
fsk_detach(fs, flags);
}
fs->fs = *nfs; /* copy configuration */
+#ifdef NEW_AQM
+ fs->aqmfp = NULL;
+ config_aqm(fs, (struct dn_extra_parms *) arg, s != NULL && s->siht != NULL);
+#endif
if (s != NULL)
fsk_attach(fs, s);
} while (0);
@@ -1866,6 +2184,19 @@ dummynet_get(struct sockopt *sopt, void **compat)
// cmd->id = sopt_valsize;
D("compatibility mode");
}
+
+#ifdef NEW_AQM
+ /* get AQM params */
+ if(cmd->subtype == DN_AQM_PARAMS) {
+ error = get_aqm_parms(sopt);
+ goto done;
+ /* get Scheduler params */
+ } else if (cmd->subtype == DN_SCH_PARAMS) {
+ error = get_sched_parms(sopt);
+ goto done;
+ }
+#endif
+
a.extra = (struct copy_range *)cmd;
if (cmd->len == sizeof(*cmd)) { /* no range, create a default */
uint32_t *rp = (uint32_t *)(cmd + 1);
@@ -2318,4 +2649,98 @@ MODULE_VERSION(dummynet, 3);
*/
//VNET_SYSUNINIT(vnet_dn_uninit, DN_SI_SUB, DN_MODEV_ORD+2, ip_dn_destroy, NULL);
+#ifdef NEW_AQM
+
+/* modevent helpers for the AQM modules */
+static int
+load_dn_aqm(struct dn_aqm *d)
+{
+ struct dn_aqm *aqm=NULL;
+
+ if (d == NULL)
+ return 1; /* error */
+ ip_dn_init(); /* just in case, we need the lock */
+
+ /* Check that mandatory funcs exists */
+ if (d->enqueue == NULL || d->dequeue == NULL) {
+ D("missing enqueue or dequeue for %s", d->name);
+ return 1;
+ }
+
+ /* Search if AQM already exists */
+ DN_BH_WLOCK();
+ SLIST_FOREACH(aqm, &dn_cfg.aqmlist, next) {
+ if (strcmp(aqm->name, d->name) == 0) {
+ D("%s already loaded", d->name);
+ break; /* AQM already exists */
+ }
+ }
+ if (aqm == NULL)
+ SLIST_INSERT_HEAD(&dn_cfg.aqmlist, d, next);
+ DN_BH_WUNLOCK();
+ D("dn_aqm %s %sloaded", d->name, aqm ? "not ":"");
+ return aqm ? 1 : 0;
+}
+
+
+/* Callback to clean up AQM status for queues connected to a flowset
+ * and then deconfigure the flowset.
+ * This function is called before an AQM module is unloaded
+ */
+static int
+fs_cleanup(void *_fs, void *arg)
+{
+ struct dn_fsk *fs = _fs;
+ uint32_t type = *(uint32_t *)arg;
+
+ if (fs->aqmfp && fs->aqmfp->type == type)
+ aqm_cleanup_deconfig_fs(fs);
+
+ return 0;
+}
+
+static int
+unload_dn_aqm(struct dn_aqm *aqm)
+{
+ struct dn_aqm *tmp, *r;
+ int err = EINVAL;
+ err = 0;
+ ND("called for %s", aqm->name);
+
+ DN_BH_WLOCK();
+
+ /* clean up AQM status and deconfig flowset */
+ dn_ht_scan(dn_cfg.fshash, fs_cleanup, &aqm->type);
+
+ SLIST_FOREACH_SAFE(r, &dn_cfg.aqmlist, next, tmp) {
+ if (strcmp(aqm->name, r->name) != 0)
+ continue;
+ ND("ref_count = %d", r->ref_count);
+ err = (r->ref_count != 0 || r->cfg_ref_count != 0) ? EBUSY : 0;
+ if (err == 0)
+ SLIST_REMOVE(&dn_cfg.aqmlist, r, dn_aqm, next);
+ break;
+ }
+ DN_BH_WUNLOCK();
+ D("%s %sunloaded", aqm->name, err ? "not ":"");
+ if (err)
+ D("ref_count=%d, cfg_ref_count=%d", r->ref_count, r->cfg_ref_count);
+ return err;
+}
+
+int
+dn_aqm_modevent(module_t mod, int cmd, void *arg)
+{
+ struct dn_aqm *aqm = arg;
+
+ if (cmd == MOD_LOAD)
+ return load_dn_aqm(aqm);
+ else if (cmd == MOD_UNLOAD)
+ return unload_dn_aqm(aqm);
+ else
+ return EINVAL;
+}
+#endif
+
/* end of file */
+
OpenPOWER on IntegriCloud