summaryrefslogtreecommitdiffstats
path: root/sys/netpfil
diff options
context:
space:
mode:
Diffstat (limited to 'sys/netpfil')
-rw-r--r--sys/netpfil/ipfw/dn_heap.c552
-rw-r--r--sys/netpfil/ipfw/dn_heap.h191
-rw-r--r--sys/netpfil/ipfw/dn_sched.h191
-rw-r--r--sys/netpfil/ipfw/dn_sched_fifo.c120
-rw-r--r--sys/netpfil/ipfw/dn_sched_prio.c229
-rw-r--r--sys/netpfil/ipfw/dn_sched_qfq.c864
-rw-r--r--sys/netpfil/ipfw/dn_sched_rr.c307
-rw-r--r--sys/netpfil/ipfw/dn_sched_wf2q.c373
-rw-r--r--sys/netpfil/ipfw/dummynet.txt860
-rw-r--r--sys/netpfil/ipfw/ip_dn_glue.c846
-rw-r--r--sys/netpfil/ipfw/ip_dn_io.c857
-rw-r--r--sys/netpfil/ipfw/ip_dn_private.h403
-rw-r--r--sys/netpfil/ipfw/ip_dummynet.c2315
-rw-r--r--sys/netpfil/ipfw/ip_fw2.c2791
-rw-r--r--sys/netpfil/ipfw/ip_fw_dynamic.c1245
-rw-r--r--sys/netpfil/ipfw/ip_fw_log.c553
-rw-r--r--sys/netpfil/ipfw/ip_fw_nat.c662
-rw-r--r--sys/netpfil/ipfw/ip_fw_pfil.c590
-rw-r--r--sys/netpfil/ipfw/ip_fw_private.h309
-rw-r--r--sys/netpfil/ipfw/ip_fw_sockopt.c1449
-rw-r--r--sys/netpfil/ipfw/ip_fw_table.c762
-rw-r--r--sys/netpfil/ipfw/test/Makefile51
-rw-r--r--sys/netpfil/ipfw/test/dn_test.h175
-rw-r--r--sys/netpfil/ipfw/test/main.c636
-rw-r--r--sys/netpfil/ipfw/test/mylist.h49
-rw-r--r--sys/netpfil/ipfw/test/test_dn_heap.c162
-rw-r--r--sys/netpfil/ipfw/test/test_dn_sched.c89
-rw-r--r--sys/netpfil/pf/if_pflog.c290
-rw-r--r--sys/netpfil/pf/if_pfsync.c2397
-rw-r--r--sys/netpfil/pf/in4_cksum.c120
-rw-r--r--sys/netpfil/pf/pf.c6271
-rw-r--r--sys/netpfil/pf/pf_if.c859
-rw-r--r--sys/netpfil/pf/pf_ioctl.c3774
-rw-r--r--sys/netpfil/pf/pf_lb.c663
-rw-r--r--sys/netpfil/pf/pf_norm.c1999
-rw-r--r--sys/netpfil/pf/pf_osfp.c526
-rw-r--r--sys/netpfil/pf/pf_ruleset.c424
-rw-r--r--sys/netpfil/pf/pf_table.c2191
38 files changed, 37145 insertions, 0 deletions
diff --git a/sys/netpfil/ipfw/dn_heap.c b/sys/netpfil/ipfw/dn_heap.c
new file mode 100644
index 0000000..1d58511
--- /dev/null
+++ b/sys/netpfil/ipfw/dn_heap.c
@@ -0,0 +1,552 @@
+/*-
+ * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Binary heap and hash tables, used in dummynet
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#ifdef _KERNEL
+__FBSDID("$FreeBSD$");
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <netpfil/ipfw/dn_heap.h>
+#ifndef log
+#define log(x, arg...)
+#endif
+
+#else /* !_KERNEL */
+
+#include <stdio.h>
+#include <dn_test.h>
+#include <strings.h>
+#include <stdlib.h>
+
+#include "dn_heap.h"
+#define log(x, arg...) fprintf(stderr, ## arg)
+#define panic(x...) fprintf(stderr, ## x), exit(1)
+#define MALLOC_DEFINE(a, b, c)
+static void *my_malloc(int s) { return malloc(s); }
+static void my_free(void *p) { free(p); }
+#define malloc(s, t, w) my_malloc(s)
+#define free(p, t) my_free(p)
+#endif /* !_KERNEL */
+
+static MALLOC_DEFINE(M_DN_HEAP, "dummynet", "dummynet heap");
+
+/*
+ * Heap management functions.
+ *
+ * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2.
+ * Some macros help finding parent/children so we can optimize them.
+ *
+ * heap_init() is called to expand the heap when needed.
+ * Increment size in blocks of 16 entries.
+ * Returns 1 on error, 0 on success
+ */
+#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 )
+#define HEAP_LEFT(x) ( (x)+(x) + 1 )
+#define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; }
+#define HEAP_INCREMENT 15
+
+static int
+heap_resize(struct dn_heap *h, unsigned int new_size)
+{
+ struct dn_heap_entry *p;
+
+ if (h->size >= new_size ) /* have enough room */
+ return 0;
+#if 1 /* round to the next power of 2 */
+ new_size |= new_size >> 1;
+ new_size |= new_size >> 2;
+ new_size |= new_size >> 4;
+ new_size |= new_size >> 8;
+ new_size |= new_size >> 16;
+#else
+ new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT;
+#endif
+ p = malloc(new_size * sizeof(*p), M_DN_HEAP, M_NOWAIT);
+ if (p == NULL) {
+ printf("--- %s, resize %d failed\n", __func__, new_size );
+ return 1; /* error */
+ }
+ if (h->size > 0) {
+ bcopy(h->p, p, h->size * sizeof(*p) );
+ free(h->p, M_DN_HEAP);
+ }
+ h->p = p;
+ h->size = new_size;
+ return 0;
+}
+
+int
+heap_init(struct dn_heap *h, int size, int ofs)
+{
+ if (heap_resize(h, size))
+ return 1;
+ h->elements = 0;
+ h->ofs = ofs;
+ return 0;
+}
+
+/*
+ * Insert element in heap. Normally, p != NULL, we insert p in
+ * a new position and bubble up. If p == NULL, then the element is
+ * already in place, and key is the position where to start the
+ * bubble-up.
+ * Returns 1 on failure (cannot allocate new heap entry)
+ *
+ * If ofs > 0 the position (index, int) of the element in the heap is
+ * also stored in the element itself at the given offset in bytes.
+ */
+#define SET_OFFSET(h, i) do { \
+ if (h->ofs > 0) \
+ *((int32_t *)((char *)(h->p[i].object) + h->ofs)) = i; \
+ } while (0)
+/*
+ * RESET_OFFSET is used for sanity checks. It sets ofs
+ * to an invalid value.
+ */
+#define RESET_OFFSET(h, i) do { \
+ if (h->ofs > 0) \
+ *((int32_t *)((char *)(h->p[i].object) + h->ofs)) = -16; \
+ } while (0)
+
+int
+heap_insert(struct dn_heap *h, uint64_t key1, void *p)
+{
+ int son = h->elements;
+
+ //log("%s key %llu p %p\n", __FUNCTION__, key1, p);
+ if (p == NULL) { /* data already there, set starting point */
+ son = key1;
+ } else { /* insert new element at the end, possibly resize */
+ son = h->elements;
+ if (son == h->size) /* need resize... */
+ // XXX expand by 16 or so
+ if (heap_resize(h, h->elements+16) )
+ return 1; /* failure... */
+ h->p[son].object = p;
+ h->p[son].key = key1;
+ h->elements++;
+ }
+ /* make sure that son >= father along the path */
+ while (son > 0) {
+ int father = HEAP_FATHER(son);
+ struct dn_heap_entry tmp;
+
+ if (DN_KEY_LT( h->p[father].key, h->p[son].key ) )
+ break; /* found right position */
+ /* son smaller than father, swap and repeat */
+ HEAP_SWAP(h->p[son], h->p[father], tmp);
+ SET_OFFSET(h, son);
+ son = father;
+ }
+ SET_OFFSET(h, son);
+ return 0;
+}
+
+/*
+ * remove top element from heap, or obj if obj != NULL
+ */
+void
+heap_extract(struct dn_heap *h, void *obj)
+{
+ int child, father, max = h->elements - 1;
+
+ if (max < 0) {
+ printf("--- %s: empty heap 0x%p\n", __FUNCTION__, h);
+ return;
+ }
+ if (obj == NULL)
+ father = 0; /* default: move up smallest child */
+ else { /* extract specific element, index is at offset */
+ if (h->ofs <= 0)
+ panic("%s: extract from middle not set on %p\n",
+ __FUNCTION__, h);
+ father = *((int *)((char *)obj + h->ofs));
+ if (father < 0 || father >= h->elements) {
+ panic("%s: father %d out of bound 0..%d\n",
+ __FUNCTION__, father, h->elements);
+ }
+ }
+ /*
+ * below, father is the index of the empty element, which
+ * we replace at each step with the smallest child until we
+ * reach the bottom level.
+ */
+ // XXX why removing RESET_OFFSET increases runtime by 10% ?
+ RESET_OFFSET(h, father);
+ while ( (child = HEAP_LEFT(father)) <= max ) {
+ if (child != max &&
+ DN_KEY_LT(h->p[child+1].key, h->p[child].key) )
+ child++; /* take right child, otherwise left */
+ h->p[father] = h->p[child];
+ SET_OFFSET(h, father);
+ father = child;
+ }
+ h->elements--;
+ if (father != max) {
+ /*
+ * Fill hole with last entry and bubble up,
+ * reusing the insert code
+ */
+ h->p[father] = h->p[max];
+ heap_insert(h, father, NULL);
+ }
+}
+
+#if 0
+/*
+ * change object position and update references
+ * XXX this one is never used!
+ */
+static void
+heap_move(struct dn_heap *h, uint64_t new_key, void *object)
+{
+ int temp, i, max = h->elements-1;
+ struct dn_heap_entry *p, buf;
+
+ if (h->ofs <= 0)
+ panic("cannot move items on this heap");
+ p = h->p; /* shortcut */
+
+ i = *((int *)((char *)object + h->ofs));
+ if (DN_KEY_LT(new_key, p[i].key) ) { /* must move up */
+ p[i].key = new_key;
+ for (; i>0 &&
+ DN_KEY_LT(new_key, p[(temp = HEAP_FATHER(i))].key);
+ i = temp ) { /* bubble up */
+ HEAP_SWAP(p[i], p[temp], buf);
+ SET_OFFSET(h, i);
+ }
+ } else { /* must move down */
+ p[i].key = new_key;
+ while ( (temp = HEAP_LEFT(i)) <= max ) {
+ /* found left child */
+ if (temp != max &&
+ DN_KEY_LT(p[temp+1].key, p[temp].key))
+ temp++; /* select child with min key */
+ if (DN_KEY_LT(>p[temp].key, new_key)) {
+ /* go down */
+ HEAP_SWAP(p[i], p[temp], buf);
+ SET_OFFSET(h, i);
+ } else
+ break;
+ i = temp;
+ }
+ }
+ SET_OFFSET(h, i);
+}
+#endif /* heap_move, unused */
+
+/*
+ * heapify() will reorganize data inside an array to maintain the
+ * heap property. It is needed when we delete a bunch of entries.
+ */
+static void
+heapify(struct dn_heap *h)
+{
+ int i;
+
+ for (i = 0; i < h->elements; i++ )
+ heap_insert(h, i , NULL);
+}
+
+int
+heap_scan(struct dn_heap *h, int (*fn)(void *, uintptr_t),
+ uintptr_t arg)
+{
+ int i, ret, found;
+
+ for (i = found = 0 ; i < h->elements ;) {
+ ret = fn(h->p[i].object, arg);
+ if (ret & HEAP_SCAN_DEL) {
+ h->elements-- ;
+ h->p[i] = h->p[h->elements] ;
+ found++ ;
+ } else
+ i++ ;
+ if (ret & HEAP_SCAN_END)
+ break;
+ }
+ if (found)
+ heapify(h);
+ return found;
+}
+
+/*
+ * cleanup the heap and free data structure
+ */
+void
+heap_free(struct dn_heap *h)
+{
+ if (h->size >0 )
+ free(h->p, M_DN_HEAP);
+ bzero(h, sizeof(*h) );
+}
+
+/*
+ * hash table support.
+ */
+
+struct dn_ht {
+ int buckets; /* how many buckets, really buckets - 1*/
+ int entries; /* how many entries */
+ int ofs; /* offset of link field */
+ uint32_t (*hash)(uintptr_t, int, void *arg);
+ int (*match)(void *_el, uintptr_t key, int, void *);
+ void *(*newh)(uintptr_t, int, void *);
+ void **ht; /* bucket heads */
+};
+/*
+ * Initialize, allocating bucket pointers inline.
+ * Recycle previous record if possible.
+ * If the 'newh' function is not supplied, we assume that the
+ * key passed to ht_find is the same object to be stored in.
+ */
+struct dn_ht *
+dn_ht_init(struct dn_ht *ht, int buckets, int ofs,
+ uint32_t (*h)(uintptr_t, int, void *),
+ int (*match)(void *, uintptr_t, int, void *),
+ void *(*newh)(uintptr_t, int, void *))
+{
+ int l;
+
+ /*
+ * Notes about rounding bucket size to a power of two.
+ * Given the original bucket size, we compute the nearest lower and
+ * higher power of two, minus 1 (respectively b_min and b_max) because
+ * this value will be used to do an AND with the index returned
+ * by hash function.
+ * To choice between these two values, the original bucket size is
+ * compared with b_min. If the original size is greater than 4/3 b_min,
+ * we round the bucket size to b_max, else to b_min.
+ * This ratio try to round to the nearest power of two, advantaging
+ * the greater size if the different between two power is relatively
+ * big.
+ * Rounding the bucket size to a power of two avoid the use of
+ * module when calculating the correct bucket.
+ * The ht->buckets variable store the bucket size - 1 to simply
+ * do an AND between the index returned by hash function and ht->bucket
+ * instead of a module.
+ */
+ int b_min; /* min buckets */
+ int b_max; /* max buckets */
+ int b_ori; /* original buckets */
+
+ if (h == NULL || match == NULL) {
+ printf("--- missing hash or match function");
+ return NULL;
+ }
+ if (buckets < 1 || buckets > 65536)
+ return NULL;
+
+ b_ori = buckets;
+ /* calculate next power of 2, - 1*/
+ buckets |= buckets >> 1;
+ buckets |= buckets >> 2;
+ buckets |= buckets >> 4;
+ buckets |= buckets >> 8;
+ buckets |= buckets >> 16;
+
+ b_max = buckets; /* Next power */
+ b_min = buckets >> 1; /* Previous power */
+
+ /* Calculate the 'nearest' bucket size */
+ if (b_min * 4000 / 3000 < b_ori)
+ buckets = b_max;
+ else
+ buckets = b_min;
+
+ if (ht) { /* see if we can reuse */
+ if (buckets <= ht->buckets) {
+ ht->buckets = buckets;
+ } else {
+ /* free pointers if not allocated inline */
+ if (ht->ht != (void *)(ht + 1))
+ free(ht->ht, M_DN_HEAP);
+ free(ht, M_DN_HEAP);
+ ht = NULL;
+ }
+ }
+ if (ht == NULL) {
+ /* Allocate buckets + 1 entries because buckets is use to
+ * do the AND with the index returned by hash function
+ */
+ l = sizeof(*ht) + (buckets + 1) * sizeof(void **);
+ ht = malloc(l, M_DN_HEAP, M_NOWAIT | M_ZERO);
+ }
+ if (ht) {
+ ht->ht = (void **)(ht + 1);
+ ht->buckets = buckets;
+ ht->ofs = ofs;
+ ht->hash = h;
+ ht->match = match;
+ ht->newh = newh;
+ }
+ return ht;
+}
+
+/* dummy callback for dn_ht_free to unlink all */
+static int
+do_del(void *obj, void *arg)
+{
+ return DNHT_SCAN_DEL;
+}
+
+void
+dn_ht_free(struct dn_ht *ht, int flags)
+{
+ if (ht == NULL)
+ return;
+ if (flags & DNHT_REMOVE) {
+ (void)dn_ht_scan(ht, do_del, NULL);
+ } else {
+ if (ht->ht && ht->ht != (void *)(ht + 1))
+ free(ht->ht, M_DN_HEAP);
+ free(ht, M_DN_HEAP);
+ }
+}
+
+int
+dn_ht_entries(struct dn_ht *ht)
+{
+ return ht ? ht->entries : 0;
+}
+
+/* lookup and optionally create or delete element */
+void *
+dn_ht_find(struct dn_ht *ht, uintptr_t key, int flags, void *arg)
+{
+ int i;
+ void **pp, *p;
+
+ if (ht == NULL) /* easy on an empty hash */
+ return NULL;
+ i = (ht->buckets == 1) ? 0 :
+ (ht->hash(key, flags, arg) & ht->buckets);
+
+ for (pp = &ht->ht[i]; (p = *pp); pp = (void **)((char *)p + ht->ofs)) {
+ if (flags & DNHT_MATCH_PTR) {
+ if (key == (uintptr_t)p)
+ break;
+ } else if (ht->match(p, key, flags, arg)) /* found match */
+ break;
+ }
+ if (p) {
+ if (flags & DNHT_REMOVE) {
+ /* link in the next element */
+ *pp = *(void **)((char *)p + ht->ofs);
+ *(void **)((char *)p + ht->ofs) = NULL;
+ ht->entries--;
+ }
+ } else if (flags & DNHT_INSERT) {
+ // printf("%s before calling new, bucket %d ofs %d\n",
+ // __FUNCTION__, i, ht->ofs);
+ p = ht->newh ? ht->newh(key, flags, arg) : (void *)key;
+ // printf("%s newh returns %p\n", __FUNCTION__, p);
+ if (p) {
+ ht->entries++;
+ *(void **)((char *)p + ht->ofs) = ht->ht[i];
+ ht->ht[i] = p;
+ }
+ }
+ return p;
+}
+
+/*
+ * do a scan with the option to delete the object. Extract next before
+ * running the callback because the element may be destroyed there.
+ */
+int
+dn_ht_scan(struct dn_ht *ht, int (*fn)(void *, void *), void *arg)
+{
+ int i, ret, found = 0;
+ void **curp, *cur, *next;
+
+ if (ht == NULL || fn == NULL)
+ return 0;
+ for (i = 0; i <= ht->buckets; i++) {
+ curp = &ht->ht[i];
+ while ( (cur = *curp) != NULL) {
+ next = *(void **)((char *)cur + ht->ofs);
+ ret = fn(cur, arg);
+ if (ret & DNHT_SCAN_DEL) {
+ found++;
+ ht->entries--;
+ *curp = next;
+ } else {
+ curp = (void **)((char *)cur + ht->ofs);
+ }
+ if (ret & DNHT_SCAN_END)
+ return found;
+ }
+ }
+ return found;
+}
+
+/*
+ * Similar to dn_ht_scan(), except that the scan is performed only
+ * in the bucket 'bucket'. The function returns a correct bucket number if
+ * the original is invalid.
+ * If the callback returns DNHT_SCAN_END, the function move the ht->ht[i]
+ * pointer to the last entry processed. Moreover, the bucket number passed
+ * by caller is decremented, because usually the caller increment it.
+ */
+int
+dn_ht_scan_bucket(struct dn_ht *ht, int *bucket, int (*fn)(void *, void *),
+ void *arg)
+{
+ int i, ret, found = 0;
+ void **curp, *cur, *next;
+
+ if (ht == NULL || fn == NULL)
+ return 0;
+ if (*bucket > ht->buckets)
+ *bucket = 0;
+ i = *bucket;
+
+ curp = &ht->ht[i];
+ while ( (cur = *curp) != NULL) {
+ next = *(void **)((char *)cur + ht->ofs);
+ ret = fn(cur, arg);
+ if (ret & DNHT_SCAN_DEL) {
+ found++;
+ ht->entries--;
+ *curp = next;
+ } else {
+ curp = (void **)((char *)cur + ht->ofs);
+ }
+ if (ret & DNHT_SCAN_END)
+ return found;
+ }
+ return found;
+}
diff --git a/sys/netpfil/ipfw/dn_heap.h b/sys/netpfil/ipfw/dn_heap.h
new file mode 100644
index 0000000..c95473a
--- /dev/null
+++ b/sys/netpfil/ipfw/dn_heap.h
@@ -0,0 +1,191 @@
+/*-
+ * Copyright (c) 1998-2010 Luigi Rizzo, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Binary heap and hash tables, header file
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IP_DN_HEAP_H
+#define _IP_DN_HEAP_H
+
+#define DN_KEY_LT(a,b) ((int64_t)((a)-(b)) < 0)
+#define DN_KEY_LEQ(a,b) ((int64_t)((a)-(b)) <= 0)
+
+/*
+ * This module implements a binary heap supporting random extraction.
+ *
+ * A heap entry contains an uint64_t key and a pointer to object.
+ * DN_KEY_LT(a,b) returns true if key 'a' is smaller than 'b'
+ *
+ * The heap is a struct dn_heap plus a dynamically allocated
+ * array of dn_heap_entry entries. 'size' represents the size of
+ * the array, 'elements' count entries in use. The topmost
+ * element has the smallest key.
+ * The heap supports ordered insert, and extract from the top.
+ * To extract an object from the middle of the heap, we the object
+ * must reserve an 'int32_t' to store the position of the object
+ * in the heap itself, and the location of this field must be
+ * passed as an argument to heap_init() -- use -1 if the feature
+ * is not used.
+ */
+struct dn_heap_entry {
+ uint64_t key; /* sorting key, smallest comes first */
+ void *object; /* object pointer */
+};
+
+struct dn_heap {
+ int size; /* the size of the array */
+ int elements; /* elements in use */
+ int ofs; /* offset in the object of heap index */
+ struct dn_heap_entry *p; /* array of "size" entries */
+};
+
+enum {
+ HEAP_SCAN_DEL = 1,
+ HEAP_SCAN_END = 2,
+};
+
+/*
+ * heap_init() reinitializes the heap setting the size and the offset
+ * of the index for random extraction (use -1 if not used).
+ * The 'elements' counter is set to 0.
+ *
+ * SET_HEAP_OFS() indicates where, in the object, is stored the index
+ * for random extractions from the heap.
+ *
+ * heap_free() frees the memory associated to a heap.
+ *
+ * heap_insert() adds a key-pointer pair to the heap
+ *
+ * HEAP_TOP() returns a pointer to the top element of the heap,
+ * but makes no checks on its existance (XXX should we change ?)
+ *
+ * heap_extract() removes the entry at the top, returing the pointer.
+ * (the key should have been read before).
+ *
+ * heap_scan() invokes a callback on each entry of the heap.
+ * The callback can return a combination of HEAP_SCAN_DEL and
+ * HEAP_SCAN_END. HEAP_SCAN_DEL means the current element must
+ * be removed, and HEAP_SCAN_END means to terminate the scan.
+ * heap_scan() returns the number of elements removed.
+ * Because the order is not guaranteed, we should use heap_scan()
+ * only as a last resort mechanism.
+ */
+#define HEAP_TOP(h) ((h)->p)
+#define SET_HEAP_OFS(h, n) do { (h)->ofs = n; } while (0)
+int heap_init(struct dn_heap *h, int size, int ofs);
+int heap_insert(struct dn_heap *h, uint64_t key1, void *p);
+void heap_extract(struct dn_heap *h, void *obj);
+void heap_free(struct dn_heap *h);
+int heap_scan(struct dn_heap *, int (*)(void *, uintptr_t), uintptr_t);
+
+/*------------------------------------------------------
+ * This module implements a generic hash table with support for
+ * running callbacks on the entire table. To avoid allocating
+ * memory during hash table operations, objects must reserve
+ * space for a link field. XXX if the heap is moderately full,
+ * an SLIST suffices, and we can tolerate the cost of a hash
+ * computation on each removal.
+ *
+ * dn_ht_init() initializes the table, setting the number of
+ * buckets, the offset of the link field, the main callbacks.
+ * Callbacks are:
+ *
+ * hash(key, flags, arg) called to return a bucket index.
+ * match(obj, key, flags, arg) called to determine if key
+ * matches the current 'obj' in the heap
+ * newh(key, flags, arg) optional, used to allocate a new
+ * object during insertions.
+ *
+ * dn_ht_free() frees the heap or unlink elements.
+ * DNHT_REMOVE unlink elements, 0 frees the heap.
+ * You need two calls to do both.
+ *
+ * dn_ht_find() is the main lookup function, which can also be
+ * used to insert or delete elements in the hash table.
+ * The final 'arg' is passed to all callbacks.
+ *
+ * dn_ht_scan() is used to invoke a callback on all entries of
+ * the heap, or possibly on just one bucket. The callback
+ * is invoked with a pointer to the object, and must return
+ * one of DNHT_SCAN_DEL or DNHT_SCAN_END to request the
+ * removal of the object from the heap and the end of the
+ * scan, respectively.
+ *
+ * dn_ht_scan_bucket() is similar to dn_ht_scan(), except that it scans
+ * only the specific bucket of the table. The bucket is a in-out
+ * parameter and return a valid bucket number if the original
+ * is invalid.
+ *
+ * A combination of flags can be used to modify the operation
+ * of the dn_ht_find(), and of the callbacks:
+ *
+ * DNHT_KEY_IS_OBJ means the key is the object pointer.
+ * It is usally of interest for the hash and match functions.
+ *
+ * DNHT_MATCH_PTR during a lookup, match pointers instead
+ * of calling match(). Normally used when removing specific
+ * entries. Does not imply KEY_IS_OBJ as the latter _is_ used
+ * by the match function.
+ *
+ * DNHT_INSERT insert the element if not found.
+ * Calls new() to allocates a new object unless
+ * DNHT_KEY_IS_OBJ is set.
+ *
+ * DNHT_UNIQUE only insert if object not found.
+ * XXX should it imply DNHT_INSERT ?
+ *
+ * DNHT_REMOVE remove objects if we find them.
+ */
+struct dn_ht; /* should be opaque */
+
+struct dn_ht *dn_ht_init(struct dn_ht *, int buckets, int ofs,
+ uint32_t (*hash)(uintptr_t, int, void *),
+ int (*match)(void *, uintptr_t, int, void *),
+ void *(*newh)(uintptr_t, int, void *));
+void dn_ht_free(struct dn_ht *, int flags);
+
+void *dn_ht_find(struct dn_ht *, uintptr_t, int, void *);
+int dn_ht_scan(struct dn_ht *, int (*)(void *, void *), void *);
+int dn_ht_scan_bucket(struct dn_ht *, int * , int (*)(void *, void *), void *);
+int dn_ht_entries(struct dn_ht *);
+
+enum { /* flags values.
+ * first two are returned by the scan callback to indicate
+ * to delete the matching element or to end the scan
+ */
+ DNHT_SCAN_DEL = 0x0001,
+ DNHT_SCAN_END = 0x0002,
+ DNHT_KEY_IS_OBJ = 0x0004, /* key is the obj pointer */
+ DNHT_MATCH_PTR = 0x0008, /* match by pointer, not match() */
+ DNHT_INSERT = 0x0010, /* insert if not found */
+ DNHT_UNIQUE = 0x0020, /* report error if already there */
+ DNHT_REMOVE = 0x0040, /* remove on find or dn_ht_free */
+};
+
+#endif /* _IP_DN_HEAP_H */
diff --git a/sys/netpfil/ipfw/dn_sched.h b/sys/netpfil/ipfw/dn_sched.h
new file mode 100644
index 0000000..ab823fe
--- /dev/null
+++ b/sys/netpfil/ipfw/dn_sched.h
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2010 Riccardo Panicucci, Luigi Rizzo, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * The API to write a packet scheduling algorithm for dummynet.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _DN_SCHED_H
+#define _DN_SCHED_H
+
+#define DN_MULTIQUEUE 0x01
+/*
+ * Descriptor for a scheduling algorithm.
+ * Contains all function pointers for a given scheduler
+ * This is typically created when a module is loaded, and stored
+ * in a global list of schedulers.
+ */
+struct dn_alg {
+ uint32_t type; /* the scheduler type */
+ const char *name; /* scheduler name */
+ uint32_t flags; /* DN_MULTIQUEUE if supports multiple queues */
+
+ /*
+ * The following define the size of 3 optional data structures
+ * that may need to be allocated at runtime, and are appended
+ * to each of the base data structures: scheduler, sched.inst,
+ * and queue. We don't have a per-flowset structure.
+ */
+ /* + parameters attached to the template, e.g.
+ * default queue sizes, weights, quantum size, and so on;
+ */
+ size_t schk_datalen;
+
+ /* + per-instance parameters, such as timestamps,
+ * containers for queues, etc;
+ */
+ size_t si_datalen;
+
+ size_t q_datalen; /* per-queue parameters (e.g. S,F) */
+
+ /*
+ * Methods implemented by the scheduler:
+ * enqueue enqueue packet 'm' on scheduler 's', queue 'q'.
+ * q is NULL for !MULTIQUEUE.
+ * Return 0 on success, 1 on drop (packet consumed anyways).
+ * Note that q should be interpreted only as a hint
+ * on the flow that the mbuf belongs to: while a
+ * scheduler will normally enqueue m into q, it is ok
+ * to leave q alone and put the mbuf elsewhere.
+ * This function is called in two cases:
+ * - when a new packet arrives to the scheduler;
+ * - when a scheduler is reconfigured. In this case the
+ * call is issued by the new_queue callback, with a
+ * non empty queue (q) and m pointing to the first
+ * mbuf in the queue. For this reason, the function
+ * should internally check for (m != q->mq.head)
+ * before calling dn_enqueue().
+ *
+ * dequeue Called when scheduler instance 's' can
+ * dequeue a packet. Return NULL if none are available.
+ * XXX what about non work-conserving ?
+ *
+ * config called on 'sched X config ...', normally writes
+ * in the area of size sch_arg
+ *
+ * destroy called on 'sched delete', frees everything
+ * in sch_arg (other parts are handled by more specific
+ * functions)
+ *
+ * new_sched called when a new instance is created, e.g.
+ * to create the local queue for !MULTIQUEUE, set V or
+ * copy parameters for WFQ, and so on.
+ *
+ * free_sched called when deleting an instance, cleans
+ * extra data in the per-instance area.
+ *
+ * new_fsk called when a flowset is linked to a scheduler,
+ * e.g. to validate parameters such as weights etc.
+ * free_fsk when a flowset is unlinked from a scheduler.
+ * (probably unnecessary)
+ *
+ * new_queue called to set the per-queue parameters,
+ * e.g. S and F, adjust sum of weights in the parent, etc.
+ *
+ * The new_queue callback is normally called from when
+ * creating a new queue. In some cases (such as a
+ * scheduler change or reconfiguration) it can be called
+ * with a non empty queue. In this case, the queue
+ * In case of non empty queue, the new_queue callback could
+ * need to call the enqueue function. In this case,
+ * the callback should eventually call enqueue() passing
+ * as m the first element in the queue.
+ *
+ * free_queue actions related to a queue removal, e.g. undo
+ * all the above. If the queue has data in it, also remove
+ * from the scheduler. This can e.g. happen during a reconfigure.
+ */
+ int (*enqueue)(struct dn_sch_inst *, struct dn_queue *,
+ struct mbuf *);
+ struct mbuf * (*dequeue)(struct dn_sch_inst *);
+
+ int (*config)(struct dn_schk *);
+ int (*destroy)(struct dn_schk*);
+ int (*new_sched)(struct dn_sch_inst *);
+ int (*free_sched)(struct dn_sch_inst *);
+ int (*new_fsk)(struct dn_fsk *f);
+ int (*free_fsk)(struct dn_fsk *f);
+ int (*new_queue)(struct dn_queue *q);
+ int (*free_queue)(struct dn_queue *q);
+
+ /* run-time fields */
+ int ref_count; /* XXX number of instances in the system */
+ SLIST_ENTRY(dn_alg) next; /* Next scheduler in the list */
+};
+
+/* MSVC does not support initializers so we need this ugly macro */
+#ifdef _WIN32
+#define _SI(fld)
+#else
+#define _SI(fld) fld
+#endif
+
+/*
+ * Additionally, dummynet exports some functions and macros
+ * to be used by schedulers:
+ */
+
+void dn_free_pkts(struct mbuf *mnext);
+int dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop);
+/* bound a variable between min and max */
+int ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg);
+
+/*
+ * Extract the head of a queue, update stats. Must be the very last
+ * thing done on a dequeue as the queue itself may go away.
+ */
+static __inline struct mbuf*
+dn_dequeue(struct dn_queue *q)
+{
+ struct mbuf *m = q->mq.head;
+ if (m == NULL)
+ return NULL;
+ q->mq.head = m->m_nextpkt;
+
+ /* Update stats for the queue */
+ q->ni.length--;
+ q->ni.len_bytes -= m->m_pkthdr.len;
+ if (q->_si) {
+ q->_si->ni.length--;
+ q->_si->ni.len_bytes -= m->m_pkthdr.len;
+ }
+ if (q->ni.length == 0) /* queue is now idle */
+ q->q_time = dn_cfg.curr_time;
+ return m;
+}
+
+int dn_sched_modevent(module_t mod, int cmd, void *arg);
+
+#define DECLARE_DNSCHED_MODULE(name, dnsched) \
+ static moduledata_t name##_mod = { \
+ #name, dn_sched_modevent, dnsched \
+ }; \
+ DECLARE_MODULE(name, name##_mod, \
+ SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); \
+ MODULE_DEPEND(name, dummynet, 3, 3, 3);
+#endif /* _DN_SCHED_H */
diff --git a/sys/netpfil/ipfw/dn_sched_fifo.c b/sys/netpfil/ipfw/dn_sched_fifo.c
new file mode 100644
index 0000000..e2aa608
--- /dev/null
+++ b/sys/netpfil/ipfw/dn_sched_fifo.c
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ */
+
+#ifdef _KERNEL
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/kernel.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <net/if.h> /* IFNAMSIZ */
+#include <netinet/in.h>
+#include <netinet/ip_var.h> /* ipfw_rule_ref */
+#include <netinet/ip_fw.h> /* flow_id */
+#include <netinet/ip_dummynet.h>
+#include <netpfil/ipfw/dn_heap.h>
+#include <netpfil/ipfw/ip_dn_private.h>
+#include <netpfil/ipfw/dn_sched.h>
+#else
+#include <dn_test.h>
+#endif
+
+/*
+ * This file implements a FIFO scheduler for a single queue.
+ * The queue is allocated as part of the scheduler instance,
+ * and there is a single flowset is in the template which stores
+ * queue size and policy.
+ * Enqueue and dequeue use the default library functions.
+ */
+static int
+fifo_enqueue(struct dn_sch_inst *si, struct dn_queue *q, struct mbuf *m)
+{
+ /* XXX if called with q != NULL and m=NULL, this is a
+ * re-enqueue from an existing scheduler, which we should
+ * handle.
+ */
+ return dn_enqueue((struct dn_queue *)(si+1), m, 0);
+}
+
+static struct mbuf *
+fifo_dequeue(struct dn_sch_inst *si)
+{
+ return dn_dequeue((struct dn_queue *)(si + 1));
+}
+
+static int
+fifo_new_sched(struct dn_sch_inst *si)
+{
+ /* This scheduler instance contains the queue */
+ struct dn_queue *q = (struct dn_queue *)(si + 1);
+
+ set_oid(&q->ni.oid, DN_QUEUE, sizeof(*q));
+ q->_si = si;
+ q->fs = si->sched->fs;
+ return 0;
+}
+
+static int
+fifo_free_sched(struct dn_sch_inst *si)
+{
+ struct dn_queue *q = (struct dn_queue *)(si + 1);
+ dn_free_pkts(q->mq.head);
+ bzero(q, sizeof(*q));
+ return 0;
+}
+
+/*
+ * FIFO scheduler descriptor
+ * contains the type of the scheduler, the name, the size of extra
+ * data structures, and function pointers.
+ */
+static struct dn_alg fifo_desc = {
+ _SI( .type = ) DN_SCHED_FIFO,
+ _SI( .name = ) "FIFO",
+ _SI( .flags = ) 0,
+
+ _SI( .schk_datalen = ) 0,
+ _SI( .si_datalen = ) sizeof(struct dn_queue),
+ _SI( .q_datalen = ) 0,
+
+ _SI( .enqueue = ) fifo_enqueue,
+ _SI( .dequeue = ) fifo_dequeue,
+ _SI( .config = ) NULL,
+ _SI( .destroy = ) NULL,
+ _SI( .new_sched = ) fifo_new_sched,
+ _SI( .free_sched = ) fifo_free_sched,
+ _SI( .new_fsk = ) NULL,
+ _SI( .free_fsk = ) NULL,
+ _SI( .new_queue = ) NULL,
+ _SI( .free_queue = ) NULL,
+};
+
+DECLARE_DNSCHED_MODULE(dn_fifo, &fifo_desc);
diff --git a/sys/netpfil/ipfw/dn_sched_prio.c b/sys/netpfil/ipfw/dn_sched_prio.c
new file mode 100644
index 0000000..b779515
--- /dev/null
+++ b/sys/netpfil/ipfw/dn_sched_prio.c
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ */
+#ifdef _KERNEL
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/kernel.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <net/if.h> /* IFNAMSIZ */
+#include <netinet/in.h>
+#include <netinet/ip_var.h> /* ipfw_rule_ref */
+#include <netinet/ip_fw.h> /* flow_id */
+#include <netinet/ip_dummynet.h>
+#include <netpfil/ipfw/dn_heap.h>
+#include <netpfil/ipfw/ip_dn_private.h>
+#include <netpfil/ipfw/dn_sched.h>
+#else
+#include <dn_test.h>
+#endif
+
+#define DN_SCHED_PRIO 5 //XXX
+
+#if !defined(_KERNEL) || !defined(__linux__)
+#define test_bit(ix, pData) ((*pData) & (1<<(ix)))
+#define __set_bit(ix, pData) (*pData) |= (1<<(ix))
+#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix))
+#endif
+
+#ifdef __MIPSEL__
+#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix))
+#endif
+
+/* Size of the array of queues pointers. */
+#define BITMAP_T unsigned long
+#define MAXPRIO (sizeof(BITMAP_T) * 8)
+
+/*
+ * The scheduler instance contains an array of pointers to queues,
+ * one for each priority, and a bitmap listing backlogged queues.
+ */
+struct prio_si {
+ BITMAP_T bitmap; /* array bitmap */
+ struct dn_queue *q_array[MAXPRIO]; /* Array of queues pointers */
+};
+
+/*
+ * If a queue with the same priority is already backlogged, use
+ * that one instead of the queue passed as argument.
+ */
+static int
+prio_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m)
+{
+ struct prio_si *si = (struct prio_si *)(_si + 1);
+ int prio = q->fs->fs.par[0];
+
+ if (test_bit(prio, &si->bitmap) == 0) {
+ /* No queue with this priority, insert */
+ __set_bit(prio, &si->bitmap);
+ si->q_array[prio] = q;
+ } else { /* use the existing queue */
+ q = si->q_array[prio];
+ }
+ if (dn_enqueue(q, m, 0))
+ return 1;
+ return 0;
+}
+
+/*
+ * Packets are dequeued only from the highest priority queue.
+ * The function ffs() return the lowest bit in the bitmap that rapresent
+ * the array index (-1) which contains the pointer to the highest priority
+ * queue.
+ * After the dequeue, if this queue become empty, it is index is removed
+ * from the bitmap.
+ * Scheduler is idle if the bitmap is empty
+ *
+ * NOTE: highest priority is 0, lowest is sched->max_prio_q
+ */
+static struct mbuf *
+prio_dequeue(struct dn_sch_inst *_si)
+{
+ struct prio_si *si = (struct prio_si *)(_si + 1);
+ struct mbuf *m;
+ struct dn_queue *q;
+ int prio;
+
+ if (si->bitmap == 0) /* scheduler idle */
+ return NULL;
+
+ prio = ffs(si->bitmap) - 1;
+
+ /* Take the highest priority queue in the scheduler */
+ q = si->q_array[prio];
+ // assert(q)
+
+ m = dn_dequeue(q);
+ if (q->mq.head == NULL) {
+ /* Queue is now empty, remove from scheduler
+ * and mark it
+ */
+ si->q_array[prio] = NULL;
+ __clear_bit(prio, &si->bitmap);
+ }
+ return m;
+}
+
+static int
+prio_new_sched(struct dn_sch_inst *_si)
+{
+ struct prio_si *si = (struct prio_si *)(_si + 1);
+
+ bzero(si->q_array, sizeof(si->q_array));
+ si->bitmap = 0;
+
+ return 0;
+}
+
+static int
+prio_new_fsk(struct dn_fsk *fs)
+{
+ /* Check if the prioritiy is between 0 and MAXPRIO-1 */
+ ipdn_bound_var(&fs->fs.par[0], 0, 0, MAXPRIO - 1, "PRIO priority");
+ return 0;
+}
+
+static int
+prio_new_queue(struct dn_queue *q)
+{
+ struct prio_si *si = (struct prio_si *)(q->_si + 1);
+ int prio = q->fs->fs.par[0];
+ struct dn_queue *oldq;
+
+ q->ni.oid.subtype = DN_SCHED_PRIO;
+
+ if (q->mq.head == NULL)
+ return 0;
+
+ /* Queue already full, must insert in the scheduler or append
+ * mbufs to existing queue. This partly duplicates prio_enqueue
+ */
+ if (test_bit(prio, &si->bitmap) == 0) {
+ /* No queue with this priority, insert */
+ __set_bit(prio, &si->bitmap);
+ si->q_array[prio] = q;
+ } else if ( (oldq = si->q_array[prio]) != q) {
+ /* must append to the existing queue.
+ * can simply append q->mq.head to q2->...
+ * and add the counters to those of q2
+ */
+ oldq->mq.tail->m_nextpkt = q->mq.head;
+ oldq->mq.tail = q->mq.tail;
+ oldq->ni.length += q->ni.length;
+ q->ni.length = 0;
+ oldq->ni.len_bytes += q->ni.len_bytes;
+ q->ni.len_bytes = 0;
+ q->mq.tail = q->mq.head = NULL;
+ }
+ return 0;
+}
+
+static int
+prio_free_queue(struct dn_queue *q)
+{
+ int prio = q->fs->fs.par[0];
+ struct prio_si *si = (struct prio_si *)(q->_si + 1);
+
+ if (si->q_array[prio] == q) {
+ si->q_array[prio] = NULL;
+ __clear_bit(prio, &si->bitmap);
+ }
+ return 0;
+}
+
+
+static struct dn_alg prio_desc = {
+ _SI( .type = ) DN_SCHED_PRIO,
+ _SI( .name = ) "PRIO",
+ _SI( .flags = ) DN_MULTIQUEUE,
+
+ /* we need extra space in the si and the queue */
+ _SI( .schk_datalen = ) 0,
+ _SI( .si_datalen = ) sizeof(struct prio_si),
+ _SI( .q_datalen = ) 0,
+
+ _SI( .enqueue = ) prio_enqueue,
+ _SI( .dequeue = ) prio_dequeue,
+
+ _SI( .config = ) NULL,
+ _SI( .destroy = ) NULL,
+ _SI( .new_sched = ) prio_new_sched,
+ _SI( .free_sched = ) NULL,
+
+ _SI( .new_fsk = ) prio_new_fsk,
+ _SI( .free_fsk = ) NULL,
+
+ _SI( .new_queue = ) prio_new_queue,
+ _SI( .free_queue = ) prio_free_queue,
+};
+
+
+DECLARE_DNSCHED_MODULE(dn_prio, &prio_desc);
diff --git a/sys/netpfil/ipfw/dn_sched_qfq.c b/sys/netpfil/ipfw/dn_sched_qfq.c
new file mode 100644
index 0000000..5bbff8a
--- /dev/null
+++ b/sys/netpfil/ipfw/dn_sched_qfq.c
@@ -0,0 +1,864 @@
+/*
+ * Copyright (c) 2010 Fabio Checconi, Luigi Rizzo, Paolo Valente
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ */
+
+#ifdef _KERNEL
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/kernel.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <net/if.h> /* IFNAMSIZ */
+#include <netinet/in.h>
+#include <netinet/ip_var.h> /* ipfw_rule_ref */
+#include <netinet/ip_fw.h> /* flow_id */
+#include <netinet/ip_dummynet.h>
+#include <netpfil/ipfw/dn_heap.h>
+#include <netpfil/ipfw/ip_dn_private.h>
+#include <netpfil/ipfw/dn_sched.h>
+#else
+#include <dn_test.h>
+#endif
+
+#ifdef QFQ_DEBUG
+struct qfq_sched;
+static void dump_sched(struct qfq_sched *q, const char *msg);
+#define NO(x) x
+#else
+#define NO(x)
+#endif
+#define DN_SCHED_QFQ 4 // XXX Where?
+typedef unsigned long bitmap;
+
+/*
+ * bitmaps ops are critical. Some linux versions have __fls
+ * and the bitmap ops. Some machines have ffs
+ */
+#if defined(_WIN32) || (defined(__MIPSEL__) && defined(LINUX_24))
+int fls(unsigned int n)
+{
+ int i = 0;
+ for (i = 0; n > 0; n >>= 1, i++)
+ ;
+ return i;
+}
+#endif
+
+#if !defined(_KERNEL) || defined( __FreeBSD__ ) || defined(_WIN32) || (defined(__MIPSEL__) && defined(LINUX_24))
+static inline unsigned long __fls(unsigned long word)
+{
+ return fls(word) - 1;
+}
+#endif
+
+#if !defined(_KERNEL) || !defined(__linux__)
+#ifdef QFQ_DEBUG
+int test_bit(int ix, bitmap *p)
+{
+ if (ix < 0 || ix > 31)
+ D("bad index %d", ix);
+ return *p & (1<<ix);
+}
+void __set_bit(int ix, bitmap *p)
+{
+ if (ix < 0 || ix > 31)
+ D("bad index %d", ix);
+ *p |= (1<<ix);
+}
+void __clear_bit(int ix, bitmap *p)
+{
+ if (ix < 0 || ix > 31)
+ D("bad index %d", ix);
+ *p &= ~(1<<ix);
+}
+#else /* !QFQ_DEBUG */
+/* XXX do we have fast version, or leave it to the compiler ? */
+#define test_bit(ix, pData) ((*pData) & (1<<(ix)))
+#define __set_bit(ix, pData) (*pData) |= (1<<(ix))
+#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix))
+#endif /* !QFQ_DEBUG */
+#endif /* !__linux__ */
+
+#ifdef __MIPSEL__
+#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix))
+#endif
+
+/*-------------------------------------------*/
+/*
+
+Virtual time computations.
+
+S, F and V are all computed in fixed point arithmetic with
+FRAC_BITS decimal bits.
+
+ QFQ_MAX_INDEX is the maximum index allowed for a group. We need
+ one bit per index.
+ QFQ_MAX_WSHIFT is the maximum power of two supported as a weight.
+ The layout of the bits is as below:
+
+ [ MTU_SHIFT ][ FRAC_BITS ]
+ [ MAX_INDEX ][ MIN_SLOT_SHIFT ]
+ ^.__grp->index = 0
+ *.__grp->slot_shift
+
+ where MIN_SLOT_SHIFT is derived by difference from the others.
+
+The max group index corresponds to Lmax/w_min, where
+Lmax=1<<MTU_SHIFT, w_min = 1 .
+From this, and knowing how many groups (MAX_INDEX) we want,
+we can derive the shift corresponding to each group.
+
+Because we often need to compute
+ F = S + len/w_i and V = V + len/wsum
+instead of storing w_i store the value
+ inv_w = (1<<FRAC_BITS)/w_i
+so we can do F = S + len * inv_w * wsum.
+We use W_TOT in the formulas so we can easily move between
+static and adaptive weight sum.
+
+The per-scheduler-instance data contain all the data structures
+for the scheduler: bitmaps and bucket lists.
+
+ */
+/*
+ * Maximum number of consecutive slots occupied by backlogged classes
+ * inside a group. This is approx lmax/lmin + 5.
+ * XXX check because it poses constraints on MAX_INDEX
+ */
+#define QFQ_MAX_SLOTS 32
+/*
+ * Shifts used for class<->group mapping. Class weights are
+ * in the range [1, QFQ_MAX_WEIGHT], we to map each class i to the
+ * group with the smallest index that can support the L_i / r_i
+ * configured for the class.
+ *
+ * grp->index is the index of the group; and grp->slot_shift
+ * is the shift for the corresponding (scaled) sigma_i.
+ *
+ * When computing the group index, we do (len<<FP_SHIFT)/weight,
+ * then compute an FLS (which is like a log2()), and if the result
+ * is below the MAX_INDEX region we use 0 (which is the same as
+ * using a larger len).
+ */
+#define QFQ_MAX_INDEX 19
+#define QFQ_MAX_WSHIFT 16 /* log2(max_weight) */
+
+#define QFQ_MAX_WEIGHT (1<<QFQ_MAX_WSHIFT)
+#define QFQ_MAX_WSUM (2*QFQ_MAX_WEIGHT)
+//#define IWSUM (q->i_wsum)
+#define IWSUM ((1<<FRAC_BITS)/QFQ_MAX_WSUM)
+
+#define FRAC_BITS 30 /* fixed point arithmetic */
+#define ONE_FP (1UL << FRAC_BITS)
+
+#define QFQ_MTU_SHIFT 11 /* log2(max_len) */
+#define QFQ_MIN_SLOT_SHIFT (FRAC_BITS + QFQ_MTU_SHIFT - QFQ_MAX_INDEX)
+
+/*
+ * Possible group states, also indexes for the bitmaps array in
+ * struct qfq_queue. We rely on ER, IR, EB, IB being numbered 0..3
+ */
+enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE };
+
+struct qfq_group;
+/*
+ * additional queue info. Some of this info should come from
+ * the flowset, we copy them here for faster processing.
+ * This is an overlay of the struct dn_queue
+ */
+struct qfq_class {
+ struct dn_queue _q;
+ uint64_t S, F; /* flow timestamps (exact) */
+ struct qfq_class *next; /* Link for the slot list. */
+
+ /* group we belong to. In principle we would need the index,
+ * which is log_2(lmax/weight), but we never reference it
+ * directly, only the group.
+ */
+ struct qfq_group *grp;
+
+ /* these are copied from the flowset. */
+ uint32_t inv_w; /* ONE_FP/weight */
+ uint32_t lmax; /* Max packet size for this flow. */
+};
+
+/* Group descriptor, see the paper for details.
+ * Basically this contains the bucket lists
+ */
+struct qfq_group {
+ uint64_t S, F; /* group timestamps (approx). */
+ unsigned int slot_shift; /* Slot shift. */
+ unsigned int index; /* Group index. */
+ unsigned int front; /* Index of the front slot. */
+ bitmap full_slots; /* non-empty slots */
+
+ /* Array of lists of active classes. */
+ struct qfq_class *slots[QFQ_MAX_SLOTS];
+};
+
+/* scheduler instance descriptor. */
+struct qfq_sched {
+ uint64_t V; /* Precise virtual time. */
+ uint32_t wsum; /* weight sum */
+ NO(uint32_t i_wsum; /* ONE_FP/w_sum */
+ uint32_t _queued; /* debugging */
+ uint32_t loops; /* debugging */)
+ bitmap bitmaps[QFQ_MAX_STATE]; /* Group bitmaps. */
+ struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */
+};
+
+/*---- support functions ----------------------------*/
+
+/* Generic comparison function, handling wraparound. */
+static inline int qfq_gt(uint64_t a, uint64_t b)
+{
+ return (int64_t)(a - b) > 0;
+}
+
+/* Round a precise timestamp to its slotted value. */
+static inline uint64_t qfq_round_down(uint64_t ts, unsigned int shift)
+{
+ return ts & ~((1ULL << shift) - 1);
+}
+
+/* return the pointer to the group with lowest index in the bitmap */
+static inline struct qfq_group *qfq_ffs(struct qfq_sched *q,
+ unsigned long bitmap)
+{
+ int index = ffs(bitmap) - 1; // zero-based
+ return &q->groups[index];
+}
+
+/*
+ * Calculate a flow index, given its weight and maximum packet length.
+ * index = log_2(maxlen/weight) but we need to apply the scaling.
+ * This is used only once at flow creation.
+ */
+static int qfq_calc_index(uint32_t inv_w, unsigned int maxlen)
+{
+ uint64_t slot_size = (uint64_t)maxlen *inv_w;
+ unsigned long size_map;
+ int index = 0;
+
+ size_map = (unsigned long)(slot_size >> QFQ_MIN_SLOT_SHIFT);
+ if (!size_map)
+ goto out;
+
+ index = __fls(size_map) + 1; // basically a log_2()
+ index -= !(slot_size - (1ULL << (index + QFQ_MIN_SLOT_SHIFT - 1)));
+
+ if (index < 0)
+ index = 0;
+
+out:
+ ND("W = %d, L = %d, I = %d\n", ONE_FP/inv_w, maxlen, index);
+ return index;
+}
+/*---- end support functions ----*/
+
+/*-------- API calls --------------------------------*/
+/*
+ * Validate and copy parameters from flowset.
+ */
+static int
+qfq_new_queue(struct dn_queue *_q)
+{
+ struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1);
+ struct qfq_class *cl = (struct qfq_class *)_q;
+ int i;
+ uint32_t w; /* approximated weight */
+
+ /* import parameters from the flowset. They should be correct
+ * already.
+ */
+ w = _q->fs->fs.par[0];
+ cl->lmax = _q->fs->fs.par[1];
+ if (!w || w > QFQ_MAX_WEIGHT) {
+ w = 1;
+ D("rounding weight to 1");
+ }
+ cl->inv_w = ONE_FP/w;
+ w = ONE_FP/cl->inv_w;
+ if (q->wsum + w > QFQ_MAX_WSUM)
+ return EINVAL;
+
+ i = qfq_calc_index(cl->inv_w, cl->lmax);
+ cl->grp = &q->groups[i];
+ q->wsum += w;
+ // XXX cl->S = q->V; ?
+ // XXX compute q->i_wsum
+ return 0;
+}
+
+/* remove an empty queue */
+static int
+qfq_free_queue(struct dn_queue *_q)
+{
+ struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1);
+ struct qfq_class *cl = (struct qfq_class *)_q;
+ if (cl->inv_w) {
+ q->wsum -= ONE_FP/cl->inv_w;
+ cl->inv_w = 0; /* reset weight to avoid run twice */
+ }
+ return 0;
+}
+
+/* Calculate a mask to mimic what would be ffs_from(). */
+static inline unsigned long
+mask_from(unsigned long bitmap, int from)
+{
+ return bitmap & ~((1UL << from) - 1);
+}
+
+/*
+ * The state computation relies on ER=0, IR=1, EB=2, IB=3
+ * First compute eligibility comparing grp->S, q->V,
+ * then check if someone is blocking us and possibly add EB
+ */
+static inline unsigned int
+qfq_calc_state(struct qfq_sched *q, struct qfq_group *grp)
+{
+ /* if S > V we are not eligible */
+ unsigned int state = qfq_gt(grp->S, q->V);
+ unsigned long mask = mask_from(q->bitmaps[ER], grp->index);
+ struct qfq_group *next;
+
+ if (mask) {
+ next = qfq_ffs(q, mask);
+ if (qfq_gt(grp->F, next->F))
+ state |= EB;
+ }
+
+ return state;
+}
+
+/*
+ * In principle
+ * q->bitmaps[dst] |= q->bitmaps[src] & mask;
+ * q->bitmaps[src] &= ~mask;
+ * but we should make sure that src != dst
+ */
+static inline void
+qfq_move_groups(struct qfq_sched *q, unsigned long mask, int src, int dst)
+{
+ q->bitmaps[dst] |= q->bitmaps[src] & mask;
+ q->bitmaps[src] &= ~mask;
+}
+
+static inline void
+qfq_unblock_groups(struct qfq_sched *q, int index, uint64_t old_finish)
+{
+ unsigned long mask = mask_from(q->bitmaps[ER], index + 1);
+ struct qfq_group *next;
+
+ if (mask) {
+ next = qfq_ffs(q, mask);
+ if (!qfq_gt(next->F, old_finish))
+ return;
+ }
+
+ mask = (1UL << index) - 1;
+ qfq_move_groups(q, mask, EB, ER);
+ qfq_move_groups(q, mask, IB, IR);
+}
+
+/*
+ * perhaps
+ *
+ old_V ^= q->V;
+ old_V >>= QFQ_MIN_SLOT_SHIFT;
+ if (old_V) {
+ ...
+ }
+ *
+ */
+static inline void
+qfq_make_eligible(struct qfq_sched *q, uint64_t old_V)
+{
+ unsigned long mask, vslot, old_vslot;
+
+ vslot = q->V >> QFQ_MIN_SLOT_SHIFT;
+ old_vslot = old_V >> QFQ_MIN_SLOT_SHIFT;
+
+ if (vslot != old_vslot) {
+ mask = (2UL << (__fls(vslot ^ old_vslot))) - 1;
+ qfq_move_groups(q, mask, IR, ER);
+ qfq_move_groups(q, mask, IB, EB);
+ }
+}
+
+/*
+ * XXX we should make sure that slot becomes less than 32.
+ * This is guaranteed by the input values.
+ * roundedS is always cl->S rounded on grp->slot_shift bits.
+ */
+static inline void
+qfq_slot_insert(struct qfq_group *grp, struct qfq_class *cl, uint64_t roundedS)
+{
+ uint64_t slot = (roundedS - grp->S) >> grp->slot_shift;
+ unsigned int i = (grp->front + slot) % QFQ_MAX_SLOTS;
+
+ cl->next = grp->slots[i];
+ grp->slots[i] = cl;
+ __set_bit(slot, &grp->full_slots);
+}
+
+/*
+ * remove the entry from the slot
+ */
+static inline void
+qfq_front_slot_remove(struct qfq_group *grp)
+{
+ struct qfq_class **h = &grp->slots[grp->front];
+
+ *h = (*h)->next;
+ if (!*h)
+ __clear_bit(0, &grp->full_slots);
+}
+
+/*
+ * Returns the first full queue in a group. As a side effect,
+ * adjust the bucket list so the first non-empty bucket is at
+ * position 0 in full_slots.
+ */
+static inline struct qfq_class *
+qfq_slot_scan(struct qfq_group *grp)
+{
+ int i;
+
+ ND("grp %d full %x", grp->index, grp->full_slots);
+ if (!grp->full_slots)
+ return NULL;
+
+ i = ffs(grp->full_slots) - 1; // zero-based
+ if (i > 0) {
+ grp->front = (grp->front + i) % QFQ_MAX_SLOTS;
+ grp->full_slots >>= i;
+ }
+
+ return grp->slots[grp->front];
+}
+
+/*
+ * adjust the bucket list. When the start time of a group decreases,
+ * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to
+ * move the objects. The mask of occupied slots must be shifted
+ * because we use ffs() to find the first non-empty slot.
+ * This covers decreases in the group's start time, but what about
+ * increases of the start time ?
+ * Here too we should make sure that i is less than 32
+ */
+static inline void
+qfq_slot_rotate(struct qfq_sched *q, struct qfq_group *grp, uint64_t roundedS)
+{
+ unsigned int i = (grp->S - roundedS) >> grp->slot_shift;
+
+ grp->full_slots <<= i;
+ grp->front = (grp->front - i) % QFQ_MAX_SLOTS;
+}
+
+
+static inline void
+qfq_update_eligible(struct qfq_sched *q, uint64_t old_V)
+{
+ bitmap ineligible;
+
+ ineligible = q->bitmaps[IR] | q->bitmaps[IB];
+ if (ineligible) {
+ if (!q->bitmaps[ER]) {
+ struct qfq_group *grp;
+ grp = qfq_ffs(q, ineligible);
+ if (qfq_gt(grp->S, q->V))
+ q->V = grp->S;
+ }
+ qfq_make_eligible(q, old_V);
+ }
+}
+
+/*
+ * Updates the class, returns true if also the group needs to be updated.
+ */
+static inline int
+qfq_update_class(struct qfq_sched *q, struct qfq_group *grp,
+ struct qfq_class *cl)
+{
+
+ cl->S = cl->F;
+ if (cl->_q.mq.head == NULL) {
+ qfq_front_slot_remove(grp);
+ } else {
+ unsigned int len;
+ uint64_t roundedS;
+
+ len = cl->_q.mq.head->m_pkthdr.len;
+ cl->F = cl->S + (uint64_t)len * cl->inv_w;
+ roundedS = qfq_round_down(cl->S, grp->slot_shift);
+ if (roundedS == grp->S)
+ return 0;
+
+ qfq_front_slot_remove(grp);
+ qfq_slot_insert(grp, cl, roundedS);
+ }
+ return 1;
+}
+
+static struct mbuf *
+qfq_dequeue(struct dn_sch_inst *si)
+{
+ struct qfq_sched *q = (struct qfq_sched *)(si + 1);
+ struct qfq_group *grp;
+ struct qfq_class *cl;
+ struct mbuf *m;
+ uint64_t old_V;
+
+ NO(q->loops++;)
+ if (!q->bitmaps[ER]) {
+ NO(if (q->queued)
+ dump_sched(q, "start dequeue");)
+ return NULL;
+ }
+
+ grp = qfq_ffs(q, q->bitmaps[ER]);
+
+ cl = grp->slots[grp->front];
+ /* extract from the first bucket in the bucket list */
+ m = dn_dequeue(&cl->_q);
+
+ if (!m) {
+ D("BUG/* non-workconserving leaf */");
+ return NULL;
+ }
+ NO(q->queued--;)
+ old_V = q->V;
+ q->V += (uint64_t)m->m_pkthdr.len * IWSUM;
+ ND("m is %p F 0x%llx V now 0x%llx", m, cl->F, q->V);
+
+ if (qfq_update_class(q, grp, cl)) {
+ uint64_t old_F = grp->F;
+ cl = qfq_slot_scan(grp);
+ if (!cl) { /* group gone, remove from ER */
+ __clear_bit(grp->index, &q->bitmaps[ER]);
+ // grp->S = grp->F + 1; // XXX debugging only
+ } else {
+ uint64_t roundedS = qfq_round_down(cl->S, grp->slot_shift);
+ unsigned int s;
+
+ if (grp->S == roundedS)
+ goto skip_unblock;
+ grp->S = roundedS;
+ grp->F = roundedS + (2ULL << grp->slot_shift);
+ /* remove from ER and put in the new set */
+ __clear_bit(grp->index, &q->bitmaps[ER]);
+ s = qfq_calc_state(q, grp);
+ __set_bit(grp->index, &q->bitmaps[s]);
+ }
+ /* we need to unblock even if the group has gone away */
+ qfq_unblock_groups(q, grp->index, old_F);
+ }
+
+skip_unblock:
+ qfq_update_eligible(q, old_V);
+ NO(if (!q->bitmaps[ER] && q->queued)
+ dump_sched(q, "end dequeue");)
+
+ return m;
+}
+
+/*
+ * Assign a reasonable start time for a new flow k in group i.
+ * Admissible values for \hat(F) are multiples of \sigma_i
+ * no greater than V+\sigma_i . Larger values mean that
+ * we had a wraparound so we consider the timestamp to be stale.
+ *
+ * If F is not stale and F >= V then we set S = F.
+ * Otherwise we should assign S = V, but this may violate
+ * the ordering in ER. So, if we have groups in ER, set S to
+ * the F_j of the first group j which would be blocking us.
+ * We are guaranteed not to move S backward because
+ * otherwise our group i would still be blocked.
+ */
+static inline void
+qfq_update_start(struct qfq_sched *q, struct qfq_class *cl)
+{
+ unsigned long mask;
+ uint64_t limit, roundedF;
+ int slot_shift = cl->grp->slot_shift;
+
+ roundedF = qfq_round_down(cl->F, slot_shift);
+ limit = qfq_round_down(q->V, slot_shift) + (1UL << slot_shift);
+
+ if (!qfq_gt(cl->F, q->V) || qfq_gt(roundedF, limit)) {
+ /* timestamp was stale */
+ mask = mask_from(q->bitmaps[ER], cl->grp->index);
+ if (mask) {
+ struct qfq_group *next = qfq_ffs(q, mask);
+ if (qfq_gt(roundedF, next->F)) {
+ cl->S = next->F;
+ return;
+ }
+ }
+ cl->S = q->V;
+ } else { /* timestamp is not stale */
+ cl->S = cl->F;
+ }
+}
+
+static int
+qfq_enqueue(struct dn_sch_inst *si, struct dn_queue *_q, struct mbuf *m)
+{
+ struct qfq_sched *q = (struct qfq_sched *)(si + 1);
+ struct qfq_group *grp;
+ struct qfq_class *cl = (struct qfq_class *)_q;
+ uint64_t roundedS;
+ int s;
+
+ NO(q->loops++;)
+ DX(4, "len %d flow %p inv_w 0x%x grp %d", m->m_pkthdr.len,
+ _q, cl->inv_w, cl->grp->index);
+ /* XXX verify that the packet obeys the parameters */
+ if (m != _q->mq.head) {
+ if (dn_enqueue(_q, m, 0)) /* packet was dropped */
+ return 1;
+ NO(q->queued++;)
+ if (m != _q->mq.head)
+ return 0;
+ }
+ /* If reach this point, queue q was idle */
+ grp = cl->grp;
+ qfq_update_start(q, cl); /* adjust start time */
+ /* compute new finish time and rounded start. */
+ cl->F = cl->S + (uint64_t)(m->m_pkthdr.len) * cl->inv_w;
+ roundedS = qfq_round_down(cl->S, grp->slot_shift);
+
+ /*
+ * insert cl in the correct bucket.
+ * If cl->S >= grp->S we don't need to adjust the
+ * bucket list and simply go to the insertion phase.
+ * Otherwise grp->S is decreasing, we must make room
+ * in the bucket list, and also recompute the group state.
+ * Finally, if there were no flows in this group and nobody
+ * was in ER make sure to adjust V.
+ */
+ if (grp->full_slots) {
+ if (!qfq_gt(grp->S, cl->S))
+ goto skip_update;
+ /* create a slot for this cl->S */
+ qfq_slot_rotate(q, grp, roundedS);
+ /* group was surely ineligible, remove */
+ __clear_bit(grp->index, &q->bitmaps[IR]);
+ __clear_bit(grp->index, &q->bitmaps[IB]);
+ } else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V))
+ q->V = roundedS;
+
+ grp->S = roundedS;
+ grp->F = roundedS + (2ULL << grp->slot_shift); // i.e. 2\sigma_i
+ s = qfq_calc_state(q, grp);
+ __set_bit(grp->index, &q->bitmaps[s]);
+ ND("new state %d 0x%x", s, q->bitmaps[s]);
+ ND("S %llx F %llx V %llx", cl->S, cl->F, q->V);
+skip_update:
+ qfq_slot_insert(grp, cl, roundedS);
+
+ return 0;
+}
+
+
+#if 0
+static inline void
+qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp,
+ struct qfq_class *cl, struct qfq_class **pprev)
+{
+ unsigned int i, offset;
+ uint64_t roundedS;
+
+ roundedS = qfq_round_down(cl->S, grp->slot_shift);
+ offset = (roundedS - grp->S) >> grp->slot_shift;
+ i = (grp->front + offset) % QFQ_MAX_SLOTS;
+
+#ifdef notyet
+ if (!pprev) {
+ pprev = &grp->slots[i];
+ while (*pprev && *pprev != cl)
+ pprev = &(*pprev)->next;
+ }
+#endif
+
+ *pprev = cl->next;
+ if (!grp->slots[i])
+ __clear_bit(offset, &grp->full_slots);
+}
+
+/*
+ * called to forcibly destroy a queue.
+ * If the queue is not in the front bucket, or if it has
+ * other queues in the front bucket, we can simply remove
+ * the queue with no other side effects.
+ * Otherwise we must propagate the event up.
+ * XXX description to be completed.
+ */
+static void
+qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl,
+ struct qfq_class **pprev)
+{
+ struct qfq_group *grp = &q->groups[cl->index];
+ unsigned long mask;
+ uint64_t roundedS;
+ int s;
+
+ cl->F = cl->S; // not needed if the class goes away.
+ qfq_slot_remove(q, grp, cl, pprev);
+
+ if (!grp->full_slots) {
+ /* nothing left in the group, remove from all sets.
+ * Do ER last because if we were blocking other groups
+ * we must unblock them.
+ */
+ __clear_bit(grp->index, &q->bitmaps[IR]);
+ __clear_bit(grp->index, &q->bitmaps[EB]);
+ __clear_bit(grp->index, &q->bitmaps[IB]);
+
+ if (test_bit(grp->index, &q->bitmaps[ER]) &&
+ !(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) {
+ mask = q->bitmaps[ER] & ((1UL << grp->index) - 1);
+ if (mask)
+ mask = ~((1UL << __fls(mask)) - 1);
+ else
+ mask = ~0UL;
+ qfq_move_groups(q, mask, EB, ER);
+ qfq_move_groups(q, mask, IB, IR);
+ }
+ __clear_bit(grp->index, &q->bitmaps[ER]);
+ } else if (!grp->slots[grp->front]) {
+ cl = qfq_slot_scan(grp);
+ roundedS = qfq_round_down(cl->S, grp->slot_shift);
+ if (grp->S != roundedS) {
+ __clear_bit(grp->index, &q->bitmaps[ER]);
+ __clear_bit(grp->index, &q->bitmaps[IR]);
+ __clear_bit(grp->index, &q->bitmaps[EB]);
+ __clear_bit(grp->index, &q->bitmaps[IB]);
+ grp->S = roundedS;
+ grp->F = roundedS + (2ULL << grp->slot_shift);
+ s = qfq_calc_state(q, grp);
+ __set_bit(grp->index, &q->bitmaps[s]);
+ }
+ }
+ qfq_update_eligible(q, q->V);
+}
+#endif
+
+static int
+qfq_new_fsk(struct dn_fsk *f)
+{
+ ipdn_bound_var(&f->fs.par[0], 1, 1, QFQ_MAX_WEIGHT, "qfq weight");
+ ipdn_bound_var(&f->fs.par[1], 1500, 1, 2000, "qfq maxlen");
+ ND("weight %d len %d\n", f->fs.par[0], f->fs.par[1]);
+ return 0;
+}
+
+/*
+ * initialize a new scheduler instance
+ */
+static int
+qfq_new_sched(struct dn_sch_inst *si)
+{
+ struct qfq_sched *q = (struct qfq_sched *)(si + 1);
+ struct qfq_group *grp;
+ int i;
+
+ for (i = 0; i <= QFQ_MAX_INDEX; i++) {
+ grp = &q->groups[i];
+ grp->index = i;
+ grp->slot_shift = QFQ_MTU_SHIFT + FRAC_BITS -
+ (QFQ_MAX_INDEX - i);
+ }
+ return 0;
+}
+
+/*
+ * QFQ scheduler descriptor
+ */
+static struct dn_alg qfq_desc = {
+ _SI( .type = ) DN_SCHED_QFQ,
+ _SI( .name = ) "QFQ",
+ _SI( .flags = ) DN_MULTIQUEUE,
+
+ _SI( .schk_datalen = ) 0,
+ _SI( .si_datalen = ) sizeof(struct qfq_sched),
+ _SI( .q_datalen = ) sizeof(struct qfq_class) - sizeof(struct dn_queue),
+
+ _SI( .enqueue = ) qfq_enqueue,
+ _SI( .dequeue = ) qfq_dequeue,
+
+ _SI( .config = ) NULL,
+ _SI( .destroy = ) NULL,
+ _SI( .new_sched = ) qfq_new_sched,
+ _SI( .free_sched = ) NULL,
+ _SI( .new_fsk = ) qfq_new_fsk,
+ _SI( .free_fsk = ) NULL,
+ _SI( .new_queue = ) qfq_new_queue,
+ _SI( .free_queue = ) qfq_free_queue,
+};
+
+DECLARE_DNSCHED_MODULE(dn_qfq, &qfq_desc);
+
+#ifdef QFQ_DEBUG
+static void
+dump_groups(struct qfq_sched *q, uint32_t mask)
+{
+ int i, j;
+
+ for (i = 0; i < QFQ_MAX_INDEX + 1; i++) {
+ struct qfq_group *g = &q->groups[i];
+
+ if (0 == (mask & (1<<i)))
+ continue;
+ for (j = 0; j < QFQ_MAX_SLOTS; j++) {
+ if (g->slots[j])
+ D(" bucket %d %p", j, g->slots[j]);
+ }
+ D("full_slots 0x%x", g->full_slots);
+ D(" %2d S 0x%20llx F 0x%llx %c", i,
+ g->S, g->F,
+ mask & (1<<i) ? '1' : '0');
+ }
+}
+
+static void
+dump_sched(struct qfq_sched *q, const char *msg)
+{
+ D("--- in %s: ---", msg);
+ ND("loops %d queued %d V 0x%llx", q->loops, q->queued, q->V);
+ D(" ER 0x%08x", q->bitmaps[ER]);
+ D(" EB 0x%08x", q->bitmaps[EB]);
+ D(" IR 0x%08x", q->bitmaps[IR]);
+ D(" IB 0x%08x", q->bitmaps[IB]);
+ dump_groups(q, 0xffffffff);
+};
+#endif /* QFQ_DEBUG */
diff --git a/sys/netpfil/ipfw/dn_sched_rr.c b/sys/netpfil/ipfw/dn_sched_rr.c
new file mode 100644
index 0000000..dd608d7
--- /dev/null
+++ b/sys/netpfil/ipfw/dn_sched_rr.c
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ */
+
+#ifdef _KERNEL
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/kernel.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <net/if.h> /* IFNAMSIZ */
+#include <netinet/in.h>
+#include <netinet/ip_var.h> /* ipfw_rule_ref */
+#include <netinet/ip_fw.h> /* flow_id */
+#include <netinet/ip_dummynet.h>
+#include <netpfil/ipfw/dn_heap.h>
+#include <netpfil/ipfw/ip_dn_private.h>
+#include <netpfil/ipfw/dn_sched.h>
+#else
+#include <dn_test.h>
+#endif
+
+#define DN_SCHED_RR 3 // XXX Where?
+
+struct rr_queue {
+ struct dn_queue q; /* Standard queue */
+ int status; /* 1: queue is in the list */
+ int credit; /* Number of bytes to transmit */
+ int quantum; /* quantum * C */
+ struct rr_queue *qnext; /* */
+};
+
+/* struct rr_schk contains global config parameters
+ * and is right after dn_schk
+ */
+struct rr_schk {
+ int min_q; /* Min quantum */
+ int max_q; /* Max quantum */
+ int q_bytes; /* Bytes per quantum */
+};
+
+/* per-instance round robin list, right after dn_sch_inst */
+struct rr_si {
+ struct rr_queue *head, *tail; /* Pointer to current queue */
+};
+
+/* Append a queue to the rr list */
+static inline void
+rr_append(struct rr_queue *q, struct rr_si *si)
+{
+ q->status = 1; /* mark as in-rr_list */
+ q->credit = q->quantum; /* initialize credit */
+
+ /* append to the tail */
+ if (si->head == NULL)
+ si->head = q;
+ else
+ si->tail->qnext = q;
+ si->tail = q; /* advance the tail pointer */
+ q->qnext = si->head; /* make it circular */
+}
+
+/* Remove the head queue from circular list. */
+static inline void
+rr_remove_head(struct rr_si *si)
+{
+ if (si->head == NULL)
+ return; /* empty queue */
+ si->head->status = 0;
+
+ if (si->head == si->tail) {
+ si->head = si->tail = NULL;
+ return;
+ }
+
+ si->head = si->head->qnext;
+ si->tail->qnext = si->head;
+}
+
+/* Remove a queue from circular list.
+ * XXX see if ti can be merge with remove_queue()
+ */
+static inline void
+remove_queue_q(struct rr_queue *q, struct rr_si *si)
+{
+ struct rr_queue *prev;
+
+ if (q->status != 1)
+ return;
+ if (q == si->head) {
+ rr_remove_head(si);
+ return;
+ }
+
+ for (prev = si->head; prev; prev = prev->qnext) {
+ if (prev->qnext != q)
+ continue;
+ prev->qnext = q->qnext;
+ if (q == si->tail)
+ si->tail = prev;
+ q->status = 0;
+ break;
+ }
+}
+
+
+static inline void
+next_pointer(struct rr_si *si)
+{
+ if (si->head == NULL)
+ return; /* empty queue */
+
+ si->head = si->head->qnext;
+ si->tail = si->tail->qnext;
+}
+
+static int
+rr_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m)
+{
+ struct rr_si *si;
+ struct rr_queue *rrq;
+
+ if (m != q->mq.head) {
+ if (dn_enqueue(q, m, 0)) /* packet was dropped */
+ return 1;
+ if (m != q->mq.head)
+ return 0;
+ }
+
+ /* If reach this point, queue q was idle */
+ si = (struct rr_si *)(_si + 1);
+ rrq = (struct rr_queue *)q;
+
+ if (rrq->status == 1) /* Queue is already in the queue list */
+ return 0;
+
+ /* Insert the queue in the queue list */
+ rr_append(rrq, si);
+
+ return 0;
+}
+
+static struct mbuf *
+rr_dequeue(struct dn_sch_inst *_si)
+{
+ /* Access scheduler instance private data */
+ struct rr_si *si = (struct rr_si *)(_si + 1);
+ struct rr_queue *rrq;
+ uint64_t len;
+
+ while ( (rrq = si->head) ) {
+ struct mbuf *m = rrq->q.mq.head;
+ if ( m == NULL) {
+ /* empty queue, remove from list */
+ rr_remove_head(si);
+ continue;
+ }
+ len = m->m_pkthdr.len;
+
+ if (len > rrq->credit) {
+ /* Packet too big */
+ rrq->credit += rrq->quantum;
+ /* Try next queue */
+ next_pointer(si);
+ } else {
+ rrq->credit -= len;
+ return dn_dequeue(&rrq->q);
+ }
+ }
+
+ /* no packet to dequeue*/
+ return NULL;
+}
+
+static int
+rr_config(struct dn_schk *_schk)
+{
+ struct rr_schk *schk = (struct rr_schk *)(_schk + 1);
+ ND("called");
+
+ /* use reasonable quantums (64..2k bytes, default 1500) */
+ schk->min_q = 64;
+ schk->max_q = 2048;
+ schk->q_bytes = 1500; /* quantum */
+
+ return 0;
+}
+
+static int
+rr_new_sched(struct dn_sch_inst *_si)
+{
+ struct rr_si *si = (struct rr_si *)(_si + 1);
+
+ ND("called");
+ si->head = si->tail = NULL;
+
+ return 0;
+}
+
+static int
+rr_free_sched(struct dn_sch_inst *_si)
+{
+ ND("called");
+ /* Nothing to do? */
+ return 0;
+}
+
+static int
+rr_new_fsk(struct dn_fsk *fs)
+{
+ struct rr_schk *schk = (struct rr_schk *)(fs->sched + 1);
+ /* par[0] is the weight, par[1] is the quantum step */
+ ipdn_bound_var(&fs->fs.par[0], 1,
+ 1, 65536, "RR weight");
+ ipdn_bound_var(&fs->fs.par[1], schk->q_bytes,
+ schk->min_q, schk->max_q, "RR quantum");
+ return 0;
+}
+
+static int
+rr_new_queue(struct dn_queue *_q)
+{
+ struct rr_queue *q = (struct rr_queue *)_q;
+
+ _q->ni.oid.subtype = DN_SCHED_RR;
+
+ q->quantum = _q->fs->fs.par[0] * _q->fs->fs.par[1];
+ ND("called, q->quantum %d", q->quantum);
+ q->credit = q->quantum;
+ q->status = 0;
+
+ if (_q->mq.head != NULL) {
+ /* Queue NOT empty, insert in the queue list */
+ rr_append(q, (struct rr_si *)(_q->_si + 1));
+ }
+ return 0;
+}
+
+static int
+rr_free_queue(struct dn_queue *_q)
+{
+ struct rr_queue *q = (struct rr_queue *)_q;
+
+ ND("called");
+ if (q->status == 1) {
+ struct rr_si *si = (struct rr_si *)(_q->_si + 1);
+ remove_queue_q(q, si);
+ }
+ return 0;
+}
+
+/*
+ * RR scheduler descriptor
+ * contains the type of the scheduler, the name, the size of the
+ * structures and function pointers.
+ */
+static struct dn_alg rr_desc = {
+ _SI( .type = ) DN_SCHED_RR,
+ _SI( .name = ) "RR",
+ _SI( .flags = ) DN_MULTIQUEUE,
+
+ _SI( .schk_datalen = ) 0,
+ _SI( .si_datalen = ) sizeof(struct rr_si),
+ _SI( .q_datalen = ) sizeof(struct rr_queue) - sizeof(struct dn_queue),
+
+ _SI( .enqueue = ) rr_enqueue,
+ _SI( .dequeue = ) rr_dequeue,
+
+ _SI( .config = ) rr_config,
+ _SI( .destroy = ) NULL,
+ _SI( .new_sched = ) rr_new_sched,
+ _SI( .free_sched = ) rr_free_sched,
+ _SI( .new_fsk = ) rr_new_fsk,
+ _SI( .free_fsk = ) NULL,
+ _SI( .new_queue = ) rr_new_queue,
+ _SI( .free_queue = ) rr_free_queue,
+};
+
+
+DECLARE_DNSCHED_MODULE(dn_rr, &rr_desc);
diff --git a/sys/netpfil/ipfw/dn_sched_wf2q.c b/sys/netpfil/ipfw/dn_sched_wf2q.c
new file mode 100644
index 0000000..a91c1ce
--- /dev/null
+++ b/sys/netpfil/ipfw/dn_sched_wf2q.c
@@ -0,0 +1,373 @@
+/*
+ * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
+ * Copyright (c) 2000-2002 Luigi Rizzo, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ */
+
+#ifdef _KERNEL
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/kernel.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <net/if.h> /* IFNAMSIZ */
+#include <netinet/in.h>
+#include <netinet/ip_var.h> /* ipfw_rule_ref */
+#include <netinet/ip_fw.h> /* flow_id */
+#include <netinet/ip_dummynet.h>
+#include <netpfil/ipfw/dn_heap.h>
+#include <netpfil/ipfw/ip_dn_private.h>
+#include <netpfil/ipfw/dn_sched.h>
+#else
+#include <dn_test.h>
+#endif
+
+#ifndef MAX64
+#define MAX64(x,y) (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x)
+#endif
+
+/*
+ * timestamps are computed on 64 bit using fixed point arithmetic.
+ * LMAX_BITS, WMAX_BITS are the max number of bits for the packet len
+ * and sum of weights, respectively. FRAC_BITS is the number of
+ * fractional bits. We want FRAC_BITS >> WMAX_BITS to avoid too large
+ * errors when computing the inverse, FRAC_BITS < 32 so we can do 1/w
+ * using an unsigned 32-bit division, and to avoid wraparounds we need
+ * LMAX_BITS + WMAX_BITS + FRAC_BITS << 64
+ * As an example
+ * FRAC_BITS = 26, LMAX_BITS=14, WMAX_BITS = 19
+ */
+#ifndef FRAC_BITS
+#define FRAC_BITS 28 /* shift for fixed point arithmetic */
+#define ONE_FP (1UL << FRAC_BITS)
+#endif
+
+/*
+ * Private information for the scheduler instance:
+ * sch_heap (key is Finish time) returns the next queue to serve
+ * ne_heap (key is Start time) stores not-eligible queues
+ * idle_heap (key=start/finish time) stores idle flows. It must
+ * support extract-from-middle.
+ * A flow is only in 1 of the three heaps.
+ * XXX todo: use a more efficient data structure, e.g. a tree sorted
+ * by F with min_subtree(S) in each node
+ */
+struct wf2qp_si {
+ struct dn_heap sch_heap; /* top extract - key Finish time */
+ struct dn_heap ne_heap; /* top extract - key Start time */
+ struct dn_heap idle_heap; /* random extract - key Start=Finish time */
+ uint64_t V; /* virtual time */
+ uint32_t inv_wsum; /* inverse of sum of weights */
+ uint32_t wsum; /* sum of weights */
+};
+
+struct wf2qp_queue {
+ struct dn_queue _q;
+ uint64_t S, F; /* start time, finish time */
+ uint32_t inv_w; /* ONE_FP / weight */
+ int32_t heap_pos; /* position (index) of struct in heap */
+};
+
+/*
+ * This file implements a WF2Q+ scheduler as it has been in dummynet
+ * since 2000.
+ * The scheduler supports per-flow queues and has O(log N) complexity.
+ *
+ * WF2Q+ needs to drain entries from the idle heap so that we
+ * can keep the sum of weights up to date. We can do it whenever
+ * we get a chance, or periodically, or following some other
+ * strategy. The function idle_check() drains at most N elements
+ * from the idle heap.
+ */
+static void
+idle_check(struct wf2qp_si *si, int n, int force)
+{
+ struct dn_heap *h = &si->idle_heap;
+ while (n-- > 0 && h->elements > 0 &&
+ (force || DN_KEY_LT(HEAP_TOP(h)->key, si->V))) {
+ struct dn_queue *q = HEAP_TOP(h)->object;
+ struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q;
+
+ heap_extract(h, NULL);
+ /* XXX to let the flowset delete the queue we should
+ * mark it as 'unused' by the scheduler.
+ */
+ alg_fq->S = alg_fq->F + 1; /* Mark timestamp as invalid. */
+ si->wsum -= q->fs->fs.par[0]; /* adjust sum of weights */
+ if (si->wsum > 0)
+ si->inv_wsum = ONE_FP/si->wsum;
+ }
+}
+
+static int
+wf2qp_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m)
+{
+ struct dn_fsk *fs = q->fs;
+ struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
+ struct wf2qp_queue *alg_fq;
+ uint64_t len = m->m_pkthdr.len;
+
+ if (m != q->mq.head) {
+ if (dn_enqueue(q, m, 0)) /* packet was dropped */
+ return 1;
+ if (m != q->mq.head) /* queue was already busy */
+ return 0;
+ }
+
+ /* If reach this point, queue q was idle */
+ alg_fq = (struct wf2qp_queue *)q;
+
+ if (DN_KEY_LT(alg_fq->F, alg_fq->S)) {
+ /* F<S means timestamps are invalid ->brand new queue. */
+ alg_fq->S = si->V; /* init start time */
+ si->wsum += fs->fs.par[0]; /* add weight of new queue. */
+ si->inv_wsum = ONE_FP/si->wsum;
+ } else { /* if it was idle then it was in the idle heap */
+ heap_extract(&si->idle_heap, q);
+ alg_fq->S = MAX64(alg_fq->F, si->V); /* compute new S */
+ }
+ alg_fq->F = alg_fq->S + len * alg_fq->inv_w;
+
+ /* if nothing is backlogged, make sure this flow is eligible */
+ if (si->ne_heap.elements == 0 && si->sch_heap.elements == 0)
+ si->V = MAX64(alg_fq->S, si->V);
+
+ /*
+ * Look at eligibility. A flow is not eligibile if S>V (when
+ * this happens, it means that there is some other flow already
+ * scheduled for the same pipe, so the sch_heap cannot be
+ * empty). If the flow is not eligible we just store it in the
+ * ne_heap. Otherwise, we store in the sch_heap.
+ * Note that for all flows in sch_heap (SCH), S_i <= V,
+ * and for all flows in ne_heap (NEH), S_i > V.
+ * So when we need to compute max(V, min(S_i)) forall i in
+ * SCH+NEH, we only need to look into NEH.
+ */
+ if (DN_KEY_LT(si->V, alg_fq->S)) {
+ /* S>V means flow Not eligible. */
+ if (si->sch_heap.elements == 0)
+ D("++ ouch! not eligible but empty scheduler!");
+ heap_insert(&si->ne_heap, alg_fq->S, q);
+ } else {
+ heap_insert(&si->sch_heap, alg_fq->F, q);
+ }
+ return 0;
+}
+
+/* XXX invariant: sch > 0 || V >= min(S in neh) */
+static struct mbuf *
+wf2qp_dequeue(struct dn_sch_inst *_si)
+{
+ /* Access scheduler instance private data */
+ struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
+ struct mbuf *m;
+ struct dn_queue *q;
+ struct dn_heap *sch = &si->sch_heap;
+ struct dn_heap *neh = &si->ne_heap;
+ struct wf2qp_queue *alg_fq;
+
+ if (sch->elements == 0 && neh->elements == 0) {
+ /* we have nothing to do. We could kill the idle heap
+ * altogether and reset V
+ */
+ idle_check(si, 0x7fffffff, 1);
+ si->V = 0;
+ si->wsum = 0; /* should be set already */
+ return NULL; /* quick return if nothing to do */
+ }
+ idle_check(si, 1, 0); /* drain something from the idle heap */
+
+ /* make sure at least one element is eligible, bumping V
+ * and moving entries that have become eligible.
+ * We need to repeat the first part twice, before and
+ * after extracting the candidate, or enqueue() will
+ * find the data structure in a wrong state.
+ */
+ m = NULL;
+ for(;;) {
+ /*
+ * Compute V = max(V, min(S_i)). Remember that all elements
+ * in sch have by definition S_i <= V so if sch is not empty,
+ * V is surely the max and we must not update it. Conversely,
+ * if sch is empty we only need to look at neh.
+ * We don't need to move the queues, as it will be done at the
+ * next enqueue
+ */
+ if (sch->elements == 0 && neh->elements > 0) {
+ si->V = MAX64(si->V, HEAP_TOP(neh)->key);
+ }
+ while (neh->elements > 0 &&
+ DN_KEY_LEQ(HEAP_TOP(neh)->key, si->V)) {
+ q = HEAP_TOP(neh)->object;
+ alg_fq = (struct wf2qp_queue *)q;
+ heap_extract(neh, NULL);
+ heap_insert(sch, alg_fq->F, q);
+ }
+ if (m) /* pkt found in previous iteration */
+ break;
+ /* ok we have at least one eligible pkt */
+ q = HEAP_TOP(sch)->object;
+ alg_fq = (struct wf2qp_queue *)q;
+ m = dn_dequeue(q);
+ heap_extract(sch, NULL); /* Remove queue from heap. */
+ si->V += (uint64_t)(m->m_pkthdr.len) * si->inv_wsum;
+ alg_fq->S = alg_fq->F; /* Update start time. */
+ if (q->mq.head == 0) { /* not backlogged any more. */
+ heap_insert(&si->idle_heap, alg_fq->F, q);
+ } else { /* Still backlogged. */
+ /* Update F, store in neh or sch */
+ uint64_t len = q->mq.head->m_pkthdr.len;
+ alg_fq->F += len * alg_fq->inv_w;
+ if (DN_KEY_LEQ(alg_fq->S, si->V)) {
+ heap_insert(sch, alg_fq->F, q);
+ } else {
+ heap_insert(neh, alg_fq->S, q);
+ }
+ }
+ }
+ return m;
+}
+
+static int
+wf2qp_new_sched(struct dn_sch_inst *_si)
+{
+ struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
+ int ofs = offsetof(struct wf2qp_queue, heap_pos);
+
+ /* all heaps support extract from middle */
+ if (heap_init(&si->idle_heap, 16, ofs) ||
+ heap_init(&si->sch_heap, 16, ofs) ||
+ heap_init(&si->ne_heap, 16, ofs)) {
+ heap_free(&si->ne_heap);
+ heap_free(&si->sch_heap);
+ heap_free(&si->idle_heap);
+ return ENOMEM;
+ }
+ return 0;
+}
+
+static int
+wf2qp_free_sched(struct dn_sch_inst *_si)
+{
+ struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
+
+ heap_free(&si->sch_heap);
+ heap_free(&si->ne_heap);
+ heap_free(&si->idle_heap);
+
+ return 0;
+}
+
+static int
+wf2qp_new_fsk(struct dn_fsk *fs)
+{
+ ipdn_bound_var(&fs->fs.par[0], 1,
+ 1, 100, "WF2Q+ weight");
+ return 0;
+}
+
+static int
+wf2qp_new_queue(struct dn_queue *_q)
+{
+ struct wf2qp_queue *q = (struct wf2qp_queue *)_q;
+
+ _q->ni.oid.subtype = DN_SCHED_WF2QP;
+ q->F = 0; /* not strictly necessary */
+ q->S = q->F + 1; /* mark timestamp as invalid. */
+ q->inv_w = ONE_FP / _q->fs->fs.par[0];
+ if (_q->mq.head != NULL) {
+ wf2qp_enqueue(_q->_si, _q, _q->mq.head);
+ }
+ return 0;
+}
+
+/*
+ * Called when the infrastructure removes a queue (e.g. flowset
+ * is reconfigured). Nothing to do if we did not 'own' the queue,
+ * otherwise remove it from the right heap and adjust the sum
+ * of weights.
+ */
+static int
+wf2qp_free_queue(struct dn_queue *q)
+{
+ struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q;
+ struct wf2qp_si *si = (struct wf2qp_si *)(q->_si + 1);
+
+ if (alg_fq->S >= alg_fq->F + 1)
+ return 0; /* nothing to do, not in any heap */
+ si->wsum -= q->fs->fs.par[0];
+ if (si->wsum > 0)
+ si->inv_wsum = ONE_FP/si->wsum;
+
+ /* extract from the heap. XXX TODO we may need to adjust V
+ * to make sure the invariants hold.
+ */
+ if (q->mq.head == NULL) {
+ heap_extract(&si->idle_heap, q);
+ } else if (DN_KEY_LT(si->V, alg_fq->S)) {
+ heap_extract(&si->ne_heap, q);
+ } else {
+ heap_extract(&si->sch_heap, q);
+ }
+ return 0;
+}
+
+/*
+ * WF2Q+ scheduler descriptor
+ * contains the type of the scheduler, the name, the size of the
+ * structures and function pointers.
+ */
+static struct dn_alg wf2qp_desc = {
+ _SI( .type = ) DN_SCHED_WF2QP,
+ _SI( .name = ) "WF2Q+",
+ _SI( .flags = ) DN_MULTIQUEUE,
+
+ /* we need extra space in the si and the queue */
+ _SI( .schk_datalen = ) 0,
+ _SI( .si_datalen = ) sizeof(struct wf2qp_si),
+ _SI( .q_datalen = ) sizeof(struct wf2qp_queue) -
+ sizeof(struct dn_queue),
+
+ _SI( .enqueue = ) wf2qp_enqueue,
+ _SI( .dequeue = ) wf2qp_dequeue,
+
+ _SI( .config = ) NULL,
+ _SI( .destroy = ) NULL,
+ _SI( .new_sched = ) wf2qp_new_sched,
+ _SI( .free_sched = ) wf2qp_free_sched,
+
+ _SI( .new_fsk = ) wf2qp_new_fsk,
+ _SI( .free_fsk = ) NULL,
+
+ _SI( .new_queue = ) wf2qp_new_queue,
+ _SI( .free_queue = ) wf2qp_free_queue,
+};
+
+
+DECLARE_DNSCHED_MODULE(dn_wf2qp, &wf2qp_desc);
diff --git a/sys/netpfil/ipfw/dummynet.txt b/sys/netpfil/ipfw/dummynet.txt
new file mode 100644
index 0000000..e8c9725
--- /dev/null
+++ b/sys/netpfil/ipfw/dummynet.txt
@@ -0,0 +1,860 @@
+#
+# $FreeBSD$
+#
+
+Notes on the internal structure of dummynet (2010 version)
+by Riccardo Panicucci and Luigi Rizzo
+Work supported by the EC project ONELAB2
+
+
+*********
+* INDEX *
+*********
+Implementation of new dummynet
+ Internal structure
+ Files
+Packet arrival
+ The reconfiguration routine
+dummynet_task()
+Configuration
+ Add a pipe
+ Add a scheduler
+ Add a flowset
+Listing object
+Delete of object
+ Delete a pipe
+ Delete a flowset
+ Delete a scheduler
+Compatibility with FreeBSD7.2 and FreeBSD 8 ipfw binary
+ ip_dummynet_glue.c
+ ip_fw_glue.c
+How to configure dummynet
+How to implement a new scheduler
+
+
+
+OPEN ISSUES
+------------------------------
+20100131 deleting RR causes infinite loop
+ presumably in the rr_free_queue() call -- seems to hang
+ forever when deleting a live flow
+------------------------------
+
+Dummynet is a traffic shaper and network emulator. Packets are
+selected by an external filter such as ipfw, and passed to the emulator
+with a tag such as "pipe 10" or "queue 5" which tells what to
+do with the packet. As an example
+
+ ipfw add queue 5 icmp from 10.0.0.2 to all
+
+All packets with the same tag belong to a "flowset", or a set
+of flows which can be further partitioned according to a mask.
+Flowsets are then passed to a scheduler for processing. The
+association of flowsets and schedulers is configurable e.g.
+
+ ipfw queue 5 config sched 10 weight 3 flow_mask xxxx
+ ipfw queue 8 config sched 10 weight 1 ...
+ ipfw queue 3 config sched 20 weight 1 ...
+
+"sched 10" represents one or more scheduler instances,
+selected through a mask on the 5-tuple itself.
+
+ ipfw sched 20 config type FIFO sched_mask yyy ...
+
+There are in fact two masks applied to each packet:
++ the "sched_mask" sends packets arriving to a scheduler_id to
+ one of many instances.
++ the "flow_mask" together with the flowset_id is used to
+ collect packets into independent flows on each scheduler.
+
+As an example, we can have
+ ipfw queue 5 config sched 10 flow_mask src-ip 0x000000ff
+ ipfw sched 10 config type WF2Q+ sched_mask src-ip 0xffffff00
+
+means that sched 10 will have one instance per /24 source subnet,
+and within that, each individual source will be a flow.
+
+Internal structure
+-----------------
+Dummynet-related data is split into several data structures,
+part of them constituting the userland-kernel API, and others
+specific to the kernel.
+NOTE: for up-to-date details please look at the relevant source
+ headers (ip_dummynet.h, ip_dn_private.h, dn_sched.h)
+
+USERLAND-KERNEL API (ip_dummynet.h)
+
+ struct dn_link:
+ contains data about the physical link such as
+ bandwith, delay, burst size;
+
+ struct dn_fs:
+ describes a flowset, i.e. a template for queues.
+ Main parameters are the scheduler we attach to, a flow_mask,
+ buckets, queue size, plr, weight, and other scheduler-specific
+ parameters.
+
+ struct dn_flow
+ contains information on a flow, including masks and
+ statistics
+
+ struct dn_sch:
+ defines a scheduler (and a link attached to it).
+ Parameters include scheduler type, sched_mask, number of
+ buckets, and possibly other scheduler-specific parameters,
+
+ struct dn_profile:
+ fields to simulate a delay profile
+
+
+KERNEL REPRESENTATION (ip_dn_private.h)
+
+ struct mq
+ a queue of mbufs with head and tail.
+
+ struct dn_queue
+ individual queue of packets, created by a flowset using
+ flow_mask and attached to a scheduler instance selected
+ through sched_mask.
+ A dn_queue has a pointer to the dn_fsk (which in turn counts
+ how many queues point to it), a pointer to the
+ dn_sch_inst it attaches to, and is in a hash table in the
+ flowset. scheduler instances also should store queues in
+ their own containers used for scheduling (lists, trees, etc.)
+ CREATE: done on packet arrivals when a flow matches a flowset.
+ DELETE: done only when deleting the parent dn_sch_inst
+ or draining memory.
+
+ struct dn_fsk
+ includes a dn_fs; a pointer to the dn_schk; a link field
+ for the list of dn_fsk attached to the same scheduler,
+ or for the unlinked list;
+ a refcount for the number of queues pointing to it;
+ The dn_fsk is in a hash table, fshash.
+ CREATE: done on configuration commands.
+ DELETE: on configuration commands.
+
+ struct dn_sch_inst
+ a scheduler instance, created from a dn_schk applying sched_mask.
+ Contains a delay line, a reference to the parent, and scheduler-
+ specific info. Both dn_sch_inst and its delay line can be in the
+ evheap if they have events to be processed.
+ CREATE: created from a dn_schk applying sched_mask
+ DELETE: configuration command delete a scheduler which in turn
+ sweeps the hash table of instances deleting them
+
+ struct dn_schk
+ includes dn_sch, dn_link, a pointer to dn_profile,
+ a hash table of dn_sch_inst, a list of dn_fsk
+ attached to it.
+ CREATE: configuration command. If there are flowsets that
+ refer to this number, they are attached and moved
+ to the hash table
+ DELETE: manual, see dn_sch_inst
+
+
+ fshash schedhash
+ +---------------+ sched +--------------+
+ | sched-------------------->| NEW_SCHK|
+ -<----*sch_chain |<-----------------*fsk_list |
+ |NEW_FSK |<----. | [dn_link] |
+ +---------------+ | +--------------+
+ |qht (hash) | | | siht(hash) |
+ | [dn_queue] | | | [dn_si] |
+ | [dn_queue] | | | [dn_si] |
+ | ... | | | ... |
+ | +--------+ | | | +---------+ |
+ | |dn_queue| | | | |dn_si | |
+ | | fs *----------' | | | |
+ | | si *---------------------->| | |
+ | +---------+ | | +---------+ |
+ +---------------+ +--------------+
+
+The following global data structures contain all
+schedulers and flowsets.
+
+- schedhash[x]: contains all scheduler templates in the system.
+ Looked up only on manual configurations, where flowsets
+ are attached to matching schedulers.
+ We have one entry per 'sched X config' command
+ (plus one for each 'pipe X config').
+
+- fshash[x]: contains all flowsets.
+ We do a lookup on this for each packet.
+ We have one entry for each 'queue X config'
+ (plus one for each 'pipe X config').
+
+Additionally, a list that contains all unlinked flowset:
+- fsu: contains flowset that are not linked with any scheduler.
+ Flowset are put in this list when they refer to a non
+ existing scheduler.
+ We don't need an efficient data structure as we never search
+ here on a packet arrivals.
+
+Scheduler instances and the delay lines associated with each scheduler
+instance need to be woken up at certain times. Because we have many
+such objects, we keep them in a priority heap (system_heap).
+
+Almost all objects in this implementation are preceded by a structure
+(struct dn_id) which makes it easier to identify them.
+
+
+Files
+-----
+The dummynet code is split in several files.
+All kernel code is in sys/netinet/ipfw except ip_dummynet.h
+All userland code is in sbin/ipfw.
+Files are
+- sys/netinet/ip_dummynet.h defines the kernel-userland API
+- ip_dn_private.h contains the kernel-specific APIs
+ and data structures
+- dn_sched.h defines the scheduler API
+- ip_dummynet.c cointains module glue and sockopt handlers, with all
+ functions to configure and list objects.
+- ip_dn_io.c contains the functions directly related to packet processing,
+ and run in the critical path. It also contains some functions
+ exported to the schedulers.
+- dn_heap.[ch] implement a binary heap and a generic hash table
+- dn_sched_* implement the various scheduler modules
+
+- dummynet.c is the file used to implement the user side of dummynet.
+ It contains the function to parsing command line, and functions to
+ show the output of dummynet objects.
+Moreover, there are two new file (ip_dummynet_glue.c and ip_fw_glue.c) that
+are used to allow compatibility with the "ipfw" binary from FreeBSD 7.2 and
+FreeBSD 8.
+
+LOCKING
+=======
+At the moment the entire processing occurs under a single lock
+which is expected to be acquired in exclusive mode
+DN_BH_WLOCK() / DN_BH_WUNLOCK().
+
+In perspective we aim at the following:
+- the 'busy' flag, 'pending' list and all structures modified by packet
+ arrivals and departures are protected by the BH_WLOCK.
+ This is normally acquired in exclusive mode by the packet processing
+ functions for short sections of code (exception -- the timer).
+ If 'busy' is not set, we can do regular packet processing.
+ If 'busy' is set, no pieces can be accessed.
+ We must enqueue the packet on 'pending' and return immediately.
+
+- the 'busy' flag is set/cleared by long sections of code as follows:
+ UH_WLOCK(); KASSERT(busy == 0);
+ BH_WLOCK(); busy=1; BH_WUNLOCK();
+ ... do processing ...
+ BH_WLOCK(); busy=0; drain_queue(pending); BH_WUNLOCK();
+ UH_WUNLOCK();
+ this normally happens when the upper half has something heavy
+ to do. The prologue and epilogue are not in the critical path.
+
+- the main containers (fshash, schedhash, ...) are protected by
+ UH_WLOCK.
+
+Packet processing
+=================
+A packet enters dummynet through dummynet_io(). We first lookup
+the flowset number in fshash using dn_ht_find(), then find the scheduler
+instance using ipdn_si_find(), then possibly identify the correct
+queue with ipdn_q_find().
+If successful, we call the scheduler's enqueue function(), and
+if needed start I/O on the link calling serve_sched().
+If the packet can be returned immediately, this is done by
+leaving *m0 set. Otherwise, the packet is absorbed by dummynet
+and we simply return, possibly with some appropriate error code.
+
+Reconfiguration
+---------------
+Reconfiguration is the complex part of the system because we need to
+keep track of the various objects and containers.
+At the moment we do not use reference counts for objects so all
+processing must be done under a lock.
+
+The main entry points for configuration is the ip_dn_ctl() handler
+for the IP_DUMMYNET3 sockopt (others are provided only for backward
+compatibility). Modifications to the configuration call do_config().
+The argument is a sequence of blocks each starting with a struct dn_id
+which specifies its content.
+The first dn_id must contain as obj.id the DN_API_VERSION
+The obj.type is DN_CMD_CONFIG (followed by actual objects),
+DN_CMD_DELETE (with the correct subtype and list of objects), or
+DN_CMD_FLUSH.
+
+DN_CMD_CONFIG is followed by objects to add/reconfigure. In general,
+if an object already exists it is reconfigured, otherwise it is
+created in a way that keeps the structure consistent.
+We have the following objects in the system, normally numbered with
+an identifier N between 1 and 65535. For certain objects we have
+"shadow" copies numbered I+NMAX and I+ 2*NMAX which are used to
+implement certain backward compatibility features.
+
+In general we have the following linking
+
+ TRADITIONAL DUMMYNET QUEUES "queue N config ... pipe M ..."
+ corresponds to a dn_fs object numbered N
+
+ TRADITIONAL DUMMYNET PIPES "pipe N config ..."
+ dn_fs N+2*NMAX --> dn_sch N+NMAX type FIFO --> dn_link N+NMAX
+
+ GENERIC SCHEDULER "sched N config ... "
+ [dn_fs N+NMAX] --> dn_sch N --> dn_link N
+ The flowset N+NMAX is created only if the scheduler is not
+ of type MULTIQUEUE.
+
+ DELAY PROFILE "pipe N config profile ..."
+ it is always attached to an existing dn_link N
+
+Because traditional dummynet pipes actually configure both a
+'standalone' instance and one that can be used by queues,
+we do the following:
+
+ "pipe N config ..." configures:
+ dn_sched N type WF2Q+
+ dn_sched N+NMAX type FIFO
+ dn_fs N+2NMAX attached to dn_sched N+NMAX
+ dn_pipe N
+ dn_pipe N+NMAX
+
+ "queue N config" configures
+ dn_fs N
+
+ "sched N config" configures
+ dn_sched N type as desired
+ dn_fs N+NMAX attached to dn_sched N
+
+
+dummynet_task()
+===============
+The dummynet_task() function is the main dummynet processing function and is
+called every tick. This function first calculate the new current time, then
+it checks if it is the time to wake up object from the system_heap comparing
+the current time and the key of the heap. Two types of object (really the
+heap contains pointer to objects) are in the
+system_heap:
+
+- scheduler instance: if a scheduler instance is waked up, the dequeue()
+ function is called until it has credit. If the dequeue() returns packets,
+ the scheduler instance is inserted in the heap with a new key depending of
+ the data that will be send out. If the scheduler instance remains with
+ some credit, it means that is hasn't other packet to send and so the
+ instance is no longer inserted in the heap.
+
+ If the scheduler instance extracted from the heap has the DELETE flag set,
+ the dequeue() is not called and the instance is destroyed now.
+
+- delay line: when extracting a delay line, the function transmit_event() is
+ called to send out packet from delay line.
+
+ If the scheduler instance associated with this delay line doesn't exists,
+ the delay line will be delete now.
+
+Configuration
+=============
+To create a pipe, queue or scheduler, the user should type commands like:
+"ipfw pipe x config"
+"ipfw queue y config pipe x"
+"ipfw pipe x config sched <type>"
+
+The userland side of dummynet will prepare a buffer contains data to pass to
+kernel side.
+The buffer contains all struct needed to configure an object. In more detail,
+to configure a pipe all three structs (dn_link, dn_sch, dn_fs) are needed,
+plus the delay profile struct if the pipe has a delay profile.
+
+If configuring a scheduler only the struct dn_sch is wrote in the buffer,
+while if configuring a flowset only the dn_fs struct is wrote.
+
+The first struct in the buffer contains the type of command request, that is
+if it is configuring a pipe, a queue, or a scheduler. Then there are structs
+need to configure the object, and finally there is the struct that mark
+the end of the buffer.
+
+To support the insertion of pipe and queue using the old syntax, when adding
+a pipe it's necessary to create a FIFO flowset and a FIFO scheduler, which
+have a number x + DN_PIPEOFFSET.
+
+Add a pipe
+----------
+A pipe is only a template for a link.
+If the pipe already exists, parameters are updated. If a delay profile exists
+it is deleted and a new one is created.
+If the pipe doesn't exist a new one is created. After the creation, the
+flowset unlinked list is scanned to see if there are some flowset that would
+be linked with this pipe. If so, these flowset will be of wf2q+ type (for
+compatibility) and a new wf2q+ scheduler is created now.
+
+Add a scheduler
+---------------
+If the scheduler already exists, and the type and the mask are the same, the
+scheduler is simply reconfigured calling the config_scheduler() scheduler
+function with the RECONFIGURE flag active.
+If the type or the mask differ, it is necessary to delete the old scheduler
+and create a new one.
+If the scheduler doesn't exists, a new one is created. If the scheduler has
+a mask, the hash table is created to store pointers to scheduler instances.
+When a new scheduler is created, it is necessary to scan the unlinked
+flowset list to search eventually flowset that would be linked with this
+scheduler number. If some are found, flowsets became of the type of this
+scheduler and they are configured properly.
+
+Add a flowset
+-------------
+Flowset pointers are store in the system in two list. The unlinked flowset list
+contains all flowset that aren't linked with a scheduler, the flowset list
+contains flowset linked to a scheduler, and so they have a type.
+When adding a new flowset, first it is checked if the flowset exists (that is,
+it is in the flowset list) and if it doesn't exists a new flowset is created
+and added to unlinked flowset list if the scheduler which the flowset would be
+linked doesn't exists, or added in the flowset list and configured properly if
+the scheduler exists. If the flowset (before to be created) was in the
+unlinked flowset list, it is removed and deleted, and then recreated.
+If the flowset exists, to allow reconfiguration of this flowset, the
+scheduler number and types must match with the one in memory. If this isn't
+so, the flowset is deleted and a new one will be created. Really, the flowset
+it isn't deleted now, but it is removed from flowset list and it will be
+deleted later because there could be some queues that are using it.
+
+Listing of object
+=================
+The user can request a list of object present in dummynet through the command
+"ipfw [-v] pipe|queue [x] list|show"
+The kernel side of dummynet send a buffer to user side that contains all
+pipe, all scheduler, all flowset, plus all scheduler instances and all queues.
+The dummynet user land will format the output and show only the relevant
+information.
+The buffer sent start with all pipe from the system. The entire struct dn_link
+is passed, except the delay_profile struct that is useless in user space.
+After pipes, all flowset are wrote in the buffer. The struct contains
+scheduler flowset specific data is linked with the flowset writing the
+'obj' id of the extension into the 'alg_fs' pointer.
+Then schedulers are wrote. If a scheduler has one or more scheduler instance,
+these are linked to the parent scheduler writing the id of the parent in the
+'ptr_sched' pointer. If a scheduler instance has queues, there are wrote in
+the buffer and linked thorugh the 'obj' and 'sched_inst' pointer.
+Finally, flowsets in the unlinked flowset list are write in the buffer, and
+then a struct gen in saved in the buffer to mark the last struct in the buffer.
+
+
+Delete of object
+================
+An object is usually removed by user through a command like
+"ipfw pipe|queue x delete". XXX sched?
+ipfw pass to the kernel a struct gen that contains the type and the number
+of the object to remove
+
+Delete of pipe x
+----------------
+A pipe can be deleted by the user throught the command 'ipfw pipe x delete'.
+To delete a pipe, the pipe is removed from the pipe list, and then deleted.
+Also the scheduler associated with this pipe should be deleted.
+For compatibility with old dummynet syntax, the associated FIFO scheduler and
+FIFO flowset must be deleted.
+
+Delete of flowset x
+-------------------
+To remove a flowset, we must be sure that is no loger referenced by any object.
+If the flowset to remove is in the unlinked flowset list, there is not any
+issue, the flowset can be safely removed calling a free() (the flowset
+extension is not yet created if the flowset is in this list).
+If the flowset is in the flowset list, first we remove from it so new packet
+are discarded when arrive. Next, the flowset is marked as delete.
+Now we must check if some queue is using this flowset.
+To do this, a counter (active_f) is provided. This counter indicate how many
+queues exist using this flowset.
+The active_f counter is automatically incremented when a queue is created
+and decremented when a queue is deleted.
+If the counter is 0, the flowset can be safely deleted, and the delete_alg_fs()
+scheduler function is called before deallocate memory.
+If the counter is not 0, the flowset remain in memory until the counter become
+zero. When a queue is delete (by dn_delete_queue() function) it is checked if
+the linked flowset is deleting and if so the counter is decrementing. If the
+counter reaches 0, the flowset is deleted.
+The deletion of a queue can be done only by the scheduler, or when the scheduler
+is destroyed.
+
+Delete of scheduler x
+---------------------
+To delete a scheduler we must be sure that any scheduler instance of this type
+are in the system_heap. To do so, a counter (inst_counter) is provided.
+This counter is managed by the system: it is incremented every time it is
+inserted in the system_heap, and decremented every time it is extracted from it.
+To delete the scheduler, first we remove it from the scheduler list, so new
+packet are discarded when they arrive, and mark the scheduler as deleting.
+
+If the counter is 0, we can remove the scheduler safely calling the
+really_deletescheduler() function. This function will scan all scheduler
+instances and call the delete_scheduler_instance() function that will delete
+the instance. When all instance are deleted, the scheduler template is
+deleted calling the delete_scheduler_template(). If the delay line associate
+with the scheduler is empty, it is deleted now, else it will be deleted when
+it will became empy.
+If the counter was not 0, we wait for it. Every time the dummynet_task()
+function extract a scheduler from the system_heap, the counter is decremented.
+If the scheduler has the delete flag enabled the dequeue() is not called and
+delete_scheduler_instance() is called to delete the instance.
+Obviously this scheduler instance is no loger inserted in the system_heap.
+If the counter reaches 0, the delete_scheduler_template() function is called
+all memory is released.
+NOTE: Flowsets that belong to this scheduler are not deleted, so if a new
+ scheduler with the same number is inserted will use these flowsets.
+ To do so, the best approach would be insert these flowset in the
+ unlinked flowset list, but doing this now will be very expensive.
+ So flowsets will remain in memory and linked with a scheduler that no
+ longer exists until a packet belonging to this flowset arrives. When
+ this packet arrives, the reconfigure() function is called because the
+ generation number mismatch with one contains in the flowset and so
+ the flowset will be moved into the flowset unlinked list, or will be
+ linked with the new scheduler if a new one was created.
+
+
+COMPATIBILITY WITH FREEBSD 7.2 AND FREEBSD 8 'IPFW' BINARY
+==========================================================
+Dummynet is not compatible with old ipfw binary because internal structs are
+changed. Moreover, the old ipfw binary is not compatible with new kernels
+because the struct that represents a firewall rule has changed. So, if a user
+install a new kernel on a FreeBSD 7.2, the ipfw (and possibly many other
+commands) will not work.
+New dummynet uses a new socket option: IP_DUMMYNET3, used for both set and get.
+The old option can be used to allow compatibility with the 'ipfw' binary of
+older version (tested with 7.2 and 8.0) of FreeBSD.
+Two file are provided for this purpose:
+- ip_dummynet_glue.c translates old dummynet requests to the new ones,
+- ip_fw_glue.c converts the rule format between 7.2 and 8 versions.
+Let see in detail these two files.
+
+IP_DUMMYNET_GLUE.C
+------------------
+The internal structs of new dummynet are very different from the original.
+Because of there are some difference from between dummynet in FreeBSD 7.2 and
+dummynet in FreeBSD 8 (the FreeBSD 8 version includes support to pipe delay
+profile and burst option), I have to include both header files. I copied
+the revision 191715 (for version 7.2) and the revision 196045 (for version 8)
+and I appended a number to each struct to mark them.
+
+The main function of this file is ip_dummynet_compat() that is called by
+ip_dn_ctl() when it receive a request of old socket option.
+
+A global variabile ('is7') store the version of 'ipfw' that FreeBSD is using.
+This variable is set every time a request of configuration is done, because
+with this request we receive a buffer of which size depending of ipfw version.
+Because of in general the first action is a configuration, this variable is
+usually set accordly. If the first action is a request of listing of pipes
+or queues, the system cannot know the version of ipfw, and we suppose that
+version 7.2 is used. If version is wrong, the output can be senseless, but
+the application should not crash.
+
+There are four request for old dummynet:
+- IP_DUMMYNET_FLUSH: the flush options have no parameter, so simply the
+ dummynet_flush() function is called;
+- IP_DUMMYNET_DEL: the delete option need to be translate.
+ It is only necessary to extract the number and the type of the object
+ (pipe or queue) to delete from the buffer received and build a new struct
+ gen contains the right parameters, then call the delete_object() function;
+- IP_DUMMYNET_CONFIGURE: the configure command receive a buffer depending of
+ the ipfw version. After the properly extraction of all data, that depends
+ by the ipfw version used, new structures are filled and then the dummynet
+ config_link() function is properly called. Note that the 7.2 version does
+ not support some parameter as burst or delay profile.
+- IP_DUMMYNET_GET: The get command should send to the ipfw the correct buffer
+ depending of its version. There are two function that build the
+ corrected buffer, ip_dummynet_get7() and ip_dummynet_get8(). These
+ functions reproduce the buffer exactly as 'ipfw' expect. The only difference
+ is that the weight parameter for a queue is no loger sent by dummynet and so
+ it is set to 0.
+ Moreover, because of the internal structure has changed, the bucket size
+ of a queue could not be correct, because now all flowset share the hash
+ table.
+ If the version of ipfw is wrong, the output could be senseless or truncated,
+ but the application should not crash.
+
+IP_FW_GLUE.C
+------------
+The ipfw binary also is used to add rules to FreeBSD firewall. Because of the
+struct ip_fw is changed from FreeBsd 7.2 to FreeBSD 8, it is necessary
+to write some glue code to allow use ipfw from FreeBSD 7.2 with the kernel
+provided with FreeBSD 8.
+This file contains two functions to convert a rule from FreeBSD 7.2 format to
+FreeBSD 8 format, and viceversa.
+The conversion should be done when a rule passes from userspace to kernel space
+and viceversa.
+I have to modify the ip_fw2.c file to manage these two case, and added a
+variable (is7) to store the ipfw version used, using an approach like the
+previous file:
+- when a new rule is added (option IP_FW_ADD) the is7 variable is set if the
+ size of the rule received corrispond to FreeBSD 7.2 ipfw version. If so, the
+ rule is converted to version 8 calling the function convert_rule_to_8().
+ Moreover, after the insertion of the rule, the rule is now reconverted to
+ version 7 because the ipfw binary will print it.
+- when the user request a list of rules (option IP_FW_GET) the is7 variable
+ should be set correctly because we suppose that a configure command was done,
+ else we suppose that the FreeBSD version is 8. The function ipfw_getrules()
+ in ip_fw2.c file return all rules, eventually converted to version 7 (if
+ the is7 is set) to the ipfw binary.
+The conversion of a rule is quite simple. The only difference between the
+two structures (struct ip_fw) is that in the new there is a new field
+(uint32_t id). So, I copy the entire rule in a buffer and the copy the rule in
+the right position in the new (or old) struct. The size of commands are not
+changed, and the copy is done into a cicle.
+
+How to configure dummynet
+=========================
+It is possible to configure dummynet through two main commands:
+'ipfw pipe' and 'ipfw queue'.
+To allow compatibility with old version, it is possible configure dummynet
+using the old command syntax. Doing so, obviously, it is only possible to
+configure a FIFO scheduler or a wf2q+ scheduler.
+A new command, 'ipfw pipe x config sched <type>' is supported to add a new
+scheduler to the system.
+
+- ipfw pipe x config ...
+ create a new pipe with the link parameters
+ create a new scheduler fifo (x + offset)
+ create a new flowset fifo (x + offset)
+ the mask is eventually stored in the FIFO scheduler
+
+- ipfw queue y config pipe x ...
+ create a new flowset y linked to sched x.
+ The type of flowset depends by the specified scheduler.
+ If the scheduler does not exist, this flowset is inserted in a special
+ list and will be not active.
+ If pipe x exists and sched does not exist, a new wf2q+ scheduler is
+ created and the flowset will be linked to this new scheduler (this is
+ done for compatibility with old syntax).
+
+- ipfw pipe x config sched <type> ...
+ create a new scheduler x of type <type>.
+ Search into the flowset unlinked list if there are some flowset that
+ should be linked with this new scheduler.
+
+- ipfw pipe x delete
+ delete the pipe x
+ delete the scheduler fifo (x + offset)
+ delete the scheduler x
+ delete the flowset fifo (x + offset)
+
+- ipfw queue x delete
+ delete the flowset x
+
+- ipfw sched x delete ///XXX
+ delete the scheduler x
+
+Follow now some examples to how configure dummynet:
+- Ex1:
+ ipfw pipe 10 config bw 1M delay 15 // create a pipe with band and delay
+ A FIFO flowset and scheduler is
+ also created
+ ipfw queue 5 config pipe 10 weight 56 // create a flowset. This flowset
+ will be of wf2q+ because a pipe 10
+ exists. Moreover, the wf2q+
+ scheduler is created now.
+- Ex2:
+ ipfw queue 5 config pipe 10 weight 56 // Create a flowset. Scheduler 10
+ does not exist, so this flowset
+ is inserted in the unlinked
+ flowset list.
+ ipfw pipe 10 config bw... // Create a pipe, a FIFO flowset and scheduler.
+ Because of a flowset with 'pipe 10' exists,
+ a wf2q+ scheduler is created now and that
+ flowset is linked with this sceduler.
+
+- Ex3:
+ ipfw pipe 10 config bw... // Create a pipe, a FIFO flowset and scheduler.
+ ipfw pipe 10 config sched rr // Create a scheduler of type RR, linked to
+ pipe 10
+ ipfw queue 5 config pipe 10 weight 56 // Create a flowset 5. This flowset
+ will belong to scheduler 10 and
+ it is of type RR
+
+- Ex4:
+ ipfw pipe 10 config sched rr // Create a scheduler of type RR, linked to
+ pipe 10 (not exist yet)
+ ipfw pipe 10 config bw... // Create a pipe, a FIFO flowset and scheduler.
+ ipfw queue 5 config pipe 10 weight 56 // Create a flowset 5.This flowset
+ will belong to scheduler 10 and
+ it is of type RR
+ ipfw pipe 10 config sched wf2q+ // Modify the type of scheduler 10. It
+ becomes a wf2q+ scheduler.
+ When a new packet of flowset 5 arrives,
+ the flowset 5 becomes to wf2q+ type.
+
+How to implement a new scheduler
+================================
+In dummynet, a scheduler algorithm is represented by two main structs, some
+functions and other minor structs.
+- A struct dn_sch_xyz (where xyz is the 'type' of scheduler algorithm
+ implemented) contains data relative to scheduler, as global parameter that
+ are common to all instances of the scheduler
+- A struct dn_sch_inst_xyz contains data relative to a single scheduler
+ instance, as local status variable depending for example by flows that
+ are linked with the scheduler, and so on.
+To add a scheduler to dummynet, the user should type a command like:
+'ipfw pipe x config sched <type> [mask ... ...]'
+This command creates a new struct dn_sch_xyz of type <type>, and
+store the optional parameter in that struct.
+
+The parameter mask determines how many scheduler instance of this
+scheduler may exist. For example, it is possible to divide traffic
+depending on the source port (or destination, or ip address...),
+so that every scheduler instance act as an independent scheduler.
+If the mask is not set, all traffic goes to the same instance.
+
+When a packet arrives to a scheduler, the system search the corrected
+scheduler instance, and if it does not exist it is created now (the
+struct dn_sch_inst_xyz is allocated by the system, and the scheduler
+fills the field correctly). It is a task of the scheduler to create
+the struct that contains all queues for a scheduler instance.
+Dummynet provides some function to create an hash table to store
+queues, but the schedule algorithm can choice the own struct.
+
+To link a flow to a scheduler, the user should type a command like:
+'ipfw queue z config pipe x [mask... ...]'
+
+This command creates a new 'dn_fs' struct that will be inserted
+in the system. If the scheduler x exists, this flowset will be
+linked to that scheduler and the flowset type become the same as
+the scheduler type. At this point, the function create_alg_fs_xyz()
+is called to allow store eventually parameter for the flowset that
+depend by scheduler (for example the 'weight' parameter for a wf2q+
+scheduler, or some priority...). A parameter mask can be used for
+a flowset. If the mask parameter is set, the scheduler instance can
+separate packet according to its flow id (src and dst ip, ports...)
+and assign it to a separate queue. This is done by the scheduler,
+so it can ignore the mask if it wants.
+
+See now the two main structs:
+struct dn_sch_xyz {
+ struct gen g; /* important the name g */
+ /* global params */
+};
+struct dn_sch_inst_xyz {
+ struct gen g; /* important the name g */
+ /* params of the instance */
+};
+It is important to embed the struct gen as first parameter. The struct gen
+contains some values that the scheduler instance must fill (the 'type' of
+scheduler, the 'len' of the struct...)
+The function create_scheduler_xyz() should be implemented to initialize global
+parameters in the first struct, and if memory allocation is done it is
+mandatory to implement the delete_scheduler_template() function to free that
+memory.
+The function create_scheduler_instance_xyz() must be implemented even if the
+scheduler instance does not use extra parameters. In this function the struct
+gen fields must be filled with corrected infos. The
+delete_scheduler_instance_xyz() function must bu implemented if the instance
+has allocated some memory in the previous function.
+
+To store data belonging to a flowset the follow struct is used:
+struct alg_fs_xyz {
+ struct gen g;
+ /* fill correctly the gen struct
+ g.subtype = DN_XYZ;
+ g.len = sizeof(struct alg_fs_xyz)
+ ...
+ */
+ /* params for the flow */
+};
+The create_alg_fs_xyz() function is mandatory, because it must fill the struct
+gen, but the delete_alg_fs_xyz() is mandatory only if the previous function
+has allocated some memory.
+
+A struct dn_queue contains packets belonging to a queue and some statistical
+data. The scheduler could have to store data in this struct, so it must define
+a dn_queue_xyz struct:
+struct dn_queue_xyz {
+ struct dn_queue q;
+ /* parameter for a queue */
+}
+
+All structures are allocated by the system. To do so, the scheduler must
+set the size of its structs in the scheduler descriptor:
+scheduler_size: sizeof(dn_sch_xyz)
+scheduler_i_size: sizeof(dn_sch_inst_xyz)
+flowset_size: sizeof(alg_fs_xyz)
+queue_size: sizeof(dn_queue_xyz);
+The scheduler_size could be 0, but other struct must have at least a struct gen.
+
+
+After the definition of structs, it is necessary to implement the
+scheduler functions.
+
+- int (*config_scheduler)(char *command, void *sch, int reconfigure);
+ Configure a scheduler, or reconfigure if 'reconfigure' == 1.
+ This function performs additional allocation and initialization of global
+ parameter for this scheduler.
+ If memory is allocated here, the delete_scheduler_template() function
+ should be implemented to remove this memory.
+- int (*delete_scheduler_template)(void* sch);
+ Delete a scheduler template. This function is mandatory if the scheduler
+ uses extra data respect the struct dn_sch.
+- int (*create_scheduler_instance)(void *s);
+ Create a new scheduler instance. The system allocate the necessary memory
+ and the schedulet can access it using the 's' pointer.
+ The scheduler instance stores all queues, and to do this can use the
+ hash table provided by the system.
+- int (*delete_scheduler_instance)(void *s);
+ Delete a scheduler instance. It is important to free memory allocated
+ by create_scheduler_instance() function. The memory allocated by system
+ is freed by the system itself. The struct contains all queue also has
+ to be deleted.
+- int (*enqueue)(void *s, struct gen *f, struct mbuf *m,
+ struct ipfw_flow_id *id);
+ Called when a packet arrives. The packet 'm' belongs to the scheduler
+ instance 's', has a flowset 'f' and the flowid 'id' has already been
+ masked. The enqueue() must call dn_queue_packet(q, m) function to really
+ enqueue packet in the queue q. The queue 'q' is chosen by the scheduler
+ and if it does not exist should be created calling the dn_create_queue()
+ function. If the schedule want to drop the packet, it must call the
+ dn_drop_packet() function and then return 1.
+- struct mbuf * (*dequeue)(void *s);
+ Called when the timer expires (or when a packet arrives and the scheduler
+ instance is idle).
+ This function is called when at least a packet can be send out. The
+ scheduler choices the packet and returns it; if no packet are in the
+ schedulerinstance, the function must return NULL.
+ Before return a packet, it is important to call the function
+ dn_return_packet() to update some statistic of the queue and update the
+ queue counters.
+- int (*drain_queue)(void *s, int flag);
+ The system request to scheduler to delete all queues that is not using
+ to free memory. The flag parameter indicate if a queue must be deleted
+ even if it is active.
+
+- int (*create_alg_fs)(char *command, struct gen *g, int reconfigure);
+ It is called when a flowset is linked with a scheduler. This is done
+ when the scheduler is defined, so we can know the type of flowset.
+ The function initialize the flowset paramenter parsing the command
+ line. The parameter will be stored in the g struct that have the right
+ size allocated by the system. If the reconfigure flag is set, it means
+ that the flowset is reconfiguring
+- int (*delete_alg_fs)(struct gen *f);
+ It is called when a flowset is deleting. Must remove the memory allocate
+ by the create_alg_fs() function.
+
+- int (*create_queue_alg)(struct dn_queue *q, struct gen *f);
+ Called when a queue is created. The function should link the queue
+ to the struct used by the scheduler instance to store all queues.
+- int (*delete_queue_alg)(struct dn_queue *q);
+ Called when a queue is deleting. The function should remove extra data
+ and update the struct contains all queues in the scheduler instance.
+
+The struct scheduler represent the scheduler descriptor that is passed to
+dummynet when a scheduler module is loaded.
+This struct contains the type of scheduler, the length of all structs and
+all function pointers.
+If a function is not implemented should be initialize to NULL. Some functions
+are mandatory, other are mandatory if some memory should be freed.
+Mandatory functions:
+- create_scheduler_instance()
+- enqueue()
+- dequeue()
+- create_alg_fs()
+- drain_queue()
+Optional functions:
+- config_scheduler()
+- create_queue_alg()
+Mandatory functions if the corresponding create...() has allocated memory:
+- delete_scheduler_template()
+- delete_scheduler_instance()
+- delete_alg_fs()
+- delete_queue_alg()
+
diff --git a/sys/netpfil/ipfw/ip_dn_glue.c b/sys/netpfil/ipfw/ip_dn_glue.c
new file mode 100644
index 0000000..2e4f460
--- /dev/null
+++ b/sys/netpfil/ipfw/ip_dn_glue.c
@@ -0,0 +1,846 @@
+/*-
+ * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $FreeBSD$
+ *
+ * Binary compatibility support for /sbin/ipfw RELENG_7 and RELENG_8
+ */
+
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/time.h>
+#include <sys/taskqueue.h>
+#include <net/if.h> /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */
+#include <netinet/in.h>
+#include <netinet/ip_var.h> /* ip_output(), IP_FORWARDING */
+#include <netinet/ip_fw.h>
+#include <netinet/ip_dummynet.h>
+
+#include <netpfil/ipfw/ip_fw_private.h>
+#include <netpfil/ipfw/dn_heap.h>
+#include <netpfil/ipfw/ip_dn_private.h>
+#include <netpfil/ipfw/dn_sched.h>
+
+/* FREEBSD7.2 ip_dummynet.h r191715*/
+
+struct dn_heap_entry7 {
+ int64_t key; /* sorting key. Topmost element is smallest one */
+ void *object; /* object pointer */
+};
+
+struct dn_heap7 {
+ int size;
+ int elements;
+ int offset; /* XXX if > 0 this is the offset of direct ptr to obj */
+ struct dn_heap_entry7 *p; /* really an array of "size" entries */
+};
+
+/* Common to 7.2 and 8 */
+struct dn_flow_set {
+ SLIST_ENTRY(dn_flow_set) next; /* linked list in a hash slot */
+
+ u_short fs_nr ; /* flow_set number */
+ u_short flags_fs;
+#define DNOLD_HAVE_FLOW_MASK 0x0001
+#define DNOLD_IS_RED 0x0002
+#define DNOLD_IS_GENTLE_RED 0x0004
+#define DNOLD_QSIZE_IS_BYTES 0x0008 /* queue size is measured in bytes */
+#define DNOLD_NOERROR 0x0010 /* do not report ENOBUFS on drops */
+#define DNOLD_HAS_PROFILE 0x0020 /* the pipe has a delay profile. */
+#define DNOLD_IS_PIPE 0x4000
+#define DNOLD_IS_QUEUE 0x8000
+
+ struct dn_pipe7 *pipe ; /* pointer to parent pipe */
+ u_short parent_nr ; /* parent pipe#, 0 if local to a pipe */
+
+ int weight ; /* WFQ queue weight */
+ int qsize ; /* queue size in slots or bytes */
+ int plr ; /* pkt loss rate (2^31-1 means 100%) */
+
+ struct ipfw_flow_id flow_mask ;
+
+ /* hash table of queues onto this flow_set */
+ int rq_size ; /* number of slots */
+ int rq_elements ; /* active elements */
+ struct dn_flow_queue7 **rq; /* array of rq_size entries */
+
+ u_int32_t last_expired ; /* do not expire too frequently */
+ int backlogged ; /* #active queues for this flowset */
+
+ /* RED parameters */
+#define SCALE_RED 16
+#define SCALE(x) ( (x) << SCALE_RED )
+#define SCALE_VAL(x) ( (x) >> SCALE_RED )
+#define SCALE_MUL(x,y) ( ( (x) * (y) ) >> SCALE_RED )
+ int w_q ; /* queue weight (scaled) */
+ int max_th ; /* maximum threshold for queue (scaled) */
+ int min_th ; /* minimum threshold for queue (scaled) */
+ int max_p ; /* maximum value for p_b (scaled) */
+ u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */
+ u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */
+ u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */
+ u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */
+ u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */
+ u_int lookup_depth ; /* depth of lookup table */
+ int lookup_step ; /* granularity inside the lookup table */
+ int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */
+ int avg_pkt_size ; /* medium packet size */
+ int max_pkt_size ; /* max packet size */
+};
+SLIST_HEAD(dn_flow_set_head, dn_flow_set);
+
+#define DN_IS_PIPE 0x4000
+#define DN_IS_QUEUE 0x8000
+struct dn_flow_queue7 {
+ struct dn_flow_queue7 *next ;
+ struct ipfw_flow_id id ;
+
+ struct mbuf *head, *tail ; /* queue of packets */
+ u_int len ;
+ u_int len_bytes ;
+
+ u_long numbytes;
+
+ u_int64_t tot_pkts ; /* statistics counters */
+ u_int64_t tot_bytes ;
+ u_int32_t drops ;
+
+ int hash_slot ; /* debugging/diagnostic */
+
+ /* RED parameters */
+ int avg ; /* average queue length est. (scaled) */
+ int count ; /* arrivals since last RED drop */
+ int random ; /* random value (scaled) */
+ u_int32_t q_time; /* start of queue idle time */
+
+ /* WF2Q+ support */
+ struct dn_flow_set *fs ; /* parent flow set */
+ int heap_pos ; /* position (index) of struct in heap */
+ int64_t sched_time ; /* current time when queue enters ready_heap */
+
+ int64_t S,F ; /* start time, finish time */
+};
+
+struct dn_pipe7 { /* a pipe */
+ SLIST_ENTRY(dn_pipe7) next; /* linked list in a hash slot */
+
+ int pipe_nr ; /* number */
+ int bandwidth; /* really, bytes/tick. */
+ int delay ; /* really, ticks */
+
+ struct mbuf *head, *tail ; /* packets in delay line */
+
+ /* WF2Q+ */
+ struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/
+ struct dn_heap7 not_eligible_heap; /* top extract- key Start time */
+ struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */
+
+ int64_t V ; /* virtual time */
+ int sum; /* sum of weights of all active sessions */
+
+ int numbytes;
+
+ int64_t sched_time ; /* time pipe was scheduled in ready_heap */
+
+ /*
+ * When the tx clock come from an interface (if_name[0] != '\0'), its name
+ * is stored below, whereas the ifp is filled when the rule is configured.
+ */
+ char if_name[IFNAMSIZ];
+ struct ifnet *ifp ;
+ int ready ; /* set if ifp != NULL and we got a signal from it */
+
+ struct dn_flow_set fs ; /* used with fixed-rate flows */
+};
+SLIST_HEAD(dn_pipe_head7, dn_pipe7);
+
+
+/* FREEBSD8 ip_dummynet.h r196045 */
+struct dn_flow_queue8 {
+ struct dn_flow_queue8 *next ;
+ struct ipfw_flow_id id ;
+
+ struct mbuf *head, *tail ; /* queue of packets */
+ u_int len ;
+ u_int len_bytes ;
+
+ uint64_t numbytes ; /* credit for transmission (dynamic queues) */
+ int64_t extra_bits; /* extra bits simulating unavailable channel */
+
+ u_int64_t tot_pkts ; /* statistics counters */
+ u_int64_t tot_bytes ;
+ u_int32_t drops ;
+
+ int hash_slot ; /* debugging/diagnostic */
+
+ /* RED parameters */
+ int avg ; /* average queue length est. (scaled) */
+ int count ; /* arrivals since last RED drop */
+ int random ; /* random value (scaled) */
+ int64_t idle_time; /* start of queue idle time */
+
+ /* WF2Q+ support */
+ struct dn_flow_set *fs ; /* parent flow set */
+ int heap_pos ; /* position (index) of struct in heap */
+ int64_t sched_time ; /* current time when queue enters ready_heap */
+
+ int64_t S,F ; /* start time, finish time */
+};
+
+struct dn_pipe8 { /* a pipe */
+ SLIST_ENTRY(dn_pipe8) next; /* linked list in a hash slot */
+
+ int pipe_nr ; /* number */
+ int bandwidth; /* really, bytes/tick. */
+ int delay ; /* really, ticks */
+
+ struct mbuf *head, *tail ; /* packets in delay line */
+
+ /* WF2Q+ */
+ struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/
+ struct dn_heap7 not_eligible_heap; /* top extract- key Start time */
+ struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */
+
+ int64_t V ; /* virtual time */
+ int sum; /* sum of weights of all active sessions */
+
+ /* Same as in dn_flow_queue, numbytes can become large */
+ int64_t numbytes; /* bits I can transmit (more or less). */
+ uint64_t burst; /* burst size, scaled: bits * hz */
+
+ int64_t sched_time ; /* time pipe was scheduled in ready_heap */
+ int64_t idle_time; /* start of pipe idle time */
+
+ char if_name[IFNAMSIZ];
+ struct ifnet *ifp ;
+ int ready ; /* set if ifp != NULL and we got a signal from it */
+
+ struct dn_flow_set fs ; /* used with fixed-rate flows */
+
+ /* fields to simulate a delay profile */
+#define ED_MAX_NAME_LEN 32
+ char name[ED_MAX_NAME_LEN];
+ int loss_level;
+ int samples_no;
+ int *samples;
+};
+
+#define ED_MAX_SAMPLES_NO 1024
+struct dn_pipe_max8 {
+ struct dn_pipe8 pipe;
+ int samples[ED_MAX_SAMPLES_NO];
+};
+SLIST_HEAD(dn_pipe_head8, dn_pipe8);
+
+/*
+ * Changes from 7.2 to 8:
+ * dn_pipe:
+ * numbytes from int to int64_t
+ * add burst (int64_t)
+ * add idle_time (int64_t)
+ * add profile
+ * add struct dn_pipe_max
+ * add flag DN_HAS_PROFILE
+ *
+ * dn_flow_queue
+ * numbytes from u_long to int64_t
+ * add extra_bits (int64_t)
+ * q_time from u_int32_t to int64_t and name idle_time
+ *
+ * dn_flow_set unchanged
+ *
+ */
+
+/* NOTE:XXX copied from dummynet.c */
+#define O_NEXT(p, len) ((void *)((char *)p + len))
+static void
+oid_fill(struct dn_id *oid, int len, int type, uintptr_t id)
+{
+ oid->len = len;
+ oid->type = type;
+ oid->subtype = 0;
+ oid->id = id;
+}
+/* make room in the buffer and move the pointer forward */
+static void *
+o_next(struct dn_id **o, int len, int type)
+{
+ struct dn_id *ret = *o;
+ oid_fill(ret, len, type, 0);
+ *o = O_NEXT(*o, len);
+ return ret;
+}
+
+
+static size_t pipesize7 = sizeof(struct dn_pipe7);
+static size_t pipesize8 = sizeof(struct dn_pipe8);
+static size_t pipesizemax8 = sizeof(struct dn_pipe_max8);
+
+/* Indicate 'ipfw' version
+ * 1: from FreeBSD 7.2
+ * 0: from FreeBSD 8
+ * -1: unknow (for now is unused)
+ *
+ * It is update when a IP_DUMMYNET_DEL or IP_DUMMYNET_CONFIGURE request arrives
+ * NOTE: if a IP_DUMMYNET_GET arrives and the 'ipfw' version is unknow,
+ * it is suppose to be the FreeBSD 8 version.
+ */
+static int is7 = 0;
+
+static int
+convertflags2new(int src)
+{
+ int dst = 0;
+
+ if (src & DNOLD_HAVE_FLOW_MASK)
+ dst |= DN_HAVE_MASK;
+ if (src & DNOLD_QSIZE_IS_BYTES)
+ dst |= DN_QSIZE_BYTES;
+ if (src & DNOLD_NOERROR)
+ dst |= DN_NOERROR;
+ if (src & DNOLD_IS_RED)
+ dst |= DN_IS_RED;
+ if (src & DNOLD_IS_GENTLE_RED)
+ dst |= DN_IS_GENTLE_RED;
+ if (src & DNOLD_HAS_PROFILE)
+ dst |= DN_HAS_PROFILE;
+
+ return dst;
+}
+
+static int
+convertflags2old(int src)
+{
+ int dst = 0;
+
+ if (src & DN_HAVE_MASK)
+ dst |= DNOLD_HAVE_FLOW_MASK;
+ if (src & DN_IS_RED)
+ dst |= DNOLD_IS_RED;
+ if (src & DN_IS_GENTLE_RED)
+ dst |= DNOLD_IS_GENTLE_RED;
+ if (src & DN_NOERROR)
+ dst |= DNOLD_NOERROR;
+ if (src & DN_HAS_PROFILE)
+ dst |= DNOLD_HAS_PROFILE;
+ if (src & DN_QSIZE_BYTES)
+ dst |= DNOLD_QSIZE_IS_BYTES;
+
+ return dst;
+}
+
+static int
+dn_compat_del(void *v)
+{
+ struct dn_pipe7 *p = (struct dn_pipe7 *) v;
+ struct dn_pipe8 *p8 = (struct dn_pipe8 *) v;
+ struct {
+ struct dn_id oid;
+ uintptr_t a[1]; /* add more if we want a list */
+ } cmd;
+
+ /* XXX DN_API_VERSION ??? */
+ oid_fill((void *)&cmd, sizeof(cmd), DN_CMD_DELETE, DN_API_VERSION);
+
+ if (is7) {
+ if (p->pipe_nr == 0 && p->fs.fs_nr == 0)
+ return EINVAL;
+ if (p->pipe_nr != 0 && p->fs.fs_nr != 0)
+ return EINVAL;
+ } else {
+ if (p8->pipe_nr == 0 && p8->fs.fs_nr == 0)
+ return EINVAL;
+ if (p8->pipe_nr != 0 && p8->fs.fs_nr != 0)
+ return EINVAL;
+ }
+
+ if (p->pipe_nr != 0) { /* pipe x delete */
+ cmd.a[0] = p->pipe_nr;
+ cmd.oid.subtype = DN_LINK;
+ } else { /* queue x delete */
+ cmd.oid.subtype = DN_FS;
+ cmd.a[0] = (is7) ? p->fs.fs_nr : p8->fs.fs_nr;
+ }
+
+ return do_config(&cmd, cmd.oid.len);
+}
+
+static int
+dn_compat_config_queue(struct dn_fs *fs, void* v)
+{
+ struct dn_pipe7 *p7 = (struct dn_pipe7 *)v;
+ struct dn_pipe8 *p8 = (struct dn_pipe8 *)v;
+ struct dn_flow_set *f;
+
+ if (is7)
+ f = &p7->fs;
+ else
+ f = &p8->fs;
+
+ fs->fs_nr = f->fs_nr;
+ fs->sched_nr = f->parent_nr;
+ fs->flow_mask = f->flow_mask;
+ fs->buckets = f->rq_size;
+ fs->qsize = f->qsize;
+ fs->plr = f->plr;
+ fs->par[0] = f->weight;
+ fs->flags = convertflags2new(f->flags_fs);
+ if (fs->flags & DN_IS_GENTLE_RED || fs->flags & DN_IS_RED) {
+ fs->w_q = f->w_q;
+ fs->max_th = f->max_th;
+ fs->min_th = f->min_th;
+ fs->max_p = f->max_p;
+ }
+
+ return 0;
+}
+
+static int
+dn_compat_config_pipe(struct dn_sch *sch, struct dn_link *p,
+ struct dn_fs *fs, void* v)
+{
+ struct dn_pipe7 *p7 = (struct dn_pipe7 *)v;
+ struct dn_pipe8 *p8 = (struct dn_pipe8 *)v;
+ int i = p7->pipe_nr;
+
+ sch->sched_nr = i;
+ sch->oid.subtype = 0;
+ p->link_nr = i;
+ fs->fs_nr = i + 2*DN_MAX_ID;
+ fs->sched_nr = i + DN_MAX_ID;
+
+ /* Common to 7 and 8 */
+ p->bandwidth = p7->bandwidth;
+ p->delay = p7->delay;
+ if (!is7) {
+ /* FreeBSD 8 has burst */
+ p->burst = p8->burst;
+ }
+
+ /* fill the fifo flowset */
+ dn_compat_config_queue(fs, v);
+ fs->fs_nr = i + 2*DN_MAX_ID;
+ fs->sched_nr = i + DN_MAX_ID;
+
+ /* Move scheduler related parameter from fs to sch */
+ sch->buckets = fs->buckets; /*XXX*/
+ fs->buckets = 0;
+ if (fs->flags & DN_HAVE_MASK) {
+ sch->flags |= DN_HAVE_MASK;
+ fs->flags &= ~DN_HAVE_MASK;
+ sch->sched_mask = fs->flow_mask;
+ bzero(&fs->flow_mask, sizeof(struct ipfw_flow_id));
+ }
+
+ return 0;
+}
+
+static int
+dn_compat_config_profile(struct dn_profile *pf, struct dn_link *p,
+ void *v)
+{
+ struct dn_pipe8 *p8 = (struct dn_pipe8 *)v;
+
+ p8->samples = &(((struct dn_pipe_max8 *)p8)->samples[0]);
+
+ pf->link_nr = p->link_nr;
+ pf->loss_level = p8->loss_level;
+// pf->bandwidth = p->bandwidth; //XXX bandwidth redundant?
+ pf->samples_no = p8->samples_no;
+ strncpy(pf->name, p8->name,sizeof(pf->name));
+ bcopy(p8->samples, pf->samples, sizeof(pf->samples));
+
+ return 0;
+}
+
+/*
+ * If p->pipe_nr != 0 the command is 'pipe x config', so need to create
+ * the three main struct, else only a flowset is created
+ */
+static int
+dn_compat_configure(void *v)
+{
+ struct dn_id *buf = NULL, *base;
+ struct dn_sch *sch = NULL;
+ struct dn_link *p = NULL;
+ struct dn_fs *fs = NULL;
+ struct dn_profile *pf = NULL;
+ int lmax;
+ int error;
+
+ struct dn_pipe7 *p7 = (struct dn_pipe7 *)v;
+ struct dn_pipe8 *p8 = (struct dn_pipe8 *)v;
+
+ int i; /* number of object to configure */
+
+ lmax = sizeof(struct dn_id); /* command header */
+ lmax += sizeof(struct dn_sch) + sizeof(struct dn_link) +
+ sizeof(struct dn_fs) + sizeof(struct dn_profile);
+
+ base = buf = malloc(lmax, M_DUMMYNET, M_WAIT|M_ZERO);
+ o_next(&buf, sizeof(struct dn_id), DN_CMD_CONFIG);
+ base->id = DN_API_VERSION;
+
+ /* pipe_nr is the same in p7 and p8 */
+ i = p7->pipe_nr;
+ if (i != 0) { /* pipe config */
+ sch = o_next(&buf, sizeof(*sch), DN_SCH);
+ p = o_next(&buf, sizeof(*p), DN_LINK);
+ fs = o_next(&buf, sizeof(*fs), DN_FS);
+
+ error = dn_compat_config_pipe(sch, p, fs, v);
+ if (error) {
+ free(buf, M_DUMMYNET);
+ return error;
+ }
+ if (!is7 && p8->samples_no > 0) {
+ /* Add profiles*/
+ pf = o_next(&buf, sizeof(*pf), DN_PROFILE);
+ error = dn_compat_config_profile(pf, p, v);
+ if (error) {
+ free(buf, M_DUMMYNET);
+ return error;
+ }
+ }
+ } else { /* queue config */
+ fs = o_next(&buf, sizeof(*fs), DN_FS);
+ error = dn_compat_config_queue(fs, v);
+ if (error) {
+ free(buf, M_DUMMYNET);
+ return error;
+ }
+ }
+ error = do_config(base, (char *)buf - (char *)base);
+
+ if (buf)
+ free(buf, M_DUMMYNET);
+ return error;
+}
+
+int
+dn_compat_calc_size(void)
+{
+ int need = 0;
+ /* XXX use FreeBSD 8 struct size */
+ /* NOTE:
+ * - half scheduler: schk_count/2
+ * - all flowset: fsk_count
+ * - all flowset queues: queue_count
+ * - all pipe queue: si_count
+ */
+ need += dn_cfg.schk_count * sizeof(struct dn_pipe8) / 2;
+ need += dn_cfg.fsk_count * sizeof(struct dn_flow_set);
+ need += dn_cfg.si_count * sizeof(struct dn_flow_queue8);
+ need += dn_cfg.queue_count * sizeof(struct dn_flow_queue8);
+
+ return need;
+}
+
+int
+dn_c_copy_q (void *_ni, void *arg)
+{
+ struct copy_args *a = arg;
+ struct dn_flow_queue7 *fq7 = (struct dn_flow_queue7 *)*a->start;
+ struct dn_flow_queue8 *fq8 = (struct dn_flow_queue8 *)*a->start;
+ struct dn_flow *ni = (struct dn_flow *)_ni;
+ int size = 0;
+
+ /* XXX hash slot not set */
+ /* No difference between 7.2/8 */
+ fq7->len = ni->length;
+ fq7->len_bytes = ni->len_bytes;
+ fq7->id = ni->fid;
+
+ if (is7) {
+ size = sizeof(struct dn_flow_queue7);
+ fq7->tot_pkts = ni->tot_pkts;
+ fq7->tot_bytes = ni->tot_bytes;
+ fq7->drops = ni->drops;
+ } else {
+ size = sizeof(struct dn_flow_queue8);
+ fq8->tot_pkts = ni->tot_pkts;
+ fq8->tot_bytes = ni->tot_bytes;
+ fq8->drops = ni->drops;
+ }
+
+ *a->start += size;
+ return 0;
+}
+
+int
+dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq)
+{
+ struct dn_link *l = &s->link;
+ struct dn_fsk *f = s->fs;
+
+ struct dn_pipe7 *pipe7 = (struct dn_pipe7 *)*a->start;
+ struct dn_pipe8 *pipe8 = (struct dn_pipe8 *)*a->start;
+ struct dn_flow_set *fs;
+ int size = 0;
+
+ if (is7) {
+ fs = &pipe7->fs;
+ size = sizeof(struct dn_pipe7);
+ } else {
+ fs = &pipe8->fs;
+ size = sizeof(struct dn_pipe8);
+ }
+
+ /* These 4 field are the same in pipe7 and pipe8 */
+ pipe7->next.sle_next = (struct dn_pipe7 *)DN_IS_PIPE;
+ pipe7->bandwidth = l->bandwidth;
+ pipe7->delay = l->delay * 1000 / hz;
+ pipe7->pipe_nr = l->link_nr - DN_MAX_ID;
+
+ if (!is7) {
+ if (s->profile) {
+ struct dn_profile *pf = s->profile;
+ strncpy(pipe8->name, pf->name, sizeof(pf->name));
+ pipe8->loss_level = pf->loss_level;
+ pipe8->samples_no = pf->samples_no;
+ }
+ pipe8->burst = div64(l->burst , 8 * hz);
+ }
+
+ fs->flow_mask = s->sch.sched_mask;
+ fs->rq_size = s->sch.buckets ? s->sch.buckets : 1;
+
+ fs->parent_nr = l->link_nr - DN_MAX_ID;
+ fs->qsize = f->fs.qsize;
+ fs->plr = f->fs.plr;
+ fs->w_q = f->fs.w_q;
+ fs->max_th = f->max_th;
+ fs->min_th = f->min_th;
+ fs->max_p = f->fs.max_p;
+ fs->rq_elements = nq;
+
+ fs->flags_fs = convertflags2old(f->fs.flags);
+
+ *a->start += size;
+ return 0;
+}
+
+
+int
+dn_compat_copy_pipe(struct copy_args *a, void *_o)
+{
+ int have = a->end - *a->start;
+ int need = 0;
+ int pipe_size = sizeof(struct dn_pipe8);
+ int queue_size = sizeof(struct dn_flow_queue8);
+ int n_queue = 0; /* number of queues */
+
+ struct dn_schk *s = (struct dn_schk *)_o;
+ /* calculate needed space:
+ * - struct dn_pipe
+ * - if there are instances, dn_queue * n_instances
+ */
+ n_queue = (s->sch.flags & DN_HAVE_MASK ? dn_ht_entries(s->siht) :
+ (s->siht ? 1 : 0));
+ need = pipe_size + queue_size * n_queue;
+ if (have < need) {
+ D("have %d < need %d", have, need);
+ return 1;
+ }
+ /* copy pipe */
+ dn_c_copy_pipe(s, a, n_queue);
+
+ /* copy queues */
+ if (s->sch.flags & DN_HAVE_MASK)
+ dn_ht_scan(s->siht, dn_c_copy_q, a);
+ else if (s->siht)
+ dn_c_copy_q(s->siht, a);
+ return 0;
+}
+
+int
+dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq)
+{
+ struct dn_flow_set *fs = (struct dn_flow_set *)*a->start;
+
+ fs->next.sle_next = (struct dn_flow_set *)DN_IS_QUEUE;
+ fs->fs_nr = f->fs.fs_nr;
+ fs->qsize = f->fs.qsize;
+ fs->plr = f->fs.plr;
+ fs->w_q = f->fs.w_q;
+ fs->max_th = f->max_th;
+ fs->min_th = f->min_th;
+ fs->max_p = f->fs.max_p;
+ fs->flow_mask = f->fs.flow_mask;
+ fs->rq_elements = nq;
+ fs->rq_size = (f->fs.buckets ? f->fs.buckets : 1);
+ fs->parent_nr = f->fs.sched_nr;
+ fs->weight = f->fs.par[0];
+
+ fs->flags_fs = convertflags2old(f->fs.flags);
+ *a->start += sizeof(struct dn_flow_set);
+ return 0;
+}
+
+int
+dn_compat_copy_queue(struct copy_args *a, void *_o)
+{
+ int have = a->end - *a->start;
+ int need = 0;
+ int fs_size = sizeof(struct dn_flow_set);
+ int queue_size = sizeof(struct dn_flow_queue8);
+
+ struct dn_fsk *fs = (struct dn_fsk *)_o;
+ int n_queue = 0; /* number of queues */
+
+ n_queue = (fs->fs.flags & DN_HAVE_MASK ? dn_ht_entries(fs->qht) :
+ (fs->qht ? 1 : 0));
+
+ need = fs_size + queue_size * n_queue;
+ if (have < need) {
+ D("have < need");
+ return 1;
+ }
+
+ /* copy flowset */
+ dn_c_copy_fs(fs, a, n_queue);
+
+ /* copy queues */
+ if (fs->fs.flags & DN_HAVE_MASK)
+ dn_ht_scan(fs->qht, dn_c_copy_q, a);
+ else if (fs->qht)
+ dn_c_copy_q(fs->qht, a);
+
+ return 0;
+}
+
+int
+copy_data_helper_compat(void *_o, void *_arg)
+{
+ struct copy_args *a = _arg;
+
+ if (a->type == DN_COMPAT_PIPE) {
+ struct dn_schk *s = _o;
+ if (s->sch.oid.subtype != 1 || s->sch.sched_nr <= DN_MAX_ID) {
+ return 0; /* not old type */
+ }
+ /* copy pipe parameters, and if instance exists, copy
+ * other parameters and eventually queues.
+ */
+ if(dn_compat_copy_pipe(a, _o))
+ return DNHT_SCAN_END;
+ } else if (a->type == DN_COMPAT_QUEUE) {
+ struct dn_fsk *fs = _o;
+ if (fs->fs.fs_nr >= DN_MAX_ID)
+ return 0;
+ if (dn_compat_copy_queue(a, _o))
+ return DNHT_SCAN_END;
+ }
+ return 0;
+}
+
+/* Main function to manage old requests */
+int
+ip_dummynet_compat(struct sockopt *sopt)
+{
+ int error=0;
+ void *v = NULL;
+ struct dn_id oid;
+
+ /* Lenght of data, used to found ipfw version... */
+ int len = sopt->sopt_valsize;
+
+ /* len can be 0 if command was dummynet_flush */
+ if (len == pipesize7) {
+ D("setting compatibility with FreeBSD 7.2");
+ is7 = 1;
+ }
+ else if (len == pipesize8 || len == pipesizemax8) {
+ D("setting compatibility with FreeBSD 8");
+ is7 = 0;
+ }
+
+ switch (sopt->sopt_name) {
+ default:
+ printf("dummynet: -- unknown option %d", sopt->sopt_name);
+ error = EINVAL;
+ break;
+
+ case IP_DUMMYNET_FLUSH:
+ oid_fill(&oid, sizeof(oid), DN_CMD_FLUSH, DN_API_VERSION);
+ do_config(&oid, oid.len);
+ break;
+
+ case IP_DUMMYNET_DEL:
+ v = malloc(len, M_TEMP, M_WAITOK);
+ error = sooptcopyin(sopt, v, len, len);
+ if (error)
+ break;
+ error = dn_compat_del(v);
+ free(v, M_TEMP);
+ break;
+
+ case IP_DUMMYNET_CONFIGURE:
+ v = malloc(len, M_TEMP, M_WAITOK);
+ error = sooptcopyin(sopt, v, len, len);
+ if (error)
+ break;
+ error = dn_compat_configure(v);
+ free(v, M_TEMP);
+ break;
+
+ case IP_DUMMYNET_GET: {
+ void *buf;
+ int ret;
+ int original_size = sopt->sopt_valsize;
+ int size;
+
+ ret = dummynet_get(sopt, &buf);
+ if (ret)
+ return 0;//XXX ?
+ size = sopt->sopt_valsize;
+ sopt->sopt_valsize = original_size;
+ D("size=%d, buf=%p", size, buf);
+ ret = sooptcopyout(sopt, buf, size);
+ if (ret)
+ printf(" %s ERROR sooptcopyout\n", __FUNCTION__);
+ if (buf)
+ free(buf, M_DUMMYNET);
+ }
+ }
+
+ return error;
+}
+
+
diff --git a/sys/netpfil/ipfw/ip_dn_io.c b/sys/netpfil/ipfw/ip_dn_io.c
new file mode 100644
index 0000000..2047b74
--- /dev/null
+++ b/sys/netpfil/ipfw/ip_dn_io.c
@@ -0,0 +1,857 @@
+/*-
+ * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Dummynet portions related to packet handling.
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h> /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */
+#include <net/netisr.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/ip.h> /* ip_len, ip_off */
+#include <netinet/ip_var.h> /* ip_output(), IP_FORWARDING */
+#include <netinet/ip_fw.h>
+#include <netinet/ip_dummynet.h>
+#include <netinet/if_ether.h> /* various ether_* routines */
+#include <netinet/ip6.h> /* for ip6_input, ip6_output prototypes */
+#include <netinet6/ip6_var.h>
+
+#include <netpfil/ipfw/ip_fw_private.h>
+#include <netpfil/ipfw/dn_heap.h>
+#include <netpfil/ipfw/ip_dn_private.h>
+#include <netpfil/ipfw/dn_sched.h>
+
+/*
+ * We keep a private variable for the simulation time, but we could
+ * probably use an existing one ("softticks" in sys/kern/kern_timeout.c)
+ * instead of dn_cfg.curr_time
+ */
+
+struct dn_parms dn_cfg;
+//VNET_DEFINE(struct dn_parms, _base_dn_cfg);
+
+static long tick_last; /* Last tick duration (usec). */
+static long tick_delta; /* Last vs standard tick diff (usec). */
+static long tick_delta_sum; /* Accumulated tick difference (usec).*/
+static long tick_adjustment; /* Tick adjustments done. */
+static long tick_lost; /* Lost(coalesced) ticks number. */
+/* Adjusted vs non-adjusted curr_time difference (ticks). */
+static long tick_diff;
+
+static unsigned long io_pkt;
+static unsigned long io_pkt_fast;
+static unsigned long io_pkt_drop;
+
+/*
+ * We use a heap to store entities for which we have pending timer events.
+ * The heap is checked at every tick and all entities with expired events
+ * are extracted.
+ */
+
+MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap");
+
+extern void (*bridge_dn_p)(struct mbuf *, struct ifnet *);
+
+#ifdef SYSCTL_NODE
+
+/*
+ * Because of the way the SYSBEGIN/SYSEND macros work on other
+ * platforms, there should not be functions between them.
+ * So keep the handlers outside the block.
+ */
+static int
+sysctl_hash_size(SYSCTL_HANDLER_ARGS)
+{
+ int error, value;
+
+ value = dn_cfg.hash_size;
+ error = sysctl_handle_int(oidp, &value, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ if (value < 16 || value > 65536)
+ return (EINVAL);
+ dn_cfg.hash_size = value;
+ return (0);
+}
+
+static int
+sysctl_limits(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ long value;
+
+ if (arg2 != 0)
+ value = dn_cfg.slot_limit;
+ else
+ value = dn_cfg.byte_limit;
+ error = sysctl_handle_long(oidp, &value, 0, req);
+
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ if (arg2 != 0) {
+ if (value < 1)
+ return (EINVAL);
+ dn_cfg.slot_limit = value;
+ } else {
+ if (value < 1500)
+ return (EINVAL);
+ dn_cfg.byte_limit = value;
+ }
+ return (0);
+}
+
+SYSBEGIN(f4)
+
+SYSCTL_DECL(_net_inet);
+SYSCTL_DECL(_net_inet_ip);
+static SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet");
+
+/* wrapper to pass dn_cfg fields to SYSCTL_* */
+//#define DC(x) (&(VNET_NAME(_base_dn_cfg).x))
+#define DC(x) (&(dn_cfg.x))
+/* parameters */
+
+
+SYSCTL_PROC(_net_inet_ip_dummynet, OID_AUTO, hash_size,
+ CTLTYPE_INT | CTLFLAG_RW, 0, 0, sysctl_hash_size,
+ "I", "Default hash table size");
+
+
+SYSCTL_PROC(_net_inet_ip_dummynet, OID_AUTO, pipe_slot_limit,
+ CTLTYPE_LONG | CTLFLAG_RW, 0, 1, sysctl_limits,
+ "L", "Upper limit in slots for pipe queue.");
+SYSCTL_PROC(_net_inet_ip_dummynet, OID_AUTO, pipe_byte_limit,
+ CTLTYPE_LONG | CTLFLAG_RW, 0, 0, sysctl_limits,
+ "L", "Upper limit in bytes for pipe queue.");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, io_fast,
+ CTLFLAG_RW, DC(io_fast), 0, "Enable fast dummynet io.");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug,
+ CTLFLAG_RW, DC(debug), 0, "Dummynet debug level");
+
+/* RED parameters */
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth,
+ CTLFLAG_RD, DC(red_lookup_depth), 0, "Depth of RED lookup table");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size,
+ CTLFLAG_RD, DC(red_avg_pkt_size), 0, "RED Medium packet size");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size,
+ CTLFLAG_RD, DC(red_max_pkt_size), 0, "RED Max packet size");
+
+/* time adjustment */
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta,
+ CTLFLAG_RD, &tick_delta, 0, "Last vs standard tick difference (usec).");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta_sum,
+ CTLFLAG_RD, &tick_delta_sum, 0, "Accumulated tick difference (usec).");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_adjustment,
+ CTLFLAG_RD, &tick_adjustment, 0, "Tick adjustments done.");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_diff,
+ CTLFLAG_RD, &tick_diff, 0,
+ "Adjusted vs non-adjusted curr_time difference (ticks).");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost,
+ CTLFLAG_RD, &tick_lost, 0,
+ "Number of ticks coalesced by dummynet taskqueue.");
+
+/* Drain parameters */
+SYSCTL_UINT(_net_inet_ip_dummynet, OID_AUTO, expire,
+ CTLFLAG_RW, DC(expire), 0, "Expire empty queues/pipes");
+SYSCTL_UINT(_net_inet_ip_dummynet, OID_AUTO, expire_cycle,
+ CTLFLAG_RD, DC(expire_cycle), 0, "Expire cycle for queues/pipes");
+
+/* statistics */
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, schk_count,
+ CTLFLAG_RD, DC(schk_count), 0, "Number of schedulers");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, si_count,
+ CTLFLAG_RD, DC(si_count), 0, "Number of scheduler instances");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, fsk_count,
+ CTLFLAG_RD, DC(fsk_count), 0, "Number of flowsets");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, queue_count,
+ CTLFLAG_RD, DC(queue_count), 0, "Number of queues");
+SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt,
+ CTLFLAG_RD, &io_pkt, 0,
+ "Number of packets passed to dummynet.");
+SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_fast,
+ CTLFLAG_RD, &io_pkt_fast, 0,
+ "Number of packets bypassed dummynet scheduler.");
+SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_drop,
+ CTLFLAG_RD, &io_pkt_drop, 0,
+ "Number of packets dropped by dummynet.");
+#undef DC
+SYSEND
+
+#endif
+
+static void dummynet_send(struct mbuf *);
+
+/*
+ * Packets processed by dummynet have an mbuf tag associated with
+ * them that carries their dummynet state.
+ * Outside dummynet, only the 'rule' field is relevant, and it must
+ * be at the beginning of the structure.
+ */
+struct dn_pkt_tag {
+ struct ipfw_rule_ref rule; /* matching rule */
+
+ /* second part, dummynet specific */
+ int dn_dir; /* action when packet comes out.*/
+ /* see ip_fw_private.h */
+ uint64_t output_time; /* when the pkt is due for delivery*/
+ struct ifnet *ifp; /* interface, for ip_output */
+ struct _ip6dn_args ip6opt; /* XXX ipv6 options */
+};
+
+/*
+ * Return the mbuf tag holding the dummynet state (it should
+ * be the first one on the list).
+ */
+static struct dn_pkt_tag *
+dn_tag_get(struct mbuf *m)
+{
+ struct m_tag *mtag = m_tag_first(m);
+ KASSERT(mtag != NULL &&
+ mtag->m_tag_cookie == MTAG_ABI_COMPAT &&
+ mtag->m_tag_id == PACKET_TAG_DUMMYNET,
+ ("packet on dummynet queue w/o dummynet tag!"));
+ return (struct dn_pkt_tag *)(mtag+1);
+}
+
+static inline void
+mq_append(struct mq *q, struct mbuf *m)
+{
+ if (q->head == NULL)
+ q->head = m;
+ else
+ q->tail->m_nextpkt = m;
+ q->tail = m;
+ m->m_nextpkt = NULL;
+}
+
+/*
+ * Dispose a list of packet. Use a functions so if we need to do
+ * more work, this is a central point to do it.
+ */
+void dn_free_pkts(struct mbuf *mnext)
+{
+ struct mbuf *m;
+
+ while ((m = mnext) != NULL) {
+ mnext = m->m_nextpkt;
+ FREE_PKT(m);
+ }
+}
+
+static int
+red_drops (struct dn_queue *q, int len)
+{
+ /*
+ * RED algorithm
+ *
+ * RED calculates the average queue size (avg) using a low-pass filter
+ * with an exponential weighted (w_q) moving average:
+ * avg <- (1-w_q) * avg + w_q * q_size
+ * where q_size is the queue length (measured in bytes or * packets).
+ *
+ * If q_size == 0, we compute the idle time for the link, and set
+ * avg = (1 - w_q)^(idle/s)
+ * where s is the time needed for transmitting a medium-sized packet.
+ *
+ * Now, if avg < min_th the packet is enqueued.
+ * If avg > max_th the packet is dropped. Otherwise, the packet is
+ * dropped with probability P function of avg.
+ */
+
+ struct dn_fsk *fs = q->fs;
+ int64_t p_b = 0;
+
+ /* Queue in bytes or packets? */
+ uint32_t q_size = (fs->fs.flags & DN_QSIZE_BYTES) ?
+ q->ni.len_bytes : q->ni.length;
+
+ /* Average queue size estimation. */
+ if (q_size != 0) {
+ /* Queue is not empty, avg <- avg + (q_size - avg) * w_q */
+ int diff = SCALE(q_size) - q->avg;
+ int64_t v = SCALE_MUL((int64_t)diff, (int64_t)fs->w_q);
+
+ q->avg += (int)v;
+ } else {
+ /*
+ * Queue is empty, find for how long the queue has been
+ * empty and use a lookup table for computing
+ * (1 - * w_q)^(idle_time/s) where s is the time to send a
+ * (small) packet.
+ * XXX check wraps...
+ */
+ if (q->avg) {
+ u_int t = div64((dn_cfg.curr_time - q->q_time), fs->lookup_step);
+
+ q->avg = (t < fs->lookup_depth) ?
+ SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0;
+ }
+ }
+
+ /* Should i drop? */
+ if (q->avg < fs->min_th) {
+ q->count = -1;
+ return (0); /* accept packet */
+ }
+ if (q->avg >= fs->max_th) { /* average queue >= max threshold */
+ if (fs->fs.flags & DN_IS_GENTLE_RED) {
+ /*
+ * According to Gentle-RED, if avg is greater than
+ * max_th the packet is dropped with a probability
+ * p_b = c_3 * avg - c_4
+ * where c_3 = (1 - max_p) / max_th
+ * c_4 = 1 - 2 * max_p
+ */
+ p_b = SCALE_MUL((int64_t)fs->c_3, (int64_t)q->avg) -
+ fs->c_4;
+ } else {
+ q->count = -1;
+ return (1);
+ }
+ } else if (q->avg > fs->min_th) {
+ /*
+ * We compute p_b using the linear dropping function
+ * p_b = c_1 * avg - c_2
+ * where c_1 = max_p / (max_th - min_th)
+ * c_2 = max_p * min_th / (max_th - min_th)
+ */
+ p_b = SCALE_MUL((int64_t)fs->c_1, (int64_t)q->avg) - fs->c_2;
+ }
+
+ if (fs->fs.flags & DN_QSIZE_BYTES)
+ p_b = div64((p_b * len) , fs->max_pkt_size);
+ if (++q->count == 0)
+ q->random = random() & 0xffff;
+ else {
+ /*
+ * q->count counts packets arrived since last drop, so a greater
+ * value of q->count means a greater packet drop probability.
+ */
+ if (SCALE_MUL(p_b, SCALE((int64_t)q->count)) > q->random) {
+ q->count = 0;
+ /* After a drop we calculate a new random value. */
+ q->random = random() & 0xffff;
+ return (1); /* drop */
+ }
+ }
+ /* End of RED algorithm. */
+
+ return (0); /* accept */
+
+}
+
+/*
+ * Enqueue a packet in q, subject to space and queue management policy
+ * (whose parameters are in q->fs).
+ * Update stats for the queue and the scheduler.
+ * Return 0 on success, 1 on drop. The packet is consumed anyways.
+ */
+int
+dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop)
+{
+ struct dn_fs *f;
+ struct dn_flow *ni; /* stats for scheduler instance */
+ uint64_t len;
+
+ if (q->fs == NULL || q->_si == NULL) {
+ printf("%s fs %p si %p, dropping\n",
+ __FUNCTION__, q->fs, q->_si);
+ FREE_PKT(m);
+ return 1;
+ }
+ f = &(q->fs->fs);
+ ni = &q->_si->ni;
+ len = m->m_pkthdr.len;
+ /* Update statistics, then check reasons to drop pkt. */
+ q->ni.tot_bytes += len;
+ q->ni.tot_pkts++;
+ ni->tot_bytes += len;
+ ni->tot_pkts++;
+ if (drop)
+ goto drop;
+ if (f->plr && random() < f->plr)
+ goto drop;
+ if (f->flags & DN_IS_RED && red_drops(q, m->m_pkthdr.len))
+ goto drop;
+ if (f->flags & DN_QSIZE_BYTES) {
+ if (q->ni.len_bytes > f->qsize)
+ goto drop;
+ } else if (q->ni.length >= f->qsize) {
+ goto drop;
+ }
+ mq_append(&q->mq, m);
+ q->ni.length++;
+ q->ni.len_bytes += len;
+ ni->length++;
+ ni->len_bytes += len;
+ return 0;
+
+drop:
+ io_pkt_drop++;
+ q->ni.drops++;
+ ni->drops++;
+ FREE_PKT(m);
+ return 1;
+}
+
+/*
+ * Fetch packets from the delay line which are due now. If there are
+ * leftover packets, reinsert the delay line in the heap.
+ * Runs under scheduler lock.
+ */
+static void
+transmit_event(struct mq *q, struct delay_line *dline, uint64_t now)
+{
+ struct mbuf *m;
+ struct dn_pkt_tag *pkt = NULL;
+
+ dline->oid.subtype = 0; /* not in heap */
+ while ((m = dline->mq.head) != NULL) {
+ pkt = dn_tag_get(m);
+ if (!DN_KEY_LEQ(pkt->output_time, now))
+ break;
+ dline->mq.head = m->m_nextpkt;
+ mq_append(q, m);
+ }
+ if (m != NULL) {
+ dline->oid.subtype = 1; /* in heap */
+ heap_insert(&dn_cfg.evheap, pkt->output_time, dline);
+ }
+}
+
+/*
+ * Convert the additional MAC overheads/delays into an equivalent
+ * number of bits for the given data rate. The samples are
+ * in milliseconds so we need to divide by 1000.
+ */
+static uint64_t
+extra_bits(struct mbuf *m, struct dn_schk *s)
+{
+ int index;
+ uint64_t bits;
+ struct dn_profile *pf = s->profile;
+
+ if (!pf || pf->samples_no == 0)
+ return 0;
+ index = random() % pf->samples_no;
+ bits = div64((uint64_t)pf->samples[index] * s->link.bandwidth, 1000);
+ if (index >= pf->loss_level) {
+ struct dn_pkt_tag *dt = dn_tag_get(m);
+ if (dt)
+ dt->dn_dir = DIR_DROP;
+ }
+ return bits;
+}
+
+/*
+ * Send traffic from a scheduler instance due by 'now'.
+ * Return a pointer to the head of the queue.
+ */
+static struct mbuf *
+serve_sched(struct mq *q, struct dn_sch_inst *si, uint64_t now)
+{
+ struct mq def_q;
+ struct dn_schk *s = si->sched;
+ struct mbuf *m = NULL;
+ int delay_line_idle = (si->dline.mq.head == NULL);
+ int done, bw;
+
+ if (q == NULL) {
+ q = &def_q;
+ q->head = NULL;
+ }
+
+ bw = s->link.bandwidth;
+ si->kflags &= ~DN_ACTIVE;
+
+ if (bw > 0)
+ si->credit += (now - si->sched_time) * bw;
+ else
+ si->credit = 0;
+ si->sched_time = now;
+ done = 0;
+ while (si->credit >= 0 && (m = s->fp->dequeue(si)) != NULL) {
+ uint64_t len_scaled;
+
+ done++;
+ len_scaled = (bw == 0) ? 0 : hz *
+ (m->m_pkthdr.len * 8 + extra_bits(m, s));
+ si->credit -= len_scaled;
+ /* Move packet in the delay line */
+ dn_tag_get(m)->output_time = dn_cfg.curr_time + s->link.delay ;
+ mq_append(&si->dline.mq, m);
+ }
+
+ /*
+ * If credit >= 0 the instance is idle, mark time.
+ * Otherwise put back in the heap, and adjust the output
+ * time of the last inserted packet, m, which was too early.
+ */
+ if (si->credit >= 0) {
+ si->idle_time = now;
+ } else {
+ uint64_t t;
+ KASSERT (bw > 0, ("bw=0 and credit<0 ?"));
+ t = div64(bw - 1 - si->credit, bw);
+ if (m)
+ dn_tag_get(m)->output_time += t;
+ si->kflags |= DN_ACTIVE;
+ heap_insert(&dn_cfg.evheap, now + t, si);
+ }
+ if (delay_line_idle && done)
+ transmit_event(q, &si->dline, now);
+ return q->head;
+}
+
+/*
+ * The timer handler for dummynet. Time is computed in ticks, but
+ * but the code is tolerant to the actual rate at which this is called.
+ * Once complete, the function reschedules itself for the next tick.
+ */
+void
+dummynet_task(void *context, int pending)
+{
+ struct timeval t;
+ struct mq q = { NULL, NULL }; /* queue to accumulate results */
+
+ CURVNET_SET((struct vnet *)context);
+
+ DN_BH_WLOCK();
+
+ /* Update number of lost(coalesced) ticks. */
+ tick_lost += pending - 1;
+
+ getmicrouptime(&t);
+ /* Last tick duration (usec). */
+ tick_last = (t.tv_sec - dn_cfg.prev_t.tv_sec) * 1000000 +
+ (t.tv_usec - dn_cfg.prev_t.tv_usec);
+ /* Last tick vs standard tick difference (usec). */
+ tick_delta = (tick_last * hz - 1000000) / hz;
+ /* Accumulated tick difference (usec). */
+ tick_delta_sum += tick_delta;
+
+ dn_cfg.prev_t = t;
+
+ /*
+ * Adjust curr_time if the accumulated tick difference is
+ * greater than the 'standard' tick. Since curr_time should
+ * be monotonically increasing, we do positive adjustments
+ * as required, and throttle curr_time in case of negative
+ * adjustment.
+ */
+ dn_cfg.curr_time++;
+ if (tick_delta_sum - tick >= 0) {
+ int diff = tick_delta_sum / tick;
+
+ dn_cfg.curr_time += diff;
+ tick_diff += diff;
+ tick_delta_sum %= tick;
+ tick_adjustment++;
+ } else if (tick_delta_sum + tick <= 0) {
+ dn_cfg.curr_time--;
+ tick_diff--;
+ tick_delta_sum += tick;
+ tick_adjustment++;
+ }
+
+ /* serve pending events, accumulate in q */
+ for (;;) {
+ struct dn_id *p; /* generic parameter to handler */
+
+ if (dn_cfg.evheap.elements == 0 ||
+ DN_KEY_LT(dn_cfg.curr_time, HEAP_TOP(&dn_cfg.evheap)->key))
+ break;
+ p = HEAP_TOP(&dn_cfg.evheap)->object;
+ heap_extract(&dn_cfg.evheap, NULL);
+
+ if (p->type == DN_SCH_I) {
+ serve_sched(&q, (struct dn_sch_inst *)p, dn_cfg.curr_time);
+ } else { /* extracted a delay line */
+ transmit_event(&q, (struct delay_line *)p, dn_cfg.curr_time);
+ }
+ }
+ if (dn_cfg.expire && ++dn_cfg.expire_cycle >= dn_cfg.expire) {
+ dn_cfg.expire_cycle = 0;
+ dn_drain_scheduler();
+ dn_drain_queue();
+ }
+
+ DN_BH_WUNLOCK();
+ dn_reschedule();
+ if (q.head != NULL)
+ dummynet_send(q.head);
+ CURVNET_RESTORE();
+}
+
+/*
+ * forward a chain of packets to the proper destination.
+ * This runs outside the dummynet lock.
+ */
+static void
+dummynet_send(struct mbuf *m)
+{
+ struct mbuf *n;
+
+ for (; m != NULL; m = n) {
+ struct ifnet *ifp = NULL; /* gcc 3.4.6 complains */
+ struct m_tag *tag;
+ int dst;
+
+ n = m->m_nextpkt;
+ m->m_nextpkt = NULL;
+ tag = m_tag_first(m);
+ if (tag == NULL) { /* should not happen */
+ dst = DIR_DROP;
+ } else {
+ struct dn_pkt_tag *pkt = dn_tag_get(m);
+ /* extract the dummynet info, rename the tag
+ * to carry reinject info.
+ */
+ dst = pkt->dn_dir;
+ ifp = pkt->ifp;
+ tag->m_tag_cookie = MTAG_IPFW_RULE;
+ tag->m_tag_id = 0;
+ }
+
+ switch (dst) {
+ case DIR_OUT:
+ SET_HOST_IPLEN(mtod(m, struct ip *));
+ ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL);
+ break ;
+
+ case DIR_IN :
+ /* put header in network format for ip_input() */
+ //SET_NET_IPLEN(mtod(m, struct ip *));
+ netisr_dispatch(NETISR_IP, m);
+ break;
+
+#ifdef INET6
+ case DIR_IN | PROTO_IPV6:
+ netisr_dispatch(NETISR_IPV6, m);
+ break;
+
+ case DIR_OUT | PROTO_IPV6:
+ ip6_output(m, NULL, NULL, IPV6_FORWARDING, NULL, NULL, NULL);
+ break;
+#endif
+
+ case DIR_FWD | PROTO_IFB: /* DN_TO_IFB_FWD: */
+ if (bridge_dn_p != NULL)
+ ((*bridge_dn_p)(m, ifp));
+ else
+ printf("dummynet: if_bridge not loaded\n");
+
+ break;
+
+ case DIR_IN | PROTO_LAYER2: /* DN_TO_ETH_DEMUX: */
+ /*
+ * The Ethernet code assumes the Ethernet header is
+ * contiguous in the first mbuf header.
+ * Insure this is true.
+ */
+ if (m->m_len < ETHER_HDR_LEN &&
+ (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) {
+ printf("dummynet/ether: pullup failed, "
+ "dropping packet\n");
+ break;
+ }
+ ether_demux(m->m_pkthdr.rcvif, m);
+ break;
+
+ case DIR_OUT | PROTO_LAYER2: /* N_TO_ETH_OUT: */
+ ether_output_frame(ifp, m);
+ break;
+
+ case DIR_DROP:
+ /* drop the packet after some time */
+ FREE_PKT(m);
+ break;
+
+ default:
+ printf("dummynet: bad switch %d!\n", dst);
+ FREE_PKT(m);
+ break;
+ }
+ }
+}
+
+static inline int
+tag_mbuf(struct mbuf *m, int dir, struct ip_fw_args *fwa)
+{
+ struct dn_pkt_tag *dt;
+ struct m_tag *mtag;
+
+ mtag = m_tag_get(PACKET_TAG_DUMMYNET,
+ sizeof(*dt), M_NOWAIT | M_ZERO);
+ if (mtag == NULL)
+ return 1; /* Cannot allocate packet header. */
+ m_tag_prepend(m, mtag); /* Attach to mbuf chain. */
+ dt = (struct dn_pkt_tag *)(mtag + 1);
+ dt->rule = fwa->rule;
+ dt->rule.info &= IPFW_ONEPASS; /* only keep this info */
+ dt->dn_dir = dir;
+ dt->ifp = fwa->oif;
+ /* dt->output tame is updated as we move through */
+ dt->output_time = dn_cfg.curr_time;
+ return 0;
+}
+
+
+/*
+ * dummynet hook for packets.
+ * We use the argument to locate the flowset fs and the sched_set sch
+ * associated to it. The we apply flow_mask and sched_mask to
+ * determine the queue and scheduler instances.
+ *
+ * dir where shall we send the packet after dummynet.
+ * *m0 the mbuf with the packet
+ * ifp the 'ifp' parameter from the caller.
+ * NULL in ip_input, destination interface in ip_output,
+ */
+int
+dummynet_io(struct mbuf **m0, int dir, struct ip_fw_args *fwa)
+{
+ struct mbuf *m = *m0;
+ struct dn_fsk *fs = NULL;
+ struct dn_sch_inst *si;
+ struct dn_queue *q = NULL; /* default */
+
+ int fs_id = (fwa->rule.info & IPFW_INFO_MASK) +
+ ((fwa->rule.info & IPFW_IS_PIPE) ? 2*DN_MAX_ID : 0);
+ DN_BH_WLOCK();
+ io_pkt++;
+ /* we could actually tag outside the lock, but who cares... */
+ if (tag_mbuf(m, dir, fwa))
+ goto dropit;
+ if (dn_cfg.busy) {
+ /* if the upper half is busy doing something expensive,
+ * lets queue the packet and move forward
+ */
+ mq_append(&dn_cfg.pending, m);
+ m = *m0 = NULL; /* consumed */
+ goto done; /* already active, nothing to do */
+ }
+ /* XXX locate_flowset could be optimised with a direct ref. */
+ fs = dn_ht_find(dn_cfg.fshash, fs_id, 0, NULL);
+ if (fs == NULL)
+ goto dropit; /* This queue/pipe does not exist! */
+ if (fs->sched == NULL) /* should not happen */
+ goto dropit;
+ /* find scheduler instance, possibly applying sched_mask */
+ si = ipdn_si_find(fs->sched, &(fwa->f_id));
+ if (si == NULL)
+ goto dropit;
+ /*
+ * If the scheduler supports multiple queues, find the right one
+ * (otherwise it will be ignored by enqueue).
+ */
+ if (fs->sched->fp->flags & DN_MULTIQUEUE) {
+ q = ipdn_q_find(fs, si, &(fwa->f_id));
+ if (q == NULL)
+ goto dropit;
+ }
+ if (fs->sched->fp->enqueue(si, q, m)) {
+ /* packet was dropped by enqueue() */
+ m = *m0 = NULL;
+ goto dropit;
+ }
+
+ if (si->kflags & DN_ACTIVE) {
+ m = *m0 = NULL; /* consumed */
+ goto done; /* already active, nothing to do */
+ }
+
+ /* compute the initial allowance */
+ if (si->idle_time < dn_cfg.curr_time) {
+ /* Do this only on the first packet on an idle pipe */
+ struct dn_link *p = &fs->sched->link;
+
+ si->sched_time = dn_cfg.curr_time;
+ si->credit = dn_cfg.io_fast ? p->bandwidth : 0;
+ if (p->burst) {
+ uint64_t burst = (dn_cfg.curr_time - si->idle_time) * p->bandwidth;
+ if (burst > p->burst)
+ burst = p->burst;
+ si->credit += burst;
+ }
+ }
+ /* pass through scheduler and delay line */
+ m = serve_sched(NULL, si, dn_cfg.curr_time);
+
+ /* optimization -- pass it back to ipfw for immediate send */
+ /* XXX Don't call dummynet_send() if scheduler return the packet
+ * just enqueued. This avoid a lock order reversal.
+ *
+ */
+ if (/*dn_cfg.io_fast &&*/ m == *m0 && (dir & PROTO_LAYER2) == 0 ) {
+ /* fast io, rename the tag * to carry reinject info. */
+ struct m_tag *tag = m_tag_first(m);
+
+ tag->m_tag_cookie = MTAG_IPFW_RULE;
+ tag->m_tag_id = 0;
+ io_pkt_fast++;
+ if (m->m_nextpkt != NULL) {
+ printf("dummynet: fast io: pkt chain detected!\n");
+ m->m_nextpkt = NULL;
+ }
+ m = NULL;
+ } else {
+ *m0 = NULL;
+ }
+done:
+ DN_BH_WUNLOCK();
+ if (m)
+ dummynet_send(m);
+ return 0;
+
+dropit:
+ io_pkt_drop++;
+ DN_BH_WUNLOCK();
+ if (m)
+ FREE_PKT(m);
+ *m0 = NULL;
+ return (fs && (fs->fs.flags & DN_NOERROR)) ? 0 : ENOBUFS;
+}
diff --git a/sys/netpfil/ipfw/ip_dn_private.h b/sys/netpfil/ipfw/ip_dn_private.h
new file mode 100644
index 0000000..159ddc9
--- /dev/null
+++ b/sys/netpfil/ipfw/ip_dn_private.h
@@ -0,0 +1,403 @@
+/*-
+ * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * internal dummynet APIs.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IP_DN_PRIVATE_H
+#define _IP_DN_PRIVATE_H
+
+/* debugging support
+ * use ND() to remove debugging, D() to print a line,
+ * DX(level, ...) to print above a certain level
+ * If you redefine D() you are expected to redefine all.
+ */
+#ifndef D
+#define ND(fmt, ...) do {} while (0)
+#define D1(fmt, ...) do {} while (0)
+#define D(fmt, ...) printf("%-10s " fmt "\n", \
+ __FUNCTION__, ## __VA_ARGS__)
+#define DX(lev, fmt, ...) do { \
+ if (dn_cfg.debug > lev) D(fmt, ## __VA_ARGS__); } while (0)
+#endif
+
+MALLOC_DECLARE(M_DUMMYNET);
+
+#ifndef __linux__
+#define div64(a, b) ((int64_t)(a) / (int64_t)(b))
+#endif
+
+#define DN_LOCK_INIT() do { \
+ mtx_init(&dn_cfg.uh_mtx, "dn_uh", NULL, MTX_DEF); \
+ mtx_init(&dn_cfg.bh_mtx, "dn_bh", NULL, MTX_DEF); \
+ } while (0)
+#define DN_LOCK_DESTROY() do { \
+ mtx_destroy(&dn_cfg.uh_mtx); \
+ mtx_destroy(&dn_cfg.bh_mtx); \
+ } while (0)
+#if 0 /* not used yet */
+#define DN_UH_RLOCK() mtx_lock(&dn_cfg.uh_mtx)
+#define DN_UH_RUNLOCK() mtx_unlock(&dn_cfg.uh_mtx)
+#define DN_UH_WLOCK() mtx_lock(&dn_cfg.uh_mtx)
+#define DN_UH_WUNLOCK() mtx_unlock(&dn_cfg.uh_mtx)
+#define DN_UH_LOCK_ASSERT() mtx_assert(&dn_cfg.uh_mtx, MA_OWNED)
+#endif
+
+#define DN_BH_RLOCK() mtx_lock(&dn_cfg.uh_mtx)
+#define DN_BH_RUNLOCK() mtx_unlock(&dn_cfg.uh_mtx)
+#define DN_BH_WLOCK() mtx_lock(&dn_cfg.uh_mtx)
+#define DN_BH_WUNLOCK() mtx_unlock(&dn_cfg.uh_mtx)
+#define DN_BH_LOCK_ASSERT() mtx_assert(&dn_cfg.uh_mtx, MA_OWNED)
+
+SLIST_HEAD(dn_schk_head, dn_schk);
+SLIST_HEAD(dn_sch_inst_head, dn_sch_inst);
+SLIST_HEAD(dn_fsk_head, dn_fsk);
+SLIST_HEAD(dn_queue_head, dn_queue);
+SLIST_HEAD(dn_alg_head, dn_alg);
+
+struct mq { /* a basic queue of packets*/
+ struct mbuf *head, *tail;
+};
+
+static inline void
+set_oid(struct dn_id *o, int type, int len)
+{
+ o->type = type;
+ o->len = len;
+ o->subtype = 0;
+};
+
+/*
+ * configuration and global data for a dummynet instance
+ *
+ * When a configuration is modified from userland, 'id' is incremented
+ * so we can use the value to check for stale pointers.
+ */
+struct dn_parms {
+ uint32_t id; /* configuration version */
+
+ /* defaults (sysctl-accessible) */
+ int red_lookup_depth;
+ int red_avg_pkt_size;
+ int red_max_pkt_size;
+ int hash_size;
+ int max_hash_size;
+ long byte_limit; /* max queue sizes */
+ long slot_limit;
+
+ int io_fast;
+ int debug;
+
+ /* timekeeping */
+ struct timeval prev_t; /* last time dummynet_tick ran */
+ struct dn_heap evheap; /* scheduled events */
+
+ /* counters of objects -- used for reporting space */
+ int schk_count;
+ int si_count;
+ int fsk_count;
+ int queue_count;
+
+ /* ticks and other stuff */
+ uint64_t curr_time;
+ /* flowsets and schedulers are in hash tables, with 'hash_size'
+ * buckets. fshash is looked up at every packet arrival
+ * so better be generous if we expect many entries.
+ */
+ struct dn_ht *fshash;
+ struct dn_ht *schedhash;
+ /* list of flowsets without a scheduler -- use sch_chain */
+ struct dn_fsk_head fsu; /* list of unlinked flowsets */
+ struct dn_alg_head schedlist; /* list of algorithms */
+
+ /* Store the fs/sch to scan when draining. The value is the
+ * bucket number of the hash table. Expire can be disabled
+ * with net.inet.ip.dummynet.expire=0, or it happens every
+ * expire ticks.
+ **/
+ int drain_fs;
+ int drain_sch;
+ uint32_t expire;
+ uint32_t expire_cycle; /* tick count */
+
+ int init_done;
+
+ /* if the upper half is busy doing something long,
+ * can set the busy flag and we will enqueue packets in
+ * a queue for later processing.
+ */
+ int busy;
+ struct mq pending;
+
+#ifdef _KERNEL
+ /*
+ * This file is normally used in the kernel, unless we do
+ * some userland tests, in which case we do not need a mtx.
+ * uh_mtx arbitrates between system calls and also
+ * protects fshash, schedhash and fsunlinked.
+ * These structures are readonly for the lower half.
+ * bh_mtx protects all other structures which may be
+ * modified upon packet arrivals
+ */
+#if defined( __linux__ ) || defined( _WIN32 )
+ spinlock_t uh_mtx;
+ spinlock_t bh_mtx;
+#else
+ struct mtx uh_mtx;
+ struct mtx bh_mtx;
+#endif
+
+#endif /* _KERNEL */
+};
+
+/*
+ * Delay line, contains all packets on output from a link.
+ * Every scheduler instance has one.
+ */
+struct delay_line {
+ struct dn_id oid;
+ struct dn_sch_inst *si;
+ struct mq mq;
+};
+
+/*
+ * The kernel side of a flowset. It is linked in a hash table
+ * of flowsets, and in a list of children of their parent scheduler.
+ * qht is either the queue or (if HAVE_MASK) a hash table queues.
+ * Note that the mask to use is the (flow_mask|sched_mask), which
+ * changes as we attach/detach schedulers. So we store it here.
+ *
+ * XXX If we want to add scheduler-specific parameters, we need to
+ * put them in external storage because the scheduler may not be
+ * available when the fsk is created.
+ */
+struct dn_fsk { /* kernel side of a flowset */
+ struct dn_fs fs;
+ SLIST_ENTRY(dn_fsk) fsk_next; /* hash chain for fshash */
+
+ struct ipfw_flow_id fsk_mask;
+
+ /* qht is a hash table of queues, or just a single queue
+ * a bit in fs.flags tells us which one
+ */
+ struct dn_ht *qht;
+ struct dn_schk *sched; /* Sched we are linked to */
+ SLIST_ENTRY(dn_fsk) sch_chain; /* list of fsk attached to sched */
+
+ /* bucket index used by drain routine to drain queues for this
+ * flowset
+ */
+ int drain_bucket;
+ /* Parameter realted to RED / GRED */
+ /* original values are in dn_fs*/
+ int w_q ; /* queue weight (scaled) */
+ int max_th ; /* maximum threshold for queue (scaled) */
+ int min_th ; /* minimum threshold for queue (scaled) */
+ int max_p ; /* maximum value for p_b (scaled) */
+
+ u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */
+ u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */
+ u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */
+ u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */
+ u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */
+ u_int lookup_depth ; /* depth of lookup table */
+ int lookup_step ; /* granularity inside the lookup table */
+ int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */
+ int avg_pkt_size ; /* medium packet size */
+ int max_pkt_size ; /* max packet size */
+};
+
+/*
+ * A queue is created as a child of a flowset unless it belongs to
+ * a !MULTIQUEUE scheduler. It is normally in a hash table in the
+ * flowset. fs always points to the parent flowset.
+ * si normally points to the sch_inst, unless the flowset has been
+ * detached from the scheduler -- in this case si == NULL and we
+ * should not enqueue.
+ */
+struct dn_queue {
+ struct dn_flow ni; /* oid, flow_id, stats */
+ struct mq mq; /* packets queue */
+ struct dn_sch_inst *_si; /* owner scheduler instance */
+ SLIST_ENTRY(dn_queue) q_next; /* hash chain list for qht */
+ struct dn_fsk *fs; /* parent flowset. */
+
+ /* RED parameters */
+ int avg; /* average queue length est. (scaled) */
+ int count; /* arrivals since last RED drop */
+ int random; /* random value (scaled) */
+ uint64_t q_time; /* start of queue idle time */
+
+};
+
+/*
+ * The kernel side of a scheduler. Contains the userland config,
+ * a link, pointer to extra config arguments from command line,
+ * kernel flags, and a pointer to the scheduler methods.
+ * It is stored in a hash table, and holds a list of all
+ * flowsets and scheduler instances.
+ * XXX sch must be at the beginning, see schk_hash().
+ */
+struct dn_schk {
+ struct dn_sch sch;
+ struct dn_alg *fp; /* Pointer to scheduler functions */
+ struct dn_link link; /* The link, embedded */
+ struct dn_profile *profile; /* delay profile, if any */
+ struct dn_id *cfg; /* extra config arguments */
+
+ SLIST_ENTRY(dn_schk) schk_next; /* hash chain for schedhash */
+
+ struct dn_fsk_head fsk_list; /* all fsk linked to me */
+ struct dn_fsk *fs; /* Flowset for !MULTIQUEUE */
+
+ /* bucket index used by the drain routine to drain the scheduler
+ * instance for this flowset.
+ */
+ int drain_bucket;
+
+ /* Hash table of all instances (through sch.sched_mask)
+ * or single instance if no mask. Always valid.
+ */
+ struct dn_ht *siht;
+};
+
+
+/*
+ * Scheduler instance.
+ * Contains variables and all queues relative to a this instance.
+ * This struct is created a runtime.
+ */
+struct dn_sch_inst {
+ struct dn_flow ni; /* oid, flowid and stats */
+ SLIST_ENTRY(dn_sch_inst) si_next; /* hash chain for siht */
+ struct delay_line dline;
+ struct dn_schk *sched; /* the template */
+ int kflags; /* DN_ACTIVE */
+
+ int64_t credit; /* bits I can transmit (more or less). */
+ uint64_t sched_time; /* time link was scheduled in ready_heap */
+ uint64_t idle_time; /* start of scheduler instance idle time */
+
+ /* q_count is the number of queues that this instance is using.
+ * The counter is incremented or decremented when
+ * a reference from the queue is created or deleted.
+ * It is used to make sure that a scheduler instance can be safely
+ * deleted by the drain routine. See notes below.
+ */
+ int q_count;
+
+};
+
+/*
+ * NOTE about object drain.
+ * The system will automatically (XXX check when) drain queues and
+ * scheduler instances when they are idle.
+ * A queue is idle when it has no packets; an instance is idle when
+ * it is not in the evheap heap, and the corresponding delay line is empty.
+ * A queue can be safely deleted when it is idle because of the scheduler
+ * function xxx_free_queue() will remove any references to it.
+ * An instance can be only deleted when no queues reference it. To be sure
+ * of that, a counter (q_count) stores the number of queues that are pointing
+ * to the instance.
+ *
+ * XXX
+ * Order of scan:
+ * - take all flowset in a bucket for the flowset hash table
+ * - take all queues in a bucket for the flowset
+ * - increment the queue bucket
+ * - scan next flowset bucket
+ * Nothing is done if a bucket contains no entries.
+ *
+ * The same schema is used for sceduler instances
+ */
+
+
+/* kernel-side flags. Linux has DN_DELETE in fcntl.h
+ */
+enum {
+ /* 1 and 2 are reserved for the SCAN flags */
+ DN_DESTROY = 0x0004, /* destroy */
+ DN_DELETE_FS = 0x0008, /* destroy flowset */
+ DN_DETACH = 0x0010,
+ DN_ACTIVE = 0x0020, /* object is in evheap */
+ DN_F_DLINE = 0x0040, /* object is a delay line */
+ DN_DEL_SAFE = 0x0080, /* delete a queue only if no longer needed
+ * by scheduler */
+ DN_QHT_IS_Q = 0x0100, /* in flowset, qht is a single queue */
+};
+
+extern struct dn_parms dn_cfg;
+//VNET_DECLARE(struct dn_parms, _base_dn_cfg);
+//#define dn_cfg VNET(_base_dn_cfg)
+
+int dummynet_io(struct mbuf **, int , struct ip_fw_args *);
+void dummynet_task(void *context, int pending);
+void dn_reschedule(void);
+
+struct dn_queue *ipdn_q_find(struct dn_fsk *, struct dn_sch_inst *,
+ struct ipfw_flow_id *);
+struct dn_sch_inst *ipdn_si_find(struct dn_schk *, struct ipfw_flow_id *);
+
+/*
+ * copy_range is a template for requests for ranges of pipes/queues/scheds.
+ * The number of ranges is variable and can be derived by o.len.
+ * As a default, we use a small number of entries so that the struct
+ * fits easily on the stack and is sufficient for most common requests.
+ */
+#define DEFAULT_RANGES 5
+struct copy_range {
+ struct dn_id o;
+ uint32_t r[ 2 * DEFAULT_RANGES ];
+};
+
+struct copy_args {
+ char **start;
+ char *end;
+ int flags;
+ int type;
+ struct copy_range *extra; /* extra filtering */
+};
+
+struct sockopt;
+int ip_dummynet_compat(struct sockopt *sopt);
+int dummynet_get(struct sockopt *sopt, void **compat);
+int dn_c_copy_q (void *_ni, void *arg);
+int dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq);
+int dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq);
+int dn_compat_copy_queue(struct copy_args *a, void *_o);
+int dn_compat_copy_pipe(struct copy_args *a, void *_o);
+int copy_data_helper_compat(void *_o, void *_arg);
+int dn_compat_calc_size(void);
+int do_config(void *p, int l);
+
+/* function to drain idle object */
+void dn_drain_scheduler(void);
+void dn_drain_queue(void);
+
+#endif /* _IP_DN_PRIVATE_H */
diff --git a/sys/netpfil/ipfw/ip_dummynet.c b/sys/netpfil/ipfw/ip_dummynet.c
new file mode 100644
index 0000000..509aeb7
--- /dev/null
+++ b/sys/netpfil/ipfw/ip_dummynet.c
@@ -0,0 +1,2315 @@
+/*-
+ * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa
+ * Portions Copyright (c) 2000 Akamba Corp.
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ * Configuration and internal object management for dummynet.
+ */
+
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/time.h>
+#include <sys/taskqueue.h>
+#include <net/if.h> /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */
+#include <netinet/in.h>
+#include <netinet/ip_var.h> /* ip_output(), IP_FORWARDING */
+#include <netinet/ip_fw.h>
+#include <netinet/ip_dummynet.h>
+
+#include <netpfil/ipfw/ip_fw_private.h>
+#include <netpfil/ipfw/dn_heap.h>
+#include <netpfil/ipfw/ip_dn_private.h>
+#include <netpfil/ipfw/dn_sched.h>
+
+/* which objects to copy */
+#define DN_C_LINK 0x01
+#define DN_C_SCH 0x02
+#define DN_C_FLOW 0x04
+#define DN_C_FS 0x08
+#define DN_C_QUEUE 0x10
+
+/* we use this argument in case of a schk_new */
+struct schk_new_arg {
+ struct dn_alg *fp;
+ struct dn_sch *sch;
+};
+
+/*---- callout hooks. ----*/
+static struct callout dn_timeout;
+static struct task dn_task;
+static struct taskqueue *dn_tq = NULL;
+
+static void
+dummynet(void *arg)
+{
+
+ (void)arg; /* UNUSED */
+ taskqueue_enqueue(dn_tq, &dn_task);
+}
+
+void
+dn_reschedule(void)
+{
+ callout_reset(&dn_timeout, 1, dummynet, NULL);
+}
+/*----- end of callout hooks -----*/
+
+/* Return a scheduler descriptor given the type or name. */
+static struct dn_alg *
+find_sched_type(int type, char *name)
+{
+ struct dn_alg *d;
+
+ SLIST_FOREACH(d, &dn_cfg.schedlist, next) {
+ if (d->type == type || (name && !strcasecmp(d->name, name)))
+ return d;
+ }
+ return NULL; /* not found */
+}
+
+int
+ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg)
+{
+ int oldv = *v;
+ const char *op = NULL;
+ if (dflt < lo)
+ dflt = lo;
+ if (dflt > hi)
+ dflt = hi;
+ if (oldv < lo) {
+ *v = dflt;
+ op = "Bump";
+ } else if (oldv > hi) {
+ *v = hi;
+ op = "Clamp";
+ } else
+ return *v;
+ if (op && msg)
+ printf("%s %s to %d (was %d)\n", op, msg, *v, oldv);
+ return *v;
+}
+
+/*---- flow_id mask, hash and compare functions ---*/
+/*
+ * The flow_id includes the 5-tuple, the queue/pipe number
+ * which we store in the extra area in host order,
+ * and for ipv6 also the flow_id6.
+ * XXX see if we want the tos byte (can store in 'flags')
+ */
+static struct ipfw_flow_id *
+flow_id_mask(struct ipfw_flow_id *mask, struct ipfw_flow_id *id)
+{
+ int is_v6 = IS_IP6_FLOW_ID(id);
+
+ id->dst_port &= mask->dst_port;
+ id->src_port &= mask->src_port;
+ id->proto &= mask->proto;
+ id->extra &= mask->extra;
+ if (is_v6) {
+ APPLY_MASK(&id->dst_ip6, &mask->dst_ip6);
+ APPLY_MASK(&id->src_ip6, &mask->src_ip6);
+ id->flow_id6 &= mask->flow_id6;
+ } else {
+ id->dst_ip &= mask->dst_ip;
+ id->src_ip &= mask->src_ip;
+ }
+ return id;
+}
+
+/* computes an OR of two masks, result in dst and also returned */
+static struct ipfw_flow_id *
+flow_id_or(struct ipfw_flow_id *src, struct ipfw_flow_id *dst)
+{
+ int is_v6 = IS_IP6_FLOW_ID(dst);
+
+ dst->dst_port |= src->dst_port;
+ dst->src_port |= src->src_port;
+ dst->proto |= src->proto;
+ dst->extra |= src->extra;
+ if (is_v6) {
+#define OR_MASK(_d, _s) \
+ (_d)->__u6_addr.__u6_addr32[0] |= (_s)->__u6_addr.__u6_addr32[0]; \
+ (_d)->__u6_addr.__u6_addr32[1] |= (_s)->__u6_addr.__u6_addr32[1]; \
+ (_d)->__u6_addr.__u6_addr32[2] |= (_s)->__u6_addr.__u6_addr32[2]; \
+ (_d)->__u6_addr.__u6_addr32[3] |= (_s)->__u6_addr.__u6_addr32[3];
+ OR_MASK(&dst->dst_ip6, &src->dst_ip6);
+ OR_MASK(&dst->src_ip6, &src->src_ip6);
+#undef OR_MASK
+ dst->flow_id6 |= src->flow_id6;
+ } else {
+ dst->dst_ip |= src->dst_ip;
+ dst->src_ip |= src->src_ip;
+ }
+ return dst;
+}
+
+static int
+nonzero_mask(struct ipfw_flow_id *m)
+{
+ if (m->dst_port || m->src_port || m->proto || m->extra)
+ return 1;
+ if (IS_IP6_FLOW_ID(m)) {
+ return
+ m->dst_ip6.__u6_addr.__u6_addr32[0] ||
+ m->dst_ip6.__u6_addr.__u6_addr32[1] ||
+ m->dst_ip6.__u6_addr.__u6_addr32[2] ||
+ m->dst_ip6.__u6_addr.__u6_addr32[3] ||
+ m->src_ip6.__u6_addr.__u6_addr32[0] ||
+ m->src_ip6.__u6_addr.__u6_addr32[1] ||
+ m->src_ip6.__u6_addr.__u6_addr32[2] ||
+ m->src_ip6.__u6_addr.__u6_addr32[3] ||
+ m->flow_id6;
+ } else {
+ return m->dst_ip || m->src_ip;
+ }
+}
+
+/* XXX we may want a better hash function */
+static uint32_t
+flow_id_hash(struct ipfw_flow_id *id)
+{
+ uint32_t i;
+
+ if (IS_IP6_FLOW_ID(id)) {
+ uint32_t *d = (uint32_t *)&id->dst_ip6;
+ uint32_t *s = (uint32_t *)&id->src_ip6;
+ i = (d[0] ) ^ (d[1]) ^
+ (d[2] ) ^ (d[3]) ^
+ (d[0] >> 15) ^ (d[1] >> 15) ^
+ (d[2] >> 15) ^ (d[3] >> 15) ^
+ (s[0] << 1) ^ (s[1] << 1) ^
+ (s[2] << 1) ^ (s[3] << 1) ^
+ (s[0] << 16) ^ (s[1] << 16) ^
+ (s[2] << 16) ^ (s[3] << 16) ^
+ (id->dst_port << 1) ^ (id->src_port) ^
+ (id->extra) ^
+ (id->proto ) ^ (id->flow_id6);
+ } else {
+ i = (id->dst_ip) ^ (id->dst_ip >> 15) ^
+ (id->src_ip << 1) ^ (id->src_ip >> 16) ^
+ (id->extra) ^
+ (id->dst_port << 1) ^ (id->src_port) ^ (id->proto);
+ }
+ return i;
+}
+
+/* Like bcmp, returns 0 if ids match, 1 otherwise. */
+static int
+flow_id_cmp(struct ipfw_flow_id *id1, struct ipfw_flow_id *id2)
+{
+ int is_v6 = IS_IP6_FLOW_ID(id1);
+
+ if (!is_v6) {
+ if (IS_IP6_FLOW_ID(id2))
+ return 1; /* different address families */
+
+ return (id1->dst_ip == id2->dst_ip &&
+ id1->src_ip == id2->src_ip &&
+ id1->dst_port == id2->dst_port &&
+ id1->src_port == id2->src_port &&
+ id1->proto == id2->proto &&
+ id1->extra == id2->extra) ? 0 : 1;
+ }
+ /* the ipv6 case */
+ return (
+ !bcmp(&id1->dst_ip6,&id2->dst_ip6, sizeof(id1->dst_ip6)) &&
+ !bcmp(&id1->src_ip6,&id2->src_ip6, sizeof(id1->src_ip6)) &&
+ id1->dst_port == id2->dst_port &&
+ id1->src_port == id2->src_port &&
+ id1->proto == id2->proto &&
+ id1->extra == id2->extra &&
+ id1->flow_id6 == id2->flow_id6) ? 0 : 1;
+}
+/*--------- end of flow-id mask, hash and compare ---------*/
+
+/*--- support functions for the qht hashtable ----
+ * Entries are hashed by flow-id
+ */
+static uint32_t
+q_hash(uintptr_t key, int flags, void *arg)
+{
+ /* compute the hash slot from the flow id */
+ struct ipfw_flow_id *id = (flags & DNHT_KEY_IS_OBJ) ?
+ &((struct dn_queue *)key)->ni.fid :
+ (struct ipfw_flow_id *)key;
+
+ return flow_id_hash(id);
+}
+
+static int
+q_match(void *obj, uintptr_t key, int flags, void *arg)
+{
+ struct dn_queue *o = (struct dn_queue *)obj;
+ struct ipfw_flow_id *id2;
+
+ if (flags & DNHT_KEY_IS_OBJ) {
+ /* compare pointers */
+ id2 = &((struct dn_queue *)key)->ni.fid;
+ } else {
+ id2 = (struct ipfw_flow_id *)key;
+ }
+ return (0 == flow_id_cmp(&o->ni.fid, id2));
+}
+
+/*
+ * create a new queue instance for the given 'key'.
+ */
+static void *
+q_new(uintptr_t key, int flags, void *arg)
+{
+ struct dn_queue *q, *template = arg;
+ struct dn_fsk *fs = template->fs;
+ int size = sizeof(*q) + fs->sched->fp->q_datalen;
+
+ q = malloc(size, M_DUMMYNET, M_NOWAIT | M_ZERO);
+ if (q == NULL) {
+ D("no memory for new queue");
+ return NULL;
+ }
+
+ set_oid(&q->ni.oid, DN_QUEUE, size);
+ if (fs->fs.flags & DN_QHT_HASH)
+ q->ni.fid = *(struct ipfw_flow_id *)key;
+ q->fs = fs;
+ q->_si = template->_si;
+ q->_si->q_count++;
+
+ if (fs->sched->fp->new_queue)
+ fs->sched->fp->new_queue(q);
+ dn_cfg.queue_count++;
+ return q;
+}
+
+/*
+ * Notify schedulers that a queue is going away.
+ * If (flags & DN_DESTROY), also free the packets.
+ * The version for callbacks is called q_delete_cb().
+ */
+static void
+dn_delete_queue(struct dn_queue *q, int flags)
+{
+ struct dn_fsk *fs = q->fs;
+
+ // D("fs %p si %p\n", fs, q->_si);
+ /* notify the parent scheduler that the queue is going away */
+ if (fs && fs->sched->fp->free_queue)
+ fs->sched->fp->free_queue(q);
+ q->_si->q_count--;
+ q->_si = NULL;
+ if (flags & DN_DESTROY) {
+ if (q->mq.head)
+ dn_free_pkts(q->mq.head);
+ bzero(q, sizeof(*q)); // safety
+ free(q, M_DUMMYNET);
+ dn_cfg.queue_count--;
+ }
+}
+
+static int
+q_delete_cb(void *q, void *arg)
+{
+ int flags = (int)(uintptr_t)arg;
+ dn_delete_queue(q, flags);
+ return (flags & DN_DESTROY) ? DNHT_SCAN_DEL : 0;
+}
+
+/*
+ * calls dn_delete_queue/q_delete_cb on all queues,
+ * which notifies the parent scheduler and possibly drains packets.
+ * flags & DN_DESTROY: drains queues and destroy qht;
+ */
+static void
+qht_delete(struct dn_fsk *fs, int flags)
+{
+ ND("fs %d start flags %d qht %p",
+ fs->fs.fs_nr, flags, fs->qht);
+ if (!fs->qht)
+ return;
+ if (fs->fs.flags & DN_QHT_HASH) {
+ dn_ht_scan(fs->qht, q_delete_cb, (void *)(uintptr_t)flags);
+ if (flags & DN_DESTROY) {
+ dn_ht_free(fs->qht, 0);
+ fs->qht = NULL;
+ }
+ } else {
+ dn_delete_queue((struct dn_queue *)(fs->qht), flags);
+ if (flags & DN_DESTROY)
+ fs->qht = NULL;
+ }
+}
+
+/*
+ * Find and possibly create the queue for a MULTIQUEUE scheduler.
+ * We never call it for !MULTIQUEUE (the queue is in the sch_inst).
+ */
+struct dn_queue *
+ipdn_q_find(struct dn_fsk *fs, struct dn_sch_inst *si,
+ struct ipfw_flow_id *id)
+{
+ struct dn_queue template;
+
+ template._si = si;
+ template.fs = fs;
+
+ if (fs->fs.flags & DN_QHT_HASH) {
+ struct ipfw_flow_id masked_id;
+ if (fs->qht == NULL) {
+ fs->qht = dn_ht_init(NULL, fs->fs.buckets,
+ offsetof(struct dn_queue, q_next),
+ q_hash, q_match, q_new);
+ if (fs->qht == NULL)
+ return NULL;
+ }
+ masked_id = *id;
+ flow_id_mask(&fs->fsk_mask, &masked_id);
+ return dn_ht_find(fs->qht, (uintptr_t)&masked_id,
+ DNHT_INSERT, &template);
+ } else {
+ if (fs->qht == NULL)
+ fs->qht = q_new(0, 0, &template);
+ return (struct dn_queue *)fs->qht;
+ }
+}
+/*--- end of queue hash table ---*/
+
+/*--- support functions for the sch_inst hashtable ----
+ *
+ * These are hashed by flow-id
+ */
+static uint32_t
+si_hash(uintptr_t key, int flags, void *arg)
+{
+ /* compute the hash slot from the flow id */
+ struct ipfw_flow_id *id = (flags & DNHT_KEY_IS_OBJ) ?
+ &((struct dn_sch_inst *)key)->ni.fid :
+ (struct ipfw_flow_id *)key;
+
+ return flow_id_hash(id);
+}
+
+static int
+si_match(void *obj, uintptr_t key, int flags, void *arg)
+{
+ struct dn_sch_inst *o = obj;
+ struct ipfw_flow_id *id2;
+
+ id2 = (flags & DNHT_KEY_IS_OBJ) ?
+ &((struct dn_sch_inst *)key)->ni.fid :
+ (struct ipfw_flow_id *)key;
+ return flow_id_cmp(&o->ni.fid, id2) == 0;
+}
+
+/*
+ * create a new instance for the given 'key'
+ * Allocate memory for instance, delay line and scheduler private data.
+ */
+static void *
+si_new(uintptr_t key, int flags, void *arg)
+{
+ struct dn_schk *s = arg;
+ struct dn_sch_inst *si;
+ int l = sizeof(*si) + s->fp->si_datalen;
+
+ si = malloc(l, M_DUMMYNET, M_NOWAIT | M_ZERO);
+ if (si == NULL)
+ goto error;
+
+ /* Set length only for the part passed up to userland. */
+ set_oid(&si->ni.oid, DN_SCH_I, sizeof(struct dn_flow));
+ set_oid(&(si->dline.oid), DN_DELAY_LINE,
+ sizeof(struct delay_line));
+ /* mark si and dline as outside the event queue */
+ si->ni.oid.id = si->dline.oid.id = -1;
+
+ si->sched = s;
+ si->dline.si = si;
+
+ if (s->fp->new_sched && s->fp->new_sched(si)) {
+ D("new_sched error");
+ goto error;
+ }
+ if (s->sch.flags & DN_HAVE_MASK)
+ si->ni.fid = *(struct ipfw_flow_id *)key;
+
+ dn_cfg.si_count++;
+ return si;
+
+error:
+ if (si) {
+ bzero(si, sizeof(*si)); // safety
+ free(si, M_DUMMYNET);
+ }
+ return NULL;
+}
+
+/*
+ * Callback from siht to delete all scheduler instances. Remove
+ * si and delay line from the system heap, destroy all queues.
+ * We assume that all flowset have been notified and do not
+ * point to us anymore.
+ */
+static int
+si_destroy(void *_si, void *arg)
+{
+ struct dn_sch_inst *si = _si;
+ struct dn_schk *s = si->sched;
+ struct delay_line *dl = &si->dline;
+
+ if (dl->oid.subtype) /* remove delay line from event heap */
+ heap_extract(&dn_cfg.evheap, dl);
+ dn_free_pkts(dl->mq.head); /* drain delay line */
+ if (si->kflags & DN_ACTIVE) /* remove si from event heap */
+ heap_extract(&dn_cfg.evheap, si);
+ if (s->fp->free_sched)
+ s->fp->free_sched(si);
+ bzero(si, sizeof(*si)); /* safety */
+ free(si, M_DUMMYNET);
+ dn_cfg.si_count--;
+ return DNHT_SCAN_DEL;
+}
+
+/*
+ * Find the scheduler instance for this packet. If we need to apply
+ * a mask, do on a local copy of the flow_id to preserve the original.
+ * Assume siht is always initialized if we have a mask.
+ */
+struct dn_sch_inst *
+ipdn_si_find(struct dn_schk *s, struct ipfw_flow_id *id)
+{
+
+ if (s->sch.flags & DN_HAVE_MASK) {
+ struct ipfw_flow_id id_t = *id;
+ flow_id_mask(&s->sch.sched_mask, &id_t);
+ return dn_ht_find(s->siht, (uintptr_t)&id_t,
+ DNHT_INSERT, s);
+ }
+ if (!s->siht)
+ s->siht = si_new(0, 0, s);
+ return (struct dn_sch_inst *)s->siht;
+}
+
+/* callback to flush credit for the scheduler instance */
+static int
+si_reset_credit(void *_si, void *arg)
+{
+ struct dn_sch_inst *si = _si;
+ struct dn_link *p = &si->sched->link;
+
+ si->credit = p->burst + (dn_cfg.io_fast ? p->bandwidth : 0);
+ return 0;
+}
+
+static void
+schk_reset_credit(struct dn_schk *s)
+{
+ if (s->sch.flags & DN_HAVE_MASK)
+ dn_ht_scan(s->siht, si_reset_credit, NULL);
+ else if (s->siht)
+ si_reset_credit(s->siht, NULL);
+}
+/*---- end of sch_inst hashtable ---------------------*/
+
+/*-------------------------------------------------------
+ * flowset hash (fshash) support. Entries are hashed by fs_nr.
+ * New allocations are put in the fsunlinked list, from which
+ * they are removed when they point to a specific scheduler.
+ */
+static uint32_t
+fsk_hash(uintptr_t key, int flags, void *arg)
+{
+ uint32_t i = !(flags & DNHT_KEY_IS_OBJ) ? key :
+ ((struct dn_fsk *)key)->fs.fs_nr;
+
+ return ( (i>>8)^(i>>4)^i );
+}
+
+static int
+fsk_match(void *obj, uintptr_t key, int flags, void *arg)
+{
+ struct dn_fsk *fs = obj;
+ int i = !(flags & DNHT_KEY_IS_OBJ) ? key :
+ ((struct dn_fsk *)key)->fs.fs_nr;
+
+ return (fs->fs.fs_nr == i);
+}
+
+static void *
+fsk_new(uintptr_t key, int flags, void *arg)
+{
+ struct dn_fsk *fs;
+
+ fs = malloc(sizeof(*fs), M_DUMMYNET, M_NOWAIT | M_ZERO);
+ if (fs) {
+ set_oid(&fs->fs.oid, DN_FS, sizeof(fs->fs));
+ dn_cfg.fsk_count++;
+ fs->drain_bucket = 0;
+ SLIST_INSERT_HEAD(&dn_cfg.fsu, fs, sch_chain);
+ }
+ return fs;
+}
+
+/*
+ * detach flowset from its current scheduler. Flags as follows:
+ * DN_DETACH removes from the fsk_list
+ * DN_DESTROY deletes individual queues
+ * DN_DELETE_FS destroys the flowset (otherwise goes in unlinked).
+ */
+static void
+fsk_detach(struct dn_fsk *fs, int flags)
+{
+ if (flags & DN_DELETE_FS)
+ flags |= DN_DESTROY;
+ ND("fs %d from sched %d flags %s %s %s",
+ fs->fs.fs_nr, fs->fs.sched_nr,
+ (flags & DN_DELETE_FS) ? "DEL_FS":"",
+ (flags & DN_DESTROY) ? "DEL":"",
+ (flags & DN_DETACH) ? "DET":"");
+ if (flags & DN_DETACH) { /* detach from the list */
+ struct dn_fsk_head *h;
+ h = fs->sched ? &fs->sched->fsk_list : &dn_cfg.fsu;
+ SLIST_REMOVE(h, fs, dn_fsk, sch_chain);
+ }
+ /* Free the RED parameters, they will be recomputed on
+ * subsequent attach if needed.
+ */
+ if (fs->w_q_lookup)
+ free(fs->w_q_lookup, M_DUMMYNET);
+ fs->w_q_lookup = NULL;
+ qht_delete(fs, flags);
+ if (fs->sched && fs->sched->fp->free_fsk)
+ fs->sched->fp->free_fsk(fs);
+ fs->sched = NULL;
+ if (flags & DN_DELETE_FS) {
+ bzero(fs, sizeof(fs)); /* safety */
+ free(fs, M_DUMMYNET);
+ dn_cfg.fsk_count--;
+ } else {
+ SLIST_INSERT_HEAD(&dn_cfg.fsu, fs, sch_chain);
+ }
+}
+
+/*
+ * Detach or destroy all flowsets in a list.
+ * flags specifies what to do:
+ * DN_DESTROY: flush all queues
+ * DN_DELETE_FS: DN_DESTROY + destroy flowset
+ * DN_DELETE_FS implies DN_DESTROY
+ */
+static void
+fsk_detach_list(struct dn_fsk_head *h, int flags)
+{
+ struct dn_fsk *fs;
+ int n = 0; /* only for stats */
+
+ ND("head %p flags %x", h, flags);
+ while ((fs = SLIST_FIRST(h))) {
+ SLIST_REMOVE_HEAD(h, sch_chain);
+ n++;
+ fsk_detach(fs, flags);
+ }
+ ND("done %d flowsets", n);
+}
+
+/*
+ * called on 'queue X delete' -- removes the flowset from fshash,
+ * deletes all queues for the flowset, and removes the flowset.
+ */
+static int
+delete_fs(int i, int locked)
+{
+ struct dn_fsk *fs;
+ int err = 0;
+
+ if (!locked)
+ DN_BH_WLOCK();
+ fs = dn_ht_find(dn_cfg.fshash, i, DNHT_REMOVE, NULL);
+ ND("fs %d found %p", i, fs);
+ if (fs) {
+ fsk_detach(fs, DN_DETACH | DN_DELETE_FS);
+ err = 0;
+ } else
+ err = EINVAL;
+ if (!locked)
+ DN_BH_WUNLOCK();
+ return err;
+}
+
+/*----- end of flowset hashtable support -------------*/
+
+/*------------------------------------------------------------
+ * Scheduler hash. When searching by index we pass sched_nr,
+ * otherwise we pass struct dn_sch * which is the first field in
+ * struct dn_schk so we can cast between the two. We use this trick
+ * because in the create phase (but it should be fixed).
+ */
+static uint32_t
+schk_hash(uintptr_t key, int flags, void *_arg)
+{
+ uint32_t i = !(flags & DNHT_KEY_IS_OBJ) ? key :
+ ((struct dn_schk *)key)->sch.sched_nr;
+ return ( (i>>8)^(i>>4)^i );
+}
+
+static int
+schk_match(void *obj, uintptr_t key, int flags, void *_arg)
+{
+ struct dn_schk *s = (struct dn_schk *)obj;
+ int i = !(flags & DNHT_KEY_IS_OBJ) ? key :
+ ((struct dn_schk *)key)->sch.sched_nr;
+ return (s->sch.sched_nr == i);
+}
+
+/*
+ * Create the entry and intialize with the sched hash if needed.
+ * Leave s->fp unset so we can tell whether a dn_ht_find() returns
+ * a new object or a previously existing one.
+ */
+static void *
+schk_new(uintptr_t key, int flags, void *arg)
+{
+ struct schk_new_arg *a = arg;
+ struct dn_schk *s;
+ int l = sizeof(*s) +a->fp->schk_datalen;
+
+ s = malloc(l, M_DUMMYNET, M_NOWAIT | M_ZERO);
+ if (s == NULL)
+ return NULL;
+ set_oid(&s->link.oid, DN_LINK, sizeof(s->link));
+ s->sch = *a->sch; // copy initial values
+ s->link.link_nr = s->sch.sched_nr;
+ SLIST_INIT(&s->fsk_list);
+ /* initialize the hash table or create the single instance */
+ s->fp = a->fp; /* si_new needs this */
+ s->drain_bucket = 0;
+ if (s->sch.flags & DN_HAVE_MASK) {
+ s->siht = dn_ht_init(NULL, s->sch.buckets,
+ offsetof(struct dn_sch_inst, si_next),
+ si_hash, si_match, si_new);
+ if (s->siht == NULL) {
+ free(s, M_DUMMYNET);
+ return NULL;
+ }
+ }
+ s->fp = NULL; /* mark as a new scheduler */
+ dn_cfg.schk_count++;
+ return s;
+}
+
+/*
+ * Callback for sched delete. Notify all attached flowsets to
+ * detach from the scheduler, destroy the internal flowset, and
+ * all instances. The scheduler goes away too.
+ * arg is 0 (only detach flowsets and destroy instances)
+ * DN_DESTROY (detach & delete queues, delete schk)
+ * or DN_DELETE_FS (delete queues and flowsets, delete schk)
+ */
+static int
+schk_delete_cb(void *obj, void *arg)
+{
+ struct dn_schk *s = obj;
+#if 0
+ int a = (int)arg;
+ ND("sched %d arg %s%s",
+ s->sch.sched_nr,
+ a&DN_DESTROY ? "DEL ":"",
+ a&DN_DELETE_FS ? "DEL_FS":"");
+#endif
+ fsk_detach_list(&s->fsk_list, arg ? DN_DESTROY : 0);
+ /* no more flowset pointing to us now */
+ if (s->sch.flags & DN_HAVE_MASK) {
+ dn_ht_scan(s->siht, si_destroy, NULL);
+ dn_ht_free(s->siht, 0);
+ } else if (s->siht)
+ si_destroy(s->siht, NULL);
+ if (s->profile) {
+ free(s->profile, M_DUMMYNET);
+ s->profile = NULL;
+ }
+ s->siht = NULL;
+ if (s->fp->destroy)
+ s->fp->destroy(s);
+ bzero(s, sizeof(*s)); // safety
+ free(obj, M_DUMMYNET);
+ dn_cfg.schk_count--;
+ return DNHT_SCAN_DEL;
+}
+
+/*
+ * called on a 'sched X delete' command. Deletes a single scheduler.
+ * This is done by removing from the schedhash, unlinking all
+ * flowsets and deleting their traffic.
+ */
+static int
+delete_schk(int i)
+{
+ struct dn_schk *s;
+
+ s = dn_ht_find(dn_cfg.schedhash, i, DNHT_REMOVE, NULL);
+ ND("%d %p", i, s);
+ if (!s)
+ return EINVAL;
+ delete_fs(i + DN_MAX_ID, 1); /* first delete internal fs */
+ /* then detach flowsets, delete traffic */
+ schk_delete_cb(s, (void*)(uintptr_t)DN_DESTROY);
+ return 0;
+}
+/*--- end of schk hashtable support ---*/
+
+static int
+copy_obj(char **start, char *end, void *_o, const char *msg, int i)
+{
+ struct dn_id *o = _o;
+ int have = end - *start;
+
+ if (have < o->len || o->len == 0 || o->type == 0) {
+ D("(WARN) type %d %s %d have %d need %d",
+ o->type, msg, i, have, o->len);
+ return 1;
+ }
+ ND("type %d %s %d len %d", o->type, msg, i, o->len);
+ bcopy(_o, *start, o->len);
+ if (o->type == DN_LINK) {
+ /* Adjust burst parameter for link */
+ struct dn_link *l = (struct dn_link *)*start;
+ l->burst = div64(l->burst, 8 * hz);
+ l->delay = l->delay * 1000 / hz;
+ } else if (o->type == DN_SCH) {
+ /* Set id->id to the number of instances */
+ struct dn_schk *s = _o;
+ struct dn_id *id = (struct dn_id *)(*start);
+ id->id = (s->sch.flags & DN_HAVE_MASK) ?
+ dn_ht_entries(s->siht) : (s->siht ? 1 : 0);
+ }
+ *start += o->len;
+ return 0;
+}
+
+/* Specific function to copy a queue.
+ * Copies only the user-visible part of a queue (which is in
+ * a struct dn_flow), and sets len accordingly.
+ */
+static int
+copy_obj_q(char **start, char *end, void *_o, const char *msg, int i)
+{
+ struct dn_id *o = _o;
+ int have = end - *start;
+ int len = sizeof(struct dn_flow); /* see above comment */
+
+ if (have < len || o->len == 0 || o->type != DN_QUEUE) {
+ D("ERROR type %d %s %d have %d need %d",
+ o->type, msg, i, have, len);
+ return 1;
+ }
+ ND("type %d %s %d len %d", o->type, msg, i, len);
+ bcopy(_o, *start, len);
+ ((struct dn_id*)(*start))->len = len;
+ *start += len;
+ return 0;
+}
+
+static int
+copy_q_cb(void *obj, void *arg)
+{
+ struct dn_queue *q = obj;
+ struct copy_args *a = arg;
+ struct dn_flow *ni = (struct dn_flow *)(*a->start);
+ if (copy_obj_q(a->start, a->end, &q->ni, "queue", -1))
+ return DNHT_SCAN_END;
+ ni->oid.type = DN_FLOW; /* override the DN_QUEUE */
+ ni->oid.id = si_hash((uintptr_t)&ni->fid, 0, NULL);
+ return 0;
+}
+
+static int
+copy_q(struct copy_args *a, struct dn_fsk *fs, int flags)
+{
+ if (!fs->qht)
+ return 0;
+ if (fs->fs.flags & DN_QHT_HASH)
+ dn_ht_scan(fs->qht, copy_q_cb, a);
+ else
+ copy_q_cb(fs->qht, a);
+ return 0;
+}
+
+/*
+ * This routine only copies the initial part of a profile ? XXX
+ */
+static int
+copy_profile(struct copy_args *a, struct dn_profile *p)
+{
+ int have = a->end - *a->start;
+ /* XXX here we check for max length */
+ int profile_len = sizeof(struct dn_profile) -
+ ED_MAX_SAMPLES_NO*sizeof(int);
+
+ if (p == NULL)
+ return 0;
+ if (have < profile_len) {
+ D("error have %d need %d", have, profile_len);
+ return 1;
+ }
+ bcopy(p, *a->start, profile_len);
+ ((struct dn_id *)(*a->start))->len = profile_len;
+ *a->start += profile_len;
+ return 0;
+}
+
+static int
+copy_flowset(struct copy_args *a, struct dn_fsk *fs, int flags)
+{
+ struct dn_fs *ufs = (struct dn_fs *)(*a->start);
+ if (!fs)
+ return 0;
+ ND("flowset %d", fs->fs.fs_nr);
+ if (copy_obj(a->start, a->end, &fs->fs, "flowset", fs->fs.fs_nr))
+ return DNHT_SCAN_END;
+ ufs->oid.id = (fs->fs.flags & DN_QHT_HASH) ?
+ dn_ht_entries(fs->qht) : (fs->qht ? 1 : 0);
+ if (flags) { /* copy queues */
+ copy_q(a, fs, 0);
+ }
+ return 0;
+}
+
+static int
+copy_si_cb(void *obj, void *arg)
+{
+ struct dn_sch_inst *si = obj;
+ struct copy_args *a = arg;
+ struct dn_flow *ni = (struct dn_flow *)(*a->start);
+ if (copy_obj(a->start, a->end, &si->ni, "inst",
+ si->sched->sch.sched_nr))
+ return DNHT_SCAN_END;
+ ni->oid.type = DN_FLOW; /* override the DN_SCH_I */
+ ni->oid.id = si_hash((uintptr_t)si, DNHT_KEY_IS_OBJ, NULL);
+ return 0;
+}
+
+static int
+copy_si(struct copy_args *a, struct dn_schk *s, int flags)
+{
+ if (s->sch.flags & DN_HAVE_MASK)
+ dn_ht_scan(s->siht, copy_si_cb, a);
+ else if (s->siht)
+ copy_si_cb(s->siht, a);
+ return 0;
+}
+
+/*
+ * compute a list of children of a scheduler and copy up
+ */
+static int
+copy_fsk_list(struct copy_args *a, struct dn_schk *s, int flags)
+{
+ struct dn_fsk *fs;
+ struct dn_id *o;
+ uint32_t *p;
+
+ int n = 0, space = sizeof(*o);
+ SLIST_FOREACH(fs, &s->fsk_list, sch_chain) {
+ if (fs->fs.fs_nr < DN_MAX_ID)
+ n++;
+ }
+ space += n * sizeof(uint32_t);
+ DX(3, "sched %d has %d flowsets", s->sch.sched_nr, n);
+ if (a->end - *(a->start) < space)
+ return DNHT_SCAN_END;
+ o = (struct dn_id *)(*(a->start));
+ o->len = space;
+ *a->start += o->len;
+ o->type = DN_TEXT;
+ p = (uint32_t *)(o+1);
+ SLIST_FOREACH(fs, &s->fsk_list, sch_chain)
+ if (fs->fs.fs_nr < DN_MAX_ID)
+ *p++ = fs->fs.fs_nr;
+ return 0;
+}
+
+static int
+copy_data_helper(void *_o, void *_arg)
+{
+ struct copy_args *a = _arg;
+ uint32_t *r = a->extra->r; /* start of first range */
+ uint32_t *lim; /* first invalid pointer */
+ int n;
+
+ lim = (uint32_t *)((char *)(a->extra) + a->extra->o.len);
+
+ if (a->type == DN_LINK || a->type == DN_SCH) {
+ /* pipe|sched show, we receive a dn_schk */
+ struct dn_schk *s = _o;
+
+ n = s->sch.sched_nr;
+ if (a->type == DN_SCH && n >= DN_MAX_ID)
+ return 0; /* not a scheduler */
+ if (a->type == DN_LINK && n <= DN_MAX_ID)
+ return 0; /* not a pipe */
+
+ /* see if the object is within one of our ranges */
+ for (;r < lim; r += 2) {
+ if (n < r[0] || n > r[1])
+ continue;
+ /* Found a valid entry, copy and we are done */
+ if (a->flags & DN_C_LINK) {
+ if (copy_obj(a->start, a->end,
+ &s->link, "link", n))
+ return DNHT_SCAN_END;
+ if (copy_profile(a, s->profile))
+ return DNHT_SCAN_END;
+ if (copy_flowset(a, s->fs, 0))
+ return DNHT_SCAN_END;
+ }
+ if (a->flags & DN_C_SCH) {
+ if (copy_obj(a->start, a->end,
+ &s->sch, "sched", n))
+ return DNHT_SCAN_END;
+ /* list all attached flowsets */
+ if (copy_fsk_list(a, s, 0))
+ return DNHT_SCAN_END;
+ }
+ if (a->flags & DN_C_FLOW)
+ copy_si(a, s, 0);
+ break;
+ }
+ } else if (a->type == DN_FS) {
+ /* queue show, skip internal flowsets */
+ struct dn_fsk *fs = _o;
+
+ n = fs->fs.fs_nr;
+ if (n >= DN_MAX_ID)
+ return 0;
+ /* see if the object is within one of our ranges */
+ for (;r < lim; r += 2) {
+ if (n < r[0] || n > r[1])
+ continue;
+ if (copy_flowset(a, fs, 0))
+ return DNHT_SCAN_END;
+ copy_q(a, fs, 0);
+ break; /* we are done */
+ }
+ }
+ return 0;
+}
+
+static inline struct dn_schk *
+locate_scheduler(int i)
+{
+ return dn_ht_find(dn_cfg.schedhash, i, 0, NULL);
+}
+
+/*
+ * red parameters are in fixed point arithmetic.
+ */
+static int
+config_red(struct dn_fsk *fs)
+{
+ int64_t s, idle, weight, w0;
+ int t, i;
+
+ fs->w_q = fs->fs.w_q;
+ fs->max_p = fs->fs.max_p;
+ ND("called");
+ /* Doing stuff that was in userland */
+ i = fs->sched->link.bandwidth;
+ s = (i <= 0) ? 0 :
+ hz * dn_cfg.red_avg_pkt_size * 8 * SCALE(1) / i;
+
+ idle = div64((s * 3) , fs->w_q); /* s, fs->w_q scaled; idle not scaled */
+ fs->lookup_step = div64(idle , dn_cfg.red_lookup_depth);
+ /* fs->lookup_step not scaled, */
+ if (!fs->lookup_step)
+ fs->lookup_step = 1;
+ w0 = weight = SCALE(1) - fs->w_q; //fs->w_q scaled
+
+ for (t = fs->lookup_step; t > 1; --t)
+ weight = SCALE_MUL(weight, w0);
+ fs->lookup_weight = (int)(weight); // scaled
+
+ /* Now doing stuff that was in kerneland */
+ fs->min_th = SCALE(fs->fs.min_th);
+ fs->max_th = SCALE(fs->fs.max_th);
+
+ fs->c_1 = fs->max_p / (fs->fs.max_th - fs->fs.min_th);
+ fs->c_2 = SCALE_MUL(fs->c_1, SCALE(fs->fs.min_th));
+
+ if (fs->fs.flags & DN_IS_GENTLE_RED) {
+ fs->c_3 = (SCALE(1) - fs->max_p) / fs->fs.max_th;
+ fs->c_4 = SCALE(1) - 2 * fs->max_p;
+ }
+
+ /* If the lookup table already exist, free and create it again. */
+ if (fs->w_q_lookup) {
+ free(fs->w_q_lookup, M_DUMMYNET);
+ fs->w_q_lookup = NULL;
+ }
+ if (dn_cfg.red_lookup_depth == 0) {
+ printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth"
+ "must be > 0\n");
+ fs->fs.flags &= ~DN_IS_RED;
+ fs->fs.flags &= ~DN_IS_GENTLE_RED;
+ return (EINVAL);
+ }
+ fs->lookup_depth = dn_cfg.red_lookup_depth;
+ fs->w_q_lookup = (u_int *)malloc(fs->lookup_depth * sizeof(int),
+ M_DUMMYNET, M_NOWAIT);
+ if (fs->w_q_lookup == NULL) {
+ printf("dummynet: sorry, cannot allocate red lookup table\n");
+ fs->fs.flags &= ~DN_IS_RED;
+ fs->fs.flags &= ~DN_IS_GENTLE_RED;
+ return(ENOSPC);
+ }
+
+ /* Fill the lookup table with (1 - w_q)^x */
+ fs->w_q_lookup[0] = SCALE(1) - fs->w_q;
+
+ for (i = 1; i < fs->lookup_depth; i++)
+ fs->w_q_lookup[i] =
+ SCALE_MUL(fs->w_q_lookup[i - 1], fs->lookup_weight);
+
+ if (dn_cfg.red_avg_pkt_size < 1)
+ dn_cfg.red_avg_pkt_size = 512;
+ fs->avg_pkt_size = dn_cfg.red_avg_pkt_size;
+ if (dn_cfg.red_max_pkt_size < 1)
+ dn_cfg.red_max_pkt_size = 1500;
+ fs->max_pkt_size = dn_cfg.red_max_pkt_size;
+ ND("exit");
+ return 0;
+}
+
+/* Scan all flowset attached to this scheduler and update red */
+static void
+update_red(struct dn_schk *s)
+{
+ struct dn_fsk *fs;
+ SLIST_FOREACH(fs, &s->fsk_list, sch_chain) {
+ if (fs && (fs->fs.flags & DN_IS_RED))
+ config_red(fs);
+ }
+}
+
+/* attach flowset to scheduler s, possibly requeue */
+static void
+fsk_attach(struct dn_fsk *fs, struct dn_schk *s)
+{
+ ND("remove fs %d from fsunlinked, link to sched %d",
+ fs->fs.fs_nr, s->sch.sched_nr);
+ SLIST_REMOVE(&dn_cfg.fsu, fs, dn_fsk, sch_chain);
+ fs->sched = s;
+ SLIST_INSERT_HEAD(&s->fsk_list, fs, sch_chain);
+ if (s->fp->new_fsk)
+ s->fp->new_fsk(fs);
+ /* XXX compute fsk_mask */
+ fs->fsk_mask = fs->fs.flow_mask;
+ if (fs->sched->sch.flags & DN_HAVE_MASK)
+ flow_id_or(&fs->sched->sch.sched_mask, &fs->fsk_mask);
+ if (fs->qht) {
+ /*
+ * we must drain qht according to the old
+ * type, and reinsert according to the new one.
+ * The requeue is complex -- in general we need to
+ * reclassify every single packet.
+ * For the time being, let's hope qht is never set
+ * when we reach this point.
+ */
+ D("XXX TODO requeue from fs %d to sch %d",
+ fs->fs.fs_nr, s->sch.sched_nr);
+ fs->qht = NULL;
+ }
+ /* set the new type for qht */
+ if (nonzero_mask(&fs->fsk_mask))
+ fs->fs.flags |= DN_QHT_HASH;
+ else
+ fs->fs.flags &= ~DN_QHT_HASH;
+
+ /* XXX config_red() can fail... */
+ if (fs->fs.flags & DN_IS_RED)
+ config_red(fs);
+}
+
+/* update all flowsets which may refer to this scheduler */
+static void
+update_fs(struct dn_schk *s)
+{
+ struct dn_fsk *fs, *tmp;
+
+ SLIST_FOREACH_SAFE(fs, &dn_cfg.fsu, sch_chain, tmp) {
+ if (s->sch.sched_nr != fs->fs.sched_nr) {
+ D("fs %d for sch %d not %d still unlinked",
+ fs->fs.fs_nr, fs->fs.sched_nr,
+ s->sch.sched_nr);
+ continue;
+ }
+ fsk_attach(fs, s);
+ }
+}
+
+/*
+ * Configuration -- to preserve backward compatibility we use
+ * the following scheme (N is 65536)
+ * NUMBER SCHED LINK FLOWSET
+ * 1 .. N-1 (1)WFQ (2)WFQ (3)queue
+ * N+1 .. 2N-1 (4)FIFO (5)FIFO (6)FIFO for sched 1..N-1
+ * 2N+1 .. 3N-1 -- -- (7)FIFO for sched N+1..2N-1
+ *
+ * "pipe i config" configures #1, #2 and #3
+ * "sched i config" configures #1 and possibly #6
+ * "queue i config" configures #3
+ * #1 is configured with 'pipe i config' or 'sched i config'
+ * #2 is configured with 'pipe i config', and created if not
+ * existing with 'sched i config'
+ * #3 is configured with 'queue i config'
+ * #4 is automatically configured after #1, can only be FIFO
+ * #5 is automatically configured after #2
+ * #6 is automatically created when #1 is !MULTIQUEUE,
+ * and can be updated.
+ * #7 is automatically configured after #2
+ */
+
+/*
+ * configure a link (and its FIFO instance)
+ */
+static int
+config_link(struct dn_link *p, struct dn_id *arg)
+{
+ int i;
+
+ if (p->oid.len != sizeof(*p)) {
+ D("invalid pipe len %d", p->oid.len);
+ return EINVAL;
+ }
+ i = p->link_nr;
+ if (i <= 0 || i >= DN_MAX_ID)
+ return EINVAL;
+ /*
+ * The config program passes parameters as follows:
+ * bw = bits/second (0 means no limits),
+ * delay = ms, must be translated into ticks.
+ * qsize = slots/bytes
+ * burst ???
+ */
+ p->delay = (p->delay * hz) / 1000;
+ /* Scale burst size: bytes -> bits * hz */
+ p->burst *= 8 * hz;
+
+ DN_BH_WLOCK();
+ /* do it twice, base link and FIFO link */
+ for (; i < 2*DN_MAX_ID; i += DN_MAX_ID) {
+ struct dn_schk *s = locate_scheduler(i);
+ if (s == NULL) {
+ DN_BH_WUNLOCK();
+ D("sched %d not found", i);
+ return EINVAL;
+ }
+ /* remove profile if exists */
+ if (s->profile) {
+ free(s->profile, M_DUMMYNET);
+ s->profile = NULL;
+ }
+ /* copy all parameters */
+ s->link.oid = p->oid;
+ s->link.link_nr = i;
+ s->link.delay = p->delay;
+ if (s->link.bandwidth != p->bandwidth) {
+ /* XXX bandwidth changes, need to update red params */
+ s->link.bandwidth = p->bandwidth;
+ update_red(s);
+ }
+ s->link.burst = p->burst;
+ schk_reset_credit(s);
+ }
+ dn_cfg.id++;
+ DN_BH_WUNLOCK();
+ return 0;
+}
+
+/*
+ * configure a flowset. Can be called from inside with locked=1,
+ */
+static struct dn_fsk *
+config_fs(struct dn_fs *nfs, struct dn_id *arg, int locked)
+{
+ int i;
+ struct dn_fsk *fs;
+
+ if (nfs->oid.len != sizeof(*nfs)) {
+ D("invalid flowset len %d", nfs->oid.len);
+ return NULL;
+ }
+ i = nfs->fs_nr;
+ if (i <= 0 || i >= 3*DN_MAX_ID)
+ return NULL;
+ ND("flowset %d", i);
+ /* XXX other sanity checks */
+ if (nfs->flags & DN_QSIZE_BYTES) {
+ ipdn_bound_var(&nfs->qsize, 16384,
+ 1500, dn_cfg.byte_limit, NULL); // "queue byte size");
+ } else {
+ ipdn_bound_var(&nfs->qsize, 50,
+ 1, dn_cfg.slot_limit, NULL); // "queue slot size");
+ }
+ if (nfs->flags & DN_HAVE_MASK) {
+ /* make sure we have some buckets */
+ ipdn_bound_var((int *)&nfs->buckets, dn_cfg.hash_size,
+ 1, dn_cfg.max_hash_size, "flowset buckets");
+ } else {
+ nfs->buckets = 1; /* we only need 1 */
+ }
+ if (!locked)
+ DN_BH_WLOCK();
+ do { /* exit with break when done */
+ struct dn_schk *s;
+ int flags = nfs->sched_nr ? DNHT_INSERT : 0;
+ int j;
+ int oldc = dn_cfg.fsk_count;
+ fs = dn_ht_find(dn_cfg.fshash, i, flags, NULL);
+ if (fs == NULL) {
+ D("missing sched for flowset %d", i);
+ break;
+ }
+ /* grab some defaults from the existing one */
+ if (nfs->sched_nr == 0) /* reuse */
+ nfs->sched_nr = fs->fs.sched_nr;
+ for (j = 0; j < sizeof(nfs->par)/sizeof(nfs->par[0]); j++) {
+ if (nfs->par[j] == -1) /* reuse */
+ nfs->par[j] = fs->fs.par[j];
+ }
+ if (bcmp(&fs->fs, nfs, sizeof(*nfs)) == 0) {
+ ND("flowset %d unchanged", i);
+ break; /* no change, nothing to do */
+ }
+ if (oldc != dn_cfg.fsk_count) /* new item */
+ dn_cfg.id++;
+ s = locate_scheduler(nfs->sched_nr);
+ /* detach from old scheduler if needed, preserving
+ * queues if we need to reattach. Then update the
+ * configuration, and possibly attach to the new sched.
+ */
+ DX(2, "fs %d changed sched %d@%p to %d@%p",
+ fs->fs.fs_nr,
+ fs->fs.sched_nr, fs->sched, nfs->sched_nr, s);
+ if (fs->sched) {
+ int flags = s ? DN_DETACH : (DN_DETACH | DN_DESTROY);
+ flags |= DN_DESTROY; /* XXX temporary */
+ fsk_detach(fs, flags);
+ }
+ fs->fs = *nfs; /* copy configuration */
+ if (s != NULL)
+ fsk_attach(fs, s);
+ } while (0);
+ if (!locked)
+ DN_BH_WUNLOCK();
+ return fs;
+}
+
+/*
+ * config/reconfig a scheduler and its FIFO variant.
+ * For !MULTIQUEUE schedulers, also set up the flowset.
+ *
+ * On reconfigurations (detected because s->fp is set),
+ * detach existing flowsets preserving traffic, preserve link,
+ * and delete the old scheduler creating a new one.
+ */
+static int
+config_sched(struct dn_sch *_nsch, struct dn_id *arg)
+{
+ struct dn_schk *s;
+ struct schk_new_arg a; /* argument for schk_new */
+ int i;
+ struct dn_link p; /* copy of oldlink */
+ struct dn_profile *pf = NULL; /* copy of old link profile */
+ /* Used to preserv mask parameter */
+ struct ipfw_flow_id new_mask;
+ int new_buckets = 0;
+ int new_flags = 0;
+ int pipe_cmd;
+ int err = ENOMEM;
+
+ a.sch = _nsch;
+ if (a.sch->oid.len != sizeof(*a.sch)) {
+ D("bad sched len %d", a.sch->oid.len);
+ return EINVAL;
+ }
+ i = a.sch->sched_nr;
+ if (i <= 0 || i >= DN_MAX_ID)
+ return EINVAL;
+ /* make sure we have some buckets */
+ if (a.sch->flags & DN_HAVE_MASK)
+ ipdn_bound_var((int *)&a.sch->buckets, dn_cfg.hash_size,
+ 1, dn_cfg.max_hash_size, "sched buckets");
+ /* XXX other sanity checks */
+ bzero(&p, sizeof(p));
+
+ pipe_cmd = a.sch->flags & DN_PIPE_CMD;
+ a.sch->flags &= ~DN_PIPE_CMD; //XXX do it even if is not set?
+ if (pipe_cmd) {
+ /* Copy mask parameter */
+ new_mask = a.sch->sched_mask;
+ new_buckets = a.sch->buckets;
+ new_flags = a.sch->flags;
+ }
+ DN_BH_WLOCK();
+again: /* run twice, for wfq and fifo */
+ /*
+ * lookup the type. If not supplied, use the previous one
+ * or default to WF2Q+. Otherwise, return an error.
+ */
+ dn_cfg.id++;
+ a.fp = find_sched_type(a.sch->oid.subtype, a.sch->name);
+ if (a.fp != NULL) {
+ /* found. Lookup or create entry */
+ s = dn_ht_find(dn_cfg.schedhash, i, DNHT_INSERT, &a);
+ } else if (a.sch->oid.subtype == 0 && !a.sch->name[0]) {
+ /* No type. search existing s* or retry with WF2Q+ */
+ s = dn_ht_find(dn_cfg.schedhash, i, 0, &a);
+ if (s != NULL) {
+ a.fp = s->fp;
+ /* Scheduler exists, skip to FIFO scheduler
+ * if command was pipe config...
+ */
+ if (pipe_cmd)
+ goto next;
+ } else {
+ /* New scheduler, create a wf2q+ with no mask
+ * if command was pipe config...
+ */
+ if (pipe_cmd) {
+ /* clear mask parameter */
+ bzero(&a.sch->sched_mask, sizeof(new_mask));
+ a.sch->buckets = 0;
+ a.sch->flags &= ~DN_HAVE_MASK;
+ }
+ a.sch->oid.subtype = DN_SCHED_WF2QP;
+ goto again;
+ }
+ } else {
+ D("invalid scheduler type %d %s",
+ a.sch->oid.subtype, a.sch->name);
+ err = EINVAL;
+ goto error;
+ }
+ /* normalize name and subtype */
+ a.sch->oid.subtype = a.fp->type;
+ bzero(a.sch->name, sizeof(a.sch->name));
+ strlcpy(a.sch->name, a.fp->name, sizeof(a.sch->name));
+ if (s == NULL) {
+ D("cannot allocate scheduler %d", i);
+ goto error;
+ }
+ /* restore existing link if any */
+ if (p.link_nr) {
+ s->link = p;
+ if (!pf || pf->link_nr != p.link_nr) { /* no saved value */
+ s->profile = NULL; /* XXX maybe not needed */
+ } else {
+ s->profile = malloc(sizeof(struct dn_profile),
+ M_DUMMYNET, M_NOWAIT | M_ZERO);
+ if (s->profile == NULL) {
+ D("cannot allocate profile");
+ goto error; //XXX
+ }
+ bcopy(pf, s->profile, sizeof(*pf));
+ }
+ }
+ p.link_nr = 0;
+ if (s->fp == NULL) {
+ DX(2, "sched %d new type %s", i, a.fp->name);
+ } else if (s->fp != a.fp ||
+ bcmp(a.sch, &s->sch, sizeof(*a.sch)) ) {
+ /* already existing. */
+ DX(2, "sched %d type changed from %s to %s",
+ i, s->fp->name, a.fp->name);
+ DX(4, " type/sub %d/%d -> %d/%d",
+ s->sch.oid.type, s->sch.oid.subtype,
+ a.sch->oid.type, a.sch->oid.subtype);
+ if (s->link.link_nr == 0)
+ D("XXX WARNING link 0 for sched %d", i);
+ p = s->link; /* preserve link */
+ if (s->profile) {/* preserve profile */
+ if (!pf)
+ pf = malloc(sizeof(*pf),
+ M_DUMMYNET, M_NOWAIT | M_ZERO);
+ if (pf) /* XXX should issue a warning otherwise */
+ bcopy(s->profile, pf, sizeof(*pf));
+ }
+ /* remove from the hash */
+ dn_ht_find(dn_cfg.schedhash, i, DNHT_REMOVE, NULL);
+ /* Detach flowsets, preserve queues. */
+ // schk_delete_cb(s, NULL);
+ // XXX temporarily, kill queues
+ schk_delete_cb(s, (void *)DN_DESTROY);
+ goto again;
+ } else {
+ DX(4, "sched %d unchanged type %s", i, a.fp->name);
+ }
+ /* complete initialization */
+ s->sch = *a.sch;
+ s->fp = a.fp;
+ s->cfg = arg;
+ // XXX schk_reset_credit(s);
+ /* create the internal flowset if needed,
+ * trying to reuse existing ones if available
+ */
+ if (!(s->fp->flags & DN_MULTIQUEUE) && !s->fs) {
+ s->fs = dn_ht_find(dn_cfg.fshash, i, 0, NULL);
+ if (!s->fs) {
+ struct dn_fs fs;
+ bzero(&fs, sizeof(fs));
+ set_oid(&fs.oid, DN_FS, sizeof(fs));
+ fs.fs_nr = i + DN_MAX_ID;
+ fs.sched_nr = i;
+ s->fs = config_fs(&fs, NULL, 1 /* locked */);
+ }
+ if (!s->fs) {
+ schk_delete_cb(s, (void *)DN_DESTROY);
+ D("error creating internal fs for %d", i);
+ goto error;
+ }
+ }
+ /* call init function after the flowset is created */
+ if (s->fp->config)
+ s->fp->config(s);
+ update_fs(s);
+next:
+ if (i < DN_MAX_ID) { /* now configure the FIFO instance */
+ i += DN_MAX_ID;
+ if (pipe_cmd) {
+ /* Restore mask parameter for FIFO */
+ a.sch->sched_mask = new_mask;
+ a.sch->buckets = new_buckets;
+ a.sch->flags = new_flags;
+ } else {
+ /* sched config shouldn't modify the FIFO scheduler */
+ if (dn_ht_find(dn_cfg.schedhash, i, 0, &a) != NULL) {
+ /* FIFO already exist, don't touch it */
+ err = 0; /* and this is not an error */
+ goto error;
+ }
+ }
+ a.sch->sched_nr = i;
+ a.sch->oid.subtype = DN_SCHED_FIFO;
+ bzero(a.sch->name, sizeof(a.sch->name));
+ goto again;
+ }
+ err = 0;
+error:
+ DN_BH_WUNLOCK();
+ if (pf)
+ free(pf, M_DUMMYNET);
+ return err;
+}
+
+/*
+ * attach a profile to a link
+ */
+static int
+config_profile(struct dn_profile *pf, struct dn_id *arg)
+{
+ struct dn_schk *s;
+ int i, olen, err = 0;
+
+ if (pf->oid.len < sizeof(*pf)) {
+ D("short profile len %d", pf->oid.len);
+ return EINVAL;
+ }
+ i = pf->link_nr;
+ if (i <= 0 || i >= DN_MAX_ID)
+ return EINVAL;
+ /* XXX other sanity checks */
+ DN_BH_WLOCK();
+ for (; i < 2*DN_MAX_ID; i += DN_MAX_ID) {
+ s = locate_scheduler(i);
+
+ if (s == NULL) {
+ err = EINVAL;
+ break;
+ }
+ dn_cfg.id++;
+ /*
+ * If we had a profile and the new one does not fit,
+ * or it is deleted, then we need to free memory.
+ */
+ if (s->profile && (pf->samples_no == 0 ||
+ s->profile->oid.len < pf->oid.len)) {
+ free(s->profile, M_DUMMYNET);
+ s->profile = NULL;
+ }
+ if (pf->samples_no == 0)
+ continue;
+ /*
+ * new profile, possibly allocate memory
+ * and copy data.
+ */
+ if (s->profile == NULL)
+ s->profile = malloc(pf->oid.len,
+ M_DUMMYNET, M_NOWAIT | M_ZERO);
+ if (s->profile == NULL) {
+ D("no memory for profile %d", i);
+ err = ENOMEM;
+ break;
+ }
+ /* preserve larger length XXX double check */
+ olen = s->profile->oid.len;
+ if (olen < pf->oid.len)
+ olen = pf->oid.len;
+ bcopy(pf, s->profile, pf->oid.len);
+ s->profile->oid.len = olen;
+ }
+ DN_BH_WUNLOCK();
+ return err;
+}
+
+/*
+ * Delete all objects:
+ */
+static void
+dummynet_flush(void)
+{
+
+ /* delete all schedulers and related links/queues/flowsets */
+ dn_ht_scan(dn_cfg.schedhash, schk_delete_cb,
+ (void *)(uintptr_t)DN_DELETE_FS);
+ /* delete all remaining (unlinked) flowsets */
+ DX(4, "still %d unlinked fs", dn_cfg.fsk_count);
+ dn_ht_free(dn_cfg.fshash, DNHT_REMOVE);
+ fsk_detach_list(&dn_cfg.fsu, DN_DELETE_FS);
+ /* Reinitialize system heap... */
+ heap_init(&dn_cfg.evheap, 16, offsetof(struct dn_id, id));
+}
+
+/*
+ * Main handler for configuration. We are guaranteed to be called
+ * with an oid which is at least a dn_id.
+ * - the first object is the command (config, delete, flush, ...)
+ * - config_link must be issued after the corresponding config_sched
+ * - parameters (DN_TXT) for an object must preceed the object
+ * processed on a config_sched.
+ */
+int
+do_config(void *p, int l)
+{
+ struct dn_id *next, *o;
+ int err = 0, err2 = 0;
+ struct dn_id *arg = NULL;
+ uintptr_t *a;
+
+ o = p;
+ if (o->id != DN_API_VERSION) {
+ D("invalid api version got %d need %d",
+ o->id, DN_API_VERSION);
+ return EINVAL;
+ }
+ for (; l >= sizeof(*o); o = next) {
+ struct dn_id *prev = arg;
+ if (o->len < sizeof(*o) || l < o->len) {
+ D("bad len o->len %d len %d", o->len, l);
+ err = EINVAL;
+ break;
+ }
+ l -= o->len;
+ next = (struct dn_id *)((char *)o + o->len);
+ err = 0;
+ switch (o->type) {
+ default:
+ D("cmd %d not implemented", o->type);
+ break;
+
+#ifdef EMULATE_SYSCTL
+ /* sysctl emulation.
+ * if we recognize the command, jump to the correct
+ * handler and return
+ */
+ case DN_SYSCTL_SET:
+ err = kesysctl_emu_set(p, l);
+ return err;
+#endif
+
+ case DN_CMD_CONFIG: /* simply a header */
+ break;
+
+ case DN_CMD_DELETE:
+ /* the argument is in the first uintptr_t after o */
+ a = (uintptr_t *)(o+1);
+ if (o->len < sizeof(*o) + sizeof(*a)) {
+ err = EINVAL;
+ break;
+ }
+ switch (o->subtype) {
+ case DN_LINK:
+ /* delete base and derived schedulers */
+ DN_BH_WLOCK();
+ err = delete_schk(*a);
+ err2 = delete_schk(*a + DN_MAX_ID);
+ DN_BH_WUNLOCK();
+ if (!err)
+ err = err2;
+ break;
+
+ default:
+ D("invalid delete type %d",
+ o->subtype);
+ err = EINVAL;
+ break;
+
+ case DN_FS:
+ err = (*a <1 || *a >= DN_MAX_ID) ?
+ EINVAL : delete_fs(*a, 0) ;
+ break;
+ }
+ break;
+
+ case DN_CMD_FLUSH:
+ DN_BH_WLOCK();
+ dummynet_flush();
+ DN_BH_WUNLOCK();
+ break;
+ case DN_TEXT: /* store argument the next block */
+ prev = NULL;
+ arg = o;
+ break;
+ case DN_LINK:
+ err = config_link((struct dn_link *)o, arg);
+ break;
+ case DN_PROFILE:
+ err = config_profile((struct dn_profile *)o, arg);
+ break;
+ case DN_SCH:
+ err = config_sched((struct dn_sch *)o, arg);
+ break;
+ case DN_FS:
+ err = (NULL==config_fs((struct dn_fs *)o, arg, 0));
+ break;
+ }
+ if (prev)
+ arg = NULL;
+ if (err != 0)
+ break;
+ }
+ return err;
+}
+
+static int
+compute_space(struct dn_id *cmd, struct copy_args *a)
+{
+ int x = 0, need = 0;
+ int profile_size = sizeof(struct dn_profile) -
+ ED_MAX_SAMPLES_NO*sizeof(int);
+
+ /* NOTE about compute space:
+ * NP = dn_cfg.schk_count
+ * NSI = dn_cfg.si_count
+ * NF = dn_cfg.fsk_count
+ * NQ = dn_cfg.queue_count
+ * - ipfw pipe show
+ * (NP/2)*(dn_link + dn_sch + dn_id + dn_fs) only half scheduler
+ * link, scheduler template, flowset
+ * integrated in scheduler and header
+ * for flowset list
+ * (NSI)*(dn_flow) all scheduler instance (includes
+ * the queue instance)
+ * - ipfw sched show
+ * (NP/2)*(dn_link + dn_sch + dn_id + dn_fs) only half scheduler
+ * link, scheduler template, flowset
+ * integrated in scheduler and header
+ * for flowset list
+ * (NSI * dn_flow) all scheduler instances
+ * (NF * sizeof(uint_32)) space for flowset list linked to scheduler
+ * (NQ * dn_queue) all queue [XXXfor now not listed]
+ * - ipfw queue show
+ * (NF * dn_fs) all flowset
+ * (NQ * dn_queue) all queues
+ */
+ switch (cmd->subtype) {
+ default:
+ return -1;
+ /* XXX where do LINK and SCH differ ? */
+ /* 'ipfw sched show' could list all queues associated to
+ * a scheduler. This feature for now is disabled
+ */
+ case DN_LINK: /* pipe show */
+ x = DN_C_LINK | DN_C_SCH | DN_C_FLOW;
+ need += dn_cfg.schk_count *
+ (sizeof(struct dn_fs) + profile_size) / 2;
+ need += dn_cfg.fsk_count * sizeof(uint32_t);
+ break;
+ case DN_SCH: /* sched show */
+ need += dn_cfg.schk_count *
+ (sizeof(struct dn_fs) + profile_size) / 2;
+ need += dn_cfg.fsk_count * sizeof(uint32_t);
+ x = DN_C_SCH | DN_C_LINK | DN_C_FLOW;
+ break;
+ case DN_FS: /* queue show */
+ x = DN_C_FS | DN_C_QUEUE;
+ break;
+ case DN_GET_COMPAT: /* compatibility mode */
+ need = dn_compat_calc_size();
+ break;
+ }
+ a->flags = x;
+ if (x & DN_C_SCH) {
+ need += dn_cfg.schk_count * sizeof(struct dn_sch) / 2;
+ /* NOT also, each fs might be attached to a sched */
+ need += dn_cfg.schk_count * sizeof(struct dn_id) / 2;
+ }
+ if (x & DN_C_FS)
+ need += dn_cfg.fsk_count * sizeof(struct dn_fs);
+ if (x & DN_C_LINK) {
+ need += dn_cfg.schk_count * sizeof(struct dn_link) / 2;
+ }
+ /*
+ * When exporting a queue to userland, only pass up the
+ * struct dn_flow, which is the only visible part.
+ */
+
+ if (x & DN_C_QUEUE)
+ need += dn_cfg.queue_count * sizeof(struct dn_flow);
+ if (x & DN_C_FLOW)
+ need += dn_cfg.si_count * (sizeof(struct dn_flow));
+ return need;
+}
+
+/*
+ * If compat != NULL dummynet_get is called in compatibility mode.
+ * *compat will be the pointer to the buffer to pass to ipfw
+ */
+int
+dummynet_get(struct sockopt *sopt, void **compat)
+{
+ int have, i, need, error;
+ char *start = NULL, *buf;
+ size_t sopt_valsize;
+ struct dn_id *cmd;
+ struct copy_args a;
+ struct copy_range r;
+ int l = sizeof(struct dn_id);
+
+ bzero(&a, sizeof(a));
+ bzero(&r, sizeof(r));
+
+ /* save and restore original sopt_valsize around copyin */
+ sopt_valsize = sopt->sopt_valsize;
+
+ cmd = &r.o;
+
+ if (!compat) {
+ /* copy at least an oid, and possibly a full object */
+ error = sooptcopyin(sopt, cmd, sizeof(r), sizeof(*cmd));
+ sopt->sopt_valsize = sopt_valsize;
+ if (error)
+ goto done;
+ l = cmd->len;
+#ifdef EMULATE_SYSCTL
+ /* sysctl emulation. */
+ if (cmd->type == DN_SYSCTL_GET)
+ return kesysctl_emu_get(sopt);
+#endif
+ if (l > sizeof(r)) {
+ /* request larger than default, allocate buffer */
+ cmd = malloc(l, M_DUMMYNET, M_WAITOK);
+ error = sooptcopyin(sopt, cmd, l, l);
+ sopt->sopt_valsize = sopt_valsize;
+ if (error)
+ goto done;
+ }
+ } else { /* compatibility */
+ error = 0;
+ cmd->type = DN_CMD_GET;
+ cmd->len = sizeof(struct dn_id);
+ cmd->subtype = DN_GET_COMPAT;
+ // cmd->id = sopt_valsize;
+ D("compatibility mode");
+ }
+ a.extra = (struct copy_range *)cmd;
+ if (cmd->len == sizeof(*cmd)) { /* no range, create a default */
+ uint32_t *rp = (uint32_t *)(cmd + 1);
+ cmd->len += 2* sizeof(uint32_t);
+ rp[0] = 1;
+ rp[1] = DN_MAX_ID - 1;
+ if (cmd->subtype == DN_LINK) {
+ rp[0] += DN_MAX_ID;
+ rp[1] += DN_MAX_ID;
+ }
+ }
+ /* Count space (under lock) and allocate (outside lock).
+ * Exit with lock held if we manage to get enough buffer.
+ * Try a few times then give up.
+ */
+ for (have = 0, i = 0; i < 10; i++) {
+ DN_BH_WLOCK();
+ need = compute_space(cmd, &a);
+
+ /* if there is a range, ignore value from compute_space() */
+ if (l > sizeof(*cmd))
+ need = sopt_valsize - sizeof(*cmd);
+
+ if (need < 0) {
+ DN_BH_WUNLOCK();
+ error = EINVAL;
+ goto done;
+ }
+ need += sizeof(*cmd);
+ cmd->id = need;
+ if (have >= need)
+ break;
+
+ DN_BH_WUNLOCK();
+ if (start)
+ free(start, M_DUMMYNET);
+ start = NULL;
+ if (need > sopt_valsize)
+ break;
+
+ have = need;
+ start = malloc(have, M_DUMMYNET, M_WAITOK | M_ZERO);
+ }
+
+ if (start == NULL) {
+ if (compat) {
+ *compat = NULL;
+ error = 1; // XXX
+ } else {
+ error = sooptcopyout(sopt, cmd, sizeof(*cmd));
+ }
+ goto done;
+ }
+ ND("have %d:%d sched %d, %d:%d links %d, %d:%d flowsets %d, "
+ "%d:%d si %d, %d:%d queues %d",
+ dn_cfg.schk_count, sizeof(struct dn_sch), DN_SCH,
+ dn_cfg.schk_count, sizeof(struct dn_link), DN_LINK,
+ dn_cfg.fsk_count, sizeof(struct dn_fs), DN_FS,
+ dn_cfg.si_count, sizeof(struct dn_flow), DN_SCH_I,
+ dn_cfg.queue_count, sizeof(struct dn_queue), DN_QUEUE);
+ sopt->sopt_valsize = sopt_valsize;
+ a.type = cmd->subtype;
+
+ if (compat == NULL) {
+ bcopy(cmd, start, sizeof(*cmd));
+ ((struct dn_id*)(start))->len = sizeof(struct dn_id);
+ buf = start + sizeof(*cmd);
+ } else
+ buf = start;
+ a.start = &buf;
+ a.end = start + have;
+ /* start copying other objects */
+ if (compat) {
+ a.type = DN_COMPAT_PIPE;
+ dn_ht_scan(dn_cfg.schedhash, copy_data_helper_compat, &a);
+ a.type = DN_COMPAT_QUEUE;
+ dn_ht_scan(dn_cfg.fshash, copy_data_helper_compat, &a);
+ } else if (a.type == DN_FS) {
+ dn_ht_scan(dn_cfg.fshash, copy_data_helper, &a);
+ } else {
+ dn_ht_scan(dn_cfg.schedhash, copy_data_helper, &a);
+ }
+ DN_BH_WUNLOCK();
+
+ if (compat) {
+ *compat = start;
+ sopt->sopt_valsize = buf - start;
+ /* free() is done by ip_dummynet_compat() */
+ start = NULL; //XXX hack
+ } else {
+ error = sooptcopyout(sopt, start, buf - start);
+ }
+done:
+ if (cmd && cmd != &r.o)
+ free(cmd, M_DUMMYNET);
+ if (start)
+ free(start, M_DUMMYNET);
+ return error;
+}
+
+/* Callback called on scheduler instance to delete it if idle */
+static int
+drain_scheduler_cb(void *_si, void *arg)
+{
+ struct dn_sch_inst *si = _si;
+
+ if ((si->kflags & DN_ACTIVE) || si->dline.mq.head != NULL)
+ return 0;
+
+ if (si->sched->fp->flags & DN_MULTIQUEUE) {
+ if (si->q_count == 0)
+ return si_destroy(si, NULL);
+ else
+ return 0;
+ } else { /* !DN_MULTIQUEUE */
+ if ((si+1)->ni.length == 0)
+ return si_destroy(si, NULL);
+ else
+ return 0;
+ }
+ return 0; /* unreachable */
+}
+
+/* Callback called on scheduler to check if it has instances */
+static int
+drain_scheduler_sch_cb(void *_s, void *arg)
+{
+ struct dn_schk *s = _s;
+
+ if (s->sch.flags & DN_HAVE_MASK) {
+ dn_ht_scan_bucket(s->siht, &s->drain_bucket,
+ drain_scheduler_cb, NULL);
+ s->drain_bucket++;
+ } else {
+ if (s->siht) {
+ if (drain_scheduler_cb(s->siht, NULL) == DNHT_SCAN_DEL)
+ s->siht = NULL;
+ }
+ }
+ return 0;
+}
+
+/* Called every tick, try to delete a 'bucket' of scheduler */
+void
+dn_drain_scheduler(void)
+{
+ dn_ht_scan_bucket(dn_cfg.schedhash, &dn_cfg.drain_sch,
+ drain_scheduler_sch_cb, NULL);
+ dn_cfg.drain_sch++;
+}
+
+/* Callback called on queue to delete if it is idle */
+static int
+drain_queue_cb(void *_q, void *arg)
+{
+ struct dn_queue *q = _q;
+
+ if (q->ni.length == 0) {
+ dn_delete_queue(q, DN_DESTROY);
+ return DNHT_SCAN_DEL; /* queue is deleted */
+ }
+
+ return 0; /* queue isn't deleted */
+}
+
+/* Callback called on flowset used to check if it has queues */
+static int
+drain_queue_fs_cb(void *_fs, void *arg)
+{
+ struct dn_fsk *fs = _fs;
+
+ if (fs->fs.flags & DN_QHT_HASH) {
+ /* Flowset has a hash table for queues */
+ dn_ht_scan_bucket(fs->qht, &fs->drain_bucket,
+ drain_queue_cb, NULL);
+ fs->drain_bucket++;
+ } else {
+ /* No hash table for this flowset, null the pointer
+ * if the queue is deleted
+ */
+ if (fs->qht) {
+ if (drain_queue_cb(fs->qht, NULL) == DNHT_SCAN_DEL)
+ fs->qht = NULL;
+ }
+ }
+ return 0;
+}
+
+/* Called every tick, try to delete a 'bucket' of queue */
+void
+dn_drain_queue(void)
+{
+ /* scan a bucket of flowset */
+ dn_ht_scan_bucket(dn_cfg.fshash, &dn_cfg.drain_fs,
+ drain_queue_fs_cb, NULL);
+ dn_cfg.drain_fs++;
+}
+
+/*
+ * Handler for the various dummynet socket options
+ */
+static int
+ip_dn_ctl(struct sockopt *sopt)
+{
+ void *p = NULL;
+ int error, l;
+
+ error = priv_check(sopt->sopt_td, PRIV_NETINET_DUMMYNET);
+ if (error)
+ return (error);
+
+ /* Disallow sets in really-really secure mode. */
+ if (sopt->sopt_dir == SOPT_SET) {
+ error = securelevel_ge(sopt->sopt_td->td_ucred, 3);
+ if (error)
+ return (error);
+ }
+
+ switch (sopt->sopt_name) {
+ default :
+ D("dummynet: unknown option %d", sopt->sopt_name);
+ error = EINVAL;
+ break;
+
+ case IP_DUMMYNET_FLUSH:
+ case IP_DUMMYNET_CONFIGURE:
+ case IP_DUMMYNET_DEL: /* remove a pipe or queue */
+ case IP_DUMMYNET_GET:
+ D("dummynet: compat option %d", sopt->sopt_name);
+ error = ip_dummynet_compat(sopt);
+ break;
+
+ case IP_DUMMYNET3 :
+ if (sopt->sopt_dir == SOPT_GET) {
+ error = dummynet_get(sopt, NULL);
+ break;
+ }
+ l = sopt->sopt_valsize;
+ if (l < sizeof(struct dn_id) || l > 12000) {
+ D("argument len %d invalid", l);
+ break;
+ }
+ p = malloc(l, M_TEMP, M_WAITOK); // XXX can it fail ?
+ error = sooptcopyin(sopt, p, l, l);
+ if (error)
+ break ;
+ error = do_config(p, l);
+ break;
+ }
+
+ if (p != NULL)
+ free(p, M_TEMP);
+
+ return error ;
+}
+
+
+static void
+ip_dn_init(void)
+{
+ if (dn_cfg.init_done)
+ return;
+ printf("DUMMYNET %p with IPv6 initialized (100409)\n", curvnet);
+ dn_cfg.init_done = 1;
+ /* Set defaults here. MSVC does not accept initializers,
+ * and this is also useful for vimages
+ */
+ /* queue limits */
+ dn_cfg.slot_limit = 100; /* Foot shooting limit for queues. */
+ dn_cfg.byte_limit = 1024 * 1024;
+ dn_cfg.expire = 1;
+
+ /* RED parameters */
+ dn_cfg.red_lookup_depth = 256; /* default lookup table depth */
+ dn_cfg.red_avg_pkt_size = 512; /* default medium packet size */
+ dn_cfg.red_max_pkt_size = 1500; /* default max packet size */
+
+ /* hash tables */
+ dn_cfg.max_hash_size = 65536; /* max in the hash tables */
+ dn_cfg.hash_size = 64; /* default hash size */
+
+ /* create hash tables for schedulers and flowsets.
+ * In both we search by key and by pointer.
+ */
+ dn_cfg.schedhash = dn_ht_init(NULL, dn_cfg.hash_size,
+ offsetof(struct dn_schk, schk_next),
+ schk_hash, schk_match, schk_new);
+ dn_cfg.fshash = dn_ht_init(NULL, dn_cfg.hash_size,
+ offsetof(struct dn_fsk, fsk_next),
+ fsk_hash, fsk_match, fsk_new);
+
+ /* bucket index to drain object */
+ dn_cfg.drain_fs = 0;
+ dn_cfg.drain_sch = 0;
+
+ heap_init(&dn_cfg.evheap, 16, offsetof(struct dn_id, id));
+ SLIST_INIT(&dn_cfg.fsu);
+ SLIST_INIT(&dn_cfg.schedlist);
+
+ DN_LOCK_INIT();
+
+ TASK_INIT(&dn_task, 0, dummynet_task, curvnet);
+ dn_tq = taskqueue_create("dummynet", M_WAITOK,
+ taskqueue_thread_enqueue, &dn_tq);
+ taskqueue_start_threads(&dn_tq, 1, PI_NET, "dummynet");
+
+ callout_init(&dn_timeout, CALLOUT_MPSAFE);
+ callout_reset(&dn_timeout, 1, dummynet, NULL);
+
+ /* Initialize curr_time adjustment mechanics. */
+ getmicrouptime(&dn_cfg.prev_t);
+}
+
+#ifdef KLD_MODULE
+static void
+ip_dn_destroy(int last)
+{
+ callout_drain(&dn_timeout);
+
+ DN_BH_WLOCK();
+ if (last) {
+ ND("removing last instance\n");
+ ip_dn_ctl_ptr = NULL;
+ ip_dn_io_ptr = NULL;
+ }
+
+ dummynet_flush();
+ DN_BH_WUNLOCK();
+ taskqueue_drain(dn_tq, &dn_task);
+ taskqueue_free(dn_tq);
+
+ dn_ht_free(dn_cfg.schedhash, 0);
+ dn_ht_free(dn_cfg.fshash, 0);
+ heap_free(&dn_cfg.evheap);
+
+ DN_LOCK_DESTROY();
+}
+#endif /* KLD_MODULE */
+
+static int
+dummynet_modevent(module_t mod, int type, void *data)
+{
+
+ if (type == MOD_LOAD) {
+ if (ip_dn_io_ptr) {
+ printf("DUMMYNET already loaded\n");
+ return EEXIST ;
+ }
+ ip_dn_init();
+ ip_dn_ctl_ptr = ip_dn_ctl;
+ ip_dn_io_ptr = dummynet_io;
+ return 0;
+ } else if (type == MOD_UNLOAD) {
+#if !defined(KLD_MODULE)
+ printf("dummynet statically compiled, cannot unload\n");
+ return EINVAL ;
+#else
+ ip_dn_destroy(1 /* last */);
+ return 0;
+#endif
+ } else
+ return EOPNOTSUPP;
+}
+
+/* modevent helpers for the modules */
+static int
+load_dn_sched(struct dn_alg *d)
+{
+ struct dn_alg *s;
+
+ if (d == NULL)
+ return 1; /* error */
+ ip_dn_init(); /* just in case, we need the lock */
+
+ /* Check that mandatory funcs exists */
+ if (d->enqueue == NULL || d->dequeue == NULL) {
+ D("missing enqueue or dequeue for %s", d->name);
+ return 1;
+ }
+
+ /* Search if scheduler already exists */
+ DN_BH_WLOCK();
+ SLIST_FOREACH(s, &dn_cfg.schedlist, next) {
+ if (strcmp(s->name, d->name) == 0) {
+ D("%s already loaded", d->name);
+ break; /* scheduler already exists */
+ }
+ }
+ if (s == NULL)
+ SLIST_INSERT_HEAD(&dn_cfg.schedlist, d, next);
+ DN_BH_WUNLOCK();
+ D("dn_sched %s %sloaded", d->name, s ? "not ":"");
+ return s ? 1 : 0;
+}
+
+static int
+unload_dn_sched(struct dn_alg *s)
+{
+ struct dn_alg *tmp, *r;
+ int err = EINVAL;
+
+ ND("called for %s", s->name);
+
+ DN_BH_WLOCK();
+ SLIST_FOREACH_SAFE(r, &dn_cfg.schedlist, next, tmp) {
+ if (strcmp(s->name, r->name) != 0)
+ continue;
+ ND("ref_count = %d", r->ref_count);
+ err = (r->ref_count != 0) ? EBUSY : 0;
+ if (err == 0)
+ SLIST_REMOVE(&dn_cfg.schedlist, r, dn_alg, next);
+ break;
+ }
+ DN_BH_WUNLOCK();
+ D("dn_sched %s %sunloaded", s->name, err ? "not ":"");
+ return err;
+}
+
+int
+dn_sched_modevent(module_t mod, int cmd, void *arg)
+{
+ struct dn_alg *sch = arg;
+
+ if (cmd == MOD_LOAD)
+ return load_dn_sched(sch);
+ else if (cmd == MOD_UNLOAD)
+ return unload_dn_sched(sch);
+ else
+ return EINVAL;
+}
+
+static moduledata_t dummynet_mod = {
+ "dummynet", dummynet_modevent, NULL
+};
+
+#define DN_SI_SUB SI_SUB_PROTO_IFATTACHDOMAIN
+#define DN_MODEV_ORD (SI_ORDER_ANY - 128) /* after ipfw */
+DECLARE_MODULE(dummynet, dummynet_mod, DN_SI_SUB, DN_MODEV_ORD);
+MODULE_DEPEND(dummynet, ipfw, 2, 2, 2);
+MODULE_VERSION(dummynet, 3);
+
+/*
+ * Starting up. Done in order after dummynet_modevent() has been called.
+ * VNET_SYSINIT is also called for each existing vnet and each new vnet.
+ */
+//VNET_SYSINIT(vnet_dn_init, DN_SI_SUB, DN_MODEV_ORD+2, ip_dn_init, NULL);
+
+/*
+ * Shutdown handlers up shop. These are done in REVERSE ORDER, but still
+ * after dummynet_modevent() has been called. Not called on reboot.
+ * VNET_SYSUNINIT is also called for each exiting vnet as it exits.
+ * or when the module is unloaded.
+ */
+//VNET_SYSUNINIT(vnet_dn_uninit, DN_SI_SUB, DN_MODEV_ORD+2, ip_dn_destroy, NULL);
+
+/* end of file */
diff --git a/sys/netpfil/ipfw/ip_fw2.c b/sys/netpfil/ipfw/ip_fw2.c
new file mode 100644
index 0000000..e43a0ef
--- /dev/null
+++ b/sys/netpfil/ipfw/ip_fw2.c
@@ -0,0 +1,2791 @@
+/*-
+ * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ * The FreeBSD IP packet firewall, main file
+ */
+
+#include "opt_ipfw.h"
+#include "opt_ipdivert.h"
+#include "opt_inet.h"
+#ifndef INET
+#error "IPFIREWALL requires INET"
+#endif /* INET */
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/condvar.h>
+#include <sys/eventhandler.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/jail.h>
+#include <sys/module.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/ucred.h>
+#include <net/ethernet.h> /* for ETHERTYPE_IP */
+#include <net/if.h>
+#include <net/route.h>
+#include <net/pf_mtag.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/in_pcb.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip_icmp.h>
+#include <netinet/ip_fw.h>
+#include <netinet/ip_carp.h>
+#include <netinet/pim.h>
+#include <netinet/tcp_var.h>
+#include <netinet/udp.h>
+#include <netinet/udp_var.h>
+#include <netinet/sctp.h>
+
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+#ifdef INET6
+#include <netinet6/in6_pcb.h>
+#include <netinet6/scope6_var.h>
+#include <netinet6/ip6_var.h>
+#endif
+
+#include <netpfil/ipfw/ip_fw_private.h>
+
+#include <machine/in_cksum.h> /* XXX for in_cksum */
+
+#ifdef MAC
+#include <security/mac/mac_framework.h>
+#endif
+
+/*
+ * static variables followed by global ones.
+ * All ipfw global variables are here.
+ */
+
+/* ipfw_vnet_ready controls when we are open for business */
+static VNET_DEFINE(int, ipfw_vnet_ready) = 0;
+#define V_ipfw_vnet_ready VNET(ipfw_vnet_ready)
+
+static VNET_DEFINE(int, fw_deny_unknown_exthdrs);
+#define V_fw_deny_unknown_exthdrs VNET(fw_deny_unknown_exthdrs)
+
+static VNET_DEFINE(int, fw_permit_single_frag6) = 1;
+#define V_fw_permit_single_frag6 VNET(fw_permit_single_frag6)
+
+#ifdef IPFIREWALL_DEFAULT_TO_ACCEPT
+static int default_to_accept = 1;
+#else
+static int default_to_accept;
+#endif
+
+VNET_DEFINE(int, autoinc_step);
+VNET_DEFINE(int, fw_one_pass) = 1;
+
+VNET_DEFINE(unsigned int, fw_tables_max);
+/* Use 128 tables by default */
+static unsigned int default_fw_tables = IPFW_TABLES_DEFAULT;
+
+/*
+ * Each rule belongs to one of 32 different sets (0..31).
+ * The variable set_disable contains one bit per set.
+ * If the bit is set, all rules in the corresponding set
+ * are disabled. Set RESVD_SET(31) is reserved for the default rule
+ * and rules that are not deleted by the flush command,
+ * and CANNOT be disabled.
+ * Rules in set RESVD_SET can only be deleted individually.
+ */
+VNET_DEFINE(u_int32_t, set_disable);
+#define V_set_disable VNET(set_disable)
+
+VNET_DEFINE(int, fw_verbose);
+/* counter for ipfw_log(NULL...) */
+VNET_DEFINE(u_int64_t, norule_counter);
+VNET_DEFINE(int, verbose_limit);
+
+/* layer3_chain contains the list of rules for layer 3 */
+VNET_DEFINE(struct ip_fw_chain, layer3_chain);
+
+ipfw_nat_t *ipfw_nat_ptr = NULL;
+struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int);
+ipfw_nat_cfg_t *ipfw_nat_cfg_ptr;
+ipfw_nat_cfg_t *ipfw_nat_del_ptr;
+ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr;
+ipfw_nat_cfg_t *ipfw_nat_get_log_ptr;
+
+#ifdef SYSCTL_NODE
+uint32_t dummy_def = IPFW_DEFAULT_RULE;
+static int sysctl_ipfw_table_num(SYSCTL_HANDLER_ARGS);
+
+SYSBEGIN(f3)
+
+SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, one_pass,
+ CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0,
+ "Only do a single pass through ipfw when using dummynet(4)");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step,
+ CTLFLAG_RW, &VNET_NAME(autoinc_step), 0,
+ "Rule number auto-increment step");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose,
+ CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_verbose), 0,
+ "Log matches to ipfw rules");
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit,
+ CTLFLAG_RW, &VNET_NAME(verbose_limit), 0,
+ "Set upper limit of matches of ipfw rules logged");
+SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, default_rule, CTLFLAG_RD,
+ &dummy_def, 0,
+ "The default/max possible rule number.");
+SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, tables_max,
+ CTLTYPE_UINT|CTLFLAG_RW, 0, 0, sysctl_ipfw_table_num, "IU",
+ "Maximum number of tables");
+SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, default_to_accept, CTLFLAG_RDTUN,
+ &default_to_accept, 0,
+ "Make the default rule accept all packets.");
+TUNABLE_INT("net.inet.ip.fw.default_to_accept", &default_to_accept);
+TUNABLE_INT("net.inet.ip.fw.tables_max", (int *)&default_fw_tables);
+SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, static_count,
+ CTLFLAG_RD, &VNET_NAME(layer3_chain.n_rules), 0,
+ "Number of static rules");
+
+#ifdef INET6
+SYSCTL_DECL(_net_inet6_ip6);
+SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
+SYSCTL_VNET_INT(_net_inet6_ip6_fw, OID_AUTO, deny_unknown_exthdrs,
+ CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_deny_unknown_exthdrs), 0,
+ "Deny packets with unknown IPv6 Extension Headers");
+SYSCTL_VNET_INT(_net_inet6_ip6_fw, OID_AUTO, permit_single_frag6,
+ CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_permit_single_frag6), 0,
+ "Permit single packet IPv6 fragments");
+#endif /* INET6 */
+
+SYSEND
+
+#endif /* SYSCTL_NODE */
+
+
+/*
+ * Some macros used in the various matching options.
+ * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T
+ * Other macros just cast void * into the appropriate type
+ */
+#define L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl))
+#define TCP(p) ((struct tcphdr *)(p))
+#define SCTP(p) ((struct sctphdr *)(p))
+#define UDP(p) ((struct udphdr *)(p))
+#define ICMP(p) ((struct icmphdr *)(p))
+#define ICMP6(p) ((struct icmp6_hdr *)(p))
+
+static __inline int
+icmptype_match(struct icmphdr *icmp, ipfw_insn_u32 *cmd)
+{
+ int type = icmp->icmp_type;
+
+ return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1<<type)) );
+}
+
+#define TT ( (1 << ICMP_ECHO) | (1 << ICMP_ROUTERSOLICIT) | \
+ (1 << ICMP_TSTAMP) | (1 << ICMP_IREQ) | (1 << ICMP_MASKREQ) )
+
+static int
+is_icmp_query(struct icmphdr *icmp)
+{
+ int type = icmp->icmp_type;
+
+ return (type <= ICMP_MAXTYPE && (TT & (1<<type)) );
+}
+#undef TT
+
+/*
+ * The following checks use two arrays of 8 or 16 bits to store the
+ * bits that we want set or clear, respectively. They are in the
+ * low and high half of cmd->arg1 or cmd->d[0].
+ *
+ * We scan options and store the bits we find set. We succeed if
+ *
+ * (want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear
+ *
+ * The code is sometimes optimized not to store additional variables.
+ */
+
+static int
+flags_match(ipfw_insn *cmd, u_int8_t bits)
+{
+ u_char want_clear;
+ bits = ~bits;
+
+ if ( ((cmd->arg1 & 0xff) & bits) != 0)
+ return 0; /* some bits we want set were clear */
+ want_clear = (cmd->arg1 >> 8) & 0xff;
+ if ( (want_clear & bits) != want_clear)
+ return 0; /* some bits we want clear were set */
+ return 1;
+}
+
+static int
+ipopts_match(struct ip *ip, ipfw_insn *cmd)
+{
+ int optlen, bits = 0;
+ u_char *cp = (u_char *)(ip + 1);
+ int x = (ip->ip_hl << 2) - sizeof (struct ip);
+
+ for (; x > 0; x -= optlen, cp += optlen) {
+ int opt = cp[IPOPT_OPTVAL];
+
+ if (opt == IPOPT_EOL)
+ break;
+ if (opt == IPOPT_NOP)
+ optlen = 1;
+ else {
+ optlen = cp[IPOPT_OLEN];
+ if (optlen <= 0 || optlen > x)
+ return 0; /* invalid or truncated */
+ }
+ switch (opt) {
+
+ default:
+ break;
+
+ case IPOPT_LSRR:
+ bits |= IP_FW_IPOPT_LSRR;
+ break;
+
+ case IPOPT_SSRR:
+ bits |= IP_FW_IPOPT_SSRR;
+ break;
+
+ case IPOPT_RR:
+ bits |= IP_FW_IPOPT_RR;
+ break;
+
+ case IPOPT_TS:
+ bits |= IP_FW_IPOPT_TS;
+ break;
+ }
+ }
+ return (flags_match(cmd, bits));
+}
+
+static int
+tcpopts_match(struct tcphdr *tcp, ipfw_insn *cmd)
+{
+ int optlen, bits = 0;
+ u_char *cp = (u_char *)(tcp + 1);
+ int x = (tcp->th_off << 2) - sizeof(struct tcphdr);
+
+ for (; x > 0; x -= optlen, cp += optlen) {
+ int opt = cp[0];
+ if (opt == TCPOPT_EOL)
+ break;
+ if (opt == TCPOPT_NOP)
+ optlen = 1;
+ else {
+ optlen = cp[1];
+ if (optlen <= 0)
+ break;
+ }
+
+ switch (opt) {
+
+ default:
+ break;
+
+ case TCPOPT_MAXSEG:
+ bits |= IP_FW_TCPOPT_MSS;
+ break;
+
+ case TCPOPT_WINDOW:
+ bits |= IP_FW_TCPOPT_WINDOW;
+ break;
+
+ case TCPOPT_SACK_PERMITTED:
+ case TCPOPT_SACK:
+ bits |= IP_FW_TCPOPT_SACK;
+ break;
+
+ case TCPOPT_TIMESTAMP:
+ bits |= IP_FW_TCPOPT_TS;
+ break;
+
+ }
+ }
+ return (flags_match(cmd, bits));
+}
+
+static int
+iface_match(struct ifnet *ifp, ipfw_insn_if *cmd, struct ip_fw_chain *chain, uint32_t *tablearg)
+{
+ if (ifp == NULL) /* no iface with this packet, match fails */
+ return 0;
+ /* Check by name or by IP address */
+ if (cmd->name[0] != '\0') { /* match by name */
+ if (cmd->name[0] == '\1') /* use tablearg to match */
+ return ipfw_lookup_table_extended(chain, cmd->p.glob,
+ ifp->if_xname, tablearg, IPFW_TABLE_INTERFACE);
+ /* Check name */
+ if (cmd->p.glob) {
+ if (fnmatch(cmd->name, ifp->if_xname, 0) == 0)
+ return(1);
+ } else {
+ if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0)
+ return(1);
+ }
+ } else {
+#ifdef __FreeBSD__ /* and OSX too ? */
+ struct ifaddr *ia;
+
+ if_addr_rlock(ifp);
+ TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) {
+ if (ia->ifa_addr->sa_family != AF_INET)
+ continue;
+ if (cmd->p.ip.s_addr == ((struct sockaddr_in *)
+ (ia->ifa_addr))->sin_addr.s_addr) {
+ if_addr_runlock(ifp);
+ return(1); /* match */
+ }
+ }
+ if_addr_runlock(ifp);
+#endif /* __FreeBSD__ */
+ }
+ return(0); /* no match, fail ... */
+}
+
+/*
+ * The verify_path function checks if a route to the src exists and
+ * if it is reachable via ifp (when provided).
+ *
+ * The 'verrevpath' option checks that the interface that an IP packet
+ * arrives on is the same interface that traffic destined for the
+ * packet's source address would be routed out of.
+ * The 'versrcreach' option just checks that the source address is
+ * reachable via any route (except default) in the routing table.
+ * These two are a measure to block forged packets. This is also
+ * commonly known as "anti-spoofing" or Unicast Reverse Path
+ * Forwarding (Unicast RFP) in Cisco-ese. The name of the knobs
+ * is purposely reminiscent of the Cisco IOS command,
+ *
+ * ip verify unicast reverse-path
+ * ip verify unicast source reachable-via any
+ *
+ * which implements the same functionality. But note that the syntax
+ * is misleading, and the check may be performed on all IP packets
+ * whether unicast, multicast, or broadcast.
+ */
+static int
+verify_path(struct in_addr src, struct ifnet *ifp, u_int fib)
+{
+#ifndef __FreeBSD__
+ return 0;
+#else
+ struct route ro;
+ struct sockaddr_in *dst;
+
+ bzero(&ro, sizeof(ro));
+
+ dst = (struct sockaddr_in *)&(ro.ro_dst);
+ dst->sin_family = AF_INET;
+ dst->sin_len = sizeof(*dst);
+ dst->sin_addr = src;
+ in_rtalloc_ign(&ro, 0, fib);
+
+ if (ro.ro_rt == NULL)
+ return 0;
+
+ /*
+ * If ifp is provided, check for equality with rtentry.
+ * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp,
+ * in order to pass packets injected back by if_simloop():
+ * if useloopback == 1 routing entry (via lo0) for our own address
+ * may exist, so we need to handle routing assymetry.
+ */
+ if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) {
+ RTFREE(ro.ro_rt);
+ return 0;
+ }
+
+ /* if no ifp provided, check if rtentry is not default route */
+ if (ifp == NULL &&
+ satosin(rt_key(ro.ro_rt))->sin_addr.s_addr == INADDR_ANY) {
+ RTFREE(ro.ro_rt);
+ return 0;
+ }
+
+ /* or if this is a blackhole/reject route */
+ if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+ RTFREE(ro.ro_rt);
+ return 0;
+ }
+
+ /* found valid route */
+ RTFREE(ro.ro_rt);
+ return 1;
+#endif /* __FreeBSD__ */
+}
+
+#ifdef INET6
+/*
+ * ipv6 specific rules here...
+ */
+static __inline int
+icmp6type_match (int type, ipfw_insn_u32 *cmd)
+{
+ return (type <= ICMP6_MAXTYPE && (cmd->d[type/32] & (1<<(type%32)) ) );
+}
+
+static int
+flow6id_match( int curr_flow, ipfw_insn_u32 *cmd )
+{
+ int i;
+ for (i=0; i <= cmd->o.arg1; ++i )
+ if (curr_flow == cmd->d[i] )
+ return 1;
+ return 0;
+}
+
+/* support for IP6_*_ME opcodes */
+static int
+search_ip6_addr_net (struct in6_addr * ip6_addr)
+{
+ struct ifnet *mdc;
+ struct ifaddr *mdc2;
+ struct in6_ifaddr *fdm;
+ struct in6_addr copia;
+
+ TAILQ_FOREACH(mdc, &V_ifnet, if_link) {
+ if_addr_rlock(mdc);
+ TAILQ_FOREACH(mdc2, &mdc->if_addrhead, ifa_link) {
+ if (mdc2->ifa_addr->sa_family == AF_INET6) {
+ fdm = (struct in6_ifaddr *)mdc2;
+ copia = fdm->ia_addr.sin6_addr;
+ /* need for leaving scope_id in the sock_addr */
+ in6_clearscope(&copia);
+ if (IN6_ARE_ADDR_EQUAL(ip6_addr, &copia)) {
+ if_addr_runlock(mdc);
+ return 1;
+ }
+ }
+ }
+ if_addr_runlock(mdc);
+ }
+ return 0;
+}
+
+static int
+verify_path6(struct in6_addr *src, struct ifnet *ifp, u_int fib)
+{
+ struct route_in6 ro;
+ struct sockaddr_in6 *dst;
+
+ bzero(&ro, sizeof(ro));
+
+ dst = (struct sockaddr_in6 * )&(ro.ro_dst);
+ dst->sin6_family = AF_INET6;
+ dst->sin6_len = sizeof(*dst);
+ dst->sin6_addr = *src;
+
+ in6_rtalloc_ign(&ro, 0, fib);
+ if (ro.ro_rt == NULL)
+ return 0;
+
+ /*
+ * if ifp is provided, check for equality with rtentry
+ * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp,
+ * to support the case of sending packets to an address of our own.
+ * (where the former interface is the first argument of if_simloop()
+ * (=ifp), the latter is lo0)
+ */
+ if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) {
+ RTFREE(ro.ro_rt);
+ return 0;
+ }
+
+ /* if no ifp provided, check if rtentry is not default route */
+ if (ifp == NULL &&
+ IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(ro.ro_rt))->sin6_addr)) {
+ RTFREE(ro.ro_rt);
+ return 0;
+ }
+
+ /* or if this is a blackhole/reject route */
+ if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+ RTFREE(ro.ro_rt);
+ return 0;
+ }
+
+ /* found valid route */
+ RTFREE(ro.ro_rt);
+ return 1;
+
+}
+
+static int
+is_icmp6_query(int icmp6_type)
+{
+ if ((icmp6_type <= ICMP6_MAXTYPE) &&
+ (icmp6_type == ICMP6_ECHO_REQUEST ||
+ icmp6_type == ICMP6_MEMBERSHIP_QUERY ||
+ icmp6_type == ICMP6_WRUREQUEST ||
+ icmp6_type == ICMP6_FQDN_QUERY ||
+ icmp6_type == ICMP6_NI_QUERY))
+ return (1);
+
+ return (0);
+}
+
+static void
+send_reject6(struct ip_fw_args *args, int code, u_int hlen, struct ip6_hdr *ip6)
+{
+ struct mbuf *m;
+
+ m = args->m;
+ if (code == ICMP6_UNREACH_RST && args->f_id.proto == IPPROTO_TCP) {
+ struct tcphdr *tcp;
+ tcp = (struct tcphdr *)((char *)ip6 + hlen);
+
+ if ((tcp->th_flags & TH_RST) == 0) {
+ struct mbuf *m0;
+ m0 = ipfw_send_pkt(args->m, &(args->f_id),
+ ntohl(tcp->th_seq), ntohl(tcp->th_ack),
+ tcp->th_flags | TH_RST);
+ if (m0 != NULL)
+ ip6_output(m0, NULL, NULL, 0, NULL, NULL,
+ NULL);
+ }
+ FREE_PKT(m);
+ } else if (code != ICMP6_UNREACH_RST) { /* Send an ICMPv6 unreach. */
+#if 0
+ /*
+ * Unlike above, the mbufs need to line up with the ip6 hdr,
+ * as the contents are read. We need to m_adj() the
+ * needed amount.
+ * The mbuf will however be thrown away so we can adjust it.
+ * Remember we did an m_pullup on it already so we
+ * can make some assumptions about contiguousness.
+ */
+ if (args->L3offset)
+ m_adj(m, args->L3offset);
+#endif
+ icmp6_error(m, ICMP6_DST_UNREACH, code, 0);
+ } else
+ FREE_PKT(m);
+
+ args->m = NULL;
+}
+
+#endif /* INET6 */
+
+
+/*
+ * sends a reject message, consuming the mbuf passed as an argument.
+ */
+static void
+send_reject(struct ip_fw_args *args, int code, int iplen, struct ip *ip)
+{
+
+#if 0
+ /* XXX When ip is not guaranteed to be at mtod() we will
+ * need to account for this */
+ * The mbuf will however be thrown away so we can adjust it.
+ * Remember we did an m_pullup on it already so we
+ * can make some assumptions about contiguousness.
+ */
+ if (args->L3offset)
+ m_adj(m, args->L3offset);
+#endif
+ if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */
+ /* We need the IP header in host order for icmp_error(). */
+ SET_HOST_IPLEN(ip);
+ icmp_error(args->m, ICMP_UNREACH, code, 0L, 0);
+ } else if (args->f_id.proto == IPPROTO_TCP) {
+ struct tcphdr *const tcp =
+ L3HDR(struct tcphdr, mtod(args->m, struct ip *));
+ if ( (tcp->th_flags & TH_RST) == 0) {
+ struct mbuf *m;
+ m = ipfw_send_pkt(args->m, &(args->f_id),
+ ntohl(tcp->th_seq), ntohl(tcp->th_ack),
+ tcp->th_flags | TH_RST);
+ if (m != NULL)
+ ip_output(m, NULL, NULL, 0, NULL, NULL);
+ }
+ FREE_PKT(args->m);
+ } else
+ FREE_PKT(args->m);
+ args->m = NULL;
+}
+
+/*
+ * Support for uid/gid/jail lookup. These tests are expensive
+ * (because we may need to look into the list of active sockets)
+ * so we cache the results. ugid_lookupp is 0 if we have not
+ * yet done a lookup, 1 if we succeeded, and -1 if we tried
+ * and failed. The function always returns the match value.
+ * We could actually spare the variable and use *uc, setting
+ * it to '(void *)check_uidgid if we have no info, NULL if
+ * we tried and failed, or any other value if successful.
+ */
+static int
+check_uidgid(ipfw_insn_u32 *insn, struct ip_fw_args *args, int *ugid_lookupp,
+ struct ucred **uc)
+{
+#ifndef __FreeBSD__
+ /* XXX */
+ return cred_check(insn, proto, oif,
+ dst_ip, dst_port, src_ip, src_port,
+ (struct bsd_ucred *)uc, ugid_lookupp, ((struct mbuf *)inp)->m_skb);
+#else /* FreeBSD */
+ struct in_addr src_ip, dst_ip;
+ struct inpcbinfo *pi;
+ struct ipfw_flow_id *id;
+ struct inpcb *pcb, *inp;
+ struct ifnet *oif;
+ int lookupflags;
+ int match;
+
+ id = &args->f_id;
+ inp = args->inp;
+ oif = args->oif;
+
+ /*
+ * Check to see if the UDP or TCP stack supplied us with
+ * the PCB. If so, rather then holding a lock and looking
+ * up the PCB, we can use the one that was supplied.
+ */
+ if (inp && *ugid_lookupp == 0) {
+ INP_LOCK_ASSERT(inp);
+ if (inp->inp_socket != NULL) {
+ *uc = crhold(inp->inp_cred);
+ *ugid_lookupp = 1;
+ } else
+ *ugid_lookupp = -1;
+ }
+ /*
+ * If we have already been here and the packet has no
+ * PCB entry associated with it, then we can safely
+ * assume that this is a no match.
+ */
+ if (*ugid_lookupp == -1)
+ return (0);
+ if (id->proto == IPPROTO_TCP) {
+ lookupflags = 0;
+ pi = &V_tcbinfo;
+ } else if (id->proto == IPPROTO_UDP) {
+ lookupflags = INPLOOKUP_WILDCARD;
+ pi = &V_udbinfo;
+ } else
+ return 0;
+ lookupflags |= INPLOOKUP_RLOCKPCB;
+ match = 0;
+ if (*ugid_lookupp == 0) {
+ if (id->addr_type == 6) {
+#ifdef INET6
+ if (oif == NULL)
+ pcb = in6_pcblookup_mbuf(pi,
+ &id->src_ip6, htons(id->src_port),
+ &id->dst_ip6, htons(id->dst_port),
+ lookupflags, oif, args->m);
+ else
+ pcb = in6_pcblookup_mbuf(pi,
+ &id->dst_ip6, htons(id->dst_port),
+ &id->src_ip6, htons(id->src_port),
+ lookupflags, oif, args->m);
+#else
+ *ugid_lookupp = -1;
+ return (0);
+#endif
+ } else {
+ src_ip.s_addr = htonl(id->src_ip);
+ dst_ip.s_addr = htonl(id->dst_ip);
+ if (oif == NULL)
+ pcb = in_pcblookup_mbuf(pi,
+ src_ip, htons(id->src_port),
+ dst_ip, htons(id->dst_port),
+ lookupflags, oif, args->m);
+ else
+ pcb = in_pcblookup_mbuf(pi,
+ dst_ip, htons(id->dst_port),
+ src_ip, htons(id->src_port),
+ lookupflags, oif, args->m);
+ }
+ if (pcb != NULL) {
+ INP_RLOCK_ASSERT(pcb);
+ *uc = crhold(pcb->inp_cred);
+ *ugid_lookupp = 1;
+ INP_RUNLOCK(pcb);
+ }
+ if (*ugid_lookupp == 0) {
+ /*
+ * We tried and failed, set the variable to -1
+ * so we will not try again on this packet.
+ */
+ *ugid_lookupp = -1;
+ return (0);
+ }
+ }
+ if (insn->o.opcode == O_UID)
+ match = ((*uc)->cr_uid == (uid_t)insn->d[0]);
+ else if (insn->o.opcode == O_GID)
+ match = groupmember((gid_t)insn->d[0], *uc);
+ else if (insn->o.opcode == O_JAIL)
+ match = ((*uc)->cr_prison->pr_id == (int)insn->d[0]);
+ return (match);
+#endif /* __FreeBSD__ */
+}
+
+/*
+ * Helper function to set args with info on the rule after the matching
+ * one. slot is precise, whereas we guess rule_id as they are
+ * assigned sequentially.
+ */
+static inline void
+set_match(struct ip_fw_args *args, int slot,
+ struct ip_fw_chain *chain)
+{
+ args->rule.chain_id = chain->id;
+ args->rule.slot = slot + 1; /* we use 0 as a marker */
+ args->rule.rule_id = 1 + chain->map[slot]->id;
+ args->rule.rulenum = chain->map[slot]->rulenum;
+}
+
+/*
+ * The main check routine for the firewall.
+ *
+ * All arguments are in args so we can modify them and return them
+ * back to the caller.
+ *
+ * Parameters:
+ *
+ * args->m (in/out) The packet; we set to NULL when/if we nuke it.
+ * Starts with the IP header.
+ * args->eh (in) Mac header if present, NULL for layer3 packet.
+ * args->L3offset Number of bytes bypassed if we came from L2.
+ * e.g. often sizeof(eh) ** NOTYET **
+ * args->oif Outgoing interface, NULL if packet is incoming.
+ * The incoming interface is in the mbuf. (in)
+ * args->divert_rule (in/out)
+ * Skip up to the first rule past this rule number;
+ * upon return, non-zero port number for divert or tee.
+ *
+ * args->rule Pointer to the last matching rule (in/out)
+ * args->next_hop Socket we are forwarding to (out).
+ * args->next_hop6 IPv6 next hop we are forwarding to (out).
+ * args->f_id Addresses grabbed from the packet (out)
+ * args->rule.info a cookie depending on rule action
+ *
+ * Return value:
+ *
+ * IP_FW_PASS the packet must be accepted
+ * IP_FW_DENY the packet must be dropped
+ * IP_FW_DIVERT divert packet, port in m_tag
+ * IP_FW_TEE tee packet, port in m_tag
+ * IP_FW_DUMMYNET to dummynet, pipe in args->cookie
+ * IP_FW_NETGRAPH into netgraph, cookie args->cookie
+ * args->rule contains the matching rule,
+ * args->rule.info has additional information.
+ *
+ */
+int
+ipfw_chk(struct ip_fw_args *args)
+{
+
+ /*
+ * Local variables holding state while processing a packet:
+ *
+ * IMPORTANT NOTE: to speed up the processing of rules, there
+ * are some assumption on the values of the variables, which
+ * are documented here. Should you change them, please check
+ * the implementation of the various instructions to make sure
+ * that they still work.
+ *
+ * args->eh The MAC header. It is non-null for a layer2
+ * packet, it is NULL for a layer-3 packet.
+ * **notyet**
+ * args->L3offset Offset in the packet to the L3 (IP or equiv.) header.
+ *
+ * m | args->m Pointer to the mbuf, as received from the caller.
+ * It may change if ipfw_chk() does an m_pullup, or if it
+ * consumes the packet because it calls send_reject().
+ * XXX This has to change, so that ipfw_chk() never modifies
+ * or consumes the buffer.
+ * ip is the beginning of the ip(4 or 6) header.
+ * Calculated by adding the L3offset to the start of data.
+ * (Until we start using L3offset, the packet is
+ * supposed to start with the ip header).
+ */
+ struct mbuf *m = args->m;
+ struct ip *ip = mtod(m, struct ip *);
+
+ /*
+ * For rules which contain uid/gid or jail constraints, cache
+ * a copy of the users credentials after the pcb lookup has been
+ * executed. This will speed up the processing of rules with
+ * these types of constraints, as well as decrease contention
+ * on pcb related locks.
+ */
+#ifndef __FreeBSD__
+ struct bsd_ucred ucred_cache;
+#else
+ struct ucred *ucred_cache = NULL;
+#endif
+ int ucred_lookup = 0;
+
+ /*
+ * oif | args->oif If NULL, ipfw_chk has been called on the
+ * inbound path (ether_input, ip_input).
+ * If non-NULL, ipfw_chk has been called on the outbound path
+ * (ether_output, ip_output).
+ */
+ struct ifnet *oif = args->oif;
+
+ int f_pos = 0; /* index of current rule in the array */
+ int retval = 0;
+
+ /*
+ * hlen The length of the IP header.
+ */
+ u_int hlen = 0; /* hlen >0 means we have an IP pkt */
+
+ /*
+ * offset The offset of a fragment. offset != 0 means that
+ * we have a fragment at this offset of an IPv4 packet.
+ * offset == 0 means that (if this is an IPv4 packet)
+ * this is the first or only fragment.
+ * For IPv6 offset|ip6f_mf == 0 means there is no Fragment Header
+ * or there is a single packet fragement (fragement header added
+ * without needed). We will treat a single packet fragment as if
+ * there was no fragment header (or log/block depending on the
+ * V_fw_permit_single_frag6 sysctl setting).
+ */
+ u_short offset = 0;
+ u_short ip6f_mf = 0;
+
+ /*
+ * Local copies of addresses. They are only valid if we have
+ * an IP packet.
+ *
+ * proto The protocol. Set to 0 for non-ip packets,
+ * or to the protocol read from the packet otherwise.
+ * proto != 0 means that we have an IPv4 packet.
+ *
+ * src_port, dst_port port numbers, in HOST format. Only
+ * valid for TCP and UDP packets.
+ *
+ * src_ip, dst_ip ip addresses, in NETWORK format.
+ * Only valid for IPv4 packets.
+ */
+ uint8_t proto;
+ uint16_t src_port = 0, dst_port = 0; /* NOTE: host format */
+ struct in_addr src_ip, dst_ip; /* NOTE: network format */
+ uint16_t iplen=0;
+ int pktlen;
+ uint16_t etype = 0; /* Host order stored ether type */
+
+ /*
+ * dyn_dir = MATCH_UNKNOWN when rules unchecked,
+ * MATCH_NONE when checked and not matched (q = NULL),
+ * MATCH_FORWARD or MATCH_REVERSE otherwise (q != NULL)
+ */
+ int dyn_dir = MATCH_UNKNOWN;
+ ipfw_dyn_rule *q = NULL;
+ struct ip_fw_chain *chain = &V_layer3_chain;
+
+ /*
+ * We store in ulp a pointer to the upper layer protocol header.
+ * In the ipv4 case this is easy to determine from the header,
+ * but for ipv6 we might have some additional headers in the middle.
+ * ulp is NULL if not found.
+ */
+ void *ulp = NULL; /* upper layer protocol pointer. */
+
+ /* XXX ipv6 variables */
+ int is_ipv6 = 0;
+ uint8_t icmp6_type = 0;
+ uint16_t ext_hd = 0; /* bits vector for extension header filtering */
+ /* end of ipv6 variables */
+
+ int is_ipv4 = 0;
+
+ int done = 0; /* flag to exit the outer loop */
+
+ if (m->m_flags & M_SKIP_FIREWALL || (! V_ipfw_vnet_ready))
+ return (IP_FW_PASS); /* accept */
+
+ dst_ip.s_addr = 0; /* make sure it is initialized */
+ src_ip.s_addr = 0; /* make sure it is initialized */
+ pktlen = m->m_pkthdr.len;
+ args->f_id.fib = M_GETFIB(m); /* note mbuf not altered) */
+ proto = args->f_id.proto = 0; /* mark f_id invalid */
+ /* XXX 0 is a valid proto: IP/IPv6 Hop-by-Hop Option */
+
+/*
+ * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous,
+ * then it sets p to point at the offset "len" in the mbuf. WARNING: the
+ * pointer might become stale after other pullups (but we never use it
+ * this way).
+ */
+#define PULLUP_TO(_len, p, T) PULLUP_LEN(_len, p, sizeof(T))
+#define PULLUP_LEN(_len, p, T) \
+do { \
+ int x = (_len) + T; \
+ if ((m)->m_len < x) { \
+ args->m = m = m_pullup(m, x); \
+ if (m == NULL) \
+ goto pullup_failed; \
+ } \
+ p = (mtod(m, char *) + (_len)); \
+} while (0)
+
+ /*
+ * if we have an ether header,
+ */
+ if (args->eh)
+ etype = ntohs(args->eh->ether_type);
+
+ /* Identify IP packets and fill up variables. */
+ if (pktlen >= sizeof(struct ip6_hdr) &&
+ (args->eh == NULL || etype == ETHERTYPE_IPV6) && ip->ip_v == 6) {
+ struct ip6_hdr *ip6 = (struct ip6_hdr *)ip;
+ is_ipv6 = 1;
+ args->f_id.addr_type = 6;
+ hlen = sizeof(struct ip6_hdr);
+ proto = ip6->ip6_nxt;
+
+ /* Search extension headers to find upper layer protocols */
+ while (ulp == NULL && offset == 0) {
+ switch (proto) {
+ case IPPROTO_ICMPV6:
+ PULLUP_TO(hlen, ulp, struct icmp6_hdr);
+ icmp6_type = ICMP6(ulp)->icmp6_type;
+ break;
+
+ case IPPROTO_TCP:
+ PULLUP_TO(hlen, ulp, struct tcphdr);
+ dst_port = TCP(ulp)->th_dport;
+ src_port = TCP(ulp)->th_sport;
+ /* save flags for dynamic rules */
+ args->f_id._flags = TCP(ulp)->th_flags;
+ break;
+
+ case IPPROTO_SCTP:
+ PULLUP_TO(hlen, ulp, struct sctphdr);
+ src_port = SCTP(ulp)->src_port;
+ dst_port = SCTP(ulp)->dest_port;
+ break;
+
+ case IPPROTO_UDP:
+ PULLUP_TO(hlen, ulp, struct udphdr);
+ dst_port = UDP(ulp)->uh_dport;
+ src_port = UDP(ulp)->uh_sport;
+ break;
+
+ case IPPROTO_HOPOPTS: /* RFC 2460 */
+ PULLUP_TO(hlen, ulp, struct ip6_hbh);
+ ext_hd |= EXT_HOPOPTS;
+ hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
+ proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
+ ulp = NULL;
+ break;
+
+ case IPPROTO_ROUTING: /* RFC 2460 */
+ PULLUP_TO(hlen, ulp, struct ip6_rthdr);
+ switch (((struct ip6_rthdr *)ulp)->ip6r_type) {
+ case 0:
+ ext_hd |= EXT_RTHDR0;
+ break;
+ case 2:
+ ext_hd |= EXT_RTHDR2;
+ break;
+ default:
+ if (V_fw_verbose)
+ printf("IPFW2: IPV6 - Unknown "
+ "Routing Header type(%d)\n",
+ ((struct ip6_rthdr *)
+ ulp)->ip6r_type);
+ if (V_fw_deny_unknown_exthdrs)
+ return (IP_FW_DENY);
+ break;
+ }
+ ext_hd |= EXT_ROUTING;
+ hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
+ proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
+ ulp = NULL;
+ break;
+
+ case IPPROTO_FRAGMENT: /* RFC 2460 */
+ PULLUP_TO(hlen, ulp, struct ip6_frag);
+ ext_hd |= EXT_FRAGMENT;
+ hlen += sizeof (struct ip6_frag);
+ proto = ((struct ip6_frag *)ulp)->ip6f_nxt;
+ offset = ((struct ip6_frag *)ulp)->ip6f_offlg &
+ IP6F_OFF_MASK;
+ ip6f_mf = ((struct ip6_frag *)ulp)->ip6f_offlg &
+ IP6F_MORE_FRAG;
+ if (V_fw_permit_single_frag6 == 0 &&
+ offset == 0 && ip6f_mf == 0) {
+ if (V_fw_verbose)
+ printf("IPFW2: IPV6 - Invalid "
+ "Fragment Header\n");
+ if (V_fw_deny_unknown_exthdrs)
+ return (IP_FW_DENY);
+ break;
+ }
+ args->f_id.extra =
+ ntohl(((struct ip6_frag *)ulp)->ip6f_ident);
+ ulp = NULL;
+ break;
+
+ case IPPROTO_DSTOPTS: /* RFC 2460 */
+ PULLUP_TO(hlen, ulp, struct ip6_hbh);
+ ext_hd |= EXT_DSTOPTS;
+ hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
+ proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
+ ulp = NULL;
+ break;
+
+ case IPPROTO_AH: /* RFC 2402 */
+ PULLUP_TO(hlen, ulp, struct ip6_ext);
+ ext_hd |= EXT_AH;
+ hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2;
+ proto = ((struct ip6_ext *)ulp)->ip6e_nxt;
+ ulp = NULL;
+ break;
+
+ case IPPROTO_ESP: /* RFC 2406 */
+ PULLUP_TO(hlen, ulp, uint32_t); /* SPI, Seq# */
+ /* Anything past Seq# is variable length and
+ * data past this ext. header is encrypted. */
+ ext_hd |= EXT_ESP;
+ break;
+
+ case IPPROTO_NONE: /* RFC 2460 */
+ /*
+ * Packet ends here, and IPv6 header has
+ * already been pulled up. If ip6e_len!=0
+ * then octets must be ignored.
+ */
+ ulp = ip; /* non-NULL to get out of loop. */
+ break;
+
+ case IPPROTO_OSPFIGP:
+ /* XXX OSPF header check? */
+ PULLUP_TO(hlen, ulp, struct ip6_ext);
+ break;
+
+ case IPPROTO_PIM:
+ /* XXX PIM header check? */
+ PULLUP_TO(hlen, ulp, struct pim);
+ break;
+
+ case IPPROTO_CARP:
+ PULLUP_TO(hlen, ulp, struct carp_header);
+ if (((struct carp_header *)ulp)->carp_version !=
+ CARP_VERSION)
+ return (IP_FW_DENY);
+ if (((struct carp_header *)ulp)->carp_type !=
+ CARP_ADVERTISEMENT)
+ return (IP_FW_DENY);
+ break;
+
+ case IPPROTO_IPV6: /* RFC 2893 */
+ PULLUP_TO(hlen, ulp, struct ip6_hdr);
+ break;
+
+ case IPPROTO_IPV4: /* RFC 2893 */
+ PULLUP_TO(hlen, ulp, struct ip);
+ break;
+
+ default:
+ if (V_fw_verbose)
+ printf("IPFW2: IPV6 - Unknown "
+ "Extension Header(%d), ext_hd=%x\n",
+ proto, ext_hd);
+ if (V_fw_deny_unknown_exthdrs)
+ return (IP_FW_DENY);
+ PULLUP_TO(hlen, ulp, struct ip6_ext);
+ break;
+ } /*switch */
+ }
+ ip = mtod(m, struct ip *);
+ ip6 = (struct ip6_hdr *)ip;
+ args->f_id.src_ip6 = ip6->ip6_src;
+ args->f_id.dst_ip6 = ip6->ip6_dst;
+ args->f_id.src_ip = 0;
+ args->f_id.dst_ip = 0;
+ args->f_id.flow_id6 = ntohl(ip6->ip6_flow);
+ } else if (pktlen >= sizeof(struct ip) &&
+ (args->eh == NULL || etype == ETHERTYPE_IP) && ip->ip_v == 4) {
+ is_ipv4 = 1;
+ hlen = ip->ip_hl << 2;
+ args->f_id.addr_type = 4;
+
+ /*
+ * Collect parameters into local variables for faster matching.
+ */
+ proto = ip->ip_p;
+ src_ip = ip->ip_src;
+ dst_ip = ip->ip_dst;
+ offset = ntohs(ip->ip_off) & IP_OFFMASK;
+ iplen = ntohs(ip->ip_len);
+ pktlen = iplen < pktlen ? iplen : pktlen;
+
+ if (offset == 0) {
+ switch (proto) {
+ case IPPROTO_TCP:
+ PULLUP_TO(hlen, ulp, struct tcphdr);
+ dst_port = TCP(ulp)->th_dport;
+ src_port = TCP(ulp)->th_sport;
+ /* save flags for dynamic rules */
+ args->f_id._flags = TCP(ulp)->th_flags;
+ break;
+
+ case IPPROTO_SCTP:
+ PULLUP_TO(hlen, ulp, struct sctphdr);
+ src_port = SCTP(ulp)->src_port;
+ dst_port = SCTP(ulp)->dest_port;
+ break;
+
+ case IPPROTO_UDP:
+ PULLUP_TO(hlen, ulp, struct udphdr);
+ dst_port = UDP(ulp)->uh_dport;
+ src_port = UDP(ulp)->uh_sport;
+ break;
+
+ case IPPROTO_ICMP:
+ PULLUP_TO(hlen, ulp, struct icmphdr);
+ //args->f_id.flags = ICMP(ulp)->icmp_type;
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ ip = mtod(m, struct ip *);
+ args->f_id.src_ip = ntohl(src_ip.s_addr);
+ args->f_id.dst_ip = ntohl(dst_ip.s_addr);
+ }
+#undef PULLUP_TO
+ if (proto) { /* we may have port numbers, store them */
+ args->f_id.proto = proto;
+ args->f_id.src_port = src_port = ntohs(src_port);
+ args->f_id.dst_port = dst_port = ntohs(dst_port);
+ }
+
+ IPFW_RLOCK(chain);
+ if (! V_ipfw_vnet_ready) { /* shutting down, leave NOW. */
+ IPFW_RUNLOCK(chain);
+ return (IP_FW_PASS); /* accept */
+ }
+ if (args->rule.slot) {
+ /*
+ * Packet has already been tagged as a result of a previous
+ * match on rule args->rule aka args->rule_id (PIPE, QUEUE,
+ * REASS, NETGRAPH, DIVERT/TEE...)
+ * Validate the slot and continue from the next one
+ * if still present, otherwise do a lookup.
+ */
+ f_pos = (args->rule.chain_id == chain->id) ?
+ args->rule.slot :
+ ipfw_find_rule(chain, args->rule.rulenum,
+ args->rule.rule_id);
+ } else {
+ f_pos = 0;
+ }
+
+ /*
+ * Now scan the rules, and parse microinstructions for each rule.
+ * We have two nested loops and an inner switch. Sometimes we
+ * need to break out of one or both loops, or re-enter one of
+ * the loops with updated variables. Loop variables are:
+ *
+ * f_pos (outer loop) points to the current rule.
+ * On output it points to the matching rule.
+ * done (outer loop) is used as a flag to break the loop.
+ * l (inner loop) residual length of current rule.
+ * cmd points to the current microinstruction.
+ *
+ * We break the inner loop by setting l=0 and possibly
+ * cmdlen=0 if we don't want to advance cmd.
+ * We break the outer loop by setting done=1
+ * We can restart the inner loop by setting l>0 and f_pos, f, cmd
+ * as needed.
+ */
+ for (; f_pos < chain->n_rules; f_pos++) {
+ ipfw_insn *cmd;
+ uint32_t tablearg = 0;
+ int l, cmdlen, skip_or; /* skip rest of OR block */
+ struct ip_fw *f;
+
+ f = chain->map[f_pos];
+ if (V_set_disable & (1 << f->set) )
+ continue;
+
+ skip_or = 0;
+ for (l = f->cmd_len, cmd = f->cmd ; l > 0 ;
+ l -= cmdlen, cmd += cmdlen) {
+ int match;
+
+ /*
+ * check_body is a jump target used when we find a
+ * CHECK_STATE, and need to jump to the body of
+ * the target rule.
+ */
+
+/* check_body: */
+ cmdlen = F_LEN(cmd);
+ /*
+ * An OR block (insn_1 || .. || insn_n) has the
+ * F_OR bit set in all but the last instruction.
+ * The first match will set "skip_or", and cause
+ * the following instructions to be skipped until
+ * past the one with the F_OR bit clear.
+ */
+ if (skip_or) { /* skip this instruction */
+ if ((cmd->len & F_OR) == 0)
+ skip_or = 0; /* next one is good */
+ continue;
+ }
+ match = 0; /* set to 1 if we succeed */
+
+ switch (cmd->opcode) {
+ /*
+ * The first set of opcodes compares the packet's
+ * fields with some pattern, setting 'match' if a
+ * match is found. At the end of the loop there is
+ * logic to deal with F_NOT and F_OR flags associated
+ * with the opcode.
+ */
+ case O_NOP:
+ match = 1;
+ break;
+
+ case O_FORWARD_MAC:
+ printf("ipfw: opcode %d unimplemented\n",
+ cmd->opcode);
+ break;
+
+ case O_GID:
+ case O_UID:
+ case O_JAIL:
+ /*
+ * We only check offset == 0 && proto != 0,
+ * as this ensures that we have a
+ * packet with the ports info.
+ */
+ if (offset != 0)
+ break;
+ if (proto == IPPROTO_TCP ||
+ proto == IPPROTO_UDP)
+ match = check_uidgid(
+ (ipfw_insn_u32 *)cmd,
+ args, &ucred_lookup,
+#ifdef __FreeBSD__
+ &ucred_cache);
+#else
+ (void *)&ucred_cache);
+#endif
+ break;
+
+ case O_RECV:
+ match = iface_match(m->m_pkthdr.rcvif,
+ (ipfw_insn_if *)cmd, chain, &tablearg);
+ break;
+
+ case O_XMIT:
+ match = iface_match(oif, (ipfw_insn_if *)cmd,
+ chain, &tablearg);
+ break;
+
+ case O_VIA:
+ match = iface_match(oif ? oif :
+ m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd,
+ chain, &tablearg);
+ break;
+
+ case O_MACADDR2:
+ if (args->eh != NULL) { /* have MAC header */
+ u_int32_t *want = (u_int32_t *)
+ ((ipfw_insn_mac *)cmd)->addr;
+ u_int32_t *mask = (u_int32_t *)
+ ((ipfw_insn_mac *)cmd)->mask;
+ u_int32_t *hdr = (u_int32_t *)args->eh;
+
+ match =
+ ( want[0] == (hdr[0] & mask[0]) &&
+ want[1] == (hdr[1] & mask[1]) &&
+ want[2] == (hdr[2] & mask[2]) );
+ }
+ break;
+
+ case O_MAC_TYPE:
+ if (args->eh != NULL) {
+ u_int16_t *p =
+ ((ipfw_insn_u16 *)cmd)->ports;
+ int i;
+
+ for (i = cmdlen - 1; !match && i>0;
+ i--, p += 2)
+ match = (etype >= p[0] &&
+ etype <= p[1]);
+ }
+ break;
+
+ case O_FRAG:
+ match = (offset != 0);
+ break;
+
+ case O_IN: /* "out" is "not in" */
+ match = (oif == NULL);
+ break;
+
+ case O_LAYER2:
+ match = (args->eh != NULL);
+ break;
+
+ case O_DIVERTED:
+ {
+ /* For diverted packets, args->rule.info
+ * contains the divert port (in host format)
+ * reason and direction.
+ */
+ uint32_t i = args->rule.info;
+ match = (i&IPFW_IS_MASK) == IPFW_IS_DIVERT &&
+ cmd->arg1 & ((i & IPFW_INFO_IN) ? 1 : 2);
+ }
+ break;
+
+ case O_PROTO:
+ /*
+ * We do not allow an arg of 0 so the
+ * check of "proto" only suffices.
+ */
+ match = (proto == cmd->arg1);
+ break;
+
+ case O_IP_SRC:
+ match = is_ipv4 &&
+ (((ipfw_insn_ip *)cmd)->addr.s_addr ==
+ src_ip.s_addr);
+ break;
+
+ case O_IP_SRC_LOOKUP:
+ case O_IP_DST_LOOKUP:
+ if (is_ipv4) {
+ uint32_t key =
+ (cmd->opcode == O_IP_DST_LOOKUP) ?
+ dst_ip.s_addr : src_ip.s_addr;
+ uint32_t v = 0;
+
+ if (cmdlen > F_INSN_SIZE(ipfw_insn_u32)) {
+ /* generic lookup. The key must be
+ * in 32bit big-endian format.
+ */
+ v = ((ipfw_insn_u32 *)cmd)->d[1];
+ if (v == 0)
+ key = dst_ip.s_addr;
+ else if (v == 1)
+ key = src_ip.s_addr;
+ else if (v == 6) /* dscp */
+ key = (ip->ip_tos >> 2) & 0x3f;
+ else if (offset != 0)
+ break;
+ else if (proto != IPPROTO_TCP &&
+ proto != IPPROTO_UDP)
+ break;
+ else if (v == 2)
+ key = htonl(dst_port);
+ else if (v == 3)
+ key = htonl(src_port);
+ else if (v == 4 || v == 5) {
+ check_uidgid(
+ (ipfw_insn_u32 *)cmd,
+ args, &ucred_lookup,
+#ifdef __FreeBSD__
+ &ucred_cache);
+ if (v == 4 /* O_UID */)
+ key = ucred_cache->cr_uid;
+ else if (v == 5 /* O_JAIL */)
+ key = ucred_cache->cr_prison->pr_id;
+#else /* !__FreeBSD__ */
+ (void *)&ucred_cache);
+ if (v ==4 /* O_UID */)
+ key = ucred_cache.uid;
+ else if (v == 5 /* O_JAIL */)
+ key = ucred_cache.xid;
+#endif /* !__FreeBSD__ */
+ key = htonl(key);
+ } else
+ break;
+ }
+ match = ipfw_lookup_table(chain,
+ cmd->arg1, key, &v);
+ if (!match)
+ break;
+ if (cmdlen == F_INSN_SIZE(ipfw_insn_u32))
+ match =
+ ((ipfw_insn_u32 *)cmd)->d[0] == v;
+ else
+ tablearg = v;
+ } else if (is_ipv6) {
+ uint32_t v = 0;
+ void *pkey = (cmd->opcode == O_IP_DST_LOOKUP) ?
+ &args->f_id.dst_ip6: &args->f_id.src_ip6;
+ match = ipfw_lookup_table_extended(chain,
+ cmd->arg1, pkey, &v,
+ IPFW_TABLE_CIDR);
+ if (cmdlen == F_INSN_SIZE(ipfw_insn_u32))
+ match = ((ipfw_insn_u32 *)cmd)->d[0] == v;
+ if (match)
+ tablearg = v;
+ }
+ break;
+
+ case O_IP_SRC_MASK:
+ case O_IP_DST_MASK:
+ if (is_ipv4) {
+ uint32_t a =
+ (cmd->opcode == O_IP_DST_MASK) ?
+ dst_ip.s_addr : src_ip.s_addr;
+ uint32_t *p = ((ipfw_insn_u32 *)cmd)->d;
+ int i = cmdlen-1;
+
+ for (; !match && i>0; i-= 2, p+= 2)
+ match = (p[0] == (a & p[1]));
+ }
+ break;
+
+ case O_IP_SRC_ME:
+ if (is_ipv4) {
+ struct ifnet *tif;
+
+ INADDR_TO_IFP(src_ip, tif);
+ match = (tif != NULL);
+ break;
+ }
+#ifdef INET6
+ /* FALLTHROUGH */
+ case O_IP6_SRC_ME:
+ match= is_ipv6 && search_ip6_addr_net(&args->f_id.src_ip6);
+#endif
+ break;
+
+ case O_IP_DST_SET:
+ case O_IP_SRC_SET:
+ if (is_ipv4) {
+ u_int32_t *d = (u_int32_t *)(cmd+1);
+ u_int32_t addr =
+ cmd->opcode == O_IP_DST_SET ?
+ args->f_id.dst_ip :
+ args->f_id.src_ip;
+
+ if (addr < d[0])
+ break;
+ addr -= d[0]; /* subtract base */
+ match = (addr < cmd->arg1) &&
+ ( d[ 1 + (addr>>5)] &
+ (1<<(addr & 0x1f)) );
+ }
+ break;
+
+ case O_IP_DST:
+ match = is_ipv4 &&
+ (((ipfw_insn_ip *)cmd)->addr.s_addr ==
+ dst_ip.s_addr);
+ break;
+
+ case O_IP_DST_ME:
+ if (is_ipv4) {
+ struct ifnet *tif;
+
+ INADDR_TO_IFP(dst_ip, tif);
+ match = (tif != NULL);
+ break;
+ }
+#ifdef INET6
+ /* FALLTHROUGH */
+ case O_IP6_DST_ME:
+ match= is_ipv6 && search_ip6_addr_net(&args->f_id.dst_ip6);
+#endif
+ break;
+
+
+ case O_IP_SRCPORT:
+ case O_IP_DSTPORT:
+ /*
+ * offset == 0 && proto != 0 is enough
+ * to guarantee that we have a
+ * packet with port info.
+ */
+ if ((proto==IPPROTO_UDP || proto==IPPROTO_TCP)
+ && offset == 0) {
+ u_int16_t x =
+ (cmd->opcode == O_IP_SRCPORT) ?
+ src_port : dst_port ;
+ u_int16_t *p =
+ ((ipfw_insn_u16 *)cmd)->ports;
+ int i;
+
+ for (i = cmdlen - 1; !match && i>0;
+ i--, p += 2)
+ match = (x>=p[0] && x<=p[1]);
+ }
+ break;
+
+ case O_ICMPTYPE:
+ match = (offset == 0 && proto==IPPROTO_ICMP &&
+ icmptype_match(ICMP(ulp), (ipfw_insn_u32 *)cmd) );
+ break;
+
+#ifdef INET6
+ case O_ICMP6TYPE:
+ match = is_ipv6 && offset == 0 &&
+ proto==IPPROTO_ICMPV6 &&
+ icmp6type_match(
+ ICMP6(ulp)->icmp6_type,
+ (ipfw_insn_u32 *)cmd);
+ break;
+#endif /* INET6 */
+
+ case O_IPOPT:
+ match = (is_ipv4 &&
+ ipopts_match(ip, cmd) );
+ break;
+
+ case O_IPVER:
+ match = (is_ipv4 &&
+ cmd->arg1 == ip->ip_v);
+ break;
+
+ case O_IPID:
+ case O_IPLEN:
+ case O_IPTTL:
+ if (is_ipv4) { /* only for IP packets */
+ uint16_t x;
+ uint16_t *p;
+ int i;
+
+ if (cmd->opcode == O_IPLEN)
+ x = iplen;
+ else if (cmd->opcode == O_IPTTL)
+ x = ip->ip_ttl;
+ else /* must be IPID */
+ x = ntohs(ip->ip_id);
+ if (cmdlen == 1) {
+ match = (cmd->arg1 == x);
+ break;
+ }
+ /* otherwise we have ranges */
+ p = ((ipfw_insn_u16 *)cmd)->ports;
+ i = cmdlen - 1;
+ for (; !match && i>0; i--, p += 2)
+ match = (x >= p[0] && x <= p[1]);
+ }
+ break;
+
+ case O_IPPRECEDENCE:
+ match = (is_ipv4 &&
+ (cmd->arg1 == (ip->ip_tos & 0xe0)) );
+ break;
+
+ case O_IPTOS:
+ match = (is_ipv4 &&
+ flags_match(cmd, ip->ip_tos));
+ break;
+
+ case O_TCPDATALEN:
+ if (proto == IPPROTO_TCP && offset == 0) {
+ struct tcphdr *tcp;
+ uint16_t x;
+ uint16_t *p;
+ int i;
+
+ tcp = TCP(ulp);
+ x = iplen -
+ ((ip->ip_hl + tcp->th_off) << 2);
+ if (cmdlen == 1) {
+ match = (cmd->arg1 == x);
+ break;
+ }
+ /* otherwise we have ranges */
+ p = ((ipfw_insn_u16 *)cmd)->ports;
+ i = cmdlen - 1;
+ for (; !match && i>0; i--, p += 2)
+ match = (x >= p[0] && x <= p[1]);
+ }
+ break;
+
+ case O_TCPFLAGS:
+ match = (proto == IPPROTO_TCP && offset == 0 &&
+ flags_match(cmd, TCP(ulp)->th_flags));
+ break;
+
+ case O_TCPOPTS:
+ PULLUP_LEN(hlen, ulp, (TCP(ulp)->th_off << 2));
+ match = (proto == IPPROTO_TCP && offset == 0 &&
+ tcpopts_match(TCP(ulp), cmd));
+ break;
+
+ case O_TCPSEQ:
+ match = (proto == IPPROTO_TCP && offset == 0 &&
+ ((ipfw_insn_u32 *)cmd)->d[0] ==
+ TCP(ulp)->th_seq);
+ break;
+
+ case O_TCPACK:
+ match = (proto == IPPROTO_TCP && offset == 0 &&
+ ((ipfw_insn_u32 *)cmd)->d[0] ==
+ TCP(ulp)->th_ack);
+ break;
+
+ case O_TCPWIN:
+ if (proto == IPPROTO_TCP && offset == 0) {
+ uint16_t x;
+ uint16_t *p;
+ int i;
+
+ x = ntohs(TCP(ulp)->th_win);
+ if (cmdlen == 1) {
+ match = (cmd->arg1 == x);
+ break;
+ }
+ /* Otherwise we have ranges. */
+ p = ((ipfw_insn_u16 *)cmd)->ports;
+ i = cmdlen - 1;
+ for (; !match && i > 0; i--, p += 2)
+ match = (x >= p[0] && x <= p[1]);
+ }
+ break;
+
+ case O_ESTAB:
+ /* reject packets which have SYN only */
+ /* XXX should i also check for TH_ACK ? */
+ match = (proto == IPPROTO_TCP && offset == 0 &&
+ (TCP(ulp)->th_flags &
+ (TH_RST | TH_ACK | TH_SYN)) != TH_SYN);
+ break;
+
+ case O_ALTQ: {
+ struct pf_mtag *at;
+ struct m_tag *mtag;
+ ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd;
+
+ /*
+ * ALTQ uses mbuf tags from another
+ * packet filtering system - pf(4).
+ * We allocate a tag in its format
+ * and fill it in, pretending to be pf(4).
+ */
+ match = 1;
+ at = pf_find_mtag(m);
+ if (at != NULL && at->qid != 0)
+ break;
+ mtag = m_tag_get(PACKET_TAG_PF,
+ sizeof(struct pf_mtag), M_NOWAIT | M_ZERO);
+ if (mtag == NULL) {
+ /*
+ * Let the packet fall back to the
+ * default ALTQ.
+ */
+ break;
+ }
+ m_tag_prepend(m, mtag);
+ at = (struct pf_mtag *)(mtag + 1);
+ at->qid = altq->qid;
+ at->hdr = ip;
+ break;
+ }
+
+ case O_LOG:
+ ipfw_log(f, hlen, args, m,
+ oif, offset | ip6f_mf, tablearg, ip);
+ match = 1;
+ break;
+
+ case O_PROB:
+ match = (random()<((ipfw_insn_u32 *)cmd)->d[0]);
+ break;
+
+ case O_VERREVPATH:
+ /* Outgoing packets automatically pass/match */
+ match = ((oif != NULL) ||
+ (m->m_pkthdr.rcvif == NULL) ||
+ (
+#ifdef INET6
+ is_ipv6 ?
+ verify_path6(&(args->f_id.src_ip6),
+ m->m_pkthdr.rcvif, args->f_id.fib) :
+#endif
+ verify_path(src_ip, m->m_pkthdr.rcvif,
+ args->f_id.fib)));
+ break;
+
+ case O_VERSRCREACH:
+ /* Outgoing packets automatically pass/match */
+ match = (hlen > 0 && ((oif != NULL) ||
+#ifdef INET6
+ is_ipv6 ?
+ verify_path6(&(args->f_id.src_ip6),
+ NULL, args->f_id.fib) :
+#endif
+ verify_path(src_ip, NULL, args->f_id.fib)));
+ break;
+
+ case O_ANTISPOOF:
+ /* Outgoing packets automatically pass/match */
+ if (oif == NULL && hlen > 0 &&
+ ( (is_ipv4 && in_localaddr(src_ip))
+#ifdef INET6
+ || (is_ipv6 &&
+ in6_localaddr(&(args->f_id.src_ip6)))
+#endif
+ ))
+ match =
+#ifdef INET6
+ is_ipv6 ? verify_path6(
+ &(args->f_id.src_ip6),
+ m->m_pkthdr.rcvif,
+ args->f_id.fib) :
+#endif
+ verify_path(src_ip,
+ m->m_pkthdr.rcvif,
+ args->f_id.fib);
+ else
+ match = 1;
+ break;
+
+ case O_IPSEC:
+#ifdef IPSEC
+ match = (m_tag_find(m,
+ PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL);
+#endif
+ /* otherwise no match */
+ break;
+
+#ifdef INET6
+ case O_IP6_SRC:
+ match = is_ipv6 &&
+ IN6_ARE_ADDR_EQUAL(&args->f_id.src_ip6,
+ &((ipfw_insn_ip6 *)cmd)->addr6);
+ break;
+
+ case O_IP6_DST:
+ match = is_ipv6 &&
+ IN6_ARE_ADDR_EQUAL(&args->f_id.dst_ip6,
+ &((ipfw_insn_ip6 *)cmd)->addr6);
+ break;
+ case O_IP6_SRC_MASK:
+ case O_IP6_DST_MASK:
+ if (is_ipv6) {
+ int i = cmdlen - 1;
+ struct in6_addr p;
+ struct in6_addr *d =
+ &((ipfw_insn_ip6 *)cmd)->addr6;
+
+ for (; !match && i > 0; d += 2,
+ i -= F_INSN_SIZE(struct in6_addr)
+ * 2) {
+ p = (cmd->opcode ==
+ O_IP6_SRC_MASK) ?
+ args->f_id.src_ip6:
+ args->f_id.dst_ip6;
+ APPLY_MASK(&p, &d[1]);
+ match =
+ IN6_ARE_ADDR_EQUAL(&d[0],
+ &p);
+ }
+ }
+ break;
+
+ case O_FLOW6ID:
+ match = is_ipv6 &&
+ flow6id_match(args->f_id.flow_id6,
+ (ipfw_insn_u32 *) cmd);
+ break;
+
+ case O_EXT_HDR:
+ match = is_ipv6 &&
+ (ext_hd & ((ipfw_insn *) cmd)->arg1);
+ break;
+
+ case O_IP6:
+ match = is_ipv6;
+ break;
+#endif
+
+ case O_IP4:
+ match = is_ipv4;
+ break;
+
+ case O_TAG: {
+ struct m_tag *mtag;
+ uint32_t tag = (cmd->arg1 == IP_FW_TABLEARG) ?
+ tablearg : cmd->arg1;
+
+ /* Packet is already tagged with this tag? */
+ mtag = m_tag_locate(m, MTAG_IPFW, tag, NULL);
+
+ /* We have `untag' action when F_NOT flag is
+ * present. And we must remove this mtag from
+ * mbuf and reset `match' to zero (`match' will
+ * be inversed later).
+ * Otherwise we should allocate new mtag and
+ * push it into mbuf.
+ */
+ if (cmd->len & F_NOT) { /* `untag' action */
+ if (mtag != NULL)
+ m_tag_delete(m, mtag);
+ match = 0;
+ } else {
+ if (mtag == NULL) {
+ mtag = m_tag_alloc( MTAG_IPFW,
+ tag, 0, M_NOWAIT);
+ if (mtag != NULL)
+ m_tag_prepend(m, mtag);
+ }
+ match = 1;
+ }
+ break;
+ }
+
+ case O_FIB: /* try match the specified fib */
+ if (args->f_id.fib == cmd->arg1)
+ match = 1;
+ break;
+
+ case O_SOCKARG: {
+ struct inpcb *inp = args->inp;
+ struct inpcbinfo *pi;
+
+ if (is_ipv6) /* XXX can we remove this ? */
+ break;
+
+ if (proto == IPPROTO_TCP)
+ pi = &V_tcbinfo;
+ else if (proto == IPPROTO_UDP)
+ pi = &V_udbinfo;
+ else
+ break;
+
+ /*
+ * XXXRW: so_user_cookie should almost
+ * certainly be inp_user_cookie?
+ */
+
+ /* For incomming packet, lookup up the
+ inpcb using the src/dest ip/port tuple */
+ if (inp == NULL) {
+ inp = in_pcblookup(pi,
+ src_ip, htons(src_port),
+ dst_ip, htons(dst_port),
+ INPLOOKUP_RLOCKPCB, NULL);
+ if (inp != NULL) {
+ tablearg =
+ inp->inp_socket->so_user_cookie;
+ if (tablearg)
+ match = 1;
+ INP_RUNLOCK(inp);
+ }
+ } else {
+ if (inp->inp_socket) {
+ tablearg =
+ inp->inp_socket->so_user_cookie;
+ if (tablearg)
+ match = 1;
+ }
+ }
+ break;
+ }
+
+ case O_TAGGED: {
+ struct m_tag *mtag;
+ uint32_t tag = (cmd->arg1 == IP_FW_TABLEARG) ?
+ tablearg : cmd->arg1;
+
+ if (cmdlen == 1) {
+ match = m_tag_locate(m, MTAG_IPFW,
+ tag, NULL) != NULL;
+ break;
+ }
+
+ /* we have ranges */
+ for (mtag = m_tag_first(m);
+ mtag != NULL && !match;
+ mtag = m_tag_next(m, mtag)) {
+ uint16_t *p;
+ int i;
+
+ if (mtag->m_tag_cookie != MTAG_IPFW)
+ continue;
+
+ p = ((ipfw_insn_u16 *)cmd)->ports;
+ i = cmdlen - 1;
+ for(; !match && i > 0; i--, p += 2)
+ match =
+ mtag->m_tag_id >= p[0] &&
+ mtag->m_tag_id <= p[1];
+ }
+ break;
+ }
+
+ /*
+ * The second set of opcodes represents 'actions',
+ * i.e. the terminal part of a rule once the packet
+ * matches all previous patterns.
+ * Typically there is only one action for each rule,
+ * and the opcode is stored at the end of the rule
+ * (but there are exceptions -- see below).
+ *
+ * In general, here we set retval and terminate the
+ * outer loop (would be a 'break 3' in some language,
+ * but we need to set l=0, done=1)
+ *
+ * Exceptions:
+ * O_COUNT and O_SKIPTO actions:
+ * instead of terminating, we jump to the next rule
+ * (setting l=0), or to the SKIPTO target (setting
+ * f/f_len, cmd and l as needed), respectively.
+ *
+ * O_TAG, O_LOG and O_ALTQ action parameters:
+ * perform some action and set match = 1;
+ *
+ * O_LIMIT and O_KEEP_STATE: these opcodes are
+ * not real 'actions', and are stored right
+ * before the 'action' part of the rule.
+ * These opcodes try to install an entry in the
+ * state tables; if successful, we continue with
+ * the next opcode (match=1; break;), otherwise
+ * the packet must be dropped (set retval,
+ * break loops with l=0, done=1)
+ *
+ * O_PROBE_STATE and O_CHECK_STATE: these opcodes
+ * cause a lookup of the state table, and a jump
+ * to the 'action' part of the parent rule
+ * if an entry is found, or
+ * (CHECK_STATE only) a jump to the next rule if
+ * the entry is not found.
+ * The result of the lookup is cached so that
+ * further instances of these opcodes become NOPs.
+ * The jump to the next rule is done by setting
+ * l=0, cmdlen=0.
+ */
+ case O_LIMIT:
+ case O_KEEP_STATE:
+ if (ipfw_install_state(f,
+ (ipfw_insn_limit *)cmd, args, tablearg)) {
+ /* error or limit violation */
+ retval = IP_FW_DENY;
+ l = 0; /* exit inner loop */
+ done = 1; /* exit outer loop */
+ }
+ match = 1;
+ break;
+
+ case O_PROBE_STATE:
+ case O_CHECK_STATE:
+ /*
+ * dynamic rules are checked at the first
+ * keep-state or check-state occurrence,
+ * with the result being stored in dyn_dir.
+ * The compiler introduces a PROBE_STATE
+ * instruction for us when we have a
+ * KEEP_STATE (because PROBE_STATE needs
+ * to be run first).
+ */
+ if (dyn_dir == MATCH_UNKNOWN &&
+ (q = ipfw_lookup_dyn_rule(&args->f_id,
+ &dyn_dir, proto == IPPROTO_TCP ?
+ TCP(ulp) : NULL))
+ != NULL) {
+ /*
+ * Found dynamic entry, update stats
+ * and jump to the 'action' part of
+ * the parent rule by setting
+ * f, cmd, l and clearing cmdlen.
+ */
+ q->pcnt++;
+ q->bcnt += pktlen;
+ /* XXX we would like to have f_pos
+ * readily accessible in the dynamic
+ * rule, instead of having to
+ * lookup q->rule.
+ */
+ f = q->rule;
+ f_pos = ipfw_find_rule(chain,
+ f->rulenum, f->id);
+ cmd = ACTION_PTR(f);
+ l = f->cmd_len - f->act_ofs;
+ ipfw_dyn_unlock();
+ cmdlen = 0;
+ match = 1;
+ break;
+ }
+ /*
+ * Dynamic entry not found. If CHECK_STATE,
+ * skip to next rule, if PROBE_STATE just
+ * ignore and continue with next opcode.
+ */
+ if (cmd->opcode == O_CHECK_STATE)
+ l = 0; /* exit inner loop */
+ match = 1;
+ break;
+
+ case O_ACCEPT:
+ retval = 0; /* accept */
+ l = 0; /* exit inner loop */
+ done = 1; /* exit outer loop */
+ break;
+
+ case O_PIPE:
+ case O_QUEUE:
+ set_match(args, f_pos, chain);
+ args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ?
+ tablearg : cmd->arg1;
+ if (cmd->opcode == O_PIPE)
+ args->rule.info |= IPFW_IS_PIPE;
+ if (V_fw_one_pass)
+ args->rule.info |= IPFW_ONEPASS;
+ retval = IP_FW_DUMMYNET;
+ l = 0; /* exit inner loop */
+ done = 1; /* exit outer loop */
+ break;
+
+ case O_DIVERT:
+ case O_TEE:
+ if (args->eh) /* not on layer 2 */
+ break;
+ /* otherwise this is terminal */
+ l = 0; /* exit inner loop */
+ done = 1; /* exit outer loop */
+ retval = (cmd->opcode == O_DIVERT) ?
+ IP_FW_DIVERT : IP_FW_TEE;
+ set_match(args, f_pos, chain);
+ args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ?
+ tablearg : cmd->arg1;
+ break;
+
+ case O_COUNT:
+ f->pcnt++; /* update stats */
+ f->bcnt += pktlen;
+ f->timestamp = time_uptime;
+ l = 0; /* exit inner loop */
+ break;
+
+ case O_SKIPTO:
+ f->pcnt++; /* update stats */
+ f->bcnt += pktlen;
+ f->timestamp = time_uptime;
+ /* If possible use cached f_pos (in f->next_rule),
+ * whose version is written in f->next_rule
+ * (horrible hacks to avoid changing the ABI).
+ */
+ if (cmd->arg1 != IP_FW_TABLEARG &&
+ (uintptr_t)f->x_next == chain->id) {
+ f_pos = (uintptr_t)f->next_rule;
+ } else {
+ int i = (cmd->arg1 == IP_FW_TABLEARG) ?
+ tablearg : cmd->arg1;
+ /* make sure we do not jump backward */
+ if (i <= f->rulenum)
+ i = f->rulenum + 1;
+ f_pos = ipfw_find_rule(chain, i, 0);
+ /* update the cache */
+ if (cmd->arg1 != IP_FW_TABLEARG) {
+ f->next_rule =
+ (void *)(uintptr_t)f_pos;
+ f->x_next =
+ (void *)(uintptr_t)chain->id;
+ }
+ }
+ /*
+ * Skip disabled rules, and re-enter
+ * the inner loop with the correct
+ * f_pos, f, l and cmd.
+ * Also clear cmdlen and skip_or
+ */
+ for (; f_pos < chain->n_rules - 1 &&
+ (V_set_disable &
+ (1 << chain->map[f_pos]->set));
+ f_pos++)
+ ;
+ /* Re-enter the inner loop at the skipto rule. */
+ f = chain->map[f_pos];
+ l = f->cmd_len;
+ cmd = f->cmd;
+ match = 1;
+ cmdlen = 0;
+ skip_or = 0;
+ continue;
+ break; /* not reached */
+
+ case O_CALLRETURN: {
+ /*
+ * Implementation of `subroutine' call/return,
+ * in the stack carried in an mbuf tag. This
+ * is different from `skipto' in that any call
+ * address is possible (`skipto' must prevent
+ * backward jumps to avoid endless loops).
+ * We have `return' action when F_NOT flag is
+ * present. The `m_tag_id' field is used as
+ * stack pointer.
+ */
+ struct m_tag *mtag;
+ uint16_t jmpto, *stack;
+
+#define IS_CALL ((cmd->len & F_NOT) == 0)
+#define IS_RETURN ((cmd->len & F_NOT) != 0)
+ /*
+ * Hand-rolled version of m_tag_locate() with
+ * wildcard `type'.
+ * If not already tagged, allocate new tag.
+ */
+ mtag = m_tag_first(m);
+ while (mtag != NULL) {
+ if (mtag->m_tag_cookie ==
+ MTAG_IPFW_CALL)
+ break;
+ mtag = m_tag_next(m, mtag);
+ }
+ if (mtag == NULL && IS_CALL) {
+ mtag = m_tag_alloc(MTAG_IPFW_CALL, 0,
+ IPFW_CALLSTACK_SIZE *
+ sizeof(uint16_t), M_NOWAIT);
+ if (mtag != NULL)
+ m_tag_prepend(m, mtag);
+ }
+
+ /*
+ * On error both `call' and `return' just
+ * continue with next rule.
+ */
+ if (IS_RETURN && (mtag == NULL ||
+ mtag->m_tag_id == 0)) {
+ l = 0; /* exit inner loop */
+ break;
+ }
+ if (IS_CALL && (mtag == NULL ||
+ mtag->m_tag_id >= IPFW_CALLSTACK_SIZE)) {
+ printf("ipfw: call stack error, "
+ "go to next rule\n");
+ l = 0; /* exit inner loop */
+ break;
+ }
+
+ f->pcnt++; /* update stats */
+ f->bcnt += pktlen;
+ f->timestamp = time_uptime;
+ stack = (uint16_t *)(mtag + 1);
+
+ /*
+ * The `call' action may use cached f_pos
+ * (in f->next_rule), whose version is written
+ * in f->next_rule.
+ * The `return' action, however, doesn't have
+ * fixed jump address in cmd->arg1 and can't use
+ * cache.
+ */
+ if (IS_CALL) {
+ stack[mtag->m_tag_id] = f->rulenum;
+ mtag->m_tag_id++;
+ if (cmd->arg1 != IP_FW_TABLEARG &&
+ (uintptr_t)f->x_next == chain->id) {
+ f_pos = (uintptr_t)f->next_rule;
+ } else {
+ jmpto = (cmd->arg1 ==
+ IP_FW_TABLEARG) ? tablearg:
+ cmd->arg1;
+ f_pos = ipfw_find_rule(chain,
+ jmpto, 0);
+ /* update the cache */
+ if (cmd->arg1 !=
+ IP_FW_TABLEARG) {
+ f->next_rule =
+ (void *)(uintptr_t)
+ f_pos;
+ f->x_next =
+ (void *)(uintptr_t)
+ chain->id;
+ }
+ }
+ } else { /* `return' action */
+ mtag->m_tag_id--;
+ jmpto = stack[mtag->m_tag_id] + 1;
+ f_pos = ipfw_find_rule(chain, jmpto, 0);
+ }
+
+ /*
+ * Skip disabled rules, and re-enter
+ * the inner loop with the correct
+ * f_pos, f, l and cmd.
+ * Also clear cmdlen and skip_or
+ */
+ for (; f_pos < chain->n_rules - 1 &&
+ (V_set_disable &
+ (1 << chain->map[f_pos]->set)); f_pos++)
+ ;
+ /* Re-enter the inner loop at the dest rule. */
+ f = chain->map[f_pos];
+ l = f->cmd_len;
+ cmd = f->cmd;
+ cmdlen = 0;
+ skip_or = 0;
+ continue;
+ break; /* NOTREACHED */
+ }
+#undef IS_CALL
+#undef IS_RETURN
+
+ case O_REJECT:
+ /*
+ * Drop the packet and send a reject notice
+ * if the packet is not ICMP (or is an ICMP
+ * query), and it is not multicast/broadcast.
+ */
+ if (hlen > 0 && is_ipv4 && offset == 0 &&
+ (proto != IPPROTO_ICMP ||
+ is_icmp_query(ICMP(ulp))) &&
+ !(m->m_flags & (M_BCAST|M_MCAST)) &&
+ !IN_MULTICAST(ntohl(dst_ip.s_addr))) {
+ send_reject(args, cmd->arg1, iplen, ip);
+ m = args->m;
+ }
+ /* FALLTHROUGH */
+#ifdef INET6
+ case O_UNREACH6:
+ if (hlen > 0 && is_ipv6 &&
+ ((offset & IP6F_OFF_MASK) == 0) &&
+ (proto != IPPROTO_ICMPV6 ||
+ (is_icmp6_query(icmp6_type) == 1)) &&
+ !(m->m_flags & (M_BCAST|M_MCAST)) &&
+ !IN6_IS_ADDR_MULTICAST(&args->f_id.dst_ip6)) {
+ send_reject6(
+ args, cmd->arg1, hlen,
+ (struct ip6_hdr *)ip);
+ m = args->m;
+ }
+ /* FALLTHROUGH */
+#endif
+ case O_DENY:
+ retval = IP_FW_DENY;
+ l = 0; /* exit inner loop */
+ done = 1; /* exit outer loop */
+ break;
+
+ case O_FORWARD_IP:
+ if (args->eh) /* not valid on layer2 pkts */
+ break;
+ if (q == NULL || q->rule != f ||
+ dyn_dir == MATCH_FORWARD) {
+ struct sockaddr_in *sa;
+ sa = &(((ipfw_insn_sa *)cmd)->sa);
+ if (sa->sin_addr.s_addr == INADDR_ANY) {
+ bcopy(sa, &args->hopstore,
+ sizeof(*sa));
+ args->hopstore.sin_addr.s_addr =
+ htonl(tablearg);
+ args->next_hop = &args->hopstore;
+ } else {
+ args->next_hop = sa;
+ }
+ }
+ retval = IP_FW_PASS;
+ l = 0; /* exit inner loop */
+ done = 1; /* exit outer loop */
+ break;
+
+#ifdef INET6
+ case O_FORWARD_IP6:
+ if (args->eh) /* not valid on layer2 pkts */
+ break;
+ if (q == NULL || q->rule != f ||
+ dyn_dir == MATCH_FORWARD) {
+ struct sockaddr_in6 *sin6;
+
+ sin6 = &(((ipfw_insn_sa6 *)cmd)->sa);
+ args->next_hop6 = sin6;
+ }
+ retval = IP_FW_PASS;
+ l = 0; /* exit inner loop */
+ done = 1; /* exit outer loop */
+ break;
+#endif
+
+ case O_NETGRAPH:
+ case O_NGTEE:
+ set_match(args, f_pos, chain);
+ args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ?
+ tablearg : cmd->arg1;
+ if (V_fw_one_pass)
+ args->rule.info |= IPFW_ONEPASS;
+ retval = (cmd->opcode == O_NETGRAPH) ?
+ IP_FW_NETGRAPH : IP_FW_NGTEE;
+ l = 0; /* exit inner loop */
+ done = 1; /* exit outer loop */
+ break;
+
+ case O_SETFIB: {
+ uint32_t fib;
+
+ f->pcnt++; /* update stats */
+ f->bcnt += pktlen;
+ f->timestamp = time_uptime;
+ fib = (cmd->arg1 == IP_FW_TABLEARG) ? tablearg:
+ cmd->arg1;
+ if (fib >= rt_numfibs)
+ fib = 0;
+ M_SETFIB(m, fib);
+ args->f_id.fib = fib;
+ l = 0; /* exit inner loop */
+ break;
+ }
+
+ case O_NAT:
+ if (!IPFW_NAT_LOADED) {
+ retval = IP_FW_DENY;
+ } else {
+ struct cfg_nat *t;
+ int nat_id;
+
+ set_match(args, f_pos, chain);
+ /* Check if this is 'global' nat rule */
+ if (cmd->arg1 == 0) {
+ retval = ipfw_nat_ptr(args, NULL, m);
+ l = 0;
+ done = 1;
+ break;
+ }
+ t = ((ipfw_insn_nat *)cmd)->nat;
+ if (t == NULL) {
+ nat_id = (cmd->arg1 == IP_FW_TABLEARG) ?
+ tablearg : cmd->arg1;
+ t = (*lookup_nat_ptr)(&chain->nat, nat_id);
+
+ if (t == NULL) {
+ retval = IP_FW_DENY;
+ l = 0; /* exit inner loop */
+ done = 1; /* exit outer loop */
+ break;
+ }
+ if (cmd->arg1 != IP_FW_TABLEARG)
+ ((ipfw_insn_nat *)cmd)->nat = t;
+ }
+ retval = ipfw_nat_ptr(args, t, m);
+ }
+ l = 0; /* exit inner loop */
+ done = 1; /* exit outer loop */
+ break;
+
+ case O_REASS: {
+ int ip_off;
+
+ f->pcnt++;
+ f->bcnt += pktlen;
+ l = 0; /* in any case exit inner loop */
+ ip_off = ntohs(ip->ip_off);
+
+ /* if not fragmented, go to next rule */
+ if ((ip_off & (IP_MF | IP_OFFMASK)) == 0)
+ break;
+ /*
+ * ip_reass() expects len & off in host
+ * byte order.
+ */
+ SET_HOST_IPLEN(ip);
+
+ args->m = m = ip_reass(m);
+
+ /*
+ * do IP header checksum fixup.
+ */
+ if (m == NULL) { /* fragment got swallowed */
+ retval = IP_FW_DENY;
+ } else { /* good, packet complete */
+ int hlen;
+
+ ip = mtod(m, struct ip *);
+ hlen = ip->ip_hl << 2;
+ SET_NET_IPLEN(ip);
+ ip->ip_sum = 0;
+ if (hlen == sizeof(struct ip))
+ ip->ip_sum = in_cksum_hdr(ip);
+ else
+ ip->ip_sum = in_cksum(m, hlen);
+ retval = IP_FW_REASS;
+ set_match(args, f_pos, chain);
+ }
+ done = 1; /* exit outer loop */
+ break;
+ }
+
+ default:
+ panic("-- unknown opcode %d\n", cmd->opcode);
+ } /* end of switch() on opcodes */
+ /*
+ * if we get here with l=0, then match is irrelevant.
+ */
+
+ if (cmd->len & F_NOT)
+ match = !match;
+
+ if (match) {
+ if (cmd->len & F_OR)
+ skip_or = 1;
+ } else {
+ if (!(cmd->len & F_OR)) /* not an OR block, */
+ break; /* try next rule */
+ }
+
+ } /* end of inner loop, scan opcodes */
+#undef PULLUP_LEN
+
+ if (done)
+ break;
+
+/* next_rule:; */ /* try next rule */
+
+ } /* end of outer for, scan rules */
+
+ if (done) {
+ struct ip_fw *rule = chain->map[f_pos];
+ /* Update statistics */
+ rule->pcnt++;
+ rule->bcnt += pktlen;
+ rule->timestamp = time_uptime;
+ } else {
+ retval = IP_FW_DENY;
+ printf("ipfw: ouch!, skip past end of rules, denying packet\n");
+ }
+ IPFW_RUNLOCK(chain);
+#ifdef __FreeBSD__
+ if (ucred_cache != NULL)
+ crfree(ucred_cache);
+#endif
+ return (retval);
+
+pullup_failed:
+ if (V_fw_verbose)
+ printf("ipfw: pullup failed\n");
+ return (IP_FW_DENY);
+}
+
+/*
+ * Set maximum number of tables that can be used in given VNET ipfw instance.
+ */
+#ifdef SYSCTL_NODE
+static int
+sysctl_ipfw_table_num(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ unsigned int ntables;
+
+ ntables = V_fw_tables_max;
+
+ error = sysctl_handle_int(oidp, &ntables, 0, req);
+ /* Read operation or some error */
+ if ((error != 0) || (req->newptr == NULL))
+ return (error);
+
+ return (ipfw_resize_tables(&V_layer3_chain, ntables));
+}
+#endif
+/*
+ * Module and VNET glue
+ */
+
+/*
+ * Stuff that must be initialised only on boot or module load
+ */
+static int
+ipfw_init(void)
+{
+ int error = 0;
+
+ ipfw_dyn_attach();
+ /*
+ * Only print out this stuff the first time around,
+ * when called from the sysinit code.
+ */
+ printf("ipfw2 "
+#ifdef INET6
+ "(+ipv6) "
+#endif
+ "initialized, divert %s, nat %s, "
+ "rule-based forwarding "
+#ifdef IPFIREWALL_FORWARD
+ "enabled, "
+#else
+ "disabled, "
+#endif
+ "default to %s, logging ",
+#ifdef IPDIVERT
+ "enabled",
+#else
+ "loadable",
+#endif
+#ifdef IPFIREWALL_NAT
+ "enabled",
+#else
+ "loadable",
+#endif
+ default_to_accept ? "accept" : "deny");
+
+ /*
+ * Note: V_xxx variables can be accessed here but the vnet specific
+ * initializer may not have been called yet for the VIMAGE case.
+ * Tuneables will have been processed. We will print out values for
+ * the default vnet.
+ * XXX This should all be rationalized AFTER 8.0
+ */
+ if (V_fw_verbose == 0)
+ printf("disabled\n");
+ else if (V_verbose_limit == 0)
+ printf("unlimited\n");
+ else
+ printf("limited to %d packets/entry by default\n",
+ V_verbose_limit);
+
+ /* Check user-supplied table count for validness */
+ if (default_fw_tables > IPFW_TABLES_MAX)
+ default_fw_tables = IPFW_TABLES_MAX;
+
+ ipfw_log_bpf(1); /* init */
+ return (error);
+}
+
+/*
+ * Called for the removal of the last instance only on module unload.
+ */
+static void
+ipfw_destroy(void)
+{
+
+ ipfw_log_bpf(0); /* uninit */
+ ipfw_dyn_detach();
+ printf("IP firewall unloaded\n");
+}
+
+/*
+ * Stuff that must be initialized for every instance
+ * (including the first of course).
+ */
+static int
+vnet_ipfw_init(const void *unused)
+{
+ int error;
+ struct ip_fw *rule = NULL;
+ struct ip_fw_chain *chain;
+
+ chain = &V_layer3_chain;
+
+ /* First set up some values that are compile time options */
+ V_autoinc_step = 100; /* bounded to 1..1000 in add_rule() */
+ V_fw_deny_unknown_exthdrs = 1;
+#ifdef IPFIREWALL_VERBOSE
+ V_fw_verbose = 1;
+#endif
+#ifdef IPFIREWALL_VERBOSE_LIMIT
+ V_verbose_limit = IPFIREWALL_VERBOSE_LIMIT;
+#endif
+#ifdef IPFIREWALL_NAT
+ LIST_INIT(&chain->nat);
+#endif
+
+ /* insert the default rule and create the initial map */
+ chain->n_rules = 1;
+ chain->static_len = sizeof(struct ip_fw);
+ chain->map = malloc(sizeof(struct ip_fw *), M_IPFW, M_WAITOK | M_ZERO);
+ if (chain->map)
+ rule = malloc(chain->static_len, M_IPFW, M_WAITOK | M_ZERO);
+
+ /* Set initial number of tables */
+ V_fw_tables_max = default_fw_tables;
+ error = ipfw_init_tables(chain);
+ if (error) {
+ printf("ipfw2: setting up tables failed\n");
+ free(chain->map, M_IPFW);
+ free(rule, M_IPFW);
+ return (ENOSPC);
+ }
+
+ /* fill and insert the default rule */
+ rule->act_ofs = 0;
+ rule->rulenum = IPFW_DEFAULT_RULE;
+ rule->cmd_len = 1;
+ rule->set = RESVD_SET;
+ rule->cmd[0].len = 1;
+ rule->cmd[0].opcode = default_to_accept ? O_ACCEPT : O_DENY;
+ chain->rules = chain->default_rule = chain->map[0] = rule;
+ chain->id = rule->id = 1;
+
+ IPFW_LOCK_INIT(chain);
+ ipfw_dyn_init();
+
+ /* First set up some values that are compile time options */
+ V_ipfw_vnet_ready = 1; /* Open for business */
+
+ /*
+ * Hook the sockopt handler and pfil hooks for ipv4 and ipv6.
+ * Even if the latter two fail we still keep the module alive
+ * because the sockopt and layer2 paths are still useful.
+ * ipfw[6]_hook return 0 on success, ENOENT on failure,
+ * so we can ignore the exact return value and just set a flag.
+ *
+ * Note that V_fw[6]_enable are manipulated by a SYSCTL_PROC so
+ * changes in the underlying (per-vnet) variables trigger
+ * immediate hook()/unhook() calls.
+ * In layer2 we have the same behaviour, except that V_ether_ipfw
+ * is checked on each packet because there are no pfil hooks.
+ */
+ V_ip_fw_ctl_ptr = ipfw_ctl;
+ error = ipfw_attach_hooks(1);
+ return (error);
+}
+
+/*
+ * Called for the removal of each instance.
+ */
+static int
+vnet_ipfw_uninit(const void *unused)
+{
+ struct ip_fw *reap, *rule;
+ struct ip_fw_chain *chain = &V_layer3_chain;
+ int i;
+
+ V_ipfw_vnet_ready = 0; /* tell new callers to go away */
+ /*
+ * disconnect from ipv4, ipv6, layer2 and sockopt.
+ * Then grab, release and grab again the WLOCK so we make
+ * sure the update is propagated and nobody will be in.
+ */
+ (void)ipfw_attach_hooks(0 /* detach */);
+ V_ip_fw_ctl_ptr = NULL;
+ IPFW_UH_WLOCK(chain);
+ IPFW_UH_WUNLOCK(chain);
+ IPFW_UH_WLOCK(chain);
+
+ IPFW_WLOCK(chain);
+ ipfw_dyn_uninit(0); /* run the callout_drain */
+ IPFW_WUNLOCK(chain);
+
+ ipfw_destroy_tables(chain);
+ reap = NULL;
+ IPFW_WLOCK(chain);
+ for (i = 0; i < chain->n_rules; i++) {
+ rule = chain->map[i];
+ rule->x_next = reap;
+ reap = rule;
+ }
+ if (chain->map)
+ free(chain->map, M_IPFW);
+ IPFW_WUNLOCK(chain);
+ IPFW_UH_WUNLOCK(chain);
+ if (reap != NULL)
+ ipfw_reap_rules(reap);
+ IPFW_LOCK_DESTROY(chain);
+ ipfw_dyn_uninit(1); /* free the remaining parts */
+ return 0;
+}
+
+/*
+ * Module event handler.
+ * In general we have the choice of handling most of these events by the
+ * event handler or by the (VNET_)SYS(UN)INIT handlers. I have chosen to
+ * use the SYSINIT handlers as they are more capable of expressing the
+ * flow of control during module and vnet operations, so this is just
+ * a skeleton. Note there is no SYSINIT equivalent of the module
+ * SHUTDOWN handler, but we don't have anything to do in that case anyhow.
+ */
+static int
+ipfw_modevent(module_t mod, int type, void *unused)
+{
+ int err = 0;
+
+ switch (type) {
+ case MOD_LOAD:
+ /* Called once at module load or
+ * system boot if compiled in. */
+ break;
+ case MOD_QUIESCE:
+ /* Called before unload. May veto unloading. */
+ break;
+ case MOD_UNLOAD:
+ /* Called during unload. */
+ break;
+ case MOD_SHUTDOWN:
+ /* Called during system shutdown. */
+ break;
+ default:
+ err = EOPNOTSUPP;
+ break;
+ }
+ return err;
+}
+
+static moduledata_t ipfwmod = {
+ "ipfw",
+ ipfw_modevent,
+ 0
+};
+
+/* Define startup order. */
+#define IPFW_SI_SUB_FIREWALL SI_SUB_PROTO_IFATTACHDOMAIN
+#define IPFW_MODEVENT_ORDER (SI_ORDER_ANY - 255) /* On boot slot in here. */
+#define IPFW_MODULE_ORDER (IPFW_MODEVENT_ORDER + 1) /* A little later. */
+#define IPFW_VNET_ORDER (IPFW_MODEVENT_ORDER + 2) /* Later still. */
+
+DECLARE_MODULE(ipfw, ipfwmod, IPFW_SI_SUB_FIREWALL, IPFW_MODEVENT_ORDER);
+MODULE_VERSION(ipfw, 2);
+/* should declare some dependencies here */
+
+/*
+ * Starting up. Done in order after ipfwmod() has been called.
+ * VNET_SYSINIT is also called for each existing vnet and each new vnet.
+ */
+SYSINIT(ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER,
+ ipfw_init, NULL);
+VNET_SYSINIT(vnet_ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER,
+ vnet_ipfw_init, NULL);
+
+/*
+ * Closing up shop. These are done in REVERSE ORDER, but still
+ * after ipfwmod() has been called. Not called on reboot.
+ * VNET_SYSUNINIT is also called for each exiting vnet as it exits.
+ * or when the module is unloaded.
+ */
+SYSUNINIT(ipfw_destroy, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER,
+ ipfw_destroy, NULL);
+VNET_SYSUNINIT(vnet_ipfw_uninit, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER,
+ vnet_ipfw_uninit, NULL);
+/* end of file */
diff --git a/sys/netpfil/ipfw/ip_fw_dynamic.c b/sys/netpfil/ipfw/ip_fw_dynamic.c
new file mode 100644
index 0000000..28d2d51
--- /dev/null
+++ b/sys/netpfil/ipfw/ip_fw_dynamic.c
@@ -0,0 +1,1245 @@
+/*-
+ * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#define DEB(x)
+#define DDB(x) x
+
+/*
+ * Dynamic rule support for ipfw
+ */
+
+#include "opt_ipfw.h"
+#include "opt_inet.h"
+#ifndef INET
+#error IPFIREWALL requires INET.
+#endif /* INET */
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <net/ethernet.h> /* for ETHERTYPE_IP */
+#include <net/if.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h> /* ip_defttl */
+#include <netinet/ip_fw.h>
+#include <netinet/tcp_var.h>
+#include <netinet/udp.h>
+
+#include <netinet/ip6.h> /* IN6_ARE_ADDR_EQUAL */
+#ifdef INET6
+#include <netinet6/in6_var.h>
+#include <netinet6/ip6_var.h>
+#endif
+
+#include <netpfil/ipfw/ip_fw_private.h>
+
+#include <machine/in_cksum.h> /* XXX for in_cksum */
+
+#ifdef MAC
+#include <security/mac/mac_framework.h>
+#endif
+
+/*
+ * Description of dynamic rules.
+ *
+ * Dynamic rules are stored in lists accessed through a hash table
+ * (ipfw_dyn_v) whose size is curr_dyn_buckets. This value can
+ * be modified through the sysctl variable dyn_buckets which is
+ * updated when the table becomes empty.
+ *
+ * XXX currently there is only one list, ipfw_dyn.
+ *
+ * When a packet is received, its address fields are first masked
+ * with the mask defined for the rule, then hashed, then matched
+ * against the entries in the corresponding list.
+ * Dynamic rules can be used for different purposes:
+ * + stateful rules;
+ * + enforcing limits on the number of sessions;
+ * + in-kernel NAT (not implemented yet)
+ *
+ * The lifetime of dynamic rules is regulated by dyn_*_lifetime,
+ * measured in seconds and depending on the flags.
+ *
+ * The total number of dynamic rules is stored in dyn_count.
+ * The max number of dynamic rules is dyn_max. When we reach
+ * the maximum number of rules we do not create anymore. This is
+ * done to avoid consuming too much memory, but also too much
+ * time when searching on each packet (ideally, we should try instead
+ * to put a limit on the length of the list on each bucket...).
+ *
+ * Each dynamic rule holds a pointer to the parent ipfw rule so
+ * we know what action to perform. Dynamic rules are removed when
+ * the parent rule is deleted. XXX we should make them survive.
+ *
+ * There are some limitations with dynamic rules -- we do not
+ * obey the 'randomized match', and we do not do multiple
+ * passes through the firewall. XXX check the latter!!!
+ */
+
+/*
+ * Static variables followed by global ones
+ */
+static VNET_DEFINE(ipfw_dyn_rule **, ipfw_dyn_v);
+static VNET_DEFINE(u_int32_t, dyn_buckets);
+static VNET_DEFINE(u_int32_t, curr_dyn_buckets);
+static VNET_DEFINE(struct callout, ipfw_timeout);
+#define V_ipfw_dyn_v VNET(ipfw_dyn_v)
+#define V_dyn_buckets VNET(dyn_buckets)
+#define V_curr_dyn_buckets VNET(curr_dyn_buckets)
+#define V_ipfw_timeout VNET(ipfw_timeout)
+
+static uma_zone_t ipfw_dyn_rule_zone;
+#ifndef __FreeBSD__
+DEFINE_SPINLOCK(ipfw_dyn_mtx);
+#else
+static struct mtx ipfw_dyn_mtx; /* mutex guarding dynamic rules */
+#endif
+
+#define IPFW_DYN_LOCK_INIT() \
+ mtx_init(&ipfw_dyn_mtx, "IPFW dynamic rules", NULL, MTX_DEF)
+#define IPFW_DYN_LOCK_DESTROY() mtx_destroy(&ipfw_dyn_mtx)
+#define IPFW_DYN_LOCK() mtx_lock(&ipfw_dyn_mtx)
+#define IPFW_DYN_UNLOCK() mtx_unlock(&ipfw_dyn_mtx)
+#define IPFW_DYN_LOCK_ASSERT() mtx_assert(&ipfw_dyn_mtx, MA_OWNED)
+
+void
+ipfw_dyn_unlock(void)
+{
+ IPFW_DYN_UNLOCK();
+}
+
+/*
+ * Timeouts for various events in handing dynamic rules.
+ */
+static VNET_DEFINE(u_int32_t, dyn_ack_lifetime);
+static VNET_DEFINE(u_int32_t, dyn_syn_lifetime);
+static VNET_DEFINE(u_int32_t, dyn_fin_lifetime);
+static VNET_DEFINE(u_int32_t, dyn_rst_lifetime);
+static VNET_DEFINE(u_int32_t, dyn_udp_lifetime);
+static VNET_DEFINE(u_int32_t, dyn_short_lifetime);
+
+#define V_dyn_ack_lifetime VNET(dyn_ack_lifetime)
+#define V_dyn_syn_lifetime VNET(dyn_syn_lifetime)
+#define V_dyn_fin_lifetime VNET(dyn_fin_lifetime)
+#define V_dyn_rst_lifetime VNET(dyn_rst_lifetime)
+#define V_dyn_udp_lifetime VNET(dyn_udp_lifetime)
+#define V_dyn_short_lifetime VNET(dyn_short_lifetime)
+
+/*
+ * Keepalives are sent if dyn_keepalive is set. They are sent every
+ * dyn_keepalive_period seconds, in the last dyn_keepalive_interval
+ * seconds of lifetime of a rule.
+ * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower
+ * than dyn_keepalive_period.
+ */
+
+static VNET_DEFINE(u_int32_t, dyn_keepalive_interval);
+static VNET_DEFINE(u_int32_t, dyn_keepalive_period);
+static VNET_DEFINE(u_int32_t, dyn_keepalive);
+
+#define V_dyn_keepalive_interval VNET(dyn_keepalive_interval)
+#define V_dyn_keepalive_period VNET(dyn_keepalive_period)
+#define V_dyn_keepalive VNET(dyn_keepalive)
+
+static VNET_DEFINE(u_int32_t, dyn_count); /* # of dynamic rules */
+static VNET_DEFINE(u_int32_t, dyn_max); /* max # of dynamic rules */
+
+#define V_dyn_count VNET(dyn_count)
+#define V_dyn_max VNET(dyn_max)
+
+#ifdef SYSCTL_NODE
+
+SYSBEGIN(f2)
+
+SYSCTL_DECL(_net_inet_ip_fw);
+SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_buckets,
+ CTLFLAG_RW, &VNET_NAME(dyn_buckets), 0,
+ "Number of dyn. buckets");
+SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets,
+ CTLFLAG_RD, &VNET_NAME(curr_dyn_buckets), 0,
+ "Current Number of dyn. buckets");
+SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_count,
+ CTLFLAG_RD, &VNET_NAME(dyn_count), 0,
+ "Number of dyn. rules");
+SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_max,
+ CTLFLAG_RW, &VNET_NAME(dyn_max), 0,
+ "Max number of dyn. rules");
+SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime,
+ CTLFLAG_RW, &VNET_NAME(dyn_ack_lifetime), 0,
+ "Lifetime of dyn. rules for acks");
+SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime,
+ CTLFLAG_RW, &VNET_NAME(dyn_syn_lifetime), 0,
+ "Lifetime of dyn. rules for syn");
+SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime,
+ CTLFLAG_RW, &VNET_NAME(dyn_fin_lifetime), 0,
+ "Lifetime of dyn. rules for fin");
+SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime,
+ CTLFLAG_RW, &VNET_NAME(dyn_rst_lifetime), 0,
+ "Lifetime of dyn. rules for rst");
+SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime,
+ CTLFLAG_RW, &VNET_NAME(dyn_udp_lifetime), 0,
+ "Lifetime of dyn. rules for UDP");
+SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime,
+ CTLFLAG_RW, &VNET_NAME(dyn_short_lifetime), 0,
+ "Lifetime of dyn. rules for other situations");
+SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive,
+ CTLFLAG_RW, &VNET_NAME(dyn_keepalive), 0,
+ "Enable keepalives for dyn. rules");
+
+SYSEND
+
+#endif /* SYSCTL_NODE */
+
+
+static __inline int
+hash_packet6(struct ipfw_flow_id *id)
+{
+ u_int32_t i;
+ i = (id->dst_ip6.__u6_addr.__u6_addr32[2]) ^
+ (id->dst_ip6.__u6_addr.__u6_addr32[3]) ^
+ (id->src_ip6.__u6_addr.__u6_addr32[2]) ^
+ (id->src_ip6.__u6_addr.__u6_addr32[3]) ^
+ (id->dst_port) ^ (id->src_port);
+ return i;
+}
+
+/*
+ * IMPORTANT: the hash function for dynamic rules must be commutative
+ * in source and destination (ip,port), because rules are bidirectional
+ * and we want to find both in the same bucket.
+ */
+static __inline int
+hash_packet(struct ipfw_flow_id *id)
+{
+ u_int32_t i;
+
+#ifdef INET6
+ if (IS_IP6_FLOW_ID(id))
+ i = hash_packet6(id);
+ else
+#endif /* INET6 */
+ i = (id->dst_ip) ^ (id->src_ip) ^ (id->dst_port) ^ (id->src_port);
+ i &= (V_curr_dyn_buckets - 1);
+ return i;
+}
+
+static __inline void
+unlink_dyn_rule_print(struct ipfw_flow_id *id)
+{
+ struct in_addr da;
+#ifdef INET6
+ char src[INET6_ADDRSTRLEN], dst[INET6_ADDRSTRLEN];
+#else
+ char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN];
+#endif
+
+#ifdef INET6
+ if (IS_IP6_FLOW_ID(id)) {
+ ip6_sprintf(src, &id->src_ip6);
+ ip6_sprintf(dst, &id->dst_ip6);
+ } else
+#endif
+ {
+ da.s_addr = htonl(id->src_ip);
+ inet_ntop(AF_INET, &da, src, sizeof(src));
+ da.s_addr = htonl(id->dst_ip);
+ inet_ntop(AF_INET, &da, dst, sizeof(dst));
+ }
+ printf("ipfw: unlink entry %s %d -> %s %d, %d left\n",
+ src, id->src_port, dst, id->dst_port, V_dyn_count - 1);
+}
+
+/**
+ * unlink a dynamic rule from a chain. prev is a pointer to
+ * the previous one, q is a pointer to the rule to delete,
+ * head is a pointer to the head of the queue.
+ * Modifies q and potentially also head.
+ */
+#define UNLINK_DYN_RULE(prev, head, q) { \
+ ipfw_dyn_rule *old_q = q; \
+ \
+ /* remove a refcount to the parent */ \
+ if (q->dyn_type == O_LIMIT) \
+ q->parent->count--; \
+ DEB(unlink_dyn_rule_print(&q->id);) \
+ if (prev != NULL) \
+ prev->next = q = q->next; \
+ else \
+ head = q = q->next; \
+ V_dyn_count--; \
+ uma_zfree(ipfw_dyn_rule_zone, old_q); }
+
+#define TIME_LEQ(a,b) ((int)((a)-(b)) <= 0)
+
+/**
+ * Remove dynamic rules pointing to "rule", or all of them if rule == NULL.
+ *
+ * If keep_me == NULL, rules are deleted even if not expired,
+ * otherwise only expired rules are removed.
+ *
+ * The value of the second parameter is also used to point to identify
+ * a rule we absolutely do not want to remove (e.g. because we are
+ * holding a reference to it -- this is the case with O_LIMIT_PARENT
+ * rules). The pointer is only used for comparison, so any non-null
+ * value will do.
+ */
+static void
+remove_dyn_rule(struct ip_fw *rule, ipfw_dyn_rule *keep_me)
+{
+ static u_int32_t last_remove = 0;
+
+#define FORCE (keep_me == NULL)
+
+ ipfw_dyn_rule *prev, *q;
+ int i, pass = 0, max_pass = 0;
+
+ IPFW_DYN_LOCK_ASSERT();
+
+ if (V_ipfw_dyn_v == NULL || V_dyn_count == 0)
+ return;
+ /* do not expire more than once per second, it is useless */
+ if (!FORCE && last_remove == time_uptime)
+ return;
+ last_remove = time_uptime;
+
+ /*
+ * because O_LIMIT refer to parent rules, during the first pass only
+ * remove child and mark any pending LIMIT_PARENT, and remove
+ * them in a second pass.
+ */
+next_pass:
+ for (i = 0 ; i < V_curr_dyn_buckets ; i++) {
+ for (prev=NULL, q = V_ipfw_dyn_v[i] ; q ; ) {
+ /*
+ * Logic can become complex here, so we split tests.
+ */
+ if (q == keep_me)
+ goto next;
+ if (rule != NULL && rule != q->rule)
+ goto next; /* not the one we are looking for */
+ if (q->dyn_type == O_LIMIT_PARENT) {
+ /*
+ * handle parent in the second pass,
+ * record we need one.
+ */
+ max_pass = 1;
+ if (pass == 0)
+ goto next;
+ if (FORCE && q->count != 0 ) {
+ /* XXX should not happen! */
+ printf("ipfw: OUCH! cannot remove rule,"
+ " count %d\n", q->count);
+ }
+ } else {
+ if (!FORCE &&
+ !TIME_LEQ( q->expire, time_uptime ))
+ goto next;
+ }
+ if (q->dyn_type != O_LIMIT_PARENT || !q->count) {
+ UNLINK_DYN_RULE(prev, V_ipfw_dyn_v[i], q);
+ continue;
+ }
+next:
+ prev=q;
+ q=q->next;
+ }
+ }
+ if (pass++ < max_pass)
+ goto next_pass;
+}
+
+void
+ipfw_remove_dyn_children(struct ip_fw *rule)
+{
+ IPFW_DYN_LOCK();
+ remove_dyn_rule(rule, NULL /* force removal */);
+ IPFW_DYN_UNLOCK();
+}
+
+/*
+ * Lookup a dynamic rule, locked version.
+ */
+static ipfw_dyn_rule *
+lookup_dyn_rule_locked(struct ipfw_flow_id *pkt, int *match_direction,
+ struct tcphdr *tcp)
+{
+ /*
+ * Stateful ipfw extensions.
+ * Lookup into dynamic session queue.
+ */
+#define MATCH_REVERSE 0
+#define MATCH_FORWARD 1
+#define MATCH_NONE 2
+#define MATCH_UNKNOWN 3
+ int i, dir = MATCH_NONE;
+ ipfw_dyn_rule *prev, *q = NULL;
+
+ IPFW_DYN_LOCK_ASSERT();
+
+ if (V_ipfw_dyn_v == NULL)
+ goto done; /* not found */
+ i = hash_packet(pkt);
+ for (prev = NULL, q = V_ipfw_dyn_v[i]; q != NULL;) {
+ if (q->dyn_type == O_LIMIT_PARENT && q->count)
+ goto next;
+ if (TIME_LEQ(q->expire, time_uptime)) { /* expire entry */
+ UNLINK_DYN_RULE(prev, V_ipfw_dyn_v[i], q);
+ continue;
+ }
+ if (pkt->proto != q->id.proto || q->dyn_type == O_LIMIT_PARENT)
+ goto next;
+
+ if (IS_IP6_FLOW_ID(pkt)) {
+ if (IN6_ARE_ADDR_EQUAL(&pkt->src_ip6, &q->id.src_ip6) &&
+ IN6_ARE_ADDR_EQUAL(&pkt->dst_ip6, &q->id.dst_ip6) &&
+ pkt->src_port == q->id.src_port &&
+ pkt->dst_port == q->id.dst_port) {
+ dir = MATCH_FORWARD;
+ break;
+ }
+ if (IN6_ARE_ADDR_EQUAL(&pkt->src_ip6, &q->id.dst_ip6) &&
+ IN6_ARE_ADDR_EQUAL(&pkt->dst_ip6, &q->id.src_ip6) &&
+ pkt->src_port == q->id.dst_port &&
+ pkt->dst_port == q->id.src_port) {
+ dir = MATCH_REVERSE;
+ break;
+ }
+ } else {
+ if (pkt->src_ip == q->id.src_ip &&
+ pkt->dst_ip == q->id.dst_ip &&
+ pkt->src_port == q->id.src_port &&
+ pkt->dst_port == q->id.dst_port) {
+ dir = MATCH_FORWARD;
+ break;
+ }
+ if (pkt->src_ip == q->id.dst_ip &&
+ pkt->dst_ip == q->id.src_ip &&
+ pkt->src_port == q->id.dst_port &&
+ pkt->dst_port == q->id.src_port) {
+ dir = MATCH_REVERSE;
+ break;
+ }
+ }
+next:
+ prev = q;
+ q = q->next;
+ }
+ if (q == NULL)
+ goto done; /* q = NULL, not found */
+
+ if (prev != NULL) { /* found and not in front */
+ prev->next = q->next;
+ q->next = V_ipfw_dyn_v[i];
+ V_ipfw_dyn_v[i] = q;
+ }
+ if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */
+ uint32_t ack;
+ u_char flags = pkt->_flags & (TH_FIN | TH_SYN | TH_RST);
+
+#define BOTH_SYN (TH_SYN | (TH_SYN << 8))
+#define BOTH_FIN (TH_FIN | (TH_FIN << 8))
+#define TCP_FLAGS (TH_FLAGS | (TH_FLAGS << 8))
+#define ACK_FWD 0x10000 /* fwd ack seen */
+#define ACK_REV 0x20000 /* rev ack seen */
+
+ q->state |= (dir == MATCH_FORWARD) ? flags : (flags << 8);
+ switch (q->state & TCP_FLAGS) {
+ case TH_SYN: /* opening */
+ q->expire = time_uptime + V_dyn_syn_lifetime;
+ break;
+
+ case BOTH_SYN: /* move to established */
+ case BOTH_SYN | TH_FIN: /* one side tries to close */
+ case BOTH_SYN | (TH_FIN << 8):
+#define _SEQ_GE(a,b) ((int)(a) - (int)(b) >= 0)
+ if (tcp == NULL)
+ break;
+
+ ack = ntohl(tcp->th_ack);
+ if (dir == MATCH_FORWARD) {
+ if (q->ack_fwd == 0 ||
+ _SEQ_GE(ack, q->ack_fwd)) {
+ q->ack_fwd = ack;
+ q->state |= ACK_FWD;
+ }
+ } else {
+ if (q->ack_rev == 0 ||
+ _SEQ_GE(ack, q->ack_rev)) {
+ q->ack_rev = ack;
+ q->state |= ACK_REV;
+ }
+ }
+ if ((q->state & (ACK_FWD | ACK_REV)) ==
+ (ACK_FWD | ACK_REV)) {
+ q->expire = time_uptime + V_dyn_ack_lifetime;
+ q->state &= ~(ACK_FWD | ACK_REV);
+ }
+ break;
+
+ case BOTH_SYN | BOTH_FIN: /* both sides closed */
+ if (V_dyn_fin_lifetime >= V_dyn_keepalive_period)
+ V_dyn_fin_lifetime = V_dyn_keepalive_period - 1;
+ q->expire = time_uptime + V_dyn_fin_lifetime;
+ break;
+
+ default:
+#if 0
+ /*
+ * reset or some invalid combination, but can also
+ * occur if we use keep-state the wrong way.
+ */
+ if ( (q->state & ((TH_RST << 8)|TH_RST)) == 0)
+ printf("invalid state: 0x%x\n", q->state);
+#endif
+ if (V_dyn_rst_lifetime >= V_dyn_keepalive_period)
+ V_dyn_rst_lifetime = V_dyn_keepalive_period - 1;
+ q->expire = time_uptime + V_dyn_rst_lifetime;
+ break;
+ }
+ } else if (pkt->proto == IPPROTO_UDP) {
+ q->expire = time_uptime + V_dyn_udp_lifetime;
+ } else {
+ /* other protocols */
+ q->expire = time_uptime + V_dyn_short_lifetime;
+ }
+done:
+ if (match_direction != NULL)
+ *match_direction = dir;
+ return (q);
+}
+
+ipfw_dyn_rule *
+ipfw_lookup_dyn_rule(struct ipfw_flow_id *pkt, int *match_direction,
+ struct tcphdr *tcp)
+{
+ ipfw_dyn_rule *q;
+
+ IPFW_DYN_LOCK();
+ q = lookup_dyn_rule_locked(pkt, match_direction, tcp);
+ if (q == NULL)
+ IPFW_DYN_UNLOCK();
+ /* NB: return table locked when q is not NULL */
+ return q;
+}
+
+static void
+realloc_dynamic_table(void)
+{
+ IPFW_DYN_LOCK_ASSERT();
+
+ /*
+ * Try reallocation, make sure we have a power of 2 and do
+ * not allow more than 64k entries. In case of overflow,
+ * default to 1024.
+ */
+
+ if (V_dyn_buckets > 65536)
+ V_dyn_buckets = 1024;
+ if ((V_dyn_buckets & (V_dyn_buckets-1)) != 0) { /* not a power of 2 */
+ V_dyn_buckets = V_curr_dyn_buckets; /* reset */
+ return;
+ }
+ V_curr_dyn_buckets = V_dyn_buckets;
+ if (V_ipfw_dyn_v != NULL)
+ free(V_ipfw_dyn_v, M_IPFW);
+ for (;;) {
+ V_ipfw_dyn_v = malloc(V_curr_dyn_buckets * sizeof(ipfw_dyn_rule *),
+ M_IPFW, M_NOWAIT | M_ZERO);
+ if (V_ipfw_dyn_v != NULL || V_curr_dyn_buckets <= 2)
+ break;
+ V_curr_dyn_buckets /= 2;
+ }
+}
+
+/**
+ * Install state of type 'type' for a dynamic session.
+ * The hash table contains two type of rules:
+ * - regular rules (O_KEEP_STATE)
+ * - rules for sessions with limited number of sess per user
+ * (O_LIMIT). When they are created, the parent is
+ * increased by 1, and decreased on delete. In this case,
+ * the third parameter is the parent rule and not the chain.
+ * - "parent" rules for the above (O_LIMIT_PARENT).
+ */
+static ipfw_dyn_rule *
+add_dyn_rule(struct ipfw_flow_id *id, u_int8_t dyn_type, struct ip_fw *rule)
+{
+ ipfw_dyn_rule *r;
+ int i;
+
+ IPFW_DYN_LOCK_ASSERT();
+
+ if (V_ipfw_dyn_v == NULL ||
+ (V_dyn_count == 0 && V_dyn_buckets != V_curr_dyn_buckets)) {
+ realloc_dynamic_table();
+ if (V_ipfw_dyn_v == NULL)
+ return NULL; /* failed ! */
+ }
+ i = hash_packet(id);
+
+ r = uma_zalloc(ipfw_dyn_rule_zone, M_NOWAIT | M_ZERO);
+ if (r == NULL) {
+ printf ("ipfw: sorry cannot allocate state\n");
+ return NULL;
+ }
+
+ /* increase refcount on parent, and set pointer */
+ if (dyn_type == O_LIMIT) {
+ ipfw_dyn_rule *parent = (ipfw_dyn_rule *)rule;
+ if ( parent->dyn_type != O_LIMIT_PARENT)
+ panic("invalid parent");
+ parent->count++;
+ r->parent = parent;
+ rule = parent->rule;
+ }
+
+ r->id = *id;
+ r->expire = time_uptime + V_dyn_syn_lifetime;
+ r->rule = rule;
+ r->dyn_type = dyn_type;
+ r->pcnt = r->bcnt = 0;
+ r->count = 0;
+
+ r->bucket = i;
+ r->next = V_ipfw_dyn_v[i];
+ V_ipfw_dyn_v[i] = r;
+ V_dyn_count++;
+ DEB({
+ struct in_addr da;
+#ifdef INET6
+ char src[INET6_ADDRSTRLEN];
+ char dst[INET6_ADDRSTRLEN];
+#else
+ char src[INET_ADDRSTRLEN];
+ char dst[INET_ADDRSTRLEN];
+#endif
+
+#ifdef INET6
+ if (IS_IP6_FLOW_ID(&(r->id))) {
+ ip6_sprintf(src, &r->id.src_ip6);
+ ip6_sprintf(dst, &r->id.dst_ip6);
+ } else
+#endif
+ {
+ da.s_addr = htonl(r->id.src_ip);
+ inet_ntop(AF_INET, &da, src, sizeof(src));
+ da.s_addr = htonl(r->id.dst_ip);
+ inet_ntop(AF_INET, &da, dst, sizeof(dst));
+ }
+ printf("ipfw: add dyn entry ty %d %s %d -> %s %d, total %d\n",
+ dyn_type, src, r->id.src_port, dst, r->id.dst_port,
+ V_dyn_count);
+ })
+ return r;
+}
+
+/**
+ * lookup dynamic parent rule using pkt and rule as search keys.
+ * If the lookup fails, then install one.
+ */
+static ipfw_dyn_rule *
+lookup_dyn_parent(struct ipfw_flow_id *pkt, struct ip_fw *rule)
+{
+ ipfw_dyn_rule *q;
+ int i;
+
+ IPFW_DYN_LOCK_ASSERT();
+
+ if (V_ipfw_dyn_v) {
+ int is_v6 = IS_IP6_FLOW_ID(pkt);
+ i = hash_packet( pkt );
+ for (q = V_ipfw_dyn_v[i] ; q != NULL ; q=q->next)
+ if (q->dyn_type == O_LIMIT_PARENT &&
+ rule== q->rule &&
+ pkt->proto == q->id.proto &&
+ pkt->src_port == q->id.src_port &&
+ pkt->dst_port == q->id.dst_port &&
+ (
+ (is_v6 &&
+ IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6),
+ &(q->id.src_ip6)) &&
+ IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6),
+ &(q->id.dst_ip6))) ||
+ (!is_v6 &&
+ pkt->src_ip == q->id.src_ip &&
+ pkt->dst_ip == q->id.dst_ip)
+ )
+ ) {
+ q->expire = time_uptime + V_dyn_short_lifetime;
+ DEB(printf("ipfw: lookup_dyn_parent found 0x%p\n",q);)
+ return q;
+ }
+ }
+ return add_dyn_rule(pkt, O_LIMIT_PARENT, rule);
+}
+
+/**
+ * Install dynamic state for rule type cmd->o.opcode
+ *
+ * Returns 1 (failure) if state is not installed because of errors or because
+ * session limitations are enforced.
+ */
+int
+ipfw_install_state(struct ip_fw *rule, ipfw_insn_limit *cmd,
+ struct ip_fw_args *args, uint32_t tablearg)
+{
+ static int last_log;
+ ipfw_dyn_rule *q;
+ struct in_addr da;
+#ifdef INET6
+ char src[INET6_ADDRSTRLEN + 2], dst[INET6_ADDRSTRLEN + 2];
+#else
+ char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN];
+#endif
+
+ src[0] = '\0';
+ dst[0] = '\0';
+
+ IPFW_DYN_LOCK();
+
+ DEB(
+#ifdef INET6
+ if (IS_IP6_FLOW_ID(&(args->f_id))) {
+ ip6_sprintf(src, &args->f_id.src_ip6);
+ ip6_sprintf(dst, &args->f_id.dst_ip6);
+ } else
+#endif
+ {
+ da.s_addr = htonl(args->f_id.src_ip);
+ inet_ntop(AF_INET, &da, src, sizeof(src));
+ da.s_addr = htonl(args->f_id.dst_ip);
+ inet_ntop(AF_INET, &da, dst, sizeof(dst));
+ }
+ printf("ipfw: %s: type %d %s %u -> %s %u\n",
+ __func__, cmd->o.opcode, src, args->f_id.src_port,
+ dst, args->f_id.dst_port);
+ src[0] = '\0';
+ dst[0] = '\0';
+ )
+
+ q = lookup_dyn_rule_locked(&args->f_id, NULL, NULL);
+
+ if (q != NULL) { /* should never occur */
+ DEB(
+ if (last_log != time_uptime) {
+ last_log = time_uptime;
+ printf("ipfw: %s: entry already present, done\n",
+ __func__);
+ })
+ IPFW_DYN_UNLOCK();
+ return (0);
+ }
+
+ if (V_dyn_count >= V_dyn_max)
+ /* Run out of slots, try to remove any expired rule. */
+ remove_dyn_rule(NULL, (ipfw_dyn_rule *)1);
+
+ if (V_dyn_count >= V_dyn_max) {
+ if (last_log != time_uptime) {
+ last_log = time_uptime;
+ printf("ipfw: %s: Too many dynamic rules\n", __func__);
+ }
+ IPFW_DYN_UNLOCK();
+ return (1); /* cannot install, notify caller */
+ }
+
+ switch (cmd->o.opcode) {
+ case O_KEEP_STATE: /* bidir rule */
+ add_dyn_rule(&args->f_id, O_KEEP_STATE, rule);
+ break;
+
+ case O_LIMIT: { /* limit number of sessions */
+ struct ipfw_flow_id id;
+ ipfw_dyn_rule *parent;
+ uint32_t conn_limit;
+ uint16_t limit_mask = cmd->limit_mask;
+
+ conn_limit = (cmd->conn_limit == IP_FW_TABLEARG) ?
+ tablearg : cmd->conn_limit;
+
+ DEB(
+ if (cmd->conn_limit == IP_FW_TABLEARG)
+ printf("ipfw: %s: O_LIMIT rule, conn_limit: %u "
+ "(tablearg)\n", __func__, conn_limit);
+ else
+ printf("ipfw: %s: O_LIMIT rule, conn_limit: %u\n",
+ __func__, conn_limit);
+ )
+
+ id.dst_ip = id.src_ip = id.dst_port = id.src_port = 0;
+ id.proto = args->f_id.proto;
+ id.addr_type = args->f_id.addr_type;
+ id.fib = M_GETFIB(args->m);
+
+ if (IS_IP6_FLOW_ID (&(args->f_id))) {
+ if (limit_mask & DYN_SRC_ADDR)
+ id.src_ip6 = args->f_id.src_ip6;
+ if (limit_mask & DYN_DST_ADDR)
+ id.dst_ip6 = args->f_id.dst_ip6;
+ } else {
+ if (limit_mask & DYN_SRC_ADDR)
+ id.src_ip = args->f_id.src_ip;
+ if (limit_mask & DYN_DST_ADDR)
+ id.dst_ip = args->f_id.dst_ip;
+ }
+ if (limit_mask & DYN_SRC_PORT)
+ id.src_port = args->f_id.src_port;
+ if (limit_mask & DYN_DST_PORT)
+ id.dst_port = args->f_id.dst_port;
+ if ((parent = lookup_dyn_parent(&id, rule)) == NULL) {
+ printf("ipfw: %s: add parent failed\n", __func__);
+ IPFW_DYN_UNLOCK();
+ return (1);
+ }
+
+ if (parent->count >= conn_limit) {
+ /* See if we can remove some expired rule. */
+ remove_dyn_rule(rule, parent);
+ if (parent->count >= conn_limit) {
+ if (V_fw_verbose && last_log != time_uptime) {
+ last_log = time_uptime;
+#ifdef INET6
+ /*
+ * XXX IPv6 flows are not
+ * supported yet.
+ */
+ if (IS_IP6_FLOW_ID(&(args->f_id))) {
+ char ip6buf[INET6_ADDRSTRLEN];
+ snprintf(src, sizeof(src),
+ "[%s]", ip6_sprintf(ip6buf,
+ &args->f_id.src_ip6));
+ snprintf(dst, sizeof(dst),
+ "[%s]", ip6_sprintf(ip6buf,
+ &args->f_id.dst_ip6));
+ } else
+#endif
+ {
+ da.s_addr =
+ htonl(args->f_id.src_ip);
+ inet_ntop(AF_INET, &da, src,
+ sizeof(src));
+ da.s_addr =
+ htonl(args->f_id.dst_ip);
+ inet_ntop(AF_INET, &da, dst,
+ sizeof(dst));
+ }
+ log(LOG_SECURITY | LOG_DEBUG,
+ "ipfw: %d %s %s:%u -> %s:%u, %s\n",
+ parent->rule->rulenum,
+ "drop session",
+ src, (args->f_id.src_port),
+ dst, (args->f_id.dst_port),
+ "too many entries");
+ }
+ IPFW_DYN_UNLOCK();
+ return (1);
+ }
+ }
+ add_dyn_rule(&args->f_id, O_LIMIT, (struct ip_fw *)parent);
+ break;
+ }
+ default:
+ printf("ipfw: %s: unknown dynamic rule type %u\n",
+ __func__, cmd->o.opcode);
+ IPFW_DYN_UNLOCK();
+ return (1);
+ }
+
+ /* XXX just set lifetime */
+ lookup_dyn_rule_locked(&args->f_id, NULL, NULL);
+
+ IPFW_DYN_UNLOCK();
+ return (0);
+}
+
+/*
+ * Generate a TCP packet, containing either a RST or a keepalive.
+ * When flags & TH_RST, we are sending a RST packet, because of a
+ * "reset" action matched the packet.
+ * Otherwise we are sending a keepalive, and flags & TH_
+ * The 'replyto' mbuf is the mbuf being replied to, if any, and is required
+ * so that MAC can label the reply appropriately.
+ */
+struct mbuf *
+ipfw_send_pkt(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t seq,
+ u_int32_t ack, int flags)
+{
+ struct mbuf *m = NULL; /* stupid compiler */
+ int len, dir;
+ struct ip *h = NULL; /* stupid compiler */
+#ifdef INET6
+ struct ip6_hdr *h6 = NULL;
+#endif
+ struct tcphdr *th = NULL;
+
+ MGETHDR(m, M_DONTWAIT, MT_DATA);
+ if (m == NULL)
+ return (NULL);
+
+ M_SETFIB(m, id->fib);
+#ifdef MAC
+ if (replyto != NULL)
+ mac_netinet_firewall_reply(replyto, m);
+ else
+ mac_netinet_firewall_send(m);
+#else
+ (void)replyto; /* don't warn about unused arg */
+#endif
+
+ switch (id->addr_type) {
+ case 4:
+ len = sizeof(struct ip) + sizeof(struct tcphdr);
+ break;
+#ifdef INET6
+ case 6:
+ len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
+ break;
+#endif
+ default:
+ /* XXX: log me?!? */
+ FREE_PKT(m);
+ return (NULL);
+ }
+ dir = ((flags & (TH_SYN | TH_RST)) == TH_SYN);
+
+ m->m_data += max_linkhdr;
+ m->m_flags |= M_SKIP_FIREWALL;
+ m->m_pkthdr.len = m->m_len = len;
+ m->m_pkthdr.rcvif = NULL;
+ bzero(m->m_data, len);
+
+ switch (id->addr_type) {
+ case 4:
+ h = mtod(m, struct ip *);
+
+ /* prepare for checksum */
+ h->ip_p = IPPROTO_TCP;
+ h->ip_len = htons(sizeof(struct tcphdr));
+ if (dir) {
+ h->ip_src.s_addr = htonl(id->src_ip);
+ h->ip_dst.s_addr = htonl(id->dst_ip);
+ } else {
+ h->ip_src.s_addr = htonl(id->dst_ip);
+ h->ip_dst.s_addr = htonl(id->src_ip);
+ }
+
+ th = (struct tcphdr *)(h + 1);
+ break;
+#ifdef INET6
+ case 6:
+ h6 = mtod(m, struct ip6_hdr *);
+
+ /* prepare for checksum */
+ h6->ip6_nxt = IPPROTO_TCP;
+ h6->ip6_plen = htons(sizeof(struct tcphdr));
+ if (dir) {
+ h6->ip6_src = id->src_ip6;
+ h6->ip6_dst = id->dst_ip6;
+ } else {
+ h6->ip6_src = id->dst_ip6;
+ h6->ip6_dst = id->src_ip6;
+ }
+
+ th = (struct tcphdr *)(h6 + 1);
+ break;
+#endif
+ }
+
+ if (dir) {
+ th->th_sport = htons(id->src_port);
+ th->th_dport = htons(id->dst_port);
+ } else {
+ th->th_sport = htons(id->dst_port);
+ th->th_dport = htons(id->src_port);
+ }
+ th->th_off = sizeof(struct tcphdr) >> 2;
+
+ if (flags & TH_RST) {
+ if (flags & TH_ACK) {
+ th->th_seq = htonl(ack);
+ th->th_flags = TH_RST;
+ } else {
+ if (flags & TH_SYN)
+ seq++;
+ th->th_ack = htonl(seq);
+ th->th_flags = TH_RST | TH_ACK;
+ }
+ } else {
+ /*
+ * Keepalive - use caller provided sequence numbers
+ */
+ th->th_seq = htonl(seq);
+ th->th_ack = htonl(ack);
+ th->th_flags = TH_ACK;
+ }
+
+ switch (id->addr_type) {
+ case 4:
+ th->th_sum = in_cksum(m, len);
+
+ /* finish the ip header */
+ h->ip_v = 4;
+ h->ip_hl = sizeof(*h) >> 2;
+ h->ip_tos = IPTOS_LOWDELAY;
+ h->ip_off = 0;
+ /* ip_len must be in host format for ip_output */
+ h->ip_len = len;
+ h->ip_ttl = V_ip_defttl;
+ h->ip_sum = 0;
+ break;
+#ifdef INET6
+ case 6:
+ th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(*h6),
+ sizeof(struct tcphdr));
+
+ /* finish the ip6 header */
+ h6->ip6_vfc |= IPV6_VERSION;
+ h6->ip6_hlim = IPV6_DEFHLIM;
+ break;
+#endif
+ }
+
+ return (m);
+}
+
+/*
+ * This procedure is only used to handle keepalives. It is invoked
+ * every dyn_keepalive_period
+ */
+static void
+ipfw_tick(void * vnetx)
+{
+ struct mbuf *m0, *m, *mnext, **mtailp;
+#ifdef INET6
+ struct mbuf *m6, **m6_tailp;
+#endif
+ int i;
+ ipfw_dyn_rule *q;
+#ifdef VIMAGE
+ struct vnet *vp = vnetx;
+#endif
+
+ CURVNET_SET(vp);
+ if (V_dyn_keepalive == 0 || V_ipfw_dyn_v == NULL || V_dyn_count == 0)
+ goto done;
+
+ /*
+ * We make a chain of packets to go out here -- not deferring
+ * until after we drop the IPFW dynamic rule lock would result
+ * in a lock order reversal with the normal packet input -> ipfw
+ * call stack.
+ */
+ m0 = NULL;
+ mtailp = &m0;
+#ifdef INET6
+ m6 = NULL;
+ m6_tailp = &m6;
+#endif
+ IPFW_DYN_LOCK();
+ for (i = 0 ; i < V_curr_dyn_buckets ; i++) {
+ for (q = V_ipfw_dyn_v[i] ; q ; q = q->next ) {
+ if (q->dyn_type == O_LIMIT_PARENT)
+ continue;
+ if (q->id.proto != IPPROTO_TCP)
+ continue;
+ if ( (q->state & BOTH_SYN) != BOTH_SYN)
+ continue;
+ if (TIME_LEQ(time_uptime + V_dyn_keepalive_interval,
+ q->expire))
+ continue; /* too early */
+ if (TIME_LEQ(q->expire, time_uptime))
+ continue; /* too late, rule expired */
+
+ m = (q->state & ACK_REV) ? NULL :
+ ipfw_send_pkt(NULL, &(q->id), q->ack_rev - 1,
+ q->ack_fwd, TH_SYN);
+ mnext = (q->state & ACK_FWD) ? NULL :
+ ipfw_send_pkt(NULL, &(q->id), q->ack_fwd - 1,
+ q->ack_rev, 0);
+
+ switch (q->id.addr_type) {
+ case 4:
+ if (m != NULL) {
+ *mtailp = m;
+ mtailp = &(*mtailp)->m_nextpkt;
+ }
+ if (mnext != NULL) {
+ *mtailp = mnext;
+ mtailp = &(*mtailp)->m_nextpkt;
+ }
+ break;
+#ifdef INET6
+ case 6:
+ if (m != NULL) {
+ *m6_tailp = m;
+ m6_tailp = &(*m6_tailp)->m_nextpkt;
+ }
+ if (mnext != NULL) {
+ *m6_tailp = mnext;
+ m6_tailp = &(*m6_tailp)->m_nextpkt;
+ }
+ break;
+#endif
+ }
+ }
+ }
+ IPFW_DYN_UNLOCK();
+ for (m = m0; m != NULL; m = mnext) {
+ mnext = m->m_nextpkt;
+ m->m_nextpkt = NULL;
+ ip_output(m, NULL, NULL, 0, NULL, NULL);
+ }
+#ifdef INET6
+ for (m = m6; m != NULL; m = mnext) {
+ mnext = m->m_nextpkt;
+ m->m_nextpkt = NULL;
+ ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL);
+ }
+#endif
+done:
+ callout_reset_on(&V_ipfw_timeout, V_dyn_keepalive_period * hz,
+ ipfw_tick, vnetx, 0);
+ CURVNET_RESTORE();
+}
+
+void
+ipfw_dyn_attach(void)
+{
+ ipfw_dyn_rule_zone = uma_zcreate("IPFW dynamic rule",
+ sizeof(ipfw_dyn_rule), NULL, NULL, NULL, NULL,
+ UMA_ALIGN_PTR, 0);
+
+ IPFW_DYN_LOCK_INIT();
+}
+
+void
+ipfw_dyn_detach(void)
+{
+ uma_zdestroy(ipfw_dyn_rule_zone);
+ IPFW_DYN_LOCK_DESTROY();
+}
+
+void
+ipfw_dyn_init(void)
+{
+ V_ipfw_dyn_v = NULL;
+ V_dyn_buckets = 256; /* must be power of 2 */
+ V_curr_dyn_buckets = 256; /* must be power of 2 */
+
+ V_dyn_ack_lifetime = 300;
+ V_dyn_syn_lifetime = 20;
+ V_dyn_fin_lifetime = 1;
+ V_dyn_rst_lifetime = 1;
+ V_dyn_udp_lifetime = 10;
+ V_dyn_short_lifetime = 5;
+
+ V_dyn_keepalive_interval = 20;
+ V_dyn_keepalive_period = 5;
+ V_dyn_keepalive = 1; /* do send keepalives */
+
+ V_dyn_max = 4096; /* max # of dynamic rules */
+ callout_init(&V_ipfw_timeout, CALLOUT_MPSAFE);
+ callout_reset_on(&V_ipfw_timeout, hz, ipfw_tick, curvnet, 0);
+}
+
+void
+ipfw_dyn_uninit(int pass)
+{
+ if (pass == 0)
+ callout_drain(&V_ipfw_timeout);
+ else {
+ if (V_ipfw_dyn_v != NULL)
+ free(V_ipfw_dyn_v, M_IPFW);
+ }
+}
+
+int
+ipfw_dyn_len(void)
+{
+ return (V_ipfw_dyn_v == NULL) ? 0 :
+ (V_dyn_count * sizeof(ipfw_dyn_rule));
+}
+
+void
+ipfw_get_dynamic(char **pbp, const char *ep)
+{
+ ipfw_dyn_rule *p, *last = NULL;
+ char *bp;
+ int i;
+
+ if (V_ipfw_dyn_v == NULL)
+ return;
+ bp = *pbp;
+
+ IPFW_DYN_LOCK();
+ for (i = 0 ; i < V_curr_dyn_buckets; i++)
+ for (p = V_ipfw_dyn_v[i] ; p != NULL; p = p->next) {
+ if (bp + sizeof *p <= ep) {
+ ipfw_dyn_rule *dst =
+ (ipfw_dyn_rule *)bp;
+ bcopy(p, dst, sizeof *p);
+ bcopy(&(p->rule->rulenum), &(dst->rule),
+ sizeof(p->rule->rulenum));
+ /*
+ * store set number into high word of
+ * dst->rule pointer.
+ */
+ bcopy(&(p->rule->set),
+ (char *)&dst->rule +
+ sizeof(p->rule->rulenum),
+ sizeof(p->rule->set));
+ /*
+ * store a non-null value in "next".
+ * The userland code will interpret a
+ * NULL here as a marker
+ * for the last dynamic rule.
+ */
+ bcopy(&dst, &dst->next, sizeof(dst));
+ last = dst;
+ dst->expire =
+ TIME_LEQ(dst->expire, time_uptime) ?
+ 0 : dst->expire - time_uptime ;
+ bp += sizeof(ipfw_dyn_rule);
+ }
+ }
+ IPFW_DYN_UNLOCK();
+ if (last != NULL) /* mark last dynamic rule */
+ bzero(&last->next, sizeof(last));
+ *pbp = bp;
+}
+/* end of file */
diff --git a/sys/netpfil/ipfw/ip_fw_log.c b/sys/netpfil/ipfw/ip_fw_log.c
new file mode 100644
index 0000000..3cbb5fa
--- /dev/null
+++ b/sys/netpfil/ipfw/ip_fw_log.c
@@ -0,0 +1,553 @@
+/*-
+ * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ * Logging support for ipfw
+ */
+
+#include "opt_ipfw.h"
+#include "opt_inet.h"
+#ifndef INET
+#error IPFIREWALL requires INET.
+#endif /* INET */
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+#include <net/ethernet.h> /* for ETHERTYPE_IP */
+#include <net/if.h>
+#include <net/if_clone.h>
+#include <net/vnet.h>
+#include <net/if_types.h> /* for IFT_PFLOG */
+#include <net/bpf.h> /* for BPF */
+
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip_fw.h>
+#include <netinet/tcp_var.h>
+#include <netinet/udp.h>
+
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+#ifdef INET6
+#include <netinet6/in6_var.h> /* ip6_sprintf() */
+#endif
+
+#include <netpfil/ipfw/ip_fw_private.h>
+
+#ifdef MAC
+#include <security/mac/mac_framework.h>
+#endif
+
+/*
+ * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T
+ * Other macros just cast void * into the appropriate type
+ */
+#define L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl))
+#define TCP(p) ((struct tcphdr *)(p))
+#define SCTP(p) ((struct sctphdr *)(p))
+#define UDP(p) ((struct udphdr *)(p))
+#define ICMP(p) ((struct icmphdr *)(p))
+#define ICMP6(p) ((struct icmp6_hdr *)(p))
+
+#define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0
+#define SNP(buf) buf, sizeof(buf)
+
+#ifdef WITHOUT_BPF
+void
+ipfw_log_bpf(int onoff)
+{
+}
+#else /* !WITHOUT_BPF */
+static struct ifnet *log_if; /* hook to attach to bpf */
+static struct rwlock log_if_lock;
+#define LOGIF_LOCK_INIT(x) rw_init(&log_if_lock, "ipfw log_if lock")
+#define LOGIF_LOCK_DESTROY(x) rw_destroy(&log_if_lock)
+#define LOGIF_RLOCK(x) rw_rlock(&log_if_lock)
+#define LOGIF_RUNLOCK(x) rw_runlock(&log_if_lock)
+#define LOGIF_WLOCK(x) rw_wlock(&log_if_lock)
+#define LOGIF_WUNLOCK(x) rw_wunlock(&log_if_lock)
+
+#define IPFWNAME "ipfw"
+
+/* we use this dummy function for all ifnet callbacks */
+static int
+log_dummy(struct ifnet *ifp, u_long cmd, caddr_t addr)
+{
+ return EINVAL;
+}
+
+static int
+ipfw_log_output(struct ifnet *ifp, struct mbuf *m,
+ struct sockaddr *dst, struct route *ro)
+{
+ if (m != NULL)
+ FREE_PKT(m);
+ return EINVAL;
+}
+
+static void
+ipfw_log_start(struct ifnet* ifp)
+{
+ panic("ipfw_log_start() must not be called");
+}
+
+static const u_char ipfwbroadcastaddr[6] =
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
+
+static int
+ipfw_log_clone_match(struct if_clone *ifc, const char *name)
+{
+
+ return (strncmp(name, IPFWNAME, sizeof(IPFWNAME) - 1) == 0);
+}
+
+static int
+ipfw_log_clone_create(struct if_clone *ifc, char *name, size_t len,
+ caddr_t params)
+{
+ int error;
+ int unit;
+ struct ifnet *ifp;
+
+ error = ifc_name2unit(name, &unit);
+ if (error)
+ return (error);
+
+ error = ifc_alloc_unit(ifc, &unit);
+ if (error)
+ return (error);
+
+ ifp = if_alloc(IFT_PFLOG);
+ if (ifp == NULL) {
+ ifc_free_unit(ifc, unit);
+ return (ENOSPC);
+ }
+ ifp->if_dname = IPFWNAME;
+ ifp->if_dunit = unit;
+ snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", IPFWNAME, unit);
+ strlcpy(name, ifp->if_xname, len);
+ ifp->if_mtu = 65536;
+ ifp->if_flags = IFF_UP | IFF_SIMPLEX | IFF_MULTICAST;
+ ifp->if_init = (void *)log_dummy;
+ ifp->if_ioctl = log_dummy;
+ ifp->if_start = ipfw_log_start;
+ ifp->if_output = ipfw_log_output;
+ ifp->if_addrlen = 6;
+ ifp->if_hdrlen = 14;
+ ifp->if_broadcastaddr = ipfwbroadcastaddr;
+ ifp->if_baudrate = IF_Mbps(10);
+
+ LOGIF_WLOCK();
+ if (log_if == NULL)
+ log_if = ifp;
+ else {
+ LOGIF_WUNLOCK();
+ if_free(ifp);
+ ifc_free_unit(ifc, unit);
+ return (EEXIST);
+ }
+ LOGIF_WUNLOCK();
+ if_attach(ifp);
+ bpfattach(ifp, DLT_EN10MB, 14);
+
+ return (0);
+}
+
+static int
+ipfw_log_clone_destroy(struct if_clone *ifc, struct ifnet *ifp)
+{
+ int unit;
+
+ if (ifp == NULL)
+ return (0);
+
+ LOGIF_WLOCK();
+ if (log_if != NULL && ifp == log_if)
+ log_if = NULL;
+ else {
+ LOGIF_WUNLOCK();
+ return (EINVAL);
+ }
+ LOGIF_WUNLOCK();
+
+ unit = ifp->if_dunit;
+ bpfdetach(ifp);
+ if_detach(ifp);
+ if_free(ifp);
+ ifc_free_unit(ifc, unit);
+
+ return (0);
+}
+
+static struct if_clone ipfw_log_cloner = IFC_CLONE_INITIALIZER(
+ IPFWNAME, NULL, IF_MAXUNIT,
+ NULL, ipfw_log_clone_match, ipfw_log_clone_create, ipfw_log_clone_destroy);
+
+void
+ipfw_log_bpf(int onoff)
+{
+
+ if (onoff) {
+ LOGIF_LOCK_INIT();
+ if_clone_attach(&ipfw_log_cloner);
+ } else {
+ if_clone_detach(&ipfw_log_cloner);
+ LOGIF_LOCK_DESTROY();
+ }
+}
+#endif /* !WITHOUT_BPF */
+
+/*
+ * We enter here when we have a rule with O_LOG.
+ * XXX this function alone takes about 2Kbytes of code!
+ */
+void
+ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args,
+ struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg,
+ struct ip *ip)
+{
+ char *action;
+ int limit_reached = 0;
+ char action2[92], proto[128], fragment[32];
+
+ if (V_fw_verbose == 0) {
+#ifndef WITHOUT_BPF
+ LOGIF_RLOCK();
+ if (log_if == NULL || log_if->if_bpf == NULL) {
+ LOGIF_RUNLOCK();
+ return;
+ }
+
+ if (args->eh) /* layer2, use orig hdr */
+ BPF_MTAP2(log_if, args->eh, ETHER_HDR_LEN, m);
+ else
+ /* Add fake header. Later we will store
+ * more info in the header.
+ */
+ BPF_MTAP2(log_if, "DDDDDDSSSSSS\x08\x00", ETHER_HDR_LEN, m);
+ LOGIF_RUNLOCK();
+#endif /* !WITHOUT_BPF */
+ return;
+ }
+ /* the old 'log' function */
+ fragment[0] = '\0';
+ proto[0] = '\0';
+
+ if (f == NULL) { /* bogus pkt */
+ if (V_verbose_limit != 0 && V_norule_counter >= V_verbose_limit)
+ return;
+ V_norule_counter++;
+ if (V_norule_counter == V_verbose_limit)
+ limit_reached = V_verbose_limit;
+ action = "Refuse";
+ } else { /* O_LOG is the first action, find the real one */
+ ipfw_insn *cmd = ACTION_PTR(f);
+ ipfw_insn_log *l = (ipfw_insn_log *)cmd;
+
+ if (l->max_log != 0 && l->log_left == 0)
+ return;
+ l->log_left--;
+ if (l->log_left == 0)
+ limit_reached = l->max_log;
+ cmd += F_LEN(cmd); /* point to first action */
+ if (cmd->opcode == O_ALTQ) {
+ ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd;
+
+ snprintf(SNPARGS(action2, 0), "Altq %d",
+ altq->qid);
+ cmd += F_LEN(cmd);
+ }
+ if (cmd->opcode == O_PROB)
+ cmd += F_LEN(cmd);
+
+ if (cmd->opcode == O_TAG)
+ cmd += F_LEN(cmd);
+
+ action = action2;
+ switch (cmd->opcode) {
+ case O_DENY:
+ action = "Deny";
+ break;
+
+ case O_REJECT:
+ if (cmd->arg1==ICMP_REJECT_RST)
+ action = "Reset";
+ else if (cmd->arg1==ICMP_UNREACH_HOST)
+ action = "Reject";
+ else
+ snprintf(SNPARGS(action2, 0), "Unreach %d",
+ cmd->arg1);
+ break;
+
+ case O_UNREACH6:
+ if (cmd->arg1==ICMP6_UNREACH_RST)
+ action = "Reset";
+ else
+ snprintf(SNPARGS(action2, 0), "Unreach %d",
+ cmd->arg1);
+ break;
+
+ case O_ACCEPT:
+ action = "Accept";
+ break;
+ case O_COUNT:
+ action = "Count";
+ break;
+ case O_DIVERT:
+ snprintf(SNPARGS(action2, 0), "Divert %d",
+ cmd->arg1);
+ break;
+ case O_TEE:
+ snprintf(SNPARGS(action2, 0), "Tee %d",
+ cmd->arg1);
+ break;
+ case O_SETFIB:
+ snprintf(SNPARGS(action2, 0), "SetFib %d",
+ cmd->arg1);
+ break;
+ case O_SKIPTO:
+ snprintf(SNPARGS(action2, 0), "SkipTo %d",
+ cmd->arg1);
+ break;
+ case O_PIPE:
+ snprintf(SNPARGS(action2, 0), "Pipe %d",
+ cmd->arg1);
+ break;
+ case O_QUEUE:
+ snprintf(SNPARGS(action2, 0), "Queue %d",
+ cmd->arg1);
+ break;
+ case O_FORWARD_IP: {
+ ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd;
+ int len;
+ struct in_addr dummyaddr;
+ if (sa->sa.sin_addr.s_addr == INADDR_ANY)
+ dummyaddr.s_addr = htonl(tablearg);
+ else
+ dummyaddr.s_addr = sa->sa.sin_addr.s_addr;
+
+ len = snprintf(SNPARGS(action2, 0), "Forward to %s",
+ inet_ntoa(dummyaddr));
+
+ if (sa->sa.sin_port)
+ snprintf(SNPARGS(action2, len), ":%d",
+ sa->sa.sin_port);
+ }
+ break;
+#ifdef INET6
+ case O_FORWARD_IP6: {
+ char buf[INET6_ADDRSTRLEN];
+ ipfw_insn_sa6 *sa = (ipfw_insn_sa6 *)cmd;
+ int len;
+
+ len = snprintf(SNPARGS(action2, 0), "Forward to [%s]",
+ ip6_sprintf(buf, &sa->sa.sin6_addr));
+
+ if (sa->sa.sin6_port)
+ snprintf(SNPARGS(action2, len), ":%u",
+ sa->sa.sin6_port);
+ }
+ break;
+#endif
+ case O_NETGRAPH:
+ snprintf(SNPARGS(action2, 0), "Netgraph %d",
+ cmd->arg1);
+ break;
+ case O_NGTEE:
+ snprintf(SNPARGS(action2, 0), "Ngtee %d",
+ cmd->arg1);
+ break;
+ case O_NAT:
+ action = "Nat";
+ break;
+ case O_REASS:
+ action = "Reass";
+ break;
+ case O_CALLRETURN:
+ if (cmd->len & F_NOT)
+ action = "Return";
+ else
+ snprintf(SNPARGS(action2, 0), "Call %d",
+ cmd->arg1);
+ break;
+ default:
+ action = "UNKNOWN";
+ break;
+ }
+ }
+
+ if (hlen == 0) { /* non-ip */
+ snprintf(SNPARGS(proto, 0), "MAC");
+
+ } else {
+ int len;
+#ifdef INET6
+ char src[INET6_ADDRSTRLEN + 2], dst[INET6_ADDRSTRLEN + 2];
+#else
+ char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN];
+#endif
+ struct icmphdr *icmp;
+ struct tcphdr *tcp;
+ struct udphdr *udp;
+#ifdef INET6
+ struct ip6_hdr *ip6 = NULL;
+ struct icmp6_hdr *icmp6;
+ u_short ip6f_mf;
+#endif
+ src[0] = '\0';
+ dst[0] = '\0';
+#ifdef INET6
+ ip6f_mf = offset & IP6F_MORE_FRAG;
+ offset &= IP6F_OFF_MASK;
+
+ if (IS_IP6_FLOW_ID(&(args->f_id))) {
+ char ip6buf[INET6_ADDRSTRLEN];
+ snprintf(src, sizeof(src), "[%s]",
+ ip6_sprintf(ip6buf, &args->f_id.src_ip6));
+ snprintf(dst, sizeof(dst), "[%s]",
+ ip6_sprintf(ip6buf, &args->f_id.dst_ip6));
+
+ ip6 = (struct ip6_hdr *)ip;
+ tcp = (struct tcphdr *)(((char *)ip) + hlen);
+ udp = (struct udphdr *)(((char *)ip) + hlen);
+ } else
+#endif
+ {
+ tcp = L3HDR(struct tcphdr, ip);
+ udp = L3HDR(struct udphdr, ip);
+
+ inet_ntop(AF_INET, &ip->ip_src, src, sizeof(src));
+ inet_ntop(AF_INET, &ip->ip_dst, dst, sizeof(dst));
+ }
+
+ switch (args->f_id.proto) {
+ case IPPROTO_TCP:
+ len = snprintf(SNPARGS(proto, 0), "TCP %s", src);
+ if (offset == 0)
+ snprintf(SNPARGS(proto, len), ":%d %s:%d",
+ ntohs(tcp->th_sport),
+ dst,
+ ntohs(tcp->th_dport));
+ else
+ snprintf(SNPARGS(proto, len), " %s", dst);
+ break;
+
+ case IPPROTO_UDP:
+ len = snprintf(SNPARGS(proto, 0), "UDP %s", src);
+ if (offset == 0)
+ snprintf(SNPARGS(proto, len), ":%d %s:%d",
+ ntohs(udp->uh_sport),
+ dst,
+ ntohs(udp->uh_dport));
+ else
+ snprintf(SNPARGS(proto, len), " %s", dst);
+ break;
+
+ case IPPROTO_ICMP:
+ icmp = L3HDR(struct icmphdr, ip);
+ if (offset == 0)
+ len = snprintf(SNPARGS(proto, 0),
+ "ICMP:%u.%u ",
+ icmp->icmp_type, icmp->icmp_code);
+ else
+ len = snprintf(SNPARGS(proto, 0), "ICMP ");
+ len += snprintf(SNPARGS(proto, len), "%s", src);
+ snprintf(SNPARGS(proto, len), " %s", dst);
+ break;
+#ifdef INET6
+ case IPPROTO_ICMPV6:
+ icmp6 = (struct icmp6_hdr *)(((char *)ip) + hlen);
+ if (offset == 0)
+ len = snprintf(SNPARGS(proto, 0),
+ "ICMPv6:%u.%u ",
+ icmp6->icmp6_type, icmp6->icmp6_code);
+ else
+ len = snprintf(SNPARGS(proto, 0), "ICMPv6 ");
+ len += snprintf(SNPARGS(proto, len), "%s", src);
+ snprintf(SNPARGS(proto, len), " %s", dst);
+ break;
+#endif
+ default:
+ len = snprintf(SNPARGS(proto, 0), "P:%d %s",
+ args->f_id.proto, src);
+ snprintf(SNPARGS(proto, len), " %s", dst);
+ break;
+ }
+
+#ifdef INET6
+ if (IS_IP6_FLOW_ID(&(args->f_id))) {
+ if (offset & (IP6F_OFF_MASK | IP6F_MORE_FRAG))
+ snprintf(SNPARGS(fragment, 0),
+ " (frag %08x:%d@%d%s)",
+ args->f_id.extra,
+ ntohs(ip6->ip6_plen) - hlen,
+ ntohs(offset) << 3, ip6f_mf ? "+" : "");
+ } else
+#endif
+ {
+ int ipoff, iplen;
+ ipoff = ntohs(ip->ip_off);
+ iplen = ntohs(ip->ip_len);
+ if (ipoff & (IP_MF | IP_OFFMASK))
+ snprintf(SNPARGS(fragment, 0),
+ " (frag %d:%d@%d%s)",
+ ntohs(ip->ip_id), iplen - (ip->ip_hl << 2),
+ offset << 3,
+ (ipoff & IP_MF) ? "+" : "");
+ }
+ }
+#ifdef __FreeBSD__
+ if (oif || m->m_pkthdr.rcvif)
+ log(LOG_SECURITY | LOG_INFO,
+ "ipfw: %d %s %s %s via %s%s\n",
+ f ? f->rulenum : -1,
+ action, proto, oif ? "out" : "in",
+ oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname,
+ fragment);
+ else
+#endif
+ log(LOG_SECURITY | LOG_INFO,
+ "ipfw: %d %s %s [no if info]%s\n",
+ f ? f->rulenum : -1,
+ action, proto, fragment);
+ if (limit_reached)
+ log(LOG_SECURITY | LOG_NOTICE,
+ "ipfw: limit %d reached on entry %d\n",
+ limit_reached, f ? f->rulenum : -1);
+}
+/* end of file */
diff --git a/sys/netpfil/ipfw/ip_fw_nat.c b/sys/netpfil/ipfw/ip_fw_nat.c
new file mode 100644
index 0000000..6318633
--- /dev/null
+++ b/sys/netpfil/ipfw/ip_fw_nat.c
@@ -0,0 +1,662 @@
+/*-
+ * Copyright (c) 2008 Paolo Pisati
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/eventhandler.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/rwlock.h>
+
+#define IPFW_INTERNAL /* Access to protected data structures in ip_fw.h. */
+
+#include <netinet/libalias/alias.h>
+#include <netinet/libalias/alias_local.h>
+
+#include <net/if.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip_fw.h>
+#include <netinet/tcp.h>
+#include <netinet/udp.h>
+
+#include <netpfil/ipfw/ip_fw_private.h>
+
+#include <machine/in_cksum.h> /* XXX for in_cksum */
+
+static VNET_DEFINE(eventhandler_tag, ifaddr_event_tag);
+#define V_ifaddr_event_tag VNET(ifaddr_event_tag)
+
+static void
+ifaddr_change(void *arg __unused, struct ifnet *ifp)
+{
+ struct cfg_nat *ptr;
+ struct ifaddr *ifa;
+ struct ip_fw_chain *chain;
+
+ chain = &V_layer3_chain;
+ IPFW_WLOCK(chain);
+ /* Check every nat entry... */
+ LIST_FOREACH(ptr, &chain->nat, _next) {
+ /* ...using nic 'ifp->if_xname' as dynamic alias address. */
+ if (strncmp(ptr->if_name, ifp->if_xname, IF_NAMESIZE) != 0)
+ continue;
+ if_addr_rlock(ifp);
+ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ if (ifa->ifa_addr == NULL)
+ continue;
+ if (ifa->ifa_addr->sa_family != AF_INET)
+ continue;
+ ptr->ip = ((struct sockaddr_in *)
+ (ifa->ifa_addr))->sin_addr;
+ LibAliasSetAddress(ptr->lib, ptr->ip);
+ }
+ if_addr_runlock(ifp);
+ }
+ IPFW_WUNLOCK(chain);
+}
+
+/*
+ * delete the pointers for nat entry ix, or all of them if ix < 0
+ */
+static void
+flush_nat_ptrs(struct ip_fw_chain *chain, const int ix)
+{
+ int i;
+ ipfw_insn_nat *cmd;
+
+ IPFW_WLOCK_ASSERT(chain);
+ for (i = 0; i < chain->n_rules; i++) {
+ cmd = (ipfw_insn_nat *)ACTION_PTR(chain->map[i]);
+ /* XXX skip log and the like ? */
+ if (cmd->o.opcode == O_NAT && cmd->nat != NULL &&
+ (ix < 0 || cmd->nat->id == ix))
+ cmd->nat = NULL;
+ }
+}
+
+static void
+del_redir_spool_cfg(struct cfg_nat *n, struct redir_chain *head)
+{
+ struct cfg_redir *r, *tmp_r;
+ struct cfg_spool *s, *tmp_s;
+ int i, num;
+
+ LIST_FOREACH_SAFE(r, head, _next, tmp_r) {
+ num = 1; /* Number of alias_link to delete. */
+ switch (r->mode) {
+ case REDIR_PORT:
+ num = r->pport_cnt;
+ /* FALLTHROUGH */
+ case REDIR_ADDR:
+ case REDIR_PROTO:
+ /* Delete all libalias redirect entry. */
+ for (i = 0; i < num; i++)
+ LibAliasRedirectDelete(n->lib, r->alink[i]);
+ /* Del spool cfg if any. */
+ LIST_FOREACH_SAFE(s, &r->spool_chain, _next, tmp_s) {
+ LIST_REMOVE(s, _next);
+ free(s, M_IPFW);
+ }
+ free(r->alink, M_IPFW);
+ LIST_REMOVE(r, _next);
+ free(r, M_IPFW);
+ break;
+ default:
+ printf("unknown redirect mode: %u\n", r->mode);
+ /* XXX - panic?!?!? */
+ break;
+ }
+ }
+}
+
+static void
+add_redir_spool_cfg(char *buf, struct cfg_nat *ptr)
+{
+ struct cfg_redir *r, *ser_r;
+ struct cfg_spool *s, *ser_s;
+ int cnt, off, i;
+
+ for (cnt = 0, off = 0; cnt < ptr->redir_cnt; cnt++) {
+ ser_r = (struct cfg_redir *)&buf[off];
+ r = malloc(SOF_REDIR, M_IPFW, M_WAITOK | M_ZERO);
+ memcpy(r, ser_r, SOF_REDIR);
+ LIST_INIT(&r->spool_chain);
+ off += SOF_REDIR;
+ r->alink = malloc(sizeof(struct alias_link *) * r->pport_cnt,
+ M_IPFW, M_WAITOK | M_ZERO);
+ switch (r->mode) {
+ case REDIR_ADDR:
+ r->alink[0] = LibAliasRedirectAddr(ptr->lib, r->laddr,
+ r->paddr);
+ break;
+ case REDIR_PORT:
+ for (i = 0 ; i < r->pport_cnt; i++) {
+ /* If remotePort is all ports, set it to 0. */
+ u_short remotePortCopy = r->rport + i;
+ if (r->rport_cnt == 1 && r->rport == 0)
+ remotePortCopy = 0;
+ r->alink[i] = LibAliasRedirectPort(ptr->lib,
+ r->laddr, htons(r->lport + i), r->raddr,
+ htons(remotePortCopy), r->paddr,
+ htons(r->pport + i), r->proto);
+ if (r->alink[i] == NULL) {
+ r->alink[0] = NULL;
+ break;
+ }
+ }
+ break;
+ case REDIR_PROTO:
+ r->alink[0] = LibAliasRedirectProto(ptr->lib ,r->laddr,
+ r->raddr, r->paddr, r->proto);
+ break;
+ default:
+ printf("unknown redirect mode: %u\n", r->mode);
+ break;
+ }
+ /* XXX perhaps return an error instead of panic ? */
+ if (r->alink[0] == NULL)
+ panic("LibAliasRedirect* returned NULL");
+ /* LSNAT handling. */
+ for (i = 0; i < r->spool_cnt; i++) {
+ ser_s = (struct cfg_spool *)&buf[off];
+ s = malloc(SOF_REDIR, M_IPFW, M_WAITOK | M_ZERO);
+ memcpy(s, ser_s, SOF_SPOOL);
+ LibAliasAddServer(ptr->lib, r->alink[0],
+ s->addr, htons(s->port));
+ off += SOF_SPOOL;
+ /* Hook spool entry. */
+ LIST_INSERT_HEAD(&r->spool_chain, s, _next);
+ }
+ /* And finally hook this redir entry. */
+ LIST_INSERT_HEAD(&ptr->redir_chain, r, _next);
+ }
+}
+
+static int
+ipfw_nat(struct ip_fw_args *args, struct cfg_nat *t, struct mbuf *m)
+{
+ struct mbuf *mcl;
+ struct ip *ip;
+ /* XXX - libalias duct tape */
+ int ldt, retval, found;
+ struct ip_fw_chain *chain;
+ char *c;
+
+ ldt = 0;
+ retval = 0;
+ mcl = m_megapullup(m, m->m_pkthdr.len);
+ if (mcl == NULL) {
+ args->m = NULL;
+ return (IP_FW_DENY);
+ }
+ ip = mtod(mcl, struct ip *);
+
+ /*
+ * XXX - Libalias checksum offload 'duct tape':
+ *
+ * locally generated packets have only pseudo-header checksum
+ * calculated and libalias will break it[1], so mark them for
+ * later fix. Moreover there are cases when libalias modifies
+ * tcp packet data[2], mark them for later fix too.
+ *
+ * [1] libalias was never meant to run in kernel, so it does
+ * not have any knowledge about checksum offloading, and
+ * expects a packet with a full internet checksum.
+ * Unfortunately, packets generated locally will have just the
+ * pseudo header calculated, and when libalias tries to adjust
+ * the checksum it will actually compute a wrong value.
+ *
+ * [2] when libalias modifies tcp's data content, full TCP
+ * checksum has to be recomputed: the problem is that
+ * libalias does not have any idea about checksum offloading.
+ * To work around this, we do not do checksumming in LibAlias,
+ * but only mark the packets in th_x2 field. If we receive a
+ * marked packet, we calculate correct checksum for it
+ * aware of offloading. Why such a terrible hack instead of
+ * recalculating checksum for each packet?
+ * Because the previous checksum was not checked!
+ * Recalculating checksums for EVERY packet will hide ALL
+ * transmission errors. Yes, marked packets still suffer from
+ * this problem. But, sigh, natd(8) has this problem, too.
+ *
+ * TODO: -make libalias mbuf aware (so
+ * it can handle delayed checksum and tso)
+ */
+
+ if (mcl->m_pkthdr.rcvif == NULL &&
+ mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA)
+ ldt = 1;
+
+ c = mtod(mcl, char *);
+
+ /* Check if this is 'global' instance */
+ if (t == NULL) {
+ if (args->oif == NULL) {
+ /* Wrong direction, skip processing */
+ args->m = mcl;
+ return (IP_FW_NAT);
+ }
+
+ found = 0;
+ chain = &V_layer3_chain;
+ IPFW_RLOCK(chain);
+ /* Check every nat entry... */
+ LIST_FOREACH(t, &chain->nat, _next) {
+ if ((t->mode & PKT_ALIAS_SKIP_GLOBAL) != 0)
+ continue;
+ retval = LibAliasOutTry(t->lib, c,
+ mcl->m_len + M_TRAILINGSPACE(mcl), 0);
+ if (retval == PKT_ALIAS_OK) {
+ /* Nat instance recognises state */
+ found = 1;
+ break;
+ }
+ }
+ IPFW_RUNLOCK(chain);
+ if (found != 1) {
+ /* No instance found, return ignore */
+ args->m = mcl;
+ return (IP_FW_NAT);
+ }
+ } else {
+ if (args->oif == NULL)
+ retval = LibAliasIn(t->lib, c,
+ mcl->m_len + M_TRAILINGSPACE(mcl));
+ else
+ retval = LibAliasOut(t->lib, c,
+ mcl->m_len + M_TRAILINGSPACE(mcl));
+ }
+
+ /*
+ * We drop packet when:
+ * 1. libalias returns PKT_ALIAS_ERROR;
+ * 2. For incoming packets:
+ * a) for unresolved fragments;
+ * b) libalias returns PKT_ALIAS_IGNORED and
+ * PKT_ALIAS_DENY_INCOMING flag is set.
+ */
+ if (retval == PKT_ALIAS_ERROR ||
+ (args->oif == NULL && (retval == PKT_ALIAS_UNRESOLVED_FRAGMENT ||
+ (retval == PKT_ALIAS_IGNORED &&
+ (t->mode & PKT_ALIAS_DENY_INCOMING) != 0)))) {
+ /* XXX - should i add some logging? */
+ m_free(mcl);
+ args->m = NULL;
+ return (IP_FW_DENY);
+ }
+
+ if (retval == PKT_ALIAS_RESPOND)
+ mcl->m_flags |= M_SKIP_FIREWALL;
+ mcl->m_pkthdr.len = mcl->m_len = ntohs(ip->ip_len);
+
+ /*
+ * XXX - libalias checksum offload
+ * 'duct tape' (see above)
+ */
+
+ if ((ip->ip_off & htons(IP_OFFMASK)) == 0 &&
+ ip->ip_p == IPPROTO_TCP) {
+ struct tcphdr *th;
+
+ th = (struct tcphdr *)(ip + 1);
+ if (th->th_x2)
+ ldt = 1;
+ }
+
+ if (ldt) {
+ struct tcphdr *th;
+ struct udphdr *uh;
+ u_short cksum;
+
+ ip->ip_len = ntohs(ip->ip_len);
+ cksum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
+ htons(ip->ip_p + ip->ip_len - (ip->ip_hl << 2)));
+
+ switch (ip->ip_p) {
+ case IPPROTO_TCP:
+ th = (struct tcphdr *)(ip + 1);
+ /*
+ * Maybe it was set in
+ * libalias...
+ */
+ th->th_x2 = 0;
+ th->th_sum = cksum;
+ mcl->m_pkthdr.csum_data =
+ offsetof(struct tcphdr, th_sum);
+ break;
+ case IPPROTO_UDP:
+ uh = (struct udphdr *)(ip + 1);
+ uh->uh_sum = cksum;
+ mcl->m_pkthdr.csum_data =
+ offsetof(struct udphdr, uh_sum);
+ break;
+ }
+ /* No hw checksum offloading: do it ourselves */
+ if ((mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA) == 0) {
+ in_delayed_cksum(mcl);
+ mcl->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
+ }
+ ip->ip_len = htons(ip->ip_len);
+ }
+ args->m = mcl;
+ return (IP_FW_NAT);
+}
+
+static struct cfg_nat *
+lookup_nat(struct nat_list *l, int nat_id)
+{
+ struct cfg_nat *res;
+
+ LIST_FOREACH(res, l, _next) {
+ if (res->id == nat_id)
+ break;
+ }
+ return res;
+}
+
+static int
+ipfw_nat_cfg(struct sockopt *sopt)
+{
+ struct cfg_nat *cfg, *ptr;
+ char *buf;
+ struct ip_fw_chain *chain = &V_layer3_chain;
+ size_t len;
+ int gencnt, error = 0;
+
+ len = sopt->sopt_valsize;
+ buf = malloc(len, M_TEMP, M_WAITOK | M_ZERO);
+ if ((error = sooptcopyin(sopt, buf, len, sizeof(struct cfg_nat))) != 0)
+ goto out;
+
+ cfg = (struct cfg_nat *)buf;
+ if (cfg->id < 0) {
+ error = EINVAL;
+ goto out;
+ }
+
+ /*
+ * Find/create nat rule.
+ */
+ IPFW_WLOCK(chain);
+ gencnt = chain->gencnt;
+ ptr = lookup_nat(&chain->nat, cfg->id);
+ if (ptr == NULL) {
+ IPFW_WUNLOCK(chain);
+ /* New rule: allocate and init new instance. */
+ ptr = malloc(sizeof(struct cfg_nat), M_IPFW, M_WAITOK | M_ZERO);
+ ptr->lib = LibAliasInit(NULL);
+ LIST_INIT(&ptr->redir_chain);
+ } else {
+ /* Entry already present: temporarily unhook it. */
+ LIST_REMOVE(ptr, _next);
+ flush_nat_ptrs(chain, cfg->id);
+ IPFW_WUNLOCK(chain);
+ }
+
+ /*
+ * Basic nat configuration.
+ */
+ ptr->id = cfg->id;
+ /*
+ * XXX - what if this rule doesn't nat any ip and just
+ * redirect?
+ * do we set aliasaddress to 0.0.0.0?
+ */
+ ptr->ip = cfg->ip;
+ ptr->redir_cnt = cfg->redir_cnt;
+ ptr->mode = cfg->mode;
+ LibAliasSetMode(ptr->lib, cfg->mode, cfg->mode);
+ LibAliasSetAddress(ptr->lib, ptr->ip);
+ memcpy(ptr->if_name, cfg->if_name, IF_NAMESIZE);
+
+ /*
+ * Redir and LSNAT configuration.
+ */
+ /* Delete old cfgs. */
+ del_redir_spool_cfg(ptr, &ptr->redir_chain);
+ /* Add new entries. */
+ add_redir_spool_cfg(&buf[(sizeof(struct cfg_nat))], ptr);
+
+ IPFW_WLOCK(chain);
+ /* Extra check to avoid race with another ipfw_nat_cfg() */
+ if (gencnt != chain->gencnt &&
+ ((cfg = lookup_nat(&chain->nat, ptr->id)) != NULL))
+ LIST_REMOVE(cfg, _next);
+ LIST_INSERT_HEAD(&chain->nat, ptr, _next);
+ chain->gencnt++;
+ IPFW_WUNLOCK(chain);
+
+out:
+ free(buf, M_TEMP);
+ return (error);
+}
+
+static int
+ipfw_nat_del(struct sockopt *sopt)
+{
+ struct cfg_nat *ptr;
+ struct ip_fw_chain *chain = &V_layer3_chain;
+ int i;
+
+ sooptcopyin(sopt, &i, sizeof i, sizeof i);
+ /* XXX validate i */
+ IPFW_WLOCK(chain);
+ ptr = lookup_nat(&chain->nat, i);
+ if (ptr == NULL) {
+ IPFW_WUNLOCK(chain);
+ return (EINVAL);
+ }
+ LIST_REMOVE(ptr, _next);
+ flush_nat_ptrs(chain, i);
+ IPFW_WUNLOCK(chain);
+ del_redir_spool_cfg(ptr, &ptr->redir_chain);
+ LibAliasUninit(ptr->lib);
+ free(ptr, M_IPFW);
+ return (0);
+}
+
+static int
+ipfw_nat_get_cfg(struct sockopt *sopt)
+{
+ struct ip_fw_chain *chain = &V_layer3_chain;
+ struct cfg_nat *n;
+ struct cfg_redir *r;
+ struct cfg_spool *s;
+ char *data;
+ int gencnt, nat_cnt, len, error;
+
+ nat_cnt = 0;
+ len = sizeof(nat_cnt);
+
+ IPFW_RLOCK(chain);
+retry:
+ gencnt = chain->gencnt;
+ /* Estimate memory amount */
+ LIST_FOREACH(n, &chain->nat, _next) {
+ nat_cnt++;
+ len += sizeof(struct cfg_nat);
+ LIST_FOREACH(r, &n->redir_chain, _next) {
+ len += sizeof(struct cfg_redir);
+ LIST_FOREACH(s, &r->spool_chain, _next)
+ len += sizeof(struct cfg_spool);
+ }
+ }
+ IPFW_RUNLOCK(chain);
+
+ data = malloc(len, M_TEMP, M_WAITOK | M_ZERO);
+ bcopy(&nat_cnt, data, sizeof(nat_cnt));
+
+ nat_cnt = 0;
+ len = sizeof(nat_cnt);
+
+ IPFW_RLOCK(chain);
+ if (gencnt != chain->gencnt) {
+ free(data, M_TEMP);
+ goto retry;
+ }
+ /* Serialize all the data. */
+ LIST_FOREACH(n, &chain->nat, _next) {
+ bcopy(n, &data[len], sizeof(struct cfg_nat));
+ len += sizeof(struct cfg_nat);
+ LIST_FOREACH(r, &n->redir_chain, _next) {
+ bcopy(r, &data[len], sizeof(struct cfg_redir));
+ len += sizeof(struct cfg_redir);
+ LIST_FOREACH(s, &r->spool_chain, _next) {
+ bcopy(s, &data[len], sizeof(struct cfg_spool));
+ len += sizeof(struct cfg_spool);
+ }
+ }
+ }
+ IPFW_RUNLOCK(chain);
+
+ error = sooptcopyout(sopt, data, len);
+ free(data, M_TEMP);
+
+ return (error);
+}
+
+static int
+ipfw_nat_get_log(struct sockopt *sopt)
+{
+ uint8_t *data;
+ struct cfg_nat *ptr;
+ int i, size;
+ struct ip_fw_chain *chain;
+
+ chain = &V_layer3_chain;
+
+ IPFW_RLOCK(chain);
+ /* one pass to count, one to copy the data */
+ i = 0;
+ LIST_FOREACH(ptr, &chain->nat, _next) {
+ if (ptr->lib->logDesc == NULL)
+ continue;
+ i++;
+ }
+ size = i * (LIBALIAS_BUF_SIZE + sizeof(int));
+ data = malloc(size, M_IPFW, M_NOWAIT | M_ZERO);
+ if (data == NULL) {
+ IPFW_RUNLOCK(chain);
+ return (ENOSPC);
+ }
+ i = 0;
+ LIST_FOREACH(ptr, &chain->nat, _next) {
+ if (ptr->lib->logDesc == NULL)
+ continue;
+ bcopy(&ptr->id, &data[i], sizeof(int));
+ i += sizeof(int);
+ bcopy(ptr->lib->logDesc, &data[i], LIBALIAS_BUF_SIZE);
+ i += LIBALIAS_BUF_SIZE;
+ }
+ IPFW_RUNLOCK(chain);
+ sooptcopyout(sopt, data, size);
+ free(data, M_IPFW);
+ return(0);
+}
+
+static void
+ipfw_nat_init(void)
+{
+
+ IPFW_WLOCK(&V_layer3_chain);
+ /* init ipfw hooks */
+ ipfw_nat_ptr = ipfw_nat;
+ lookup_nat_ptr = lookup_nat;
+ ipfw_nat_cfg_ptr = ipfw_nat_cfg;
+ ipfw_nat_del_ptr = ipfw_nat_del;
+ ipfw_nat_get_cfg_ptr = ipfw_nat_get_cfg;
+ ipfw_nat_get_log_ptr = ipfw_nat_get_log;
+ IPFW_WUNLOCK(&V_layer3_chain);
+ V_ifaddr_event_tag = EVENTHANDLER_REGISTER(
+ ifaddr_event, ifaddr_change,
+ NULL, EVENTHANDLER_PRI_ANY);
+}
+
+static void
+ipfw_nat_destroy(void)
+{
+ struct cfg_nat *ptr, *ptr_temp;
+ struct ip_fw_chain *chain;
+
+ chain = &V_layer3_chain;
+ IPFW_WLOCK(chain);
+ LIST_FOREACH_SAFE(ptr, &chain->nat, _next, ptr_temp) {
+ LIST_REMOVE(ptr, _next);
+ del_redir_spool_cfg(ptr, &ptr->redir_chain);
+ LibAliasUninit(ptr->lib);
+ free(ptr, M_IPFW);
+ }
+ EVENTHANDLER_DEREGISTER(ifaddr_event, V_ifaddr_event_tag);
+ flush_nat_ptrs(chain, -1 /* flush all */);
+ /* deregister ipfw_nat */
+ ipfw_nat_ptr = NULL;
+ lookup_nat_ptr = NULL;
+ ipfw_nat_cfg_ptr = NULL;
+ ipfw_nat_del_ptr = NULL;
+ ipfw_nat_get_cfg_ptr = NULL;
+ ipfw_nat_get_log_ptr = NULL;
+ IPFW_WUNLOCK(chain);
+}
+
+static int
+ipfw_nat_modevent(module_t mod, int type, void *unused)
+{
+ int err = 0;
+
+ switch (type) {
+ case MOD_LOAD:
+ ipfw_nat_init();
+ break;
+
+ case MOD_UNLOAD:
+ ipfw_nat_destroy();
+ break;
+
+ default:
+ return EOPNOTSUPP;
+ break;
+ }
+ return err;
+}
+
+static moduledata_t ipfw_nat_mod = {
+ "ipfw_nat",
+ ipfw_nat_modevent,
+ 0
+};
+
+DECLARE_MODULE(ipfw_nat, ipfw_nat_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY);
+MODULE_DEPEND(ipfw_nat, libalias, 1, 1, 1);
+MODULE_DEPEND(ipfw_nat, ipfw, 2, 2, 2);
+MODULE_VERSION(ipfw_nat, 1);
+/* end of file */
diff --git a/sys/netpfil/ipfw/ip_fw_pfil.c b/sys/netpfil/ipfw/ip_fw_pfil.c
new file mode 100644
index 0000000..4ab9316
--- /dev/null
+++ b/sys/netpfil/ipfw/ip_fw_pfil.c
@@ -0,0 +1,590 @@
+/*-
+ * Copyright (c) 2004 Andre Oppermann, Internet Business Solutions AG
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ipfw.h"
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#ifndef INET
+#error IPFIREWALL requires INET.
+#endif /* INET */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h>
+#include <net/route.h>
+#include <net/ethernet.h>
+#include <net/pfil.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip_fw.h>
+#ifdef INET6
+#include <netinet/ip6.h>
+#include <netinet6/ip6_var.h>
+#endif
+
+#include <netgraph/ng_ipfw.h>
+
+#include <netpfil/ipfw/ip_fw_private.h>
+
+#include <machine/in_cksum.h>
+
+static VNET_DEFINE(int, fw_enable) = 1;
+#define V_fw_enable VNET(fw_enable)
+
+#ifdef INET6
+static VNET_DEFINE(int, fw6_enable) = 1;
+#define V_fw6_enable VNET(fw6_enable)
+#endif
+
+static VNET_DEFINE(int, fwlink_enable) = 0;
+#define V_fwlink_enable VNET(fwlink_enable)
+
+int ipfw_chg_hook(SYSCTL_HANDLER_ARGS);
+
+/* Forward declarations. */
+static int ipfw_divert(struct mbuf **, int, struct ipfw_rule_ref *, int);
+static int ipfw_check_packet(void *, struct mbuf **, struct ifnet *, int,
+ struct inpcb *);
+static int ipfw_check_frame(void *, struct mbuf **, struct ifnet *, int,
+ struct inpcb *);
+
+#ifdef SYSCTL_NODE
+
+SYSBEGIN(f1)
+
+SYSCTL_DECL(_net_inet_ip_fw);
+SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, enable,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_enable), 0,
+ ipfw_chg_hook, "I", "Enable ipfw");
+#ifdef INET6
+SYSCTL_DECL(_net_inet6_ip6_fw);
+SYSCTL_VNET_PROC(_net_inet6_ip6_fw, OID_AUTO, enable,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw6_enable), 0,
+ ipfw_chg_hook, "I", "Enable ipfw+6");
+#endif /* INET6 */
+
+SYSCTL_DECL(_net_link_ether);
+SYSCTL_VNET_PROC(_net_link_ether, OID_AUTO, ipfw,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fwlink_enable), 0,
+ ipfw_chg_hook, "I", "Pass ether pkts through firewall");
+
+SYSEND
+
+#endif /* SYSCTL_NODE */
+
+/*
+ * The pfilter hook to pass packets to ipfw_chk and then to
+ * dummynet, divert, netgraph or other modules.
+ * The packet may be consumed.
+ */
+static int
+ipfw_check_packet(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir,
+ struct inpcb *inp)
+{
+ struct ip_fw_args args;
+ struct m_tag *tag;
+ int ipfw;
+ int ret;
+
+ /* all the processing now uses ip_len in net format */
+ if (mtod(*m0, struct ip *)->ip_v == 4)
+ SET_NET_IPLEN(mtod(*m0, struct ip *));
+
+ /* convert dir to IPFW values */
+ dir = (dir == PFIL_IN) ? DIR_IN : DIR_OUT;
+ bzero(&args, sizeof(args));
+
+again:
+ /*
+ * extract and remove the tag if present. If we are left
+ * with onepass, optimize the outgoing path.
+ */
+ tag = m_tag_locate(*m0, MTAG_IPFW_RULE, 0, NULL);
+ if (tag != NULL) {
+ args.rule = *((struct ipfw_rule_ref *)(tag+1));
+ m_tag_delete(*m0, tag);
+ if (args.rule.info & IPFW_ONEPASS) {
+ if (mtod(*m0, struct ip *)->ip_v == 4)
+ SET_HOST_IPLEN(mtod(*m0, struct ip *));
+ return (0);
+ }
+ }
+
+ args.m = *m0;
+ args.oif = dir == DIR_OUT ? ifp : NULL;
+ args.inp = inp;
+
+ ipfw = ipfw_chk(&args);
+ *m0 = args.m;
+
+ KASSERT(*m0 != NULL || ipfw == IP_FW_DENY, ("%s: m0 is NULL",
+ __func__));
+
+ /* breaking out of the switch means drop */
+ ret = 0; /* default return value for pass */
+ switch (ipfw) {
+ case IP_FW_PASS:
+ /* next_hop may be set by ipfw_chk */
+ if (args.next_hop == NULL && args.next_hop6 == NULL)
+ break; /* pass */
+#if !defined(IPFIREWALL_FORWARD) || (!defined(INET6) && !defined(INET))
+ ret = EACCES;
+#else
+ {
+ struct m_tag *fwd_tag;
+ size_t len;
+
+ KASSERT(args.next_hop == NULL || args.next_hop6 == NULL,
+ ("%s: both next_hop=%p and next_hop6=%p not NULL", __func__,
+ args.next_hop, args.next_hop6));
+#ifdef INET6
+ if (args.next_hop6 != NULL)
+ len = sizeof(struct sockaddr_in6);
+#endif
+#ifdef INET
+ if (args.next_hop != NULL)
+ len = sizeof(struct sockaddr_in);
+#endif
+
+ /* Incoming packets should not be tagged so we do not
+ * m_tag_find. Outgoing packets may be tagged, so we
+ * reuse the tag if present.
+ */
+ fwd_tag = (dir == DIR_IN) ? NULL :
+ m_tag_find(*m0, PACKET_TAG_IPFORWARD, NULL);
+ if (fwd_tag != NULL) {
+ m_tag_unlink(*m0, fwd_tag);
+ } else {
+ fwd_tag = m_tag_get(PACKET_TAG_IPFORWARD, len,
+ M_NOWAIT);
+ if (fwd_tag == NULL) {
+ ret = EACCES;
+ break; /* i.e. drop */
+ }
+ }
+#ifdef INET6
+ if (args.next_hop6 != NULL) {
+ bcopy(args.next_hop6, (fwd_tag+1), len);
+ if (in6_localip(&args.next_hop6->sin6_addr))
+ (*m0)->m_flags |= M_FASTFWD_OURS;
+ }
+#endif
+#ifdef INET
+ if (args.next_hop != NULL) {
+ bcopy(args.next_hop, (fwd_tag+1), len);
+ if (in_localip(args.next_hop->sin_addr))
+ (*m0)->m_flags |= M_FASTFWD_OURS;
+ }
+#endif
+ m_tag_prepend(*m0, fwd_tag);
+ }
+#endif /* IPFIREWALL_FORWARD */
+ break;
+
+ case IP_FW_DENY:
+ ret = EACCES;
+ break; /* i.e. drop */
+
+ case IP_FW_DUMMYNET:
+ ret = EACCES;
+ if (ip_dn_io_ptr == NULL)
+ break; /* i.e. drop */
+ if (mtod(*m0, struct ip *)->ip_v == 4)
+ ret = ip_dn_io_ptr(m0, dir, &args);
+ else if (mtod(*m0, struct ip *)->ip_v == 6)
+ ret = ip_dn_io_ptr(m0, dir | PROTO_IPV6, &args);
+ else
+ break; /* drop it */
+ /*
+ * XXX should read the return value.
+ * dummynet normally eats the packet and sets *m0=NULL
+ * unless the packet can be sent immediately. In this
+ * case args is updated and we should re-run the
+ * check without clearing args.
+ */
+ if (*m0 != NULL)
+ goto again;
+ break;
+
+ case IP_FW_TEE:
+ case IP_FW_DIVERT:
+ if (ip_divert_ptr == NULL) {
+ ret = EACCES;
+ break; /* i.e. drop */
+ }
+ ret = ipfw_divert(m0, dir, &args.rule,
+ (ipfw == IP_FW_TEE) ? 1 : 0);
+ /* continue processing for the original packet (tee). */
+ if (*m0)
+ goto again;
+ break;
+
+ case IP_FW_NGTEE:
+ case IP_FW_NETGRAPH:
+ if (ng_ipfw_input_p == NULL) {
+ ret = EACCES;
+ break; /* i.e. drop */
+ }
+ ret = ng_ipfw_input_p(m0, dir, &args,
+ (ipfw == IP_FW_NGTEE) ? 1 : 0);
+ if (ipfw == IP_FW_NGTEE) /* ignore errors for NGTEE */
+ goto again; /* continue with packet */
+ break;
+
+ case IP_FW_NAT:
+ /* honor one-pass in case of successful nat */
+ if (V_fw_one_pass)
+ break; /* ret is already 0 */
+ goto again;
+
+ case IP_FW_REASS:
+ goto again; /* continue with packet */
+
+ default:
+ KASSERT(0, ("%s: unknown retval", __func__));
+ }
+
+ if (ret != 0) {
+ if (*m0)
+ FREE_PKT(*m0);
+ *m0 = NULL;
+ }
+ if (*m0 && mtod(*m0, struct ip *)->ip_v == 4)
+ SET_HOST_IPLEN(mtod(*m0, struct ip *));
+ return ret;
+}
+
+/*
+ * ipfw processing for ethernet packets (in and out).
+ * Inteface is NULL from ether_demux, and ifp from
+ * ether_output_frame.
+ */
+static int
+ipfw_check_frame(void *arg, struct mbuf **m0, struct ifnet *dst, int dir,
+ struct inpcb *inp)
+{
+ struct ether_header *eh;
+ struct ether_header save_eh;
+ struct mbuf *m;
+ int i, ret;
+ struct ip_fw_args args;
+ struct m_tag *mtag;
+
+ /* fetch start point from rule, if any */
+ mtag = m_tag_locate(*m0, MTAG_IPFW_RULE, 0, NULL);
+ if (mtag == NULL) {
+ args.rule.slot = 0;
+ } else {
+ /* dummynet packet, already partially processed */
+ struct ipfw_rule_ref *r;
+
+ /* XXX can we free it after use ? */
+ mtag->m_tag_id = PACKET_TAG_NONE;
+ r = (struct ipfw_rule_ref *)(mtag + 1);
+ if (r->info & IPFW_ONEPASS)
+ return (0);
+ args.rule = *r;
+ }
+
+ /* I need some amt of data to be contiguous */
+ m = *m0;
+ i = min(m->m_pkthdr.len, max_protohdr);
+ if (m->m_len < i) {
+ m = m_pullup(m, i);
+ if (m == NULL) {
+ *m0 = m;
+ return (0);
+ }
+ }
+ eh = mtod(m, struct ether_header *);
+ save_eh = *eh; /* save copy for restore below */
+ m_adj(m, ETHER_HDR_LEN); /* strip ethernet header */
+
+ args.m = m; /* the packet we are looking at */
+ args.oif = dst; /* destination, if any */
+ args.next_hop = NULL; /* we do not support forward yet */
+ args.next_hop6 = NULL; /* we do not support forward yet */
+ args.eh = &save_eh; /* MAC header for bridged/MAC packets */
+ args.inp = NULL; /* used by ipfw uid/gid/jail rules */
+ i = ipfw_chk(&args);
+ m = args.m;
+ if (m != NULL) {
+ /*
+ * Restore Ethernet header, as needed, in case the
+ * mbuf chain was replaced by ipfw.
+ */
+ M_PREPEND(m, ETHER_HDR_LEN, M_DONTWAIT);
+ if (m == NULL) {
+ *m0 = NULL;
+ return (0);
+ }
+ if (eh != mtod(m, struct ether_header *))
+ bcopy(&save_eh, mtod(m, struct ether_header *),
+ ETHER_HDR_LEN);
+ }
+ *m0 = m;
+
+ ret = 0;
+ /* Check result of ipfw_chk() */
+ switch (i) {
+ case IP_FW_PASS:
+ break;
+
+ case IP_FW_DENY:
+ ret = EACCES;
+ break; /* i.e. drop */
+
+ case IP_FW_DUMMYNET:
+ ret = EACCES;
+ int dir;
+
+ if (ip_dn_io_ptr == NULL)
+ break; /* i.e. drop */
+
+ *m0 = NULL;
+ dir = PROTO_LAYER2 | (dst ? DIR_OUT : DIR_IN);
+ ip_dn_io_ptr(&m, dir, &args);
+ return 0;
+
+ default:
+ KASSERT(0, ("%s: unknown retval", __func__));
+ }
+
+ if (ret != 0) {
+ if (*m0)
+ FREE_PKT(*m0);
+ *m0 = NULL;
+ }
+
+ return ret;
+}
+
+/* do the divert, return 1 on error 0 on success */
+static int
+ipfw_divert(struct mbuf **m0, int incoming, struct ipfw_rule_ref *rule,
+ int tee)
+{
+ /*
+ * ipfw_chk() has already tagged the packet with the divert tag.
+ * If tee is set, copy packet and return original.
+ * If not tee, consume packet and send it to divert socket.
+ */
+ struct mbuf *clone;
+ struct ip *ip = mtod(*m0, struct ip *);
+ struct m_tag *tag;
+
+ /* Cloning needed for tee? */
+ if (tee == 0) {
+ clone = *m0; /* use the original mbuf */
+ *m0 = NULL;
+ } else {
+ clone = m_dup(*m0, M_DONTWAIT);
+ /* If we cannot duplicate the mbuf, we sacrifice the divert
+ * chain and continue with the tee-ed packet.
+ */
+ if (clone == NULL)
+ return 1;
+ }
+
+ /*
+ * Divert listeners can normally handle non-fragmented packets,
+ * but we can only reass in the non-tee case.
+ * This means that listeners on a tee rule may get fragments,
+ * and have to live with that.
+ * Note that we now have the 'reass' ipfw option so if we care
+ * we can do it before a 'tee'.
+ */
+ if (!tee) switch (ip->ip_v) {
+ case IPVERSION:
+ if (ntohs(ip->ip_off) & (IP_MF | IP_OFFMASK)) {
+ int hlen;
+ struct mbuf *reass;
+
+ SET_HOST_IPLEN(ip); /* ip_reass wants host order */
+ reass = ip_reass(clone); /* Reassemble packet. */
+ if (reass == NULL)
+ return 0; /* not an error */
+ /* if reass = NULL then it was consumed by ip_reass */
+ /*
+ * IP header checksum fixup after reassembly and leave header
+ * in network byte order.
+ */
+ ip = mtod(reass, struct ip *);
+ hlen = ip->ip_hl << 2;
+ SET_NET_IPLEN(ip);
+ ip->ip_sum = 0;
+ if (hlen == sizeof(struct ip))
+ ip->ip_sum = in_cksum_hdr(ip);
+ else
+ ip->ip_sum = in_cksum(reass, hlen);
+ clone = reass;
+ }
+ break;
+#ifdef INET6
+ case IPV6_VERSION >> 4:
+ {
+ struct ip6_hdr *const ip6 = mtod(clone, struct ip6_hdr *);
+
+ if (ip6->ip6_nxt == IPPROTO_FRAGMENT) {
+ int nxt, off;
+
+ off = sizeof(struct ip6_hdr);
+ nxt = frag6_input(&clone, &off, 0);
+ if (nxt == IPPROTO_DONE)
+ return (0);
+ }
+ break;
+ }
+#endif
+ }
+
+ /* attach a tag to the packet with the reinject info */
+ tag = m_tag_alloc(MTAG_IPFW_RULE, 0,
+ sizeof(struct ipfw_rule_ref), M_NOWAIT);
+ if (tag == NULL) {
+ FREE_PKT(clone);
+ return 1;
+ }
+ *((struct ipfw_rule_ref *)(tag+1)) = *rule;
+ m_tag_prepend(clone, tag);
+
+ /* Do the dirty job... */
+ ip_divert_ptr(clone, incoming);
+ return 0;
+}
+
+/*
+ * attach or detach hooks for a given protocol family
+ */
+static int
+ipfw_hook(int onoff, int pf)
+{
+ struct pfil_head *pfh;
+ void *hook_func;
+
+ pfh = pfil_head_get(PFIL_TYPE_AF, pf);
+ if (pfh == NULL)
+ return ENOENT;
+
+ hook_func = (pf == AF_LINK) ? ipfw_check_frame : ipfw_check_packet;
+
+ (void) (onoff ? pfil_add_hook : pfil_remove_hook)
+ (hook_func, NULL, PFIL_IN | PFIL_OUT | PFIL_WAITOK, pfh);
+
+ return 0;
+}
+
+int
+ipfw_attach_hooks(int arg)
+{
+ int error = 0;
+
+ if (arg == 0) /* detach */
+ ipfw_hook(0, AF_INET);
+ else if (V_fw_enable && ipfw_hook(1, AF_INET) != 0) {
+ error = ENOENT; /* see ip_fw_pfil.c::ipfw_hook() */
+ printf("ipfw_hook() error\n");
+ }
+#ifdef INET6
+ if (arg == 0) /* detach */
+ ipfw_hook(0, AF_INET6);
+ else if (V_fw6_enable && ipfw_hook(1, AF_INET6) != 0) {
+ error = ENOENT;
+ printf("ipfw6_hook() error\n");
+ }
+#endif
+ if (arg == 0) /* detach */
+ ipfw_hook(0, AF_LINK);
+ else if (V_fwlink_enable && ipfw_hook(1, AF_LINK) != 0) {
+ error = ENOENT;
+ printf("ipfw_link_hook() error\n");
+ }
+ return error;
+}
+
+int
+ipfw_chg_hook(SYSCTL_HANDLER_ARGS)
+{
+ int *enable;
+ int newval;
+ int error;
+ int af;
+
+ if (arg1 == &VNET_NAME(fw_enable)) {
+ enable = &V_fw_enable;
+ af = AF_INET;
+ }
+#ifdef INET6
+ else if (arg1 == &VNET_NAME(fw6_enable)) {
+ enable = &V_fw6_enable;
+ af = AF_INET6;
+ }
+#endif
+ else if (arg1 == &VNET_NAME(fwlink_enable)) {
+ enable = &V_fwlink_enable;
+ af = AF_LINK;
+ }
+ else
+ return (EINVAL);
+
+ newval = *enable;
+
+ /* Handle sysctl change */
+ error = sysctl_handle_int(oidp, &newval, 0, req);
+
+ if (error)
+ return (error);
+
+ /* Formalize new value */
+ newval = (newval) ? 1 : 0;
+
+ if (*enable == newval)
+ return (0);
+
+ error = ipfw_hook(newval, af);
+ if (error)
+ return (error);
+ *enable = newval;
+
+ return (0);
+}
+/* end of file */
diff --git a/sys/netpfil/ipfw/ip_fw_private.h b/sys/netpfil/ipfw/ip_fw_private.h
new file mode 100644
index 0000000..fb13a72
--- /dev/null
+++ b/sys/netpfil/ipfw/ip_fw_private.h
@@ -0,0 +1,309 @@
+/*-
+ * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IPFW2_PRIVATE_H
+#define _IPFW2_PRIVATE_H
+
+/*
+ * Internal constants and data structures used by ipfw components
+ * and not meant to be exported outside the kernel.
+ */
+
+#ifdef _KERNEL
+
+/*
+ * For platforms that do not have SYSCTL support, we wrap the
+ * SYSCTL_* into a function (one per file) to collect the values
+ * into an array at module initialization. The wrapping macros,
+ * SYSBEGIN() and SYSEND, are empty in the default case.
+ */
+#ifndef SYSBEGIN
+#define SYSBEGIN(x)
+#endif
+#ifndef SYSEND
+#define SYSEND
+#endif
+
+/* Return values from ipfw_chk() */
+enum {
+ IP_FW_PASS = 0,
+ IP_FW_DENY,
+ IP_FW_DIVERT,
+ IP_FW_TEE,
+ IP_FW_DUMMYNET,
+ IP_FW_NETGRAPH,
+ IP_FW_NGTEE,
+ IP_FW_NAT,
+ IP_FW_REASS,
+};
+
+/*
+ * Structure for collecting parameters to dummynet for ip6_output forwarding
+ */
+struct _ip6dn_args {
+ struct ip6_pktopts *opt_or;
+ struct route_in6 ro_or;
+ int flags_or;
+ struct ip6_moptions *im6o_or;
+ struct ifnet *origifp_or;
+ struct ifnet *ifp_or;
+ struct sockaddr_in6 dst_or;
+ u_long mtu_or;
+ struct route_in6 ro_pmtu_or;
+};
+
+
+/*
+ * Arguments for calling ipfw_chk() and dummynet_io(). We put them
+ * all into a structure because this way it is easier and more
+ * efficient to pass variables around and extend the interface.
+ */
+struct ip_fw_args {
+ struct mbuf *m; /* the mbuf chain */
+ struct ifnet *oif; /* output interface */
+ struct sockaddr_in *next_hop; /* forward address */
+ struct sockaddr_in6 *next_hop6; /* ipv6 forward address */
+
+ /*
+ * On return, it points to the matching rule.
+ * On entry, rule.slot > 0 means the info is valid and
+ * contains the starting rule for an ipfw search.
+ * If chain_id == chain->id && slot >0 then jump to that slot.
+ * Otherwise, we locate the first rule >= rulenum:rule_id
+ */
+ struct ipfw_rule_ref rule; /* match/restart info */
+
+ struct ether_header *eh; /* for bridged packets */
+
+ struct ipfw_flow_id f_id; /* grabbed from IP header */
+ //uint32_t cookie; /* a cookie depending on rule action */
+ struct inpcb *inp;
+
+ struct _ip6dn_args dummypar; /* dummynet->ip6_output */
+ struct sockaddr_in hopstore; /* store here if cannot use a pointer */
+};
+
+MALLOC_DECLARE(M_IPFW);
+
+/*
+ * Hooks sometime need to know the direction of the packet
+ * (divert, dummynet, netgraph, ...)
+ * We use a generic definition here, with bit0-1 indicating the
+ * direction, bit 2 indicating layer2 or 3, bit 3-4 indicating the
+ * specific protocol
+ * indicating the protocol (if necessary)
+ */
+enum {
+ DIR_MASK = 0x3,
+ DIR_OUT = 0,
+ DIR_IN = 1,
+ DIR_FWD = 2,
+ DIR_DROP = 3,
+ PROTO_LAYER2 = 0x4, /* set for layer 2 */
+ /* PROTO_DEFAULT = 0, */
+ PROTO_IPV4 = 0x08,
+ PROTO_IPV6 = 0x10,
+ PROTO_IFB = 0x0c, /* layer2 + ifbridge */
+ /* PROTO_OLDBDG = 0x14, unused, old bridge */
+};
+
+/* wrapper for freeing a packet, in case we need to do more work */
+#ifndef FREE_PKT
+#if defined(__linux__) || defined(_WIN32)
+#define FREE_PKT(m) netisr_dispatch(-1, m)
+#else
+#define FREE_PKT(m) m_freem(m)
+#endif
+#endif /* !FREE_PKT */
+
+/*
+ * Function definitions.
+ */
+
+/* attach (arg = 1) or detach (arg = 0) hooks */
+int ipfw_attach_hooks(int);
+#ifdef NOTYET
+void ipfw_nat_destroy(void);
+#endif
+
+/* In ip_fw_log.c */
+struct ip;
+void ipfw_log_bpf(int);
+void ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args,
+ struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg,
+ struct ip *ip);
+VNET_DECLARE(u_int64_t, norule_counter);
+#define V_norule_counter VNET(norule_counter)
+VNET_DECLARE(int, verbose_limit);
+#define V_verbose_limit VNET(verbose_limit)
+
+/* In ip_fw_dynamic.c */
+
+enum { /* result for matching dynamic rules */
+ MATCH_REVERSE = 0,
+ MATCH_FORWARD,
+ MATCH_NONE,
+ MATCH_UNKNOWN,
+};
+
+/*
+ * The lock for dynamic rules is only used once outside the file,
+ * and only to release the result of lookup_dyn_rule().
+ * Eventually we may implement it with a callback on the function.
+ */
+void ipfw_dyn_unlock(void);
+
+struct tcphdr;
+struct mbuf *ipfw_send_pkt(struct mbuf *, struct ipfw_flow_id *,
+ u_int32_t, u_int32_t, int);
+int ipfw_install_state(struct ip_fw *rule, ipfw_insn_limit *cmd,
+ struct ip_fw_args *args, uint32_t tablearg);
+ipfw_dyn_rule *ipfw_lookup_dyn_rule(struct ipfw_flow_id *pkt,
+ int *match_direction, struct tcphdr *tcp);
+void ipfw_remove_dyn_children(struct ip_fw *rule);
+void ipfw_get_dynamic(char **bp, const char *ep);
+
+void ipfw_dyn_attach(void); /* uma_zcreate .... */
+void ipfw_dyn_detach(void); /* uma_zdestroy ... */
+void ipfw_dyn_init(void); /* per-vnet initialization */
+void ipfw_dyn_uninit(int); /* per-vnet deinitialization */
+int ipfw_dyn_len(void);
+
+/* common variables */
+VNET_DECLARE(int, fw_one_pass);
+#define V_fw_one_pass VNET(fw_one_pass)
+
+VNET_DECLARE(int, fw_verbose);
+#define V_fw_verbose VNET(fw_verbose)
+
+VNET_DECLARE(struct ip_fw_chain, layer3_chain);
+#define V_layer3_chain VNET(layer3_chain)
+
+VNET_DECLARE(u_int32_t, set_disable);
+#define V_set_disable VNET(set_disable)
+
+VNET_DECLARE(int, autoinc_step);
+#define V_autoinc_step VNET(autoinc_step)
+
+VNET_DECLARE(unsigned int, fw_tables_max);
+#define V_fw_tables_max VNET(fw_tables_max)
+
+struct ip_fw_chain {
+ struct ip_fw *rules; /* list of rules */
+ struct ip_fw *reap; /* list of rules to reap */
+ struct ip_fw *default_rule;
+ int n_rules; /* number of static rules */
+ int static_len; /* total len of static rules */
+ struct ip_fw **map; /* array of rule ptrs to ease lookup */
+ LIST_HEAD(nat_list, cfg_nat) nat; /* list of nat entries */
+ struct radix_node_head **tables; /* IPv4 tables */
+ struct radix_node_head **xtables; /* extended tables */
+ uint8_t *tabletype; /* Array of table types */
+#if defined( __linux__ ) || defined( _WIN32 )
+ spinlock_t rwmtx;
+ spinlock_t uh_lock;
+#else
+ struct rwlock rwmtx;
+ struct rwlock uh_lock; /* lock for upper half */
+#endif
+ uint32_t id; /* ruleset id */
+ uint32_t gencnt; /* generation count */
+};
+
+struct sockopt; /* used by tcp_var.h */
+
+/*
+ * The lock is heavily used by ip_fw2.c (the main file) and ip_fw_nat.c
+ * so the variable and the macros must be here.
+ */
+
+#define IPFW_LOCK_INIT(_chain) do { \
+ rw_init(&(_chain)->rwmtx, "IPFW static rules"); \
+ rw_init(&(_chain)->uh_lock, "IPFW UH lock"); \
+ } while (0)
+
+#define IPFW_LOCK_DESTROY(_chain) do { \
+ rw_destroy(&(_chain)->rwmtx); \
+ rw_destroy(&(_chain)->uh_lock); \
+ } while (0)
+
+#define IPFW_WLOCK_ASSERT(_chain) rw_assert(&(_chain)->rwmtx, RA_WLOCKED)
+
+#define IPFW_RLOCK(p) rw_rlock(&(p)->rwmtx)
+#define IPFW_RUNLOCK(p) rw_runlock(&(p)->rwmtx)
+#define IPFW_WLOCK(p) rw_wlock(&(p)->rwmtx)
+#define IPFW_WUNLOCK(p) rw_wunlock(&(p)->rwmtx)
+
+#define IPFW_UH_RLOCK(p) rw_rlock(&(p)->uh_lock)
+#define IPFW_UH_RUNLOCK(p) rw_runlock(&(p)->uh_lock)
+#define IPFW_UH_WLOCK(p) rw_wlock(&(p)->uh_lock)
+#define IPFW_UH_WUNLOCK(p) rw_wunlock(&(p)->uh_lock)
+
+/* In ip_fw_sockopt.c */
+int ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id);
+int ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule);
+int ipfw_ctl(struct sockopt *sopt);
+int ipfw_chk(struct ip_fw_args *args);
+void ipfw_reap_rules(struct ip_fw *head);
+
+/* In ip_fw_table.c */
+struct radix_node;
+int ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+ uint32_t *val);
+int ipfw_lookup_table_extended(struct ip_fw_chain *ch, uint16_t tbl, void *paddr,
+ uint32_t *val, int type);
+int ipfw_init_tables(struct ip_fw_chain *ch);
+void ipfw_destroy_tables(struct ip_fw_chain *ch);
+int ipfw_flush_table(struct ip_fw_chain *ch, uint16_t tbl);
+int ipfw_add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, void *paddr,
+ uint8_t plen, uint8_t mlen, uint8_t type, uint32_t value);
+int ipfw_del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, void *paddr,
+ uint8_t plen, uint8_t mlen, uint8_t type);
+int ipfw_count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt);
+int ipfw_dump_table_entry(struct radix_node *rn, void *arg);
+int ipfw_dump_table(struct ip_fw_chain *ch, ipfw_table *tbl);
+int ipfw_count_xtable(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt);
+int ipfw_dump_xtable(struct ip_fw_chain *ch, ipfw_xtable *tbl);
+int ipfw_resize_tables(struct ip_fw_chain *ch, unsigned int ntables);
+
+/* In ip_fw_nat.c -- XXX to be moved to ip_var.h */
+
+extern struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int);
+
+typedef int ipfw_nat_t(struct ip_fw_args *, struct cfg_nat *, struct mbuf *);
+typedef int ipfw_nat_cfg_t(struct sockopt *);
+
+extern ipfw_nat_t *ipfw_nat_ptr;
+#define IPFW_NAT_LOADED (ipfw_nat_ptr != NULL)
+
+extern ipfw_nat_cfg_t *ipfw_nat_cfg_ptr;
+extern ipfw_nat_cfg_t *ipfw_nat_del_ptr;
+extern ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr;
+extern ipfw_nat_cfg_t *ipfw_nat_get_log_ptr;
+
+#endif /* _KERNEL */
+#endif /* _IPFW2_PRIVATE_H */
diff --git a/sys/netpfil/ipfw/ip_fw_sockopt.c b/sys/netpfil/ipfw/ip_fw_sockopt.c
new file mode 100644
index 0000000..a412eb0
--- /dev/null
+++ b/sys/netpfil/ipfw/ip_fw_sockopt.c
@@ -0,0 +1,1449 @@
+/*-
+ * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
+ *
+ * Supported by: Valeria Paoli
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ * Sockopt support for ipfw. The routines here implement
+ * the upper half of the ipfw code.
+ */
+
+#include "opt_ipfw.h"
+#include "opt_inet.h"
+#ifndef INET
+#error IPFIREWALL requires INET.
+#endif /* INET */
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h> /* struct m_tag used by nested headers */
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <net/if.h>
+#include <net/route.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/ip_var.h> /* hooks */
+#include <netinet/ip_fw.h>
+
+#include <netpfil/ipfw/ip_fw_private.h>
+
+#ifdef MAC
+#include <security/mac/mac_framework.h>
+#endif
+
+MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");
+
+/*
+ * static variables followed by global ones (none in this file)
+ */
+
+/*
+ * Find the smallest rule >= key, id.
+ * We could use bsearch but it is so simple that we code it directly
+ */
+int
+ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id)
+{
+ int i, lo, hi;
+ struct ip_fw *r;
+
+ for (lo = 0, hi = chain->n_rules - 1; lo < hi;) {
+ i = (lo + hi) / 2;
+ r = chain->map[i];
+ if (r->rulenum < key)
+ lo = i + 1; /* continue from the next one */
+ else if (r->rulenum > key)
+ hi = i; /* this might be good */
+ else if (r->id < id)
+ lo = i + 1; /* continue from the next one */
+ else /* r->id >= id */
+ hi = i; /* this might be good */
+ };
+ return hi;
+}
+
+/*
+ * allocate a new map, returns the chain locked. extra is the number
+ * of entries to add or delete.
+ */
+static struct ip_fw **
+get_map(struct ip_fw_chain *chain, int extra, int locked)
+{
+
+ for (;;) {
+ struct ip_fw **map;
+ int i;
+
+ i = chain->n_rules + extra;
+ map = malloc(i * sizeof(struct ip_fw *), M_IPFW,
+ locked ? M_NOWAIT : M_WAITOK);
+ if (map == NULL) {
+ printf("%s: cannot allocate map\n", __FUNCTION__);
+ return NULL;
+ }
+ if (!locked)
+ IPFW_UH_WLOCK(chain);
+ if (i >= chain->n_rules + extra) /* good */
+ return map;
+ /* otherwise we lost the race, free and retry */
+ if (!locked)
+ IPFW_UH_WUNLOCK(chain);
+ free(map, M_IPFW);
+ }
+}
+
+/*
+ * swap the maps. It is supposed to be called with IPFW_UH_WLOCK
+ */
+static struct ip_fw **
+swap_map(struct ip_fw_chain *chain, struct ip_fw **new_map, int new_len)
+{
+ struct ip_fw **old_map;
+
+ IPFW_WLOCK(chain);
+ chain->id++;
+ chain->n_rules = new_len;
+ old_map = chain->map;
+ chain->map = new_map;
+ IPFW_WUNLOCK(chain);
+ return old_map;
+}
+
+/*
+ * Add a new rule to the list. Copy the rule into a malloc'ed area, then
+ * possibly create a rule number and add the rule to the list.
+ * Update the rule_number in the input struct so the caller knows it as well.
+ * XXX DO NOT USE FOR THE DEFAULT RULE.
+ * Must be called without IPFW_UH held
+ */
+int
+ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
+{
+ struct ip_fw *rule;
+ int i, l, insert_before;
+ struct ip_fw **map; /* the new array of pointers */
+
+ if (chain->rules == NULL || input_rule->rulenum > IPFW_DEFAULT_RULE-1)
+ return (EINVAL);
+
+ l = RULESIZE(input_rule);
+ rule = malloc(l, M_IPFW, M_WAITOK | M_ZERO);
+ /* get_map returns with IPFW_UH_WLOCK if successful */
+ map = get_map(chain, 1, 0 /* not locked */);
+ if (map == NULL) {
+ free(rule, M_IPFW);
+ return ENOSPC;
+ }
+
+ bcopy(input_rule, rule, l);
+ /* clear fields not settable from userland */
+ rule->x_next = NULL;
+ rule->next_rule = NULL;
+ rule->pcnt = 0;
+ rule->bcnt = 0;
+ rule->timestamp = 0;
+
+ if (V_autoinc_step < 1)
+ V_autoinc_step = 1;
+ else if (V_autoinc_step > 1000)
+ V_autoinc_step = 1000;
+ /* find the insertion point, we will insert before */
+ insert_before = rule->rulenum ? rule->rulenum + 1 : IPFW_DEFAULT_RULE;
+ i = ipfw_find_rule(chain, insert_before, 0);
+ /* duplicate first part */
+ if (i > 0)
+ bcopy(chain->map, map, i * sizeof(struct ip_fw *));
+ map[i] = rule;
+ /* duplicate remaining part, we always have the default rule */
+ bcopy(chain->map + i, map + i + 1,
+ sizeof(struct ip_fw *) *(chain->n_rules - i));
+ if (rule->rulenum == 0) {
+ /* write back the number */
+ rule->rulenum = i > 0 ? map[i-1]->rulenum : 0;
+ if (rule->rulenum < IPFW_DEFAULT_RULE - V_autoinc_step)
+ rule->rulenum += V_autoinc_step;
+ input_rule->rulenum = rule->rulenum;
+ }
+
+ rule->id = chain->id + 1;
+ map = swap_map(chain, map, chain->n_rules + 1);
+ chain->static_len += l;
+ IPFW_UH_WUNLOCK(chain);
+ if (map)
+ free(map, M_IPFW);
+ return (0);
+}
+
+/*
+ * Reclaim storage associated with a list of rules. This is
+ * typically the list created using remove_rule.
+ * A NULL pointer on input is handled correctly.
+ */
+void
+ipfw_reap_rules(struct ip_fw *head)
+{
+ struct ip_fw *rule;
+
+ while ((rule = head) != NULL) {
+ head = head->x_next;
+ free(rule, M_IPFW);
+ }
+}
+
+/*
+ * Used by del_entry() to check if a rule should be kept.
+ * Returns 1 if the rule must be kept, 0 otherwise.
+ *
+ * Called with cmd = {0,1,5}.
+ * cmd == 0 matches on rule numbers, excludes rules in RESVD_SET if n == 0 ;
+ * cmd == 1 matches on set numbers only, rule numbers are ignored;
+ * cmd == 5 matches on rule and set numbers.
+ *
+ * n == 0 is a wildcard for rule numbers, there is no wildcard for sets.
+ *
+ * Rules to keep are
+ * (default || reserved || !match_set || !match_number)
+ * where
+ * default ::= (rule->rulenum == IPFW_DEFAULT_RULE)
+ * // the default rule is always protected
+ *
+ * reserved ::= (cmd == 0 && n == 0 && rule->set == RESVD_SET)
+ * // RESVD_SET is protected only if cmd == 0 and n == 0 ("ipfw flush")
+ *
+ * match_set ::= (cmd == 0 || rule->set == set)
+ * // set number is ignored for cmd == 0
+ *
+ * match_number ::= (cmd == 1 || n == 0 || n == rule->rulenum)
+ * // number is ignored for cmd == 1 or n == 0
+ *
+ */
+static int
+keep_rule(struct ip_fw *rule, uint8_t cmd, uint8_t set, uint32_t n)
+{
+ return
+ (rule->rulenum == IPFW_DEFAULT_RULE) ||
+ (cmd == 0 && n == 0 && rule->set == RESVD_SET) ||
+ !(cmd == 0 || rule->set == set) ||
+ !(cmd == 1 || n == 0 || n == rule->rulenum);
+}
+
+/**
+ * Remove all rules with given number, or do set manipulation.
+ * Assumes chain != NULL && *chain != NULL.
+ *
+ * The argument is an uint32_t. The low 16 bit are the rule or set number;
+ * the next 8 bits are the new set; the top 8 bits indicate the command:
+ *
+ * 0 delete rules numbered "rulenum"
+ * 1 delete rules in set "rulenum"
+ * 2 move rules "rulenum" to set "new_set"
+ * 3 move rules from set "rulenum" to set "new_set"
+ * 4 swap sets "rulenum" and "new_set"
+ * 5 delete rules "rulenum" and set "new_set"
+ */
+static int
+del_entry(struct ip_fw_chain *chain, uint32_t arg)
+{
+ struct ip_fw *rule;
+ uint32_t num; /* rule number or old_set */
+ uint8_t cmd, new_set;
+ int start, end, i, ofs, n;
+ struct ip_fw **map = NULL;
+ int error = 0;
+
+ num = arg & 0xffff;
+ cmd = (arg >> 24) & 0xff;
+ new_set = (arg >> 16) & 0xff;
+
+ if (cmd > 5 || new_set > RESVD_SET)
+ return EINVAL;
+ if (cmd == 0 || cmd == 2 || cmd == 5) {
+ if (num >= IPFW_DEFAULT_RULE)
+ return EINVAL;
+ } else {
+ if (num > RESVD_SET) /* old_set */
+ return EINVAL;
+ }
+
+ IPFW_UH_WLOCK(chain); /* arbitrate writers */
+ chain->reap = NULL; /* prepare for deletions */
+
+ switch (cmd) {
+ case 0: /* delete rules "num" (num == 0 matches all) */
+ case 1: /* delete all rules in set N */
+ case 5: /* delete rules with number N and set "new_set". */
+
+ /*
+ * Locate first rule to delete (start), the rule after
+ * the last one to delete (end), and count how many
+ * rules to delete (n). Always use keep_rule() to
+ * determine which rules to keep.
+ */
+ n = 0;
+ if (cmd == 1) {
+ /* look for a specific set including RESVD_SET.
+ * Must scan the entire range, ignore num.
+ */
+ new_set = num;
+ for (start = -1, end = i = 0; i < chain->n_rules; i++) {
+ if (keep_rule(chain->map[i], cmd, new_set, 0))
+ continue;
+ if (start < 0)
+ start = i;
+ end = i;
+ n++;
+ }
+ end++; /* first non-matching */
+ } else {
+ /* Optimized search on rule numbers */
+ start = ipfw_find_rule(chain, num, 0);
+ for (end = start; end < chain->n_rules; end++) {
+ rule = chain->map[end];
+ if (num > 0 && rule->rulenum != num)
+ break;
+ if (!keep_rule(rule, cmd, new_set, num))
+ n++;
+ }
+ }
+
+ if (n == 0) {
+ /* A flush request (arg == 0 or cmd == 1) on empty
+ * ruleset returns with no error. On the contrary,
+ * if there is no match on a specific request,
+ * we return EINVAL.
+ */
+ if (arg != 0 && cmd != 1)
+ error = EINVAL;
+ break;
+ }
+
+ /* We have something to delete. Allocate the new map */
+ map = get_map(chain, -n, 1 /* locked */);
+ if (map == NULL) {
+ error = EINVAL;
+ break;
+ }
+
+ /* 1. bcopy the initial part of the map */
+ if (start > 0)
+ bcopy(chain->map, map, start * sizeof(struct ip_fw *));
+ /* 2. copy active rules between start and end */
+ for (i = ofs = start; i < end; i++) {
+ rule = chain->map[i];
+ if (keep_rule(rule, cmd, new_set, num))
+ map[ofs++] = rule;
+ }
+ /* 3. copy the final part of the map */
+ bcopy(chain->map + end, map + ofs,
+ (chain->n_rules - end) * sizeof(struct ip_fw *));
+ /* 4. swap the maps (under BH_LOCK) */
+ map = swap_map(chain, map, chain->n_rules - n);
+ /* 5. now remove the rules deleted from the old map */
+ for (i = start; i < end; i++) {
+ int l;
+ rule = map[i];
+ if (keep_rule(rule, cmd, new_set, num))
+ continue;
+ l = RULESIZE(rule);
+ chain->static_len -= l;
+ ipfw_remove_dyn_children(rule);
+ rule->x_next = chain->reap;
+ chain->reap = rule;
+ }
+ break;
+
+ /*
+ * In the next 3 cases the loop stops at (n_rules - 1)
+ * because the default rule is never eligible..
+ */
+
+ case 2: /* move rules with given RULE number to new set */
+ for (i = 0; i < chain->n_rules - 1; i++) {
+ rule = chain->map[i];
+ if (rule->rulenum == num)
+ rule->set = new_set;
+ }
+ break;
+
+ case 3: /* move rules with given SET number to new set */
+ for (i = 0; i < chain->n_rules - 1; i++) {
+ rule = chain->map[i];
+ if (rule->set == num)
+ rule->set = new_set;
+ }
+ break;
+
+ case 4: /* swap two sets */
+ for (i = 0; i < chain->n_rules - 1; i++) {
+ rule = chain->map[i];
+ if (rule->set == num)
+ rule->set = new_set;
+ else if (rule->set == new_set)
+ rule->set = num;
+ }
+ break;
+ }
+
+ rule = chain->reap;
+ chain->reap = NULL;
+ IPFW_UH_WUNLOCK(chain);
+ ipfw_reap_rules(rule);
+ if (map)
+ free(map, M_IPFW);
+ return error;
+}
+
+/*
+ * Clear counters for a specific rule.
+ * Normally run under IPFW_UH_RLOCK, but these are idempotent ops
+ * so we only care that rules do not disappear.
+ */
+static void
+clear_counters(struct ip_fw *rule, int log_only)
+{
+ ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule);
+
+ if (log_only == 0) {
+ rule->bcnt = rule->pcnt = 0;
+ rule->timestamp = 0;
+ }
+ if (l->o.opcode == O_LOG)
+ l->log_left = l->max_log;
+}
+
+/**
+ * Reset some or all counters on firewall rules.
+ * The argument `arg' is an u_int32_t. The low 16 bit are the rule number,
+ * the next 8 bits are the set number, the top 8 bits are the command:
+ * 0 work with rules from all set's;
+ * 1 work with rules only from specified set.
+ * Specified rule number is zero if we want to clear all entries.
+ * log_only is 1 if we only want to reset logs, zero otherwise.
+ */
+static int
+zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only)
+{
+ struct ip_fw *rule;
+ char *msg;
+ int i;
+
+ uint16_t rulenum = arg & 0xffff;
+ uint8_t set = (arg >> 16) & 0xff;
+ uint8_t cmd = (arg >> 24) & 0xff;
+
+ if (cmd > 1)
+ return (EINVAL);
+ if (cmd == 1 && set > RESVD_SET)
+ return (EINVAL);
+
+ IPFW_UH_RLOCK(chain);
+ if (rulenum == 0) {
+ V_norule_counter = 0;
+ for (i = 0; i < chain->n_rules; i++) {
+ rule = chain->map[i];
+ /* Skip rules not in our set. */
+ if (cmd == 1 && rule->set != set)
+ continue;
+ clear_counters(rule, log_only);
+ }
+ msg = log_only ? "All logging counts reset" :
+ "Accounting cleared";
+ } else {
+ int cleared = 0;
+ for (i = 0; i < chain->n_rules; i++) {
+ rule = chain->map[i];
+ if (rule->rulenum == rulenum) {
+ if (cmd == 0 || rule->set == set)
+ clear_counters(rule, log_only);
+ cleared = 1;
+ }
+ if (rule->rulenum > rulenum)
+ break;
+ }
+ if (!cleared) { /* we did not find any matching rules */
+ IPFW_UH_RUNLOCK(chain);
+ return (EINVAL);
+ }
+ msg = log_only ? "logging count reset" : "cleared";
+ }
+ IPFW_UH_RUNLOCK(chain);
+
+ if (V_fw_verbose) {
+ int lev = LOG_SECURITY | LOG_NOTICE;
+
+ if (rulenum)
+ log(lev, "ipfw: Entry %d %s.\n", rulenum, msg);
+ else
+ log(lev, "ipfw: %s.\n", msg);
+ }
+ return (0);
+}
+
+/*
+ * Check validity of the structure before insert.
+ * Rules are simple, so this mostly need to check rule sizes.
+ */
+static int
+check_ipfw_struct(struct ip_fw *rule, int size)
+{
+ int l, cmdlen = 0;
+ int have_action=0;
+ ipfw_insn *cmd;
+
+ if (size < sizeof(*rule)) {
+ printf("ipfw: rule too short\n");
+ return (EINVAL);
+ }
+ /* first, check for valid size */
+ l = RULESIZE(rule);
+ if (l != size) {
+ printf("ipfw: size mismatch (have %d want %d)\n", size, l);
+ return (EINVAL);
+ }
+ if (rule->act_ofs >= rule->cmd_len) {
+ printf("ipfw: bogus action offset (%u > %u)\n",
+ rule->act_ofs, rule->cmd_len - 1);
+ return (EINVAL);
+ }
+ /*
+ * Now go for the individual checks. Very simple ones, basically only
+ * instruction sizes.
+ */
+ for (l = rule->cmd_len, cmd = rule->cmd ;
+ l > 0 ; l -= cmdlen, cmd += cmdlen) {
+ cmdlen = F_LEN(cmd);
+ if (cmdlen > l) {
+ printf("ipfw: opcode %d size truncated\n",
+ cmd->opcode);
+ return EINVAL;
+ }
+ switch (cmd->opcode) {
+ case O_PROBE_STATE:
+ case O_KEEP_STATE:
+ case O_PROTO:
+ case O_IP_SRC_ME:
+ case O_IP_DST_ME:
+ case O_LAYER2:
+ case O_IN:
+ case O_FRAG:
+ case O_DIVERTED:
+ case O_IPOPT:
+ case O_IPTOS:
+ case O_IPPRECEDENCE:
+ case O_IPVER:
+ case O_SOCKARG:
+ case O_TCPFLAGS:
+ case O_TCPOPTS:
+ case O_ESTAB:
+ case O_VERREVPATH:
+ case O_VERSRCREACH:
+ case O_ANTISPOOF:
+ case O_IPSEC:
+#ifdef INET6
+ case O_IP6_SRC_ME:
+ case O_IP6_DST_ME:
+ case O_EXT_HDR:
+ case O_IP6:
+#endif
+ case O_IP4:
+ case O_TAG:
+ if (cmdlen != F_INSN_SIZE(ipfw_insn))
+ goto bad_size;
+ break;
+
+ case O_FIB:
+ if (cmdlen != F_INSN_SIZE(ipfw_insn))
+ goto bad_size;
+ if (cmd->arg1 >= rt_numfibs) {
+ printf("ipfw: invalid fib number %d\n",
+ cmd->arg1);
+ return EINVAL;
+ }
+ break;
+
+ case O_SETFIB:
+ if (cmdlen != F_INSN_SIZE(ipfw_insn))
+ goto bad_size;
+ if ((cmd->arg1 != IP_FW_TABLEARG) &&
+ (cmd->arg1 >= rt_numfibs)) {
+ printf("ipfw: invalid fib number %d\n",
+ cmd->arg1);
+ return EINVAL;
+ }
+ goto check_action;
+
+ case O_UID:
+ case O_GID:
+ case O_JAIL:
+ case O_IP_SRC:
+ case O_IP_DST:
+ case O_TCPSEQ:
+ case O_TCPACK:
+ case O_PROB:
+ case O_ICMPTYPE:
+ if (cmdlen != F_INSN_SIZE(ipfw_insn_u32))
+ goto bad_size;
+ break;
+
+ case O_LIMIT:
+ if (cmdlen != F_INSN_SIZE(ipfw_insn_limit))
+ goto bad_size;
+ break;
+
+ case O_LOG:
+ if (cmdlen != F_INSN_SIZE(ipfw_insn_log))
+ goto bad_size;
+
+ ((ipfw_insn_log *)cmd)->log_left =
+ ((ipfw_insn_log *)cmd)->max_log;
+
+ break;
+
+ case O_IP_SRC_MASK:
+ case O_IP_DST_MASK:
+ /* only odd command lengths */
+ if ( !(cmdlen & 1) || cmdlen > 31)
+ goto bad_size;
+ break;
+
+ case O_IP_SRC_SET:
+ case O_IP_DST_SET:
+ if (cmd->arg1 == 0 || cmd->arg1 > 256) {
+ printf("ipfw: invalid set size %d\n",
+ cmd->arg1);
+ return EINVAL;
+ }
+ if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
+ (cmd->arg1+31)/32 )
+ goto bad_size;
+ break;
+
+ case O_IP_SRC_LOOKUP:
+ case O_IP_DST_LOOKUP:
+ if (cmd->arg1 >= IPFW_TABLES_MAX) {
+ printf("ipfw: invalid table number %d\n",
+ cmd->arg1);
+ return (EINVAL);
+ }
+ if (cmdlen != F_INSN_SIZE(ipfw_insn) &&
+ cmdlen != F_INSN_SIZE(ipfw_insn_u32) + 1 &&
+ cmdlen != F_INSN_SIZE(ipfw_insn_u32))
+ goto bad_size;
+ break;
+ case O_MACADDR2:
+ if (cmdlen != F_INSN_SIZE(ipfw_insn_mac))
+ goto bad_size;
+ break;
+
+ case O_NOP:
+ case O_IPID:
+ case O_IPTTL:
+ case O_IPLEN:
+ case O_TCPDATALEN:
+ case O_TCPWIN:
+ case O_TAGGED:
+ if (cmdlen < 1 || cmdlen > 31)
+ goto bad_size;
+ break;
+
+ case O_MAC_TYPE:
+ case O_IP_SRCPORT:
+ case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */
+ if (cmdlen < 2 || cmdlen > 31)
+ goto bad_size;
+ break;
+
+ case O_RECV:
+ case O_XMIT:
+ case O_VIA:
+ if (cmdlen != F_INSN_SIZE(ipfw_insn_if))
+ goto bad_size;
+ break;
+
+ case O_ALTQ:
+ if (cmdlen != F_INSN_SIZE(ipfw_insn_altq))
+ goto bad_size;
+ break;
+
+ case O_PIPE:
+ case O_QUEUE:
+ if (cmdlen != F_INSN_SIZE(ipfw_insn))
+ goto bad_size;
+ goto check_action;
+
+ case O_FORWARD_IP:
+#ifdef IPFIREWALL_FORWARD
+ if (cmdlen != F_INSN_SIZE(ipfw_insn_sa))
+ goto bad_size;
+ goto check_action;
+#else
+ return EINVAL;
+#endif
+
+#ifdef INET6
+ case O_FORWARD_IP6:
+#ifdef IPFIREWALL_FORWARD
+ if (cmdlen != F_INSN_SIZE(ipfw_insn_sa6))
+ goto bad_size;
+ goto check_action;
+#else
+ return (EINVAL);
+#endif
+#endif /* INET6 */
+
+ case O_DIVERT:
+ case O_TEE:
+ if (ip_divert_ptr == NULL)
+ return EINVAL;
+ else
+ goto check_size;
+ case O_NETGRAPH:
+ case O_NGTEE:
+ if (ng_ipfw_input_p == NULL)
+ return EINVAL;
+ else
+ goto check_size;
+ case O_NAT:
+ if (!IPFW_NAT_LOADED)
+ return EINVAL;
+ if (cmdlen != F_INSN_SIZE(ipfw_insn_nat))
+ goto bad_size;
+ goto check_action;
+ case O_FORWARD_MAC: /* XXX not implemented yet */
+ case O_CHECK_STATE:
+ case O_COUNT:
+ case O_ACCEPT:
+ case O_DENY:
+ case O_REJECT:
+#ifdef INET6
+ case O_UNREACH6:
+#endif
+ case O_SKIPTO:
+ case O_REASS:
+ case O_CALLRETURN:
+check_size:
+ if (cmdlen != F_INSN_SIZE(ipfw_insn))
+ goto bad_size;
+check_action:
+ if (have_action) {
+ printf("ipfw: opcode %d, multiple actions"
+ " not allowed\n",
+ cmd->opcode);
+ return EINVAL;
+ }
+ have_action = 1;
+ if (l != cmdlen) {
+ printf("ipfw: opcode %d, action must be"
+ " last opcode\n",
+ cmd->opcode);
+ return EINVAL;
+ }
+ break;
+#ifdef INET6
+ case O_IP6_SRC:
+ case O_IP6_DST:
+ if (cmdlen != F_INSN_SIZE(struct in6_addr) +
+ F_INSN_SIZE(ipfw_insn))
+ goto bad_size;
+ break;
+
+ case O_FLOW6ID:
+ if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
+ ((ipfw_insn_u32 *)cmd)->o.arg1)
+ goto bad_size;
+ break;
+
+ case O_IP6_SRC_MASK:
+ case O_IP6_DST_MASK:
+ if ( !(cmdlen & 1) || cmdlen > 127)
+ goto bad_size;
+ break;
+ case O_ICMP6TYPE:
+ if( cmdlen != F_INSN_SIZE( ipfw_insn_icmp6 ) )
+ goto bad_size;
+ break;
+#endif
+
+ default:
+ switch (cmd->opcode) {
+#ifndef INET6
+ case O_IP6_SRC_ME:
+ case O_IP6_DST_ME:
+ case O_EXT_HDR:
+ case O_IP6:
+ case O_UNREACH6:
+ case O_IP6_SRC:
+ case O_IP6_DST:
+ case O_FLOW6ID:
+ case O_IP6_SRC_MASK:
+ case O_IP6_DST_MASK:
+ case O_ICMP6TYPE:
+ printf("ipfw: no IPv6 support in kernel\n");
+ return EPROTONOSUPPORT;
+#endif
+ default:
+ printf("ipfw: opcode %d, unknown opcode\n",
+ cmd->opcode);
+ return EINVAL;
+ }
+ }
+ }
+ if (have_action == 0) {
+ printf("ipfw: missing action\n");
+ return EINVAL;
+ }
+ return 0;
+
+bad_size:
+ printf("ipfw: opcode %d size %d wrong\n",
+ cmd->opcode, cmdlen);
+ return EINVAL;
+}
+
+
+/*
+ * Translation of requests for compatibility with FreeBSD 7.2/8.
+ * a static variable tells us if we have an old client from userland,
+ * and if necessary we translate requests and responses between the
+ * two formats.
+ */
+static int is7 = 0;
+
+struct ip_fw7 {
+ struct ip_fw7 *next; /* linked list of rules */
+ struct ip_fw7 *next_rule; /* ptr to next [skipto] rule */
+ /* 'next_rule' is used to pass up 'set_disable' status */
+
+ uint16_t act_ofs; /* offset of action in 32-bit units */
+ uint16_t cmd_len; /* # of 32-bit words in cmd */
+ uint16_t rulenum; /* rule number */
+ uint8_t set; /* rule set (0..31) */
+ // #define RESVD_SET 31 /* set for default and persistent rules */
+ uint8_t _pad; /* padding */
+ // uint32_t id; /* rule id, only in v.8 */
+ /* These fields are present in all rules. */
+ uint64_t pcnt; /* Packet counter */
+ uint64_t bcnt; /* Byte counter */
+ uint32_t timestamp; /* tv_sec of last match */
+
+ ipfw_insn cmd[1]; /* storage for commands */
+};
+
+ int convert_rule_to_7(struct ip_fw *rule);
+int convert_rule_to_8(struct ip_fw *rule);
+
+#ifndef RULESIZE7
+#define RULESIZE7(rule) (sizeof(struct ip_fw7) + \
+ ((struct ip_fw7 *)(rule))->cmd_len * 4 - 4)
+#endif
+
+
+/*
+ * Copy the static and dynamic rules to the supplied buffer
+ * and return the amount of space actually used.
+ * Must be run under IPFW_UH_RLOCK
+ */
+static size_t
+ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space)
+{
+ char *bp = buf;
+ char *ep = bp + space;
+ struct ip_fw *rule, *dst;
+ int l, i;
+ time_t boot_seconds;
+
+ boot_seconds = boottime.tv_sec;
+ for (i = 0; i < chain->n_rules; i++) {
+ rule = chain->map[i];
+
+ if (is7) {
+ /* Convert rule to FreeBSd 7.2 format */
+ l = RULESIZE7(rule);
+ if (bp + l + sizeof(uint32_t) <= ep) {
+ int error;
+ bcopy(rule, bp, l + sizeof(uint32_t));
+ error = convert_rule_to_7((struct ip_fw *) bp);
+ if (error)
+ return 0; /*XXX correct? */
+ /*
+ * XXX HACK. Store the disable mask in the "next"
+ * pointer in a wild attempt to keep the ABI the same.
+ * Why do we do this on EVERY rule?
+ */
+ bcopy(&V_set_disable,
+ &(((struct ip_fw7 *)bp)->next_rule),
+ sizeof(V_set_disable));
+ if (((struct ip_fw7 *)bp)->timestamp)
+ ((struct ip_fw7 *)bp)->timestamp += boot_seconds;
+ bp += l;
+ }
+ continue; /* go to next rule */
+ }
+
+ /* normal mode, don't touch rules */
+ l = RULESIZE(rule);
+ if (bp + l > ep) { /* should not happen */
+ printf("overflow dumping static rules\n");
+ break;
+ }
+ dst = (struct ip_fw *)bp;
+ bcopy(rule, dst, l);
+ /*
+ * XXX HACK. Store the disable mask in the "next"
+ * pointer in a wild attempt to keep the ABI the same.
+ * Why do we do this on EVERY rule?
+ */
+ bcopy(&V_set_disable, &dst->next_rule, sizeof(V_set_disable));
+ if (dst->timestamp)
+ dst->timestamp += boot_seconds;
+ bp += l;
+ }
+ ipfw_get_dynamic(&bp, ep); /* protected by the dynamic lock */
+ return (bp - (char *)buf);
+}
+
+
+#define IP_FW3_OPLENGTH(x) ((x)->sopt_valsize - sizeof(ip_fw3_opheader))
+/**
+ * {set|get}sockopt parser.
+ */
+int
+ipfw_ctl(struct sockopt *sopt)
+{
+#define RULE_MAXSIZE (256*sizeof(u_int32_t))
+ int error;
+ size_t size, len, valsize;
+ struct ip_fw *buf, *rule;
+ struct ip_fw_chain *chain;
+ u_int32_t rulenum[2];
+ uint32_t opt;
+ char xbuf[128];
+ ip_fw3_opheader *op3 = NULL;
+
+ error = priv_check(sopt->sopt_td, PRIV_NETINET_IPFW);
+ if (error)
+ return (error);
+
+ /*
+ * Disallow modifications in really-really secure mode, but still allow
+ * the logging counters to be reset.
+ */
+ if (sopt->sopt_name == IP_FW_ADD ||
+ (sopt->sopt_dir == SOPT_SET && sopt->sopt_name != IP_FW_RESETLOG)) {
+ error = securelevel_ge(sopt->sopt_td->td_ucred, 3);
+ if (error)
+ return (error);
+ }
+
+ chain = &V_layer3_chain;
+ error = 0;
+
+ /* Save original valsize before it is altered via sooptcopyin() */
+ valsize = sopt->sopt_valsize;
+ if ((opt = sopt->sopt_name) == IP_FW3) {
+ /*
+ * Copy not less than sizeof(ip_fw3_opheader).
+ * We hope any IP_FW3 command will fit into 128-byte buffer.
+ */
+ if ((error = sooptcopyin(sopt, xbuf, sizeof(xbuf),
+ sizeof(ip_fw3_opheader))) != 0)
+ return (error);
+ op3 = (ip_fw3_opheader *)xbuf;
+ opt = op3->opcode;
+ }
+
+ switch (opt) {
+ case IP_FW_GET:
+ /*
+ * pass up a copy of the current rules. Static rules
+ * come first (the last of which has number IPFW_DEFAULT_RULE),
+ * followed by a possibly empty list of dynamic rule.
+ * The last dynamic rule has NULL in the "next" field.
+ *
+ * Note that the calculated size is used to bound the
+ * amount of data returned to the user. The rule set may
+ * change between calculating the size and returning the
+ * data in which case we'll just return what fits.
+ */
+ for (;;) {
+ int len = 0, want;
+
+ size = chain->static_len;
+ size += ipfw_dyn_len();
+ if (size >= sopt->sopt_valsize)
+ break;
+ buf = malloc(size, M_TEMP, M_WAITOK);
+ IPFW_UH_RLOCK(chain);
+ /* check again how much space we need */
+ want = chain->static_len + ipfw_dyn_len();
+ if (size >= want)
+ len = ipfw_getrules(chain, buf, size);
+ IPFW_UH_RUNLOCK(chain);
+ if (size >= want)
+ error = sooptcopyout(sopt, buf, len);
+ free(buf, M_TEMP);
+ if (size >= want)
+ break;
+ }
+ break;
+
+ case IP_FW_FLUSH:
+ /* locking is done within del_entry() */
+ error = del_entry(chain, 0); /* special case, rule=0, cmd=0 means all */
+ break;
+
+ case IP_FW_ADD:
+ rule = malloc(RULE_MAXSIZE, M_TEMP, M_WAITOK);
+ error = sooptcopyin(sopt, rule, RULE_MAXSIZE,
+ sizeof(struct ip_fw7) );
+
+ /*
+ * If the size of commands equals RULESIZE7 then we assume
+ * a FreeBSD7.2 binary is talking to us (set is7=1).
+ * is7 is persistent so the next 'ipfw list' command
+ * will use this format.
+ * NOTE: If wrong version is guessed (this can happen if
+ * the first ipfw command is 'ipfw [pipe] list')
+ * the ipfw binary may crash or loop infinitly...
+ */
+ if (sopt->sopt_valsize == RULESIZE7(rule)) {
+ is7 = 1;
+ error = convert_rule_to_8(rule);
+ if (error)
+ return error;
+ if (error == 0)
+ error = check_ipfw_struct(rule, RULESIZE(rule));
+ } else {
+ is7 = 0;
+ if (error == 0)
+ error = check_ipfw_struct(rule, sopt->sopt_valsize);
+ }
+ if (error == 0) {
+ /* locking is done within ipfw_add_rule() */
+ error = ipfw_add_rule(chain, rule);
+ size = RULESIZE(rule);
+ if (!error && sopt->sopt_dir == SOPT_GET) {
+ if (is7) {
+ error = convert_rule_to_7(rule);
+ size = RULESIZE7(rule);
+ if (error)
+ return error;
+ }
+ error = sooptcopyout(sopt, rule, size);
+ }
+ }
+ free(rule, M_TEMP);
+ break;
+
+ case IP_FW_DEL:
+ /*
+ * IP_FW_DEL is used for deleting single rules or sets,
+ * and (ab)used to atomically manipulate sets. Argument size
+ * is used to distinguish between the two:
+ * sizeof(u_int32_t)
+ * delete single rule or set of rules,
+ * or reassign rules (or sets) to a different set.
+ * 2*sizeof(u_int32_t)
+ * atomic disable/enable sets.
+ * first u_int32_t contains sets to be disabled,
+ * second u_int32_t contains sets to be enabled.
+ */
+ error = sooptcopyin(sopt, rulenum,
+ 2*sizeof(u_int32_t), sizeof(u_int32_t));
+ if (error)
+ break;
+ size = sopt->sopt_valsize;
+ if (size == sizeof(u_int32_t) && rulenum[0] != 0) {
+ /* delete or reassign, locking done in del_entry() */
+ error = del_entry(chain, rulenum[0]);
+ } else if (size == 2*sizeof(u_int32_t)) { /* set enable/disable */
+ IPFW_UH_WLOCK(chain);
+ V_set_disable =
+ (V_set_disable | rulenum[0]) & ~rulenum[1] &
+ ~(1<<RESVD_SET); /* set RESVD_SET always enabled */
+ IPFW_UH_WUNLOCK(chain);
+ } else
+ error = EINVAL;
+ break;
+
+ case IP_FW_ZERO:
+ case IP_FW_RESETLOG: /* argument is an u_int_32, the rule number */
+ rulenum[0] = 0;
+ if (sopt->sopt_val != 0) {
+ error = sooptcopyin(sopt, rulenum,
+ sizeof(u_int32_t), sizeof(u_int32_t));
+ if (error)
+ break;
+ }
+ error = zero_entry(chain, rulenum[0],
+ sopt->sopt_name == IP_FW_RESETLOG);
+ break;
+
+ /*--- TABLE manipulations are protected by the IPFW_LOCK ---*/
+ case IP_FW_TABLE_ADD:
+ {
+ ipfw_table_entry ent;
+
+ error = sooptcopyin(sopt, &ent,
+ sizeof(ent), sizeof(ent));
+ if (error)
+ break;
+ error = ipfw_add_table_entry(chain, ent.tbl,
+ &ent.addr, sizeof(ent.addr), ent.masklen,
+ IPFW_TABLE_CIDR, ent.value);
+ }
+ break;
+
+ case IP_FW_TABLE_DEL:
+ {
+ ipfw_table_entry ent;
+
+ error = sooptcopyin(sopt, &ent,
+ sizeof(ent), sizeof(ent));
+ if (error)
+ break;
+ error = ipfw_del_table_entry(chain, ent.tbl,
+ &ent.addr, sizeof(ent.addr), ent.masklen, IPFW_TABLE_CIDR);
+ }
+ break;
+
+ case IP_FW_TABLE_XADD: /* IP_FW3 */
+ case IP_FW_TABLE_XDEL: /* IP_FW3 */
+ {
+ ipfw_table_xentry *xent = (ipfw_table_xentry *)(op3 + 1);
+
+ /* Check minimum header size */
+ if (IP_FW3_OPLENGTH(sopt) < offsetof(ipfw_table_xentry, k)) {
+ error = EINVAL;
+ break;
+ }
+
+ /* Check if len field is valid */
+ if (xent->len > sizeof(ipfw_table_xentry)) {
+ error = EINVAL;
+ break;
+ }
+
+ len = xent->len - offsetof(ipfw_table_xentry, k);
+
+ error = (opt == IP_FW_TABLE_XADD) ?
+ ipfw_add_table_entry(chain, xent->tbl, &xent->k,
+ len, xent->masklen, xent->type, xent->value) :
+ ipfw_del_table_entry(chain, xent->tbl, &xent->k,
+ len, xent->masklen, xent->type);
+ }
+ break;
+
+ case IP_FW_TABLE_FLUSH:
+ {
+ u_int16_t tbl;
+
+ error = sooptcopyin(sopt, &tbl,
+ sizeof(tbl), sizeof(tbl));
+ if (error)
+ break;
+ error = ipfw_flush_table(chain, tbl);
+ }
+ break;
+
+ case IP_FW_TABLE_GETSIZE:
+ {
+ u_int32_t tbl, cnt;
+
+ if ((error = sooptcopyin(sopt, &tbl, sizeof(tbl),
+ sizeof(tbl))))
+ break;
+ IPFW_RLOCK(chain);
+ error = ipfw_count_table(chain, tbl, &cnt);
+ IPFW_RUNLOCK(chain);
+ if (error)
+ break;
+ error = sooptcopyout(sopt, &cnt, sizeof(cnt));
+ }
+ break;
+
+ case IP_FW_TABLE_LIST:
+ {
+ ipfw_table *tbl;
+
+ if (sopt->sopt_valsize < sizeof(*tbl)) {
+ error = EINVAL;
+ break;
+ }
+ size = sopt->sopt_valsize;
+ tbl = malloc(size, M_TEMP, M_WAITOK);
+ error = sooptcopyin(sopt, tbl, size, sizeof(*tbl));
+ if (error) {
+ free(tbl, M_TEMP);
+ break;
+ }
+ tbl->size = (size - sizeof(*tbl)) /
+ sizeof(ipfw_table_entry);
+ IPFW_RLOCK(chain);
+ error = ipfw_dump_table(chain, tbl);
+ IPFW_RUNLOCK(chain);
+ if (error) {
+ free(tbl, M_TEMP);
+ break;
+ }
+ error = sooptcopyout(sopt, tbl, size);
+ free(tbl, M_TEMP);
+ }
+ break;
+
+ case IP_FW_TABLE_XGETSIZE: /* IP_FW3 */
+ {
+ uint32_t *tbl;
+
+ if (IP_FW3_OPLENGTH(sopt) < sizeof(uint32_t)) {
+ error = EINVAL;
+ break;
+ }
+
+ tbl = (uint32_t *)(op3 + 1);
+
+ IPFW_RLOCK(chain);
+ error = ipfw_count_xtable(chain, *tbl, tbl);
+ IPFW_RUNLOCK(chain);
+ if (error)
+ break;
+ error = sooptcopyout(sopt, op3, sopt->sopt_valsize);
+ }
+ break;
+
+ case IP_FW_TABLE_XLIST: /* IP_FW3 */
+ {
+ ipfw_xtable *tbl;
+
+ if ((size = valsize) < sizeof(ipfw_xtable)) {
+ error = EINVAL;
+ break;
+ }
+
+ tbl = malloc(size, M_TEMP, M_ZERO | M_WAITOK);
+ memcpy(tbl, op3, sizeof(ipfw_xtable));
+
+ /* Get maximum number of entries we can store */
+ tbl->size = (size - sizeof(ipfw_xtable)) /
+ sizeof(ipfw_table_xentry);
+ IPFW_RLOCK(chain);
+ error = ipfw_dump_xtable(chain, tbl);
+ IPFW_RUNLOCK(chain);
+ if (error) {
+ free(tbl, M_TEMP);
+ break;
+ }
+
+ /* Revert size field back to bytes */
+ tbl->size = tbl->size * sizeof(ipfw_table_xentry) +
+ sizeof(ipfw_table);
+ /*
+ * Since we call sooptcopyin() with small buffer, sopt_valsize is
+ * decreased to reflect supplied buffer size. Set it back to original value
+ */
+ sopt->sopt_valsize = valsize;
+ error = sooptcopyout(sopt, tbl, size);
+ free(tbl, M_TEMP);
+ }
+ break;
+
+ /*--- NAT operations are protected by the IPFW_LOCK ---*/
+ case IP_FW_NAT_CFG:
+ if (IPFW_NAT_LOADED)
+ error = ipfw_nat_cfg_ptr(sopt);
+ else {
+ printf("IP_FW_NAT_CFG: %s\n",
+ "ipfw_nat not present, please load it");
+ error = EINVAL;
+ }
+ break;
+
+ case IP_FW_NAT_DEL:
+ if (IPFW_NAT_LOADED)
+ error = ipfw_nat_del_ptr(sopt);
+ else {
+ printf("IP_FW_NAT_DEL: %s\n",
+ "ipfw_nat not present, please load it");
+ error = EINVAL;
+ }
+ break;
+
+ case IP_FW_NAT_GET_CONFIG:
+ if (IPFW_NAT_LOADED)
+ error = ipfw_nat_get_cfg_ptr(sopt);
+ else {
+ printf("IP_FW_NAT_GET_CFG: %s\n",
+ "ipfw_nat not present, please load it");
+ error = EINVAL;
+ }
+ break;
+
+ case IP_FW_NAT_GET_LOG:
+ if (IPFW_NAT_LOADED)
+ error = ipfw_nat_get_log_ptr(sopt);
+ else {
+ printf("IP_FW_NAT_GET_LOG: %s\n",
+ "ipfw_nat not present, please load it");
+ error = EINVAL;
+ }
+ break;
+
+ default:
+ printf("ipfw: ipfw_ctl invalid option %d\n", sopt->sopt_name);
+ error = EINVAL;
+ }
+
+ return (error);
+#undef RULE_MAXSIZE
+}
+
+
+#define RULE_MAXSIZE (256*sizeof(u_int32_t))
+
+/* Functions to convert rules 7.2 <==> 8.0 */
+int
+convert_rule_to_7(struct ip_fw *rule)
+{
+ /* Used to modify original rule */
+ struct ip_fw7 *rule7 = (struct ip_fw7 *)rule;
+ /* copy of original rule, version 8 */
+ struct ip_fw *tmp;
+
+ /* Used to copy commands */
+ ipfw_insn *ccmd, *dst;
+ int ll = 0, ccmdlen = 0;
+
+ tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO);
+ if (tmp == NULL) {
+ return 1; //XXX error
+ }
+ bcopy(rule, tmp, RULE_MAXSIZE);
+
+ /* Copy fields */
+ rule7->_pad = tmp->_pad;
+ rule7->set = tmp->set;
+ rule7->rulenum = tmp->rulenum;
+ rule7->cmd_len = tmp->cmd_len;
+ rule7->act_ofs = tmp->act_ofs;
+ rule7->next_rule = (struct ip_fw7 *)tmp->next_rule;
+ rule7->next = (struct ip_fw7 *)tmp->x_next;
+ rule7->cmd_len = tmp->cmd_len;
+ rule7->pcnt = tmp->pcnt;
+ rule7->bcnt = tmp->bcnt;
+ rule7->timestamp = tmp->timestamp;
+
+ /* Copy commands */
+ for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule7->cmd ;
+ ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) {
+ ccmdlen = F_LEN(ccmd);
+
+ bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t));
+
+ if (dst->opcode > O_NAT)
+ /* O_REASS doesn't exists in 7.2 version, so
+ * decrement opcode if it is after O_REASS
+ */
+ dst->opcode--;
+
+ if (ccmdlen > ll) {
+ printf("ipfw: opcode %d size truncated\n",
+ ccmd->opcode);
+ return EINVAL;
+ }
+ }
+ free(tmp, M_TEMP);
+
+ return 0;
+}
+
+int
+convert_rule_to_8(struct ip_fw *rule)
+{
+ /* Used to modify original rule */
+ struct ip_fw7 *rule7 = (struct ip_fw7 *) rule;
+
+ /* Used to copy commands */
+ ipfw_insn *ccmd, *dst;
+ int ll = 0, ccmdlen = 0;
+
+ /* Copy of original rule */
+ struct ip_fw7 *tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO);
+ if (tmp == NULL) {
+ return 1; //XXX error
+ }
+
+ bcopy(rule7, tmp, RULE_MAXSIZE);
+
+ for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule->cmd ;
+ ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) {
+ ccmdlen = F_LEN(ccmd);
+
+ bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t));
+
+ if (dst->opcode > O_NAT)
+ /* O_REASS doesn't exists in 7.2 version, so
+ * increment opcode if it is after O_REASS
+ */
+ dst->opcode++;
+
+ if (ccmdlen > ll) {
+ printf("ipfw: opcode %d size truncated\n",
+ ccmd->opcode);
+ return EINVAL;
+ }
+ }
+
+ rule->_pad = tmp->_pad;
+ rule->set = tmp->set;
+ rule->rulenum = tmp->rulenum;
+ rule->cmd_len = tmp->cmd_len;
+ rule->act_ofs = tmp->act_ofs;
+ rule->next_rule = (struct ip_fw *)tmp->next_rule;
+ rule->x_next = (struct ip_fw *)tmp->next;
+ rule->cmd_len = tmp->cmd_len;
+ rule->id = 0; /* XXX see if is ok = 0 */
+ rule->pcnt = tmp->pcnt;
+ rule->bcnt = tmp->bcnt;
+ rule->timestamp = tmp->timestamp;
+
+ free (tmp, M_TEMP);
+ return 0;
+}
+
+/* end of file */
diff --git a/sys/netpfil/ipfw/ip_fw_table.c b/sys/netpfil/ipfw/ip_fw_table.c
new file mode 100644
index 0000000..a22fff9
--- /dev/null
+++ b/sys/netpfil/ipfw/ip_fw_table.c
@@ -0,0 +1,762 @@
+/*-
+ * Copyright (c) 2004 Ruslan Ermilov and Vsevolod Lobko.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ * Lookup table support for ipfw
+ *
+ * Lookup tables are implemented (at the moment) using the radix
+ * tree used for routing tables. Tables store key-value entries, where
+ * keys are network prefixes (addr/masklen), and values are integers.
+ * As a degenerate case we can interpret keys as 32-bit integers
+ * (with a /32 mask).
+ *
+ * The table is protected by the IPFW lock even for manipulation coming
+ * from userland, because operations are typically fast.
+ */
+
+#include "opt_ipfw.h"
+#include "opt_inet.h"
+#ifndef INET
+#error IPFIREWALL requires INET.
+#endif /* INET */
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/queue.h>
+#include <net/if.h> /* ip_fw.h requires IFNAMSIZ */
+#include <net/radix.h>
+#include <net/route.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/ip_var.h> /* struct ipfw_rule_ref */
+#include <netinet/ip_fw.h>
+
+#include <netpfil/ipfw/ip_fw_private.h>
+
+#ifdef MAC
+#include <security/mac/mac_framework.h>
+#endif
+
+static MALLOC_DEFINE(M_IPFW_TBL, "ipfw_tbl", "IpFw tables");
+
+struct table_entry {
+ struct radix_node rn[2];
+ struct sockaddr_in addr, mask;
+ u_int32_t value;
+};
+
+struct xaddr_iface {
+ uint8_t if_len; /* length of this struct */
+ uint8_t pad[7]; /* Align name */
+ char ifname[IF_NAMESIZE]; /* Interface name */
+};
+
+struct table_xentry {
+ struct radix_node rn[2];
+ union {
+#ifdef INET6
+ struct sockaddr_in6 addr6;
+#endif
+ struct xaddr_iface iface;
+ } a;
+ union {
+#ifdef INET6
+ struct sockaddr_in6 mask6;
+#endif
+ struct xaddr_iface ifmask;
+ } m;
+ u_int32_t value;
+};
+
+/*
+ * The radix code expects addr and mask to be array of bytes,
+ * with the first byte being the length of the array. rn_inithead
+ * is called with the offset in bits of the lookup key within the
+ * array. If we use a sockaddr_in as the underlying type,
+ * sin_len is conveniently located at offset 0, sin_addr is at
+ * offset 4 and normally aligned.
+ * But for portability, let's avoid assumption and make the code explicit
+ */
+#define KEY_LEN(v) *((uint8_t *)&(v))
+#define KEY_OFS (8*offsetof(struct sockaddr_in, sin_addr))
+/*
+ * Do not require radix to compare more than actual IPv4/IPv6 address
+ */
+#define KEY_LEN_INET (offsetof(struct sockaddr_in, sin_addr) + sizeof(in_addr_t))
+#define KEY_LEN_INET6 (offsetof(struct sockaddr_in6, sin6_addr) + sizeof(struct in6_addr))
+#define KEY_LEN_IFACE (offsetof(struct xaddr_iface, ifname))
+
+#define OFF_LEN_INET (8 * offsetof(struct sockaddr_in, sin_addr))
+#define OFF_LEN_INET6 (8 * offsetof(struct sockaddr_in6, sin6_addr))
+#define OFF_LEN_IFACE (8 * offsetof(struct xaddr_iface, ifname))
+
+
+static inline void
+ipv6_writemask(struct in6_addr *addr6, uint8_t mask)
+{
+ uint32_t *cp;
+
+ for (cp = (uint32_t *)addr6; mask >= 32; mask -= 32)
+ *cp++ = 0xFFFFFFFF;
+ *cp = htonl(mask ? ~((1 << (32 - mask)) - 1) : 0);
+}
+
+int
+ipfw_add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, void *paddr,
+ uint8_t plen, uint8_t mlen, uint8_t type, uint32_t value)
+{
+ struct radix_node_head *rnh, **rnh_ptr;
+ struct table_entry *ent;
+ struct table_xentry *xent;
+ struct radix_node *rn;
+ in_addr_t addr;
+ int offset;
+ void *ent_ptr;
+ struct sockaddr *addr_ptr, *mask_ptr;
+ char c;
+
+ if (tbl >= V_fw_tables_max)
+ return (EINVAL);
+
+ switch (type) {
+ case IPFW_TABLE_CIDR:
+ if (plen == sizeof(in_addr_t)) {
+#ifdef INET
+ /* IPv4 case */
+ if (mlen > 32)
+ return (EINVAL);
+ ent = malloc(sizeof(*ent), M_IPFW_TBL, M_WAITOK | M_ZERO);
+ ent->value = value;
+ /* Set 'total' structure length */
+ KEY_LEN(ent->addr) = KEY_LEN_INET;
+ KEY_LEN(ent->mask) = KEY_LEN_INET;
+ /* Set offset of IPv4 address in bits */
+ offset = OFF_LEN_INET;
+ ent->mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0);
+ addr = *((in_addr_t *)paddr);
+ ent->addr.sin_addr.s_addr = addr & ent->mask.sin_addr.s_addr;
+ /* Set pointers */
+ rnh_ptr = &ch->tables[tbl];
+ ent_ptr = ent;
+ addr_ptr = (struct sockaddr *)&ent->addr;
+ mask_ptr = (struct sockaddr *)&ent->mask;
+#endif
+#ifdef INET6
+ } else if (plen == sizeof(struct in6_addr)) {
+ /* IPv6 case */
+ if (mlen > 128)
+ return (EINVAL);
+ xent = malloc(sizeof(*xent), M_IPFW_TBL, M_WAITOK | M_ZERO);
+ xent->value = value;
+ /* Set 'total' structure length */
+ KEY_LEN(xent->a.addr6) = KEY_LEN_INET6;
+ KEY_LEN(xent->m.mask6) = KEY_LEN_INET6;
+ /* Set offset of IPv6 address in bits */
+ offset = OFF_LEN_INET6;
+ ipv6_writemask(&xent->m.mask6.sin6_addr, mlen);
+ memcpy(&xent->a.addr6.sin6_addr, paddr, sizeof(struct in6_addr));
+ APPLY_MASK(&xent->a.addr6.sin6_addr, &xent->m.mask6.sin6_addr);
+ /* Set pointers */
+ rnh_ptr = &ch->xtables[tbl];
+ ent_ptr = xent;
+ addr_ptr = (struct sockaddr *)&xent->a.addr6;
+ mask_ptr = (struct sockaddr *)&xent->m.mask6;
+#endif
+ } else {
+ /* Unknown CIDR type */
+ return (EINVAL);
+ }
+ break;
+
+ case IPFW_TABLE_INTERFACE:
+ /* Check if string is terminated */
+ c = ((char *)paddr)[IF_NAMESIZE - 1];
+ ((char *)paddr)[IF_NAMESIZE - 1] = '\0';
+ if (((mlen = strlen((char *)paddr)) == IF_NAMESIZE - 1) && (c != '\0'))
+ return (EINVAL);
+
+ /* Include last \0 into comparison */
+ mlen++;
+
+ xent = malloc(sizeof(*xent), M_IPFW_TBL, M_WAITOK | M_ZERO);
+ xent->value = value;
+ /* Set 'total' structure length */
+ KEY_LEN(xent->a.iface) = KEY_LEN_IFACE + mlen;
+ KEY_LEN(xent->m.ifmask) = KEY_LEN_IFACE + mlen;
+ /* Set offset of interface name in bits */
+ offset = OFF_LEN_IFACE;
+ memcpy(xent->a.iface.ifname, paddr, mlen);
+ /* Assume direct match */
+ /* TODO: Add interface pattern matching */
+#if 0
+ memset(xent->m.ifmask.ifname, 0xFF, IF_NAMESIZE);
+ mask_ptr = (struct sockaddr *)&xent->m.ifmask;
+#endif
+ /* Set pointers */
+ rnh_ptr = &ch->xtables[tbl];
+ ent_ptr = xent;
+ addr_ptr = (struct sockaddr *)&xent->a.iface;
+ mask_ptr = NULL;
+ break;
+
+ default:
+ return (EINVAL);
+ }
+
+ IPFW_WLOCK(ch);
+
+ /* Check if tabletype is valid */
+ if ((ch->tabletype[tbl] != 0) && (ch->tabletype[tbl] != type)) {
+ IPFW_WUNLOCK(ch);
+ free(ent_ptr, M_IPFW_TBL);
+ return (EINVAL);
+ }
+
+ /* Check if radix tree exists */
+ if ((rnh = *rnh_ptr) == NULL) {
+ IPFW_WUNLOCK(ch);
+ /* Create radix for a new table */
+ if (!rn_inithead((void **)&rnh, offset)) {
+ free(ent_ptr, M_IPFW_TBL);
+ return (ENOMEM);
+ }
+
+ IPFW_WLOCK(ch);
+ if (*rnh_ptr != NULL) {
+ /* Tree is already attached by other thread */
+ rn_detachhead((void **)&rnh);
+ rnh = *rnh_ptr;
+ /* Check table type another time */
+ if (ch->tabletype[tbl] != type) {
+ IPFW_WUNLOCK(ch);
+ free(ent_ptr, M_IPFW_TBL);
+ return (EINVAL);
+ }
+ } else {
+ *rnh_ptr = rnh;
+ /*
+ * Set table type. It can be set already
+ * (if we have IPv6-only table) but setting
+ * it another time does not hurt
+ */
+ ch->tabletype[tbl] = type;
+ }
+ }
+
+ rn = rnh->rnh_addaddr(addr_ptr, mask_ptr, rnh, ent_ptr);
+ IPFW_WUNLOCK(ch);
+
+ if (rn == NULL) {
+ free(ent_ptr, M_IPFW_TBL);
+ return (EEXIST);
+ }
+ return (0);
+}
+
+int
+ipfw_del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, void *paddr,
+ uint8_t plen, uint8_t mlen, uint8_t type)
+{
+ struct radix_node_head *rnh, **rnh_ptr;
+ struct table_entry *ent;
+ in_addr_t addr;
+ struct sockaddr_in sa, mask;
+ struct sockaddr *sa_ptr, *mask_ptr;
+ char c;
+
+ if (tbl >= V_fw_tables_max)
+ return (EINVAL);
+
+ switch (type) {
+ case IPFW_TABLE_CIDR:
+ if (plen == sizeof(in_addr_t)) {
+ /* Set 'total' structure length */
+ KEY_LEN(sa) = KEY_LEN_INET;
+ KEY_LEN(mask) = KEY_LEN_INET;
+ mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0);
+ addr = *((in_addr_t *)paddr);
+ sa.sin_addr.s_addr = addr & mask.sin_addr.s_addr;
+ rnh_ptr = &ch->tables[tbl];
+ sa_ptr = (struct sockaddr *)&sa;
+ mask_ptr = (struct sockaddr *)&mask;
+#ifdef INET6
+ } else if (plen == sizeof(struct in6_addr)) {
+ /* IPv6 case */
+ if (mlen > 128)
+ return (EINVAL);
+ struct sockaddr_in6 sa6, mask6;
+ memset(&sa6, 0, sizeof(struct sockaddr_in6));
+ memset(&mask6, 0, sizeof(struct sockaddr_in6));
+ /* Set 'total' structure length */
+ KEY_LEN(sa6) = KEY_LEN_INET6;
+ KEY_LEN(mask6) = KEY_LEN_INET6;
+ ipv6_writemask(&mask6.sin6_addr, mlen);
+ memcpy(&sa6.sin6_addr, paddr, sizeof(struct in6_addr));
+ APPLY_MASK(&sa6.sin6_addr, &mask6.sin6_addr);
+ rnh_ptr = &ch->xtables[tbl];
+ sa_ptr = (struct sockaddr *)&sa6;
+ mask_ptr = (struct sockaddr *)&mask6;
+#endif
+ } else {
+ /* Unknown CIDR type */
+ return (EINVAL);
+ }
+ break;
+
+ case IPFW_TABLE_INTERFACE:
+ /* Check if string is terminated */
+ c = ((char *)paddr)[IF_NAMESIZE - 1];
+ ((char *)paddr)[IF_NAMESIZE - 1] = '\0';
+ if (((mlen = strlen((char *)paddr)) == IF_NAMESIZE - 1) && (c != '\0'))
+ return (EINVAL);
+
+ struct xaddr_iface ifname, ifmask;
+ memset(&ifname, 0, sizeof(ifname));
+
+ /* Include last \0 into comparison */
+ mlen++;
+
+ /* Set 'total' structure length */
+ KEY_LEN(ifname) = KEY_LEN_IFACE + mlen;
+ KEY_LEN(ifmask) = KEY_LEN_IFACE + mlen;
+ /* Assume direct match */
+ /* FIXME: Add interface pattern matching */
+#if 0
+ memset(ifmask.ifname, 0xFF, IF_NAMESIZE);
+ mask_ptr = (struct sockaddr *)&ifmask;
+#endif
+ mask_ptr = NULL;
+ memcpy(ifname.ifname, paddr, mlen);
+ /* Set pointers */
+ rnh_ptr = &ch->xtables[tbl];
+ sa_ptr = (struct sockaddr *)&ifname;
+
+ break;
+
+ default:
+ return (EINVAL);
+ }
+
+ IPFW_WLOCK(ch);
+ if ((rnh = *rnh_ptr) == NULL) {
+ IPFW_WUNLOCK(ch);
+ return (ESRCH);
+ }
+
+ if (ch->tabletype[tbl] != type) {
+ IPFW_WUNLOCK(ch);
+ return (EINVAL);
+ }
+
+ ent = (struct table_entry *)rnh->rnh_deladdr(sa_ptr, mask_ptr, rnh);
+ IPFW_WUNLOCK(ch);
+
+ if (ent == NULL)
+ return (ESRCH);
+
+ free(ent, M_IPFW_TBL);
+ return (0);
+}
+
+static int
+flush_table_entry(struct radix_node *rn, void *arg)
+{
+ struct radix_node_head * const rnh = arg;
+ struct table_entry *ent;
+
+ ent = (struct table_entry *)
+ rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh);
+ if (ent != NULL)
+ free(ent, M_IPFW_TBL);
+ return (0);
+}
+
+int
+ipfw_flush_table(struct ip_fw_chain *ch, uint16_t tbl)
+{
+ struct radix_node_head *rnh, *xrnh;
+
+ if (tbl >= V_fw_tables_max)
+ return (EINVAL);
+
+ /*
+ * We free both (IPv4 and extended) radix trees and
+ * clear table type here to permit table to be reused
+ * for different type without module reload
+ */
+
+ IPFW_WLOCK(ch);
+ /* Set IPv4 table pointer to zero */
+ if ((rnh = ch->tables[tbl]) != NULL)
+ ch->tables[tbl] = NULL;
+ /* Set extended table pointer to zero */
+ if ((xrnh = ch->xtables[tbl]) != NULL)
+ ch->xtables[tbl] = NULL;
+ /* Zero table type */
+ ch->tabletype[tbl] = 0;
+ IPFW_WUNLOCK(ch);
+
+ if (rnh != NULL) {
+ rnh->rnh_walktree(rnh, flush_table_entry, rnh);
+ rn_detachhead((void **)&rnh);
+ }
+
+ if (xrnh != NULL) {
+ xrnh->rnh_walktree(xrnh, flush_table_entry, xrnh);
+ rn_detachhead((void **)&xrnh);
+ }
+
+ return (0);
+}
+
+void
+ipfw_destroy_tables(struct ip_fw_chain *ch)
+{
+ uint16_t tbl;
+
+ /* Flush all tables */
+ for (tbl = 0; tbl < V_fw_tables_max; tbl++)
+ ipfw_flush_table(ch, tbl);
+
+ /* Free pointers itself */
+ free(ch->tables, M_IPFW);
+ free(ch->xtables, M_IPFW);
+ free(ch->tabletype, M_IPFW);
+}
+
+int
+ipfw_init_tables(struct ip_fw_chain *ch)
+{
+ /* Allocate pointers */
+ ch->tables = malloc(V_fw_tables_max * sizeof(void *), M_IPFW, M_WAITOK | M_ZERO);
+ ch->xtables = malloc(V_fw_tables_max * sizeof(void *), M_IPFW, M_WAITOK | M_ZERO);
+ ch->tabletype = malloc(V_fw_tables_max * sizeof(uint8_t), M_IPFW, M_WAITOK | M_ZERO);
+ return (0);
+}
+
+int
+ipfw_resize_tables(struct ip_fw_chain *ch, unsigned int ntables)
+{
+ struct radix_node_head **tables, **xtables, *rnh;
+ struct radix_node_head **tables_old, **xtables_old;
+ uint8_t *tabletype, *tabletype_old;
+ unsigned int ntables_old, tbl;
+
+ /* Check new value for validity */
+ if (ntables > IPFW_TABLES_MAX)
+ ntables = IPFW_TABLES_MAX;
+
+ /* Allocate new pointers */
+ tables = malloc(ntables * sizeof(void *), M_IPFW, M_WAITOK | M_ZERO);
+ xtables = malloc(ntables * sizeof(void *), M_IPFW, M_WAITOK | M_ZERO);
+ tabletype = malloc(ntables * sizeof(uint8_t), M_IPFW, M_WAITOK | M_ZERO);
+
+ IPFW_WLOCK(ch);
+
+ tbl = (ntables >= V_fw_tables_max) ? V_fw_tables_max : ntables;
+
+ /* Copy old table pointers */
+ memcpy(tables, ch->tables, sizeof(void *) * tbl);
+ memcpy(xtables, ch->xtables, sizeof(void *) * tbl);
+ memcpy(tabletype, ch->tabletype, sizeof(uint8_t) * tbl);
+
+ /* Change pointers and number of tables */
+ tables_old = ch->tables;
+ xtables_old = ch->xtables;
+ tabletype_old = ch->tabletype;
+ ch->tables = tables;
+ ch->xtables = xtables;
+ ch->tabletype = tabletype;
+
+ ntables_old = V_fw_tables_max;
+ V_fw_tables_max = ntables;
+
+ IPFW_WUNLOCK(ch);
+
+ /* Check if we need to destroy radix trees */
+ if (ntables < ntables_old) {
+ for (tbl = ntables; tbl < ntables_old; tbl++) {
+ if ((rnh = tables_old[tbl]) != NULL) {
+ rnh->rnh_walktree(rnh, flush_table_entry, rnh);
+ rn_detachhead((void **)&rnh);
+ }
+
+ if ((rnh = xtables_old[tbl]) != NULL) {
+ rnh->rnh_walktree(rnh, flush_table_entry, rnh);
+ rn_detachhead((void **)&rnh);
+ }
+ }
+ }
+
+ /* Free old pointers */
+ free(tables_old, M_IPFW);
+ free(xtables_old, M_IPFW);
+ free(tabletype_old, M_IPFW);
+
+ return (0);
+}
+
+int
+ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+ uint32_t *val)
+{
+ struct radix_node_head *rnh;
+ struct table_entry *ent;
+ struct sockaddr_in sa;
+
+ if (tbl >= V_fw_tables_max)
+ return (0);
+ if ((rnh = ch->tables[tbl]) == NULL)
+ return (0);
+ KEY_LEN(sa) = KEY_LEN_INET;
+ sa.sin_addr.s_addr = addr;
+ ent = (struct table_entry *)(rnh->rnh_lookup(&sa, NULL, rnh));
+ if (ent != NULL) {
+ *val = ent->value;
+ return (1);
+ }
+ return (0);
+}
+
+int
+ipfw_lookup_table_extended(struct ip_fw_chain *ch, uint16_t tbl, void *paddr,
+ uint32_t *val, int type)
+{
+ struct radix_node_head *rnh;
+ struct table_xentry *xent;
+ struct sockaddr_in6 sa6;
+ struct xaddr_iface iface;
+
+ if (tbl >= V_fw_tables_max)
+ return (0);
+ if ((rnh = ch->xtables[tbl]) == NULL)
+ return (0);
+
+ switch (type) {
+ case IPFW_TABLE_CIDR:
+ KEY_LEN(sa6) = KEY_LEN_INET6;
+ memcpy(&sa6.sin6_addr, paddr, sizeof(struct in6_addr));
+ xent = (struct table_xentry *)(rnh->rnh_lookup(&sa6, NULL, rnh));
+ break;
+
+ case IPFW_TABLE_INTERFACE:
+ KEY_LEN(iface) = KEY_LEN_IFACE +
+ strlcpy(iface.ifname, (char *)paddr, IF_NAMESIZE) + 1;
+ /* Assume direct match */
+ /* FIXME: Add interface pattern matching */
+ xent = (struct table_xentry *)(rnh->rnh_lookup(&iface, NULL, rnh));
+ break;
+
+ default:
+ return (0);
+ }
+
+ if (xent != NULL) {
+ *val = xent->value;
+ return (1);
+ }
+ return (0);
+}
+
+static int
+count_table_entry(struct radix_node *rn, void *arg)
+{
+ u_int32_t * const cnt = arg;
+
+ (*cnt)++;
+ return (0);
+}
+
+int
+ipfw_count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt)
+{
+ struct radix_node_head *rnh;
+
+ if (tbl >= V_fw_tables_max)
+ return (EINVAL);
+ *cnt = 0;
+ if ((rnh = ch->tables[tbl]) == NULL)
+ return (0);
+ rnh->rnh_walktree(rnh, count_table_entry, cnt);
+ return (0);
+}
+
+static int
+dump_table_entry(struct radix_node *rn, void *arg)
+{
+ struct table_entry * const n = (struct table_entry *)rn;
+ ipfw_table * const tbl = arg;
+ ipfw_table_entry *ent;
+
+ if (tbl->cnt == tbl->size)
+ return (1);
+ ent = &tbl->ent[tbl->cnt];
+ ent->tbl = tbl->tbl;
+ if (in_nullhost(n->mask.sin_addr))
+ ent->masklen = 0;
+ else
+ ent->masklen = 33 - ffs(ntohl(n->mask.sin_addr.s_addr));
+ ent->addr = n->addr.sin_addr.s_addr;
+ ent->value = n->value;
+ tbl->cnt++;
+ return (0);
+}
+
+int
+ipfw_dump_table(struct ip_fw_chain *ch, ipfw_table *tbl)
+{
+ struct radix_node_head *rnh;
+
+ if (tbl->tbl >= V_fw_tables_max)
+ return (EINVAL);
+ tbl->cnt = 0;
+ if ((rnh = ch->tables[tbl->tbl]) == NULL)
+ return (0);
+ rnh->rnh_walktree(rnh, dump_table_entry, tbl);
+ return (0);
+}
+
+static int
+count_table_xentry(struct radix_node *rn, void *arg)
+{
+ uint32_t * const cnt = arg;
+
+ (*cnt) += sizeof(ipfw_table_xentry);
+ return (0);
+}
+
+int
+ipfw_count_xtable(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt)
+{
+ struct radix_node_head *rnh;
+
+ if (tbl >= V_fw_tables_max)
+ return (EINVAL);
+ *cnt = 0;
+ if ((rnh = ch->tables[tbl]) != NULL)
+ rnh->rnh_walktree(rnh, count_table_xentry, cnt);
+ if ((rnh = ch->xtables[tbl]) != NULL)
+ rnh->rnh_walktree(rnh, count_table_xentry, cnt);
+ /* Return zero if table is empty */
+ if (*cnt > 0)
+ (*cnt) += sizeof(ipfw_xtable);
+ return (0);
+}
+
+
+static int
+dump_table_xentry_base(struct radix_node *rn, void *arg)
+{
+ struct table_entry * const n = (struct table_entry *)rn;
+ ipfw_xtable * const tbl = arg;
+ ipfw_table_xentry *xent;
+
+ /* Out of memory, returning */
+ if (tbl->cnt == tbl->size)
+ return (1);
+ xent = &tbl->xent[tbl->cnt];
+ xent->len = sizeof(ipfw_table_xentry);
+ xent->tbl = tbl->tbl;
+ if (in_nullhost(n->mask.sin_addr))
+ xent->masklen = 0;
+ else
+ xent->masklen = 33 - ffs(ntohl(n->mask.sin_addr.s_addr));
+ /* Save IPv4 address as deprecated IPv6 compatible */
+ xent->k.addr6.s6_addr32[3] = n->addr.sin_addr.s_addr;
+ xent->value = n->value;
+ tbl->cnt++;
+ return (0);
+}
+
+static int
+dump_table_xentry_extended(struct radix_node *rn, void *arg)
+{
+ struct table_xentry * const n = (struct table_xentry *)rn;
+ ipfw_xtable * const tbl = arg;
+ ipfw_table_xentry *xent;
+#ifdef INET6
+ int i;
+ uint32_t *v;
+#endif
+ /* Out of memory, returning */
+ if (tbl->cnt == tbl->size)
+ return (1);
+ xent = &tbl->xent[tbl->cnt];
+ xent->len = sizeof(ipfw_table_xentry);
+ xent->tbl = tbl->tbl;
+
+ switch (tbl->type) {
+#ifdef INET6
+ case IPFW_TABLE_CIDR:
+ /* Count IPv6 mask */
+ v = (uint32_t *)&n->m.mask6.sin6_addr;
+ for (i = 0; i < sizeof(struct in6_addr) / 4; i++, v++)
+ xent->masklen += bitcount32(*v);
+ memcpy(&xent->k, &n->a.addr6.sin6_addr, sizeof(struct in6_addr));
+ break;
+#endif
+ case IPFW_TABLE_INTERFACE:
+ /* Assume exact mask */
+ xent->masklen = 8 * IF_NAMESIZE;
+ memcpy(&xent->k, &n->a.iface.ifname, IF_NAMESIZE);
+ break;
+
+ default:
+ /* unknown, skip entry */
+ return (0);
+ }
+
+ xent->value = n->value;
+ tbl->cnt++;
+ return (0);
+}
+
+int
+ipfw_dump_xtable(struct ip_fw_chain *ch, ipfw_xtable *tbl)
+{
+ struct radix_node_head *rnh;
+
+ if (tbl->tbl >= V_fw_tables_max)
+ return (EINVAL);
+ tbl->cnt = 0;
+ tbl->type = ch->tabletype[tbl->tbl];
+ if ((rnh = ch->tables[tbl->tbl]) != NULL)
+ rnh->rnh_walktree(rnh, dump_table_xentry_base, tbl);
+ if ((rnh = ch->xtables[tbl->tbl]) != NULL)
+ rnh->rnh_walktree(rnh, dump_table_xentry_extended, tbl);
+ return (0);
+}
+
+/* end of file */
diff --git a/sys/netpfil/ipfw/test/Makefile b/sys/netpfil/ipfw/test/Makefile
new file mode 100644
index 0000000..c556a4b
--- /dev/null
+++ b/sys/netpfil/ipfw/test/Makefile
@@ -0,0 +1,51 @@
+#
+# $FreeBSD$
+#
+# Makefile for building userland tests
+# this is written in a form compatible with gmake
+
+SCHED_SRCS = test_dn_sched.c
+SCHED_SRCS += dn_sched_fifo.c
+SCHED_SRCS += dn_sched_prio.c
+SCHED_SRCS += dn_sched_qfq.c
+SCHED_SRCS += dn_sched_rr.c
+SCHED_SRCS += dn_sched_wf2q.c
+SCHED_SRCS += dn_heap.c
+SCHED_SRCS += main.c
+
+SCHED_OBJS=$(SCHED_SRCS:.c=.o)
+
+HEAP_SRCS = dn_heap.c test_dn_heap.c
+HEAP_OBJS=$(HEAP_SRCS:.c=.o)
+
+VPATH= .:..
+
+CFLAGS = -I.. -I. -Wall -Werror -O3 -DIPFW
+TARGETS= test_sched # no test_heap by default
+
+all: $(TARGETS)
+
+test_heap : $(HEAP_OBJS)
+ $(CC) -o $@ $(HEAP_OBJS)
+
+test_sched : $(SCHED_OBJS)
+ $(CC) -o $@ $(SCHED_OBJS)
+
+$(SCHED_OBJS): dn_test.h
+main.o: mylist.h
+
+clean:
+ - rm *.o $(TARGETS) *.core
+
+ALLSRCS = $(SCHED_SRCS) dn_test.h mylist.h \
+ dn_sched.h dn_heap.h ip_dn_private.h Makefile
+TMPBASE = /tmp/testXYZ
+TMPDIR = $(TMPBASE)/test
+
+tgz:
+ -rm -rf $(TMPDIR)
+ mkdir -p $(TMPDIR)
+ -cp -p $(ALLSRCS) $(TMPDIR)
+ -(cd ..; cp -p $(ALLSRCS) $(TMPDIR))
+ ls -la $(TMPDIR)
+ (cd $(TMPBASE); tar cvzf /tmp/test.tgz test)
diff --git a/sys/netpfil/ipfw/test/dn_test.h b/sys/netpfil/ipfw/test/dn_test.h
new file mode 100644
index 0000000..4e079bc
--- /dev/null
+++ b/sys/netpfil/ipfw/test/dn_test.h
@@ -0,0 +1,175 @@
+/*
+ * $FreeBSD$
+ *
+ * userspace compatibility code for dummynet schedulers
+ */
+
+#ifndef _DN_TEST_H
+#define _DN_TEST_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h> /* bzero, ffs, ... */
+#include <string.h> /* strcmp */
+#include <errno.h>
+#include <sys/queue.h>
+#include <sys/time.h>
+
+extern int debug;
+#define ND(fmt, args...) do {} while (0)
+#define D1(fmt, args...) do {} while (0)
+#define D(fmt, args...) fprintf(stderr, "%-8s " fmt "\n", \
+ __FUNCTION__, ## args)
+#define DX(lev, fmt, args...) do { \
+ if (debug > lev) D(fmt, ## args); } while (0)
+
+
+#ifndef offsetof
+#define offsetof(t,m) (int)((&((t *)0L)->m))
+#endif
+
+#include <mylist.h>
+
+/* prevent include of other system headers */
+#define _NETINET_IP_VAR_H_ /* ip_fw_args */
+#define _IPFW2_H
+#define _SYS_MBUF_H_
+
+enum {
+ DN_QUEUE,
+};
+
+enum {
+ DN_SCHED_FIFO,
+ DN_SCHED_WF2QP,
+};
+
+struct dn_id {
+ int type, subtype, len, id;
+};
+
+struct dn_fs {
+ int par[4]; /* flowset parameters */
+
+ /* simulation entries.
+ * 'index' is not strictly necessary
+ * y is used for the inverse mapping ,
+ */
+ int index;
+ int y; /* inverse mapping */
+ int base_y; /* inverse mapping */
+ int next_y; /* inverse mapping */
+ int n_flows;
+ int first_flow;
+ int next_flow; /* first_flow + n_flows */
+ /*
+ * when generating, let 'cur' go from 0 to n_flows-1,
+ * then point to flow first_flow + cur
+ */
+ int cur;
+};
+
+struct dn_sch {
+};
+
+struct dn_flow {
+ struct dn_id oid;
+ int length;
+ int len_bytes;
+ int drops;
+ uint64_t tot_bytes;
+ uint32_t flow_id;
+ struct list_head h; /* used by the generator */
+};
+
+struct dn_link {
+};
+
+struct ip_fw_args {
+};
+
+struct mbuf {
+ struct {
+ int len;
+ } m_pkthdr;
+ struct mbuf *m_nextpkt;
+ int flow_id; /* for testing, index of a flow */
+ //int flowset_id; /* for testing, index of a flowset */
+ void *cfg; /* config args */
+};
+
+#define MALLOC_DECLARE(x)
+#define KASSERT(x, y) do { if (!(x)) printf y ; exit(0); } while (0)
+struct ipfw_flow_id {
+};
+
+typedef void * module_t;
+
+struct _md_t {
+ const char *name;
+ int (*f)(module_t, int, void *);
+ void *p;
+};
+
+typedef struct _md_t moduledata_t;
+
+#define DECLARE_MODULE(name, b, c, d) \
+ moduledata_t *_g_##name = & b
+#define MODULE_DEPEND(a, b, c, d, e)
+
+#ifdef IPFW
+#include <dn_heap.h>
+#include <ip_dn_private.h>
+#include <dn_sched.h>
+#else
+struct dn_queue {
+ struct dn_fsk *fs; /* parent flowset. */
+ struct dn_sch_inst *_si; /* parent sched instance. */
+};
+struct dn_schk {
+};
+struct dn_fsk {
+ struct dn_fs fs;
+ struct dn_schk *sched;
+};
+struct dn_sch_inst {
+ struct dn_schk *sched;
+};
+struct dn_alg {
+ int type;
+ const char *name;
+ void *enqueue, *dequeue;
+ int q_datalen, si_datalen, schk_datalen;
+ int (*config)(struct dn_schk *);
+ int (*new_sched)(struct dn_sch_inst *);
+ int (*new_fsk)(struct dn_fsk *);
+ int (*new_queue)(struct dn_queue *q);
+};
+
+#endif
+
+#ifndef __FreeBSD__
+int fls(int);
+#endif
+
+static inline void
+mq_append(struct mq *q, struct mbuf *m)
+{
+ if (q->head == NULL)
+ q->head = m;
+ else
+ q->tail->m_nextpkt = m;
+ q->tail = m;
+ m->m_nextpkt = NULL;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _DN_TEST_H */
diff --git a/sys/netpfil/ipfw/test/main.c b/sys/netpfil/ipfw/test/main.c
new file mode 100644
index 0000000..be9fdf5
--- /dev/null
+++ b/sys/netpfil/ipfw/test/main.c
@@ -0,0 +1,636 @@
+/*
+ * $FreeBSD$
+ *
+ * Testing program for schedulers
+ *
+ * The framework include a simple controller which, at each
+ * iteration, decides whether we can enqueue and/or dequeue.
+ * Then the mainloop runs the required number of tests,
+ * keeping track of statistics.
+ */
+
+#include "dn_test.h"
+
+struct q_list {
+ struct list_head h;
+};
+
+struct cfg_s {
+ int ac;
+ char * const *av;
+
+ const char *name;
+ int loops;
+ struct timeval time;
+
+ /* running counters */
+ uint32_t _enqueue;
+ uint32_t drop;
+ uint32_t pending;
+ uint32_t dequeue;
+
+ /* generator parameters */
+ int th_min, th_max;
+ int maxburst;
+ int lmin, lmax; /* packet len */
+ int flows; /* number of flows */
+ int flowsets; /* number of flowsets */
+ int wsum; /* sum of weights of all flows */
+ int max_y; /* max random number in the generation */
+ int cur_y, cur_fs; /* used in generation, between 0 and max_y - 1 */
+ const char *fs_config; /* flowset config */
+ int can_dequeue;
+ int burst; /* count of packets sent in a burst */
+ struct mbuf *tosend; /* packet to send -- also flag to enqueue */
+
+ struct mbuf *freelist;
+
+ struct mbuf *head, *tail; /* a simple tailq */
+
+ /* scheduler hooks */
+ int (*enq)(struct dn_sch_inst *, struct dn_queue *,
+ struct mbuf *);
+ struct mbuf * (*deq)(struct dn_sch_inst *);
+ /* size of the three fields including sched-specific areas */
+ int schk_len;
+ int q_len; /* size of a queue including sched-fields */
+ int si_len; /* size of a sch_inst including sched-fields */
+ char *q; /* array of flow queues */
+ /* use a char* because size is variable */
+ struct dn_fsk *fs; /* array of flowsets */
+ struct dn_sch_inst *si;
+ struct dn_schk *sched;
+
+ /* generator state */
+ int state; /* 0 = going up, 1: going down */
+
+ /*
+ * We keep lists for each backlog level, and always serve
+ * the one with shortest backlog. llmask contains a bitmap
+ * of lists, and ll are the heads of the lists. The last
+ * entry (BACKLOG) contains all entries considered 'full'
+ * XXX to optimize things, entry i could contain queues with
+ * 2^{i-1}+1 .. 2^i entries.
+ */
+#define BACKLOG 30
+ uint32_t llmask;
+ struct list_head ll[BACKLOG + 10];
+};
+
+/* FI2Q and Q2FI converts from flow_id to dn_queue and back.
+ * We cannot easily use pointer arithmetic because it is variable size.
+ */
+#define FI2Q(c, i) ((struct dn_queue *)((c)->q + (c)->q_len * (i)))
+#define Q2FI(c, q) (((char *)(q) - (c)->q)/(c)->q_len)
+
+int debug = 0;
+
+struct dn_parms dn_cfg;
+
+static void controller(struct cfg_s *c);
+
+/* release a packet: put the mbuf in the freelist, and the queue in
+ * the bucket.
+ */
+int
+drop(struct cfg_s *c, struct mbuf *m)
+{
+ struct dn_queue *q;
+ int i;
+
+ c->drop++;
+ q = FI2Q(c, m->flow_id);
+ i = q->ni.length; // XXX or ffs...
+
+ ND("q %p id %d current length %d", q, m->flow_id, i);
+ if (i < BACKLOG) {
+ struct list_head *h = &q->ni.h;
+ c->llmask &= ~(1<<(i+1));
+ c->llmask |= (1<<(i));
+ list_del(h);
+ list_add_tail(h, &c->ll[i]);
+ }
+ m->m_nextpkt = c->freelist;
+ c->freelist = m;
+ return 0;
+}
+
+/* dequeue returns NON-NULL when a packet is dropped */
+static int
+enqueue(struct cfg_s *c, void *_m)
+{
+ struct mbuf *m = _m;
+ if (c->enq)
+ return c->enq(c->si, FI2Q(c, m->flow_id), m);
+ if (c->head == NULL)
+ c->head = m;
+ else
+ c->tail->m_nextpkt = m;
+ c->tail = m;
+ return 0; /* default - success */
+}
+
+/* dequeue returns NON-NULL when a packet is available */
+static void *
+dequeue(struct cfg_s *c)
+{
+ struct mbuf *m;
+ if (c->deq)
+ return c->deq(c->si);
+ if ((m = c->head)) {
+ m = c->head;
+ c->head = m->m_nextpkt;
+ m->m_nextpkt = NULL;
+ }
+ return m;
+}
+
+static int
+mainloop(struct cfg_s *c)
+{
+ int i;
+ struct mbuf *m;
+
+ for (i=0; i < c->loops; i++) {
+ /* implement histeresis */
+ controller(c);
+ DX(3, "loop %d enq %d send %p rx %d",
+ i, c->_enqueue, c->tosend, c->can_dequeue);
+ if ( (m = c->tosend) ) {
+ c->_enqueue++;
+ if (enqueue(c, m)) {
+ drop(c, m);
+ ND("loop %d enqueue fail", i );
+ } else {
+ ND("enqueue ok");
+ c->pending++;
+ }
+ }
+ if (c->can_dequeue) {
+ c->dequeue++;
+ if ((m = dequeue(c))) {
+ c->pending--;
+ drop(c, m);
+ c->drop--; /* compensate */
+ }
+ }
+ }
+ DX(1, "mainloop ends %d", i);
+ return 0;
+}
+
+int
+dump(struct cfg_s *c)
+{
+ int i;
+ struct dn_queue *q;
+
+ for (i=0; i < c->flows; i++) {
+ q = FI2Q(c, i);
+ DX(1, "queue %4d tot %10lld", i, q->ni.tot_bytes);
+ }
+ DX(1, "done %d loops\n", c->loops);
+ return 0;
+}
+
+/* interpret a number in human form */
+static long
+getnum(const char *s, char **next, const char *key)
+{
+ char *end = NULL;
+ long l;
+
+ if (next) /* default */
+ *next = NULL;
+ if (s && *s) {
+ DX(3, "token is <%s> %s", s, key ? key : "-");
+ l = strtol(s, &end, 0);
+ } else {
+ DX(3, "empty string");
+ l = -1;
+ }
+ if (l < 0) {
+ DX(2, "invalid %s for %s", s ? s : "NULL", (key ? key : "") );
+ return 0; // invalid
+ }
+ if (!end || !*end)
+ return l;
+ if (*end == 'n')
+ l = -l; /* multiply by n */
+ else if (*end == 'K')
+ l = l*1000;
+ else if (*end == 'M')
+ l = l*1000000;
+ else if (*end == 'k')
+ l = l*1024;
+ else if (*end == 'm')
+ l = l*1024*1024;
+ else if (*end == 'w')
+ ;
+ else {/* not recognized */
+ D("suffix %s for %s, next %p", end, key, next);
+ end--;
+ }
+ end++;
+ DX(3, "suffix now %s for %s, next %p", end, key, next);
+ if (next && *end) {
+ DX(3, "setting next to %s for %s", end, key);
+ *next = end;
+ }
+ return l;
+}
+
+/*
+ * flowsets are a comma-separated list of
+ * weight:maxlen:flows
+ * indicating how many flows are hooked to that fs.
+ * Both weight and range can be min-max-steps.
+ * In a first pass we just count the number of flowsets and flows,
+ * in a second pass we complete the setup.
+ */
+static void
+parse_flowsets(struct cfg_s *c, const char *fs, int pass)
+{
+ char *s, *cur, *next;
+ int n_flows = 0, n_fs = 0, wsum = 0;
+ int i, j;
+ struct dn_fs *prev = NULL;
+
+ DX(3, "--- pass %d flows %d flowsets %d", pass, c->flows, c->flowsets);
+ if (pass == 0)
+ c->fs_config = fs;
+ s = c->fs_config ? strdup(c->fs_config) : NULL;
+ if (s == NULL) {
+ if (pass == 0)
+ D("no fsconfig");
+ return;
+ }
+ for (next = s; (cur = strsep(&next, ","));) {
+ char *p = NULL;
+ int w, w_h, w_steps, wi;
+ int len, len_h, l_steps, li;
+ int flows;
+
+ w = getnum(strsep(&cur, ":"), &p, "weight");
+ if (w <= 0)
+ w = 1;
+ w_h = p ? getnum(p+1, &p, "weight_max") : w;
+ w_steps = p ? getnum(p+1, &p, "w_steps") : (w_h == w ?1:2);
+ len = getnum(strsep(&cur, ":"), &p, "len");
+ if (len <= 0)
+ len = 1000;
+ len_h = p ? getnum(p+1, &p, "len_max") : len;
+ l_steps = p ? getnum(p+1, &p, "l_steps") : (len_h == len ? 1 : 2);
+ flows = getnum(strsep(&cur, ":"), NULL, "flows");
+ if (flows == 0)
+ flows = 1;
+ DX(4, "weight %d..%d (%d) len %d..%d (%d) flows %d",
+ w, w_h, w_steps, len, len_h, l_steps, flows);
+ if (w == 0 || w_h < w || len == 0 || len_h < len ||
+ flows == 0) {
+ DX(4,"wrong parameters %s", fs);
+ return;
+ }
+ n_flows += flows * w_steps * l_steps;
+ for (i = 0; i < w_steps; i++) {
+ wi = w + ((w_h - w)* i)/(w_steps == 1 ? 1 : (w_steps-1));
+ for (j = 0; j < l_steps; j++, n_fs++) {
+ struct dn_fs *fs = &c->fs[n_fs].fs; // tentative
+ int x;
+
+ li = len + ((len_h - len)* j)/(l_steps == 1 ? 1 : (l_steps-1));
+ x = (wi*2048)/li;
+ DX(3, "----- fs %4d weight %4d lmax %4d X %4d flows %d",
+ n_fs, wi, li, x, flows);
+ if (pass == 0)
+ continue;
+ if (c->fs == NULL || c->flowsets <= n_fs) {
+ D("error in number of flowsets");
+ return;
+ }
+ wsum += wi * flows;
+ fs->par[0] = wi;
+ fs->par[1] = li;
+ fs->index = n_fs;
+ fs->n_flows = flows;
+ fs->cur = fs->first_flow = prev==NULL ? 0 : prev->next_flow;
+ fs->next_flow = fs->first_flow + fs->n_flows;
+ fs->y = x * flows;
+ fs->base_y = (prev == NULL) ? 0 : prev->next_y;
+ fs->next_y = fs->base_y + fs->y;
+ prev = fs;
+ }
+ }
+ }
+ c->max_y = prev ? prev->base_y + prev->y : 0;
+ c->flows = n_flows;
+ c->flowsets = n_fs;
+ c->wsum = wsum;
+ if (pass == 0)
+ return;
+
+ /* now link all flows to their parent flowsets */
+ DX(1,"%d flows on %d flowsets max_y %d", c->flows, c->flowsets, c->max_y);
+ for (i=0; i < c->flowsets; i++) {
+ struct dn_fs *fs = &c->fs[i].fs;
+ DX(1, "fs %3d w %5d l %4d flow %5d .. %5d y %6d .. %6d",
+ i, fs->par[0], fs->par[1],
+ fs->first_flow, fs->next_flow,
+ fs->base_y, fs->next_y);
+ for (j = fs->first_flow; j < fs->next_flow; j++) {
+ struct dn_queue *q = FI2Q(c, j);
+ q->fs = &c->fs[i];
+ }
+ }
+}
+
+static int
+init(struct cfg_s *c)
+{
+ int i;
+ int ac = c->ac;
+ char * const *av = c->av;
+
+ c->si_len = sizeof(struct dn_sch_inst);
+ c->q_len = sizeof(struct dn_queue);
+ moduledata_t *mod = NULL;
+ struct dn_alg *p = NULL;
+
+ c->th_min = 0;
+ c->th_max = -20;/* 20 packets per flow */
+ c->lmin = c->lmax = 1280; /* packet len */
+ c->flows = 1;
+ c->flowsets = 1;
+ c->name = "null";
+ ac--; av++;
+ while (ac > 1) {
+ if (!strcmp(*av, "-n")) {
+ c->loops = getnum(av[1], NULL, av[0]);
+ } else if (!strcmp(*av, "-d")) {
+ debug = atoi(av[1]);
+ } else if (!strcmp(*av, "-alg")) {
+ extern moduledata_t *_g_dn_fifo;
+ extern moduledata_t *_g_dn_wf2qp;
+ extern moduledata_t *_g_dn_rr;
+ extern moduledata_t *_g_dn_qfq;
+#ifdef WITH_KPS
+ extern moduledata_t *_g_dn_kps;
+#endif
+ if (!strcmp(av[1], "rr"))
+ mod = _g_dn_rr;
+ else if (!strcmp(av[1], "wf2qp"))
+ mod = _g_dn_wf2qp;
+ else if (!strcmp(av[1], "fifo"))
+ mod = _g_dn_fifo;
+ else if (!strcmp(av[1], "qfq"))
+ mod = _g_dn_qfq;
+#ifdef WITH_KPS
+ else if (!strcmp(av[1], "kps"))
+ mod = _g_dn_kps;
+#endif
+ else
+ mod = NULL;
+ c->name = mod ? mod->name : "NULL";
+ DX(3, "using scheduler %s", c->name);
+ } else if (!strcmp(*av, "-len")) {
+ c->lmin = getnum(av[1], NULL, av[0]);
+ c->lmax = c->lmin;
+ DX(3, "setting max to %d", c->th_max);
+ } else if (!strcmp(*av, "-burst")) {
+ c->maxburst = getnum(av[1], NULL, av[0]);
+ DX(3, "setting max to %d", c->th_max);
+ } else if (!strcmp(*av, "-qmax")) {
+ c->th_max = getnum(av[1], NULL, av[0]);
+ DX(3, "setting max to %d", c->th_max);
+ } else if (!strcmp(*av, "-qmin")) {
+ c->th_min = getnum(av[1], NULL, av[0]);
+ DX(3, "setting min to %d", c->th_min);
+ } else if (!strcmp(*av, "-flows")) {
+ c->flows = getnum(av[1], NULL, av[0]);
+ DX(3, "setting flows to %d", c->flows);
+ } else if (!strcmp(*av, "-flowsets")) {
+ parse_flowsets(c, av[1], 0);
+ DX(3, "setting flowsets to %d", c->flowsets);
+ } else {
+ D("option %s not recognised, ignore", *av);
+ }
+ ac -= 2; av += 2;
+ }
+ if (c->maxburst <= 0)
+ c->maxburst = 1;
+ if (c->loops <= 0)
+ c->loops = 1;
+ if (c->flows <= 0)
+ c->flows = 1;
+ if (c->flowsets <= 0)
+ c->flowsets = 1;
+ if (c->lmin <= 0)
+ c->lmin = 1;
+ if (c->lmax <= 0)
+ c->lmax = 1;
+ /* multiply by N */
+ if (c->th_min < 0)
+ c->th_min = c->flows * -c->th_min;
+ if (c->th_max < 0)
+ c->th_max = c->flows * -c->th_max;
+ if (c->th_max <= c->th_min)
+ c->th_max = c->th_min + 1;
+ if (mod) {
+ p = mod->p;
+ DX(3, "using module %s f %p p %p", mod->name, mod->f, mod->p);
+ DX(3, "modname %s ty %d", p->name, p->type);
+ c->enq = p->enqueue;
+ c->deq = p->dequeue;
+ c->si_len += p->si_datalen;
+ c->q_len += p->q_datalen;
+ c->schk_len += p->schk_datalen;
+ }
+ /* allocate queues, flowsets and one scheduler */
+ c->q = calloc(c->flows, c->q_len);
+ c->fs = calloc(c->flowsets, sizeof(struct dn_fsk));
+ c->si = calloc(1, c->si_len);
+ c->sched = calloc(c->flows, c->schk_len);
+ if (c->q == NULL || c->fs == NULL) {
+ D("error allocating memory for flows");
+ exit(1);
+ }
+ c->si->sched = c->sched;
+ if (p) {
+ if (p->config)
+ p->config(c->sched);
+ if (p->new_sched)
+ p->new_sched(c->si);
+ }
+ /* parse_flowsets links queues to their flowsets */
+ parse_flowsets(c, av[1], 1);
+ /* complete the work calling new_fsk */
+ for (i = 0; i < c->flowsets; i++) {
+ if (c->fs[i].fs.par[1] == 0)
+ c->fs[i].fs.par[1] = 1000; /* default pkt len */
+ c->fs[i].sched = c->sched;
+ if (p && p->new_fsk)
+ p->new_fsk(&c->fs[i]);
+ }
+
+ /* initialize the lists for the generator, and put
+ * all flows in the list for backlog = 0
+ */
+ for (i=0; i <= BACKLOG+5; i++)
+ INIT_LIST_HEAD(&c->ll[i]);
+
+ for (i = 0; i < c->flows; i++) {
+ struct dn_queue *q = FI2Q(c, i);
+ if (q->fs == NULL)
+ q->fs = &c->fs[0]; /* XXX */
+ q->_si = c->si;
+ if (p && p->new_queue)
+ p->new_queue(q);
+ INIT_LIST_HEAD(&q->ni.h);
+ list_add_tail(&q->ni.h, &c->ll[0]);
+ }
+ c->llmask = 1;
+ return 0;
+}
+
+
+int
+main(int ac, char *av[])
+{
+ struct cfg_s c;
+ struct timeval end;
+ double ll;
+ int i;
+ char msg[40];
+
+ bzero(&c, sizeof(c));
+ c.ac = ac;
+ c.av = av;
+ init(&c);
+ gettimeofday(&c.time, NULL);
+ mainloop(&c);
+ gettimeofday(&end, NULL);
+ end.tv_sec -= c.time.tv_sec;
+ end.tv_usec -= c.time.tv_usec;
+ if (end.tv_usec < 0) {
+ end.tv_usec += 1000000;
+ end.tv_sec--;
+ }
+ c.time = end;
+ ll = end.tv_sec*1000000 + end.tv_usec;
+ ll *= 1000; /* convert to nanoseconds */
+ ll /= c._enqueue;
+ sprintf(msg, "1::%d", c.flows);
+ D("%-8s n %d %d time %d.%06d %8.3f qlen %d %d flows %s drops %d",
+ c.name, c._enqueue, c.loops,
+ (int)c.time.tv_sec, (int)c.time.tv_usec, ll,
+ c.th_min, c.th_max,
+ c.fs_config ? c.fs_config : msg, c.drop);
+ dump(&c);
+ DX(1, "done ac %d av %p", ac, av);
+ for (i=0; i < ac; i++)
+ DX(1, "arg %d %s", i, av[i]);
+ return 0;
+}
+
+/*
+ * The controller decides whether in this iteration we should send
+ * (the packet is in c->tosend) and/or receive (flag c->can_dequeue)
+ */
+static void
+controller(struct cfg_s *c)
+{
+ struct mbuf *m;
+ struct dn_fs *fs;
+ int flow_id;
+
+ /* histeresis between max and min */
+ if (c->state == 0 && c->pending >= c->th_max)
+ c->state = 1;
+ else if (c->state == 1 && c->pending <= c->th_min)
+ c->state = 0;
+ ND(1, "state %d pending %2d", c->state, c->pending);
+ c->can_dequeue = c->state;
+ c->tosend = NULL;
+ if (c->state)
+ return;
+
+ if (1) {
+ int i;
+ struct dn_queue *q;
+ struct list_head *h;
+
+ i = ffs(c->llmask) - 1;
+ if (i < 0) {
+ DX(2, "no candidate");
+ c->can_dequeue = 1;
+ return;
+ }
+ h = &c->ll[i];
+ ND(1, "backlog %d p %p prev %p next %p", i, h, h->prev, h->next);
+ q = list_first_entry(h, struct dn_queue, ni.h);
+ list_del(&q->ni.h);
+ flow_id = Q2FI(c, q);
+ DX(2, "extracted flow %p %d backlog %d", q, flow_id, i);
+ if (list_empty(h)) {
+ ND(2, "backlog %d empty", i);
+ c->llmask &= ~(1<<i);
+ }
+ ND(1, "before %d p %p prev %p next %p", i+1, h+1, h[1].prev, h[1].next);
+ list_add_tail(&q->ni.h, h+1);
+ ND(1, " after %d p %p prev %p next %p", i+1, h+1, h[1].prev, h[1].next);
+ if (i < BACKLOG) {
+ ND(2, "backlog %d full", i+1);
+ c->llmask |= 1<<(1+i);
+ }
+ fs = &q->fs->fs;
+ c->cur_fs = q->fs - c->fs;
+ fs->cur = flow_id;
+ } else {
+ /* XXX this does not work ? */
+ /* now decide whom to send the packet, and the length */
+ /* lookup in the flow table */
+ if (c->cur_y >= c->max_y) { /* handle wraparound */
+ c->cur_y = 0;
+ c->cur_fs = 0;
+ }
+ fs = &c->fs[c->cur_fs].fs;
+ flow_id = fs->cur++;
+ if (fs->cur >= fs->next_flow)
+ fs->cur = fs->first_flow;
+ c->cur_y++;
+ if (c->cur_y >= fs->next_y)
+ c->cur_fs++;
+ }
+
+ /* construct a packet */
+ if (c->freelist) {
+ m = c->tosend = c->freelist;
+ c->freelist = c->freelist->m_nextpkt;
+ } else {
+ m = c->tosend = calloc(1, sizeof(struct mbuf));
+ }
+ if (m == NULL)
+ return;
+
+ m->cfg = c;
+ m->m_nextpkt = NULL;
+ m->m_pkthdr.len = fs->par[1]; // XXX maxlen
+ m->flow_id = flow_id;
+
+ ND(2,"y %6d flow %5d fs %3d weight %4d len %4d",
+ c->cur_y, m->flow_id, c->cur_fs,
+ fs->par[0], m->m_pkthdr.len);
+
+}
+
+/*
+Packet allocation:
+to achieve a distribution that matches weights, for each X=w/lmax class
+we should generate a number of packets proportional to Y = X times the number
+of flows in the class.
+So we construct an array with the cumulative distribution of Y's,
+and use it to identify the flow via inverse mapping (if the Y's are
+not too many we can use an array for the lookup). In practice,
+each flow will have X entries [virtually] pointing to it.
+
+*/
diff --git a/sys/netpfil/ipfw/test/mylist.h b/sys/netpfil/ipfw/test/mylist.h
new file mode 100644
index 0000000..6247f32
--- /dev/null
+++ b/sys/netpfil/ipfw/test/mylist.h
@@ -0,0 +1,49 @@
+/*
+ * $FreeBSD$
+ *
+ * linux-like bidirectional lists
+ */
+
+#ifndef _MYLIST_H
+#define _MYLIST_H
+struct list_head {
+ struct list_head *prev, *next;
+};
+
+#define INIT_LIST_HEAD(l) do { (l)->prev = (l)->next = (l); } while (0)
+#define list_empty(l) ( (l)->next == l )
+static inline void
+__list_add(struct list_head *o, struct list_head *prev,
+ struct list_head *next)
+{
+ next->prev = o;
+ o->next = next;
+ o->prev = prev;
+ prev->next = o;
+}
+
+static inline void
+list_add_tail(struct list_head *o, struct list_head *head)
+{
+ __list_add(o, head->prev, head);
+}
+
+#define list_first_entry(pL, ty, member) \
+ (ty *)((char *)((pL)->next) - offsetof(ty, member))
+
+static inline void
+__list_del(struct list_head *prev, struct list_head *next)
+{
+ next->prev = prev;
+ prev->next = next;
+}
+
+static inline void
+list_del(struct list_head *entry)
+{
+ ND("called on %p", entry);
+ __list_del(entry->prev, entry->next);
+ entry->next = entry->prev = NULL;
+}
+
+#endif /* _MYLIST_H */
diff --git a/sys/netpfil/ipfw/test/test_dn_heap.c b/sys/netpfil/ipfw/test/test_dn_heap.c
new file mode 100644
index 0000000..d460cf2
--- /dev/null
+++ b/sys/netpfil/ipfw/test/test_dn_heap.c
@@ -0,0 +1,162 @@
+/*-
+ * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Userland code for testing binary heaps and hash tables
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+
+#include <stdio.h>
+#include <strings.h>
+#include <stdlib.h>
+
+#include "dn_heap.h"
+#define log(x, arg...) fprintf(stderr, ## arg)
+#define panic(x...) fprintf(stderr, ## x), exit(1)
+
+#include <string.h>
+
+struct x {
+ struct x *ht_link;
+ char buf[0];
+};
+
+uint32_t hf(uintptr_t key, int flags, void *arg)
+{
+ return (flags & DNHT_KEY_IS_OBJ) ?
+ ((struct x *)key)->buf[0] : *(char *)key;
+}
+
+int matchf(void *obj, uintptr_t key, int flags, void *arg)
+{
+ char *s = (flags & DNHT_KEY_IS_OBJ) ?
+ ((struct x *)key)->buf : (char *)key;
+ return (strcmp(((struct x *)obj)->buf, s) == 0);
+}
+
+void *newfn(uintptr_t key, int flags, void *arg)
+{
+ char *s = (char *)key;
+ struct x *p = malloc(sizeof(*p) + 1 + strlen(s));
+ if (p)
+ strcpy(p->buf, s);
+ return p;
+}
+
+char *strings[] = {
+ "undici", "unico", "doppio", "devoto",
+ "uno", "due", "tre", "quattro", "cinque", "sei",
+ "uno", "due", "tre", "quattro", "cinque", "sei",
+ NULL,
+};
+
+int doprint(void *_x, void *arg)
+{
+ struct x *x = _x;
+ printf("found element <%s>\n", x->buf);
+ return (int)arg;
+}
+
+static void
+test_hash()
+{
+ char **p;
+ struct dn_ht *h;
+ uintptr_t x = 0;
+ uintptr_t x1 = 0;
+
+ /* first, find and allocate */
+ h = dn_ht_init(NULL, 10, 0, hf, matchf, newfn);
+
+ for (p = strings; *p; p++) {
+ dn_ht_find(h, (uintptr_t)*p, DNHT_INSERT, NULL);
+ }
+ dn_ht_scan(h, doprint, 0);
+ printf("/* second -- find without allocate */\n");
+ h = dn_ht_init(NULL, 10, 0, hf, matchf, NULL);
+ for (p = strings; *p; p++) {
+ void **y = newfn((uintptr_t)*p, 0, NULL);
+ if (x == 0)
+ x = (uintptr_t)y;
+ else {
+ if (x1 == 0)
+ x1 = (uintptr_t)*p;
+ }
+ dn_ht_find(h, (uintptr_t)y, DNHT_INSERT | DNHT_KEY_IS_OBJ, NULL);
+ }
+ dn_ht_scan(h, doprint, 0);
+ printf("remove %p gives %p\n", (void *)x,
+ dn_ht_find(h, x, DNHT_KEY_IS_OBJ | DNHT_REMOVE, NULL));
+ printf("remove %p gives %p\n", (void *)x,
+ dn_ht_find(h, x, DNHT_KEY_IS_OBJ | DNHT_REMOVE, NULL));
+ printf("remove %p gives %p\n", (void *)x,
+ dn_ht_find(h, x1, DNHT_REMOVE, NULL));
+ printf("remove %p gives %p\n", (void *)x,
+ dn_ht_find(h, x1, DNHT_REMOVE, NULL));
+ dn_ht_scan(h, doprint, 0);
+}
+
+int
+main(int argc, char *argv[])
+{
+ struct dn_heap h;
+ int i, n, n2, n3;
+
+ test_hash();
+ return 0;
+
+ /* n = elements, n2 = cycles */
+ n = (argc > 1) ? atoi(argv[1]) : 0;
+ if (n <= 0 || n > 1000000)
+ n = 100;
+ n2 = (argc > 2) ? atoi(argv[2]) : 0;
+ if (n2 <= 0)
+ n = 1000000;
+ n3 = (argc > 3) ? atoi(argv[3]) : 0;
+ bzero(&h, sizeof(h));
+ heap_init(&h, n, -1);
+ while (n2-- > 0) {
+ uint64_t prevk = 0;
+ for (i=0; i < n; i++)
+ heap_insert(&h, n3 ? n-i: random(), (void *)(100+i));
+
+ for (i=0; h.elements > 0; i++) {
+ uint64_t k = h.p[0].key;
+ if (k < prevk)
+ panic("wrong sequence\n");
+ prevk = k;
+ if (0)
+ printf("%d key %llu, val %p\n",
+ i, h.p[0].key, h.p[0].object);
+ heap_extract(&h, NULL);
+ }
+ }
+ return 0;
+}
diff --git a/sys/netpfil/ipfw/test/test_dn_sched.c b/sys/netpfil/ipfw/test/test_dn_sched.c
new file mode 100644
index 0000000..ee46c95
--- /dev/null
+++ b/sys/netpfil/ipfw/test/test_dn_sched.c
@@ -0,0 +1,89 @@
+/*
+ * $FreeBSD$
+ *
+ * library functions for userland testing of dummynet schedulers
+ */
+
+#include "dn_test.h"
+
+void
+m_freem(struct mbuf *m)
+{
+ printf("free %p\n", m);
+}
+
+int
+dn_sched_modevent(module_t mod, int cmd, void *arg)
+{
+ return 0;
+}
+
+void
+dn_free_pkts(struct mbuf *m)
+{
+ struct mbuf *x;
+ while ( (x = m) ) {
+ m = m->m_nextpkt;
+ m_freem(x);
+ }
+}
+
+int
+dn_delete_queue(void *_q, void *do_free)
+{
+ struct dn_queue *q = _q;
+ if (q->mq.head)
+ dn_free_pkts(q->mq.head);
+ free(q);
+ return 0;
+}
+
+/*
+ * This is a simplified function for testing purposes, which does
+ * not implement statistics or random loss.
+ * Enqueue a packet in q, subject to space and queue management policy
+ * (whose parameters are in q->fs).
+ * Update stats for the queue and the scheduler.
+ * Return 0 on success, 1 on drop. The packet is consumed anyways.
+ */
+int
+dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop)
+{
+ if (drop)
+ goto drop;
+ if (q->ni.length >= 200)
+ goto drop;
+ mq_append(&q->mq, m);
+ q->ni.length++;
+ q->ni.tot_bytes += m->m_pkthdr.len;
+ return 0;
+
+drop:
+ q->ni.drops++;
+ return 1;
+}
+
+int
+ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg)
+{
+ if (*v < lo) {
+ *v = dflt;
+ } else if (*v > hi) {
+ *v = hi;
+ }
+ return *v;
+}
+
+#ifndef __FreeBSD__
+int
+fls(int mask)
+{
+ int bit;
+
+ if (mask == 0)
+ return (0);
+ for (bit = 1; mask != 1; bit++)
+ mask = (unsigned int)mask >> 1;
+ return (bit);
+}
+#endif
diff --git a/sys/netpfil/pf/if_pflog.c b/sys/netpfil/pf/if_pflog.c
new file mode 100644
index 0000000..20feea2
--- /dev/null
+++ b/sys/netpfil/pf/if_pflog.c
@@ -0,0 +1,290 @@
+/* $OpenBSD: if_pflog.c,v 1.26 2007/10/18 21:58:18 mpf Exp $ */
+/*
+ * The authors of this code are John Ioannidis (ji@tla.org),
+ * Angelos D. Keromytis (kermit@csd.uch.gr) and
+ * Niels Provos (provos@physnet.uni-hamburg.de).
+ *
+ * This code was written by John Ioannidis for BSD/OS in Athens, Greece,
+ * in November 1995.
+ *
+ * Ported to OpenBSD and NetBSD, with additional transforms, in December 1996,
+ * by Angelos D. Keromytis.
+ *
+ * Additional transforms and features in 1997 and 1998 by Angelos D. Keromytis
+ * and Niels Provos.
+ *
+ * Copyright (C) 1995, 1996, 1997, 1998 by John Ioannidis, Angelos D. Keromytis
+ * and Niels Provos.
+ * Copyright (c) 2001, Angelos D. Keromytis, Niels Provos.
+ *
+ * Permission to use, copy, and modify this software with or without fee
+ * is hereby granted, provided that this entire notice is included in
+ * all copies of any software which is or includes a copy or
+ * modification of this software.
+ * You may use this code under the GNU public license if you so wish. Please
+ * contribute changes back to the authors under this freer than GPL license
+ * so that we may further the use of strong encryption without limitations to
+ * all.
+ *
+ * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR
+ * IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE
+ * MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR
+ * PURPOSE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_bpf.h"
+#include "opt_pf.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/proc.h>
+#include <sys/socket.h>
+#include <sys/sockio.h>
+
+#include <net/bpf.h>
+#include <net/if.h>
+#include <net/if_clone.h>
+#include <net/if_pflog.h>
+#include <net/if_types.h>
+#include <net/pfvar.h>
+
+#if defined(INET) || defined(INET6)
+#include <netinet/in.h>
+#endif
+#ifdef INET
+#include <netinet/in_var.h>
+#include <netinet/ip.h>
+#endif
+
+#ifdef INET6
+#include <netinet6/in6_var.h>
+#include <netinet6/nd6.h>
+#endif /* INET6 */
+
+#ifdef INET
+#include <machine/in_cksum.h>
+#endif /* INET */
+
+#define PFLOGMTU (32768 + MHLEN + MLEN)
+
+#ifdef PFLOGDEBUG
+#define DPRINTF(x) do { if (pflogdebug) printf x ; } while (0)
+#else
+#define DPRINTF(x)
+#endif
+
+static int pflogoutput(struct ifnet *, struct mbuf *, struct sockaddr *,
+ struct route *);
+static void pflogattach(int);
+static int pflogioctl(struct ifnet *, u_long, caddr_t);
+static void pflogstart(struct ifnet *);
+static int pflog_clone_create(struct if_clone *, int, caddr_t);
+static void pflog_clone_destroy(struct ifnet *);
+
+IFC_SIMPLE_DECLARE(pflog, 1);
+
+struct ifnet *pflogifs[PFLOGIFS_MAX]; /* for fast access */
+
+static void
+pflogattach(int npflog)
+{
+ int i;
+ for (i = 0; i < PFLOGIFS_MAX; i++)
+ pflogifs[i] = NULL;
+ if_clone_attach(&pflog_cloner);
+}
+
+static int
+pflog_clone_create(struct if_clone *ifc, int unit, caddr_t param)
+{
+ struct ifnet *ifp;
+
+ if (unit >= PFLOGIFS_MAX)
+ return (EINVAL);
+
+ ifp = if_alloc(IFT_PFLOG);
+ if (ifp == NULL) {
+ return (ENOSPC);
+ }
+ if_initname(ifp, ifc->ifc_name, unit);
+ ifp->if_mtu = PFLOGMTU;
+ ifp->if_ioctl = pflogioctl;
+ ifp->if_output = pflogoutput;
+ ifp->if_start = pflogstart;
+ ifp->if_snd.ifq_maxlen = ifqmaxlen;
+ ifp->if_hdrlen = PFLOG_HDRLEN;
+ if_attach(ifp);
+
+ bpfattach(ifp, DLT_PFLOG, PFLOG_HDRLEN);
+
+ pflogifs[unit] = ifp;
+
+ return (0);
+}
+
+static void
+pflog_clone_destroy(struct ifnet *ifp)
+{
+ int i;
+
+ for (i = 0; i < PFLOGIFS_MAX; i++)
+ if (pflogifs[i] == ifp)
+ pflogifs[i] = NULL;
+
+ bpfdetach(ifp);
+ if_detach(ifp);
+ if_free(ifp);
+}
+
+/*
+ * Start output on the pflog interface.
+ */
+static void
+pflogstart(struct ifnet *ifp)
+{
+ struct mbuf *m;
+
+ for (;;) {
+ IF_LOCK(&ifp->if_snd);
+ _IF_DROP(&ifp->if_snd);
+ _IF_DEQUEUE(&ifp->if_snd, m);
+ IF_UNLOCK(&ifp->if_snd);
+
+ if (m == NULL)
+ return;
+ else
+ m_freem(m);
+ }
+}
+
+static int
+pflogoutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
+ struct route *rt)
+{
+ m_freem(m);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+pflogioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
+{
+ switch (cmd) {
+ case SIOCSIFFLAGS:
+ if (ifp->if_flags & IFF_UP)
+ ifp->if_drv_flags |= IFF_DRV_RUNNING;
+ else
+ ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
+ break;
+ default:
+ return (ENOTTY);
+ }
+
+ return (0);
+}
+
+static int
+pflog_packet(struct pfi_kif *kif, struct mbuf *m, sa_family_t af, u_int8_t dir,
+ u_int8_t reason, struct pf_rule *rm, struct pf_rule *am,
+ struct pf_ruleset *ruleset, struct pf_pdesc *pd, int lookupsafe)
+{
+ struct ifnet *ifn;
+ struct pfloghdr hdr;
+
+ if (kif == NULL || m == NULL || rm == NULL || pd == NULL)
+ return ( 1);
+
+ if ((ifn = pflogifs[rm->logif]) == NULL || !ifn->if_bpf)
+ return (0);
+
+ bzero(&hdr, sizeof(hdr));
+ hdr.length = PFLOG_REAL_HDRLEN;
+ hdr.af = af;
+ hdr.action = rm->action;
+ hdr.reason = reason;
+ memcpy(hdr.ifname, kif->pfik_name, sizeof(hdr.ifname));
+
+ if (am == NULL) {
+ hdr.rulenr = htonl(rm->nr);
+ hdr.subrulenr = 1;
+ } else {
+ hdr.rulenr = htonl(am->nr);
+ hdr.subrulenr = htonl(rm->nr);
+ if (ruleset != NULL && ruleset->anchor != NULL)
+ strlcpy(hdr.ruleset, ruleset->anchor->name,
+ sizeof(hdr.ruleset));
+ }
+ /*
+ * XXXGL: we avoid pf_socket_lookup() when we are holding
+ * state lock, since this leads to unsafe LOR.
+ * These conditions are very very rare, however.
+ */
+ if (rm->log & PF_LOG_SOCKET_LOOKUP && !pd->lookup.done && lookupsafe)
+ pd->lookup.done = pf_socket_lookup(dir, pd, m);
+ if (pd->lookup.done > 0)
+ hdr.uid = pd->lookup.uid;
+ else
+ hdr.uid = UID_MAX;
+ hdr.pid = NO_PID;
+ hdr.rule_uid = rm->cuid;
+ hdr.rule_pid = rm->cpid;
+ hdr.dir = dir;
+
+#ifdef INET
+ if (af == AF_INET && dir == PF_OUT) {
+ struct ip *ip;
+
+ ip = mtod(m, struct ip *);
+ ip->ip_sum = 0;
+ ip->ip_sum = in_cksum(m, ip->ip_hl << 2);
+ }
+#endif /* INET */
+
+ ifn->if_opackets++;
+ ifn->if_obytes += m->m_pkthdr.len;
+ BPF_MTAP2(ifn, &hdr, PFLOG_HDRLEN, m);
+
+ return (0);
+}
+
+static int
+pflog_modevent(module_t mod, int type, void *data)
+{
+ int error = 0;
+
+ switch (type) {
+ case MOD_LOAD:
+ pflogattach(1);
+ PF_RULES_WLOCK();
+ pflog_packet_ptr = pflog_packet;
+ PF_RULES_WUNLOCK();
+ break;
+ case MOD_UNLOAD:
+ PF_RULES_WLOCK();
+ pflog_packet_ptr = NULL;
+ PF_RULES_WUNLOCK();
+ if_clone_detach(&pflog_cloner);
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+
+ return error;
+}
+
+static moduledata_t pflog_mod = { "pflog", pflog_modevent, 0 };
+
+#define PFLOG_MODVER 1
+
+DECLARE_MODULE(pflog, pflog_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
+MODULE_VERSION(pflog, PFLOG_MODVER);
+MODULE_DEPEND(pflog, pf, PF_MODVER, PF_MODVER, PF_MODVER);
diff --git a/sys/netpfil/pf/if_pfsync.c b/sys/netpfil/pf/if_pfsync.c
new file mode 100644
index 0000000..28af641
--- /dev/null
+++ b/sys/netpfil/pf/if_pfsync.c
@@ -0,0 +1,2397 @@
+/* $OpenBSD: if_pfsync.c,v 1.110 2009/02/24 05:39:19 dlg Exp $ */
+
+/*
+ * Copyright (c) 2002 Michael Shalayeff
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 2009 David Gwynne <dlg@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/*
+ * Revisions picked from OpenBSD after revision 1.110 import:
+ * 1.118, 1.124, 1.148, 1.149, 1.151, 1.171 - fixes to bulk updates
+ * 1.120, 1.175 - use monotonic time_uptime
+ * 1.122 - reduce number of updates for non-TCP sessions
+ * 1.128 - cleanups
+ * 1.146 - bzero() mbuf before sparsely filling it with data
+ * 1.170 - SIOCSIFMTU checks
+ * 1.126, 1.142 - deferred packets processing
+ * 1.173 - correct expire time processing
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_pf.h"
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/endian.h>
+#include <sys/interrupt.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/priv.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/sockio.h>
+#include <sys/sysctl.h>
+
+#include <net/bpf.h>
+#include <net/if.h>
+#include <net/if_clone.h>
+#include <net/if_types.h>
+#include <net/pfvar.h>
+#include <net/if_pfsync.h>
+
+#include <netinet/if_ether.h>
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/ip.h>
+#include <netinet/ip_carp.h>
+#include <netinet/ip_var.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+
+#define PFSYNC_MINPKT ( \
+ sizeof(struct ip) + \
+ sizeof(struct pfsync_header) + \
+ sizeof(struct pfsync_subheader) + \
+ sizeof(struct pfsync_eof))
+
+struct pfsync_pkt {
+ struct ip *ip;
+ struct in_addr src;
+ u_int8_t flags;
+};
+
+static int pfsync_upd_tcp(struct pf_state *, struct pfsync_state_peer *,
+ struct pfsync_state_peer *);
+static int pfsync_in_clr(struct pfsync_pkt *, struct mbuf *, int, int);
+static int pfsync_in_ins(struct pfsync_pkt *, struct mbuf *, int, int);
+static int pfsync_in_iack(struct pfsync_pkt *, struct mbuf *, int, int);
+static int pfsync_in_upd(struct pfsync_pkt *, struct mbuf *, int, int);
+static int pfsync_in_upd_c(struct pfsync_pkt *, struct mbuf *, int, int);
+static int pfsync_in_ureq(struct pfsync_pkt *, struct mbuf *, int, int);
+static int pfsync_in_del(struct pfsync_pkt *, struct mbuf *, int, int);
+static int pfsync_in_del_c(struct pfsync_pkt *, struct mbuf *, int, int);
+static int pfsync_in_bus(struct pfsync_pkt *, struct mbuf *, int, int);
+static int pfsync_in_tdb(struct pfsync_pkt *, struct mbuf *, int, int);
+static int pfsync_in_eof(struct pfsync_pkt *, struct mbuf *, int, int);
+static int pfsync_in_error(struct pfsync_pkt *, struct mbuf *, int, int);
+
+static int (*pfsync_acts[])(struct pfsync_pkt *, struct mbuf *, int, int) = {
+ pfsync_in_clr, /* PFSYNC_ACT_CLR */
+ pfsync_in_ins, /* PFSYNC_ACT_INS */
+ pfsync_in_iack, /* PFSYNC_ACT_INS_ACK */
+ pfsync_in_upd, /* PFSYNC_ACT_UPD */
+ pfsync_in_upd_c, /* PFSYNC_ACT_UPD_C */
+ pfsync_in_ureq, /* PFSYNC_ACT_UPD_REQ */
+ pfsync_in_del, /* PFSYNC_ACT_DEL */
+ pfsync_in_del_c, /* PFSYNC_ACT_DEL_C */
+ pfsync_in_error, /* PFSYNC_ACT_INS_F */
+ pfsync_in_error, /* PFSYNC_ACT_DEL_F */
+ pfsync_in_bus, /* PFSYNC_ACT_BUS */
+ pfsync_in_tdb, /* PFSYNC_ACT_TDB */
+ pfsync_in_eof /* PFSYNC_ACT_EOF */
+};
+
+struct pfsync_q {
+ int (*write)(struct pf_state *, struct mbuf *, int);
+ size_t len;
+ u_int8_t action;
+};
+
+/* we have one of these for every PFSYNC_S_ */
+static int pfsync_out_state(struct pf_state *, struct mbuf *, int);
+static int pfsync_out_iack(struct pf_state *, struct mbuf *, int);
+static int pfsync_out_upd_c(struct pf_state *, struct mbuf *, int);
+static int pfsync_out_del(struct pf_state *, struct mbuf *, int);
+
+static struct pfsync_q pfsync_qs[] = {
+ { pfsync_out_state, sizeof(struct pfsync_state), PFSYNC_ACT_INS },
+ { pfsync_out_iack, sizeof(struct pfsync_ins_ack), PFSYNC_ACT_INS_ACK },
+ { pfsync_out_state, sizeof(struct pfsync_state), PFSYNC_ACT_UPD },
+ { pfsync_out_upd_c, sizeof(struct pfsync_upd_c), PFSYNC_ACT_UPD_C },
+ { pfsync_out_del, sizeof(struct pfsync_del_c), PFSYNC_ACT_DEL_C }
+};
+
+static void pfsync_q_ins(struct pf_state *, int);
+static void pfsync_q_del(struct pf_state *);
+
+static void pfsync_update_state(struct pf_state *);
+
+struct pfsync_upd_req_item {
+ TAILQ_ENTRY(pfsync_upd_req_item) ur_entry;
+ struct pfsync_upd_req ur_msg;
+};
+
+struct pfsync_deferral {
+ struct pfsync_softc *pd_sc;
+ TAILQ_ENTRY(pfsync_deferral) pd_entry;
+ u_int pd_refs;
+ struct callout pd_tmo;
+
+ struct pf_state *pd_st;
+ struct mbuf *pd_m;
+};
+
+struct pfsync_softc {
+ /* Configuration */
+ struct ifnet *sc_ifp;
+ struct ifnet *sc_sync_if;
+ struct ip_moptions sc_imo;
+ struct in_addr sc_sync_peer;
+ uint32_t sc_flags;
+#define PFSYNCF_OK 0x00000001
+#define PFSYNCF_DEFER 0x00000002
+#define PFSYNCF_PUSH 0x00000004
+ uint8_t sc_maxupdates;
+ struct ip sc_template;
+ struct callout sc_tmo;
+ struct mtx sc_mtx;
+
+ /* Queued data */
+ size_t sc_len;
+ TAILQ_HEAD(, pf_state) sc_qs[PFSYNC_S_COUNT];
+ TAILQ_HEAD(, pfsync_upd_req_item) sc_upd_req_list;
+ TAILQ_HEAD(, pfsync_deferral) sc_deferrals;
+ u_int sc_deferred;
+ void *sc_plus;
+ size_t sc_pluslen;
+
+ /* Bulk update info */
+ struct mtx sc_bulk_mtx;
+ uint32_t sc_ureq_sent;
+ int sc_bulk_tries;
+ uint32_t sc_ureq_received;
+ int sc_bulk_hashid;
+ uint64_t sc_bulk_stateid;
+ uint32_t sc_bulk_creatorid;
+ struct callout sc_bulk_tmo;
+ struct callout sc_bulkfail_tmo;
+};
+
+#define PFSYNC_LOCK(sc) mtx_lock(&(sc)->sc_mtx)
+#define PFSYNC_UNLOCK(sc) mtx_unlock(&(sc)->sc_mtx)
+#define PFSYNC_LOCK_ASSERT(sc) mtx_assert(&(sc)->sc_mtx, MA_OWNED)
+
+#define PFSYNC_BLOCK(sc) mtx_lock(&(sc)->sc_bulk_mtx)
+#define PFSYNC_BUNLOCK(sc) mtx_unlock(&(sc)->sc_bulk_mtx)
+#define PFSYNC_BLOCK_ASSERT(sc) mtx_assert(&(sc)->sc_bulk_mtx, MA_OWNED)
+
+static MALLOC_DEFINE(M_PFSYNC, "pfsync", "pfsync(4) data");
+static VNET_DEFINE(struct pfsync_softc *, pfsyncif) = NULL;
+#define V_pfsyncif VNET(pfsyncif)
+static VNET_DEFINE(void *, pfsync_swi_cookie) = NULL;
+#define V_pfsync_swi_cookie VNET(pfsync_swi_cookie)
+static VNET_DEFINE(struct pfsyncstats, pfsyncstats);
+#define V_pfsyncstats VNET(pfsyncstats)
+static VNET_DEFINE(int, pfsync_carp_adj) = CARP_MAXSKEW;
+#define V_pfsync_carp_adj VNET(pfsync_carp_adj)
+
+static void pfsync_timeout(void *);
+static void pfsync_push(struct pfsync_softc *);
+static void pfsyncintr(void *);
+static int pfsync_multicast_setup(struct pfsync_softc *, struct ifnet *,
+ void *);
+static void pfsync_multicast_cleanup(struct pfsync_softc *);
+static int pfsync_init(void);
+static void pfsync_uninit(void);
+
+SYSCTL_NODE(_net, OID_AUTO, pfsync, CTLFLAG_RW, 0, "PFSYNC");
+SYSCTL_VNET_STRUCT(_net_pfsync, OID_AUTO, stats, CTLFLAG_RW,
+ &VNET_NAME(pfsyncstats), pfsyncstats,
+ "PFSYNC statistics (struct pfsyncstats, net/if_pfsync.h)");
+SYSCTL_INT(_net_pfsync, OID_AUTO, carp_demotion_factor, CTLFLAG_RW,
+ &VNET_NAME(pfsync_carp_adj), 0, "pfsync's CARP demotion factor adjustment");
+
+static int pfsync_clone_create(struct if_clone *, int, caddr_t);
+static void pfsync_clone_destroy(struct ifnet *);
+static int pfsync_alloc_scrub_memory(struct pfsync_state_peer *,
+ struct pf_state_peer *);
+static int pfsyncoutput(struct ifnet *, struct mbuf *, struct sockaddr *,
+ struct route *);
+static int pfsyncioctl(struct ifnet *, u_long, caddr_t);
+
+static int pfsync_defer(struct pf_state *, struct mbuf *);
+static void pfsync_undefer(struct pfsync_deferral *, int);
+static void pfsync_undefer_state(struct pf_state *, int);
+static void pfsync_defer_tmo(void *);
+
+static void pfsync_request_update(u_int32_t, u_int64_t);
+static void pfsync_update_state_req(struct pf_state *);
+
+static void pfsync_drop(struct pfsync_softc *);
+static void pfsync_sendout(int);
+static void pfsync_send_plus(void *, size_t);
+
+static void pfsync_bulk_start(void);
+static void pfsync_bulk_status(u_int8_t);
+static void pfsync_bulk_update(void *);
+static void pfsync_bulk_fail(void *);
+
+#ifdef IPSEC
+static void pfsync_update_net_tdb(struct pfsync_tdb *);
+#endif
+
+#define PFSYNC_MAX_BULKTRIES 12
+
+VNET_DEFINE(struct ifc_simple_data, pfsync_cloner_data);
+VNET_DEFINE(struct if_clone, pfsync_cloner);
+#define V_pfsync_cloner_data VNET(pfsync_cloner_data)
+#define V_pfsync_cloner VNET(pfsync_cloner)
+IFC_SIMPLE_DECLARE(pfsync, 1);
+
+static int
+pfsync_clone_create(struct if_clone *ifc, int unit, caddr_t param)
+{
+ struct pfsync_softc *sc;
+ struct ifnet *ifp;
+ int q;
+
+ if (unit != 0)
+ return (EINVAL);
+
+ sc = malloc(sizeof(struct pfsync_softc), M_PFSYNC, M_WAITOK | M_ZERO);
+ sc->sc_flags |= PFSYNCF_OK;
+
+ for (q = 0; q < PFSYNC_S_COUNT; q++)
+ TAILQ_INIT(&sc->sc_qs[q]);
+
+ TAILQ_INIT(&sc->sc_upd_req_list);
+ TAILQ_INIT(&sc->sc_deferrals);
+
+ sc->sc_len = PFSYNC_MINPKT;
+ sc->sc_maxupdates = 128;
+
+ ifp = sc->sc_ifp = if_alloc(IFT_PFSYNC);
+ if (ifp == NULL) {
+ free(sc, M_PFSYNC);
+ return (ENOSPC);
+ }
+ if_initname(ifp, ifc->ifc_name, unit);
+ ifp->if_softc = sc;
+ ifp->if_ioctl = pfsyncioctl;
+ ifp->if_output = pfsyncoutput;
+ ifp->if_type = IFT_PFSYNC;
+ ifp->if_snd.ifq_maxlen = ifqmaxlen;
+ ifp->if_hdrlen = sizeof(struct pfsync_header);
+ ifp->if_mtu = ETHERMTU;
+ mtx_init(&sc->sc_mtx, "pfsync", NULL, MTX_DEF);
+ mtx_init(&sc->sc_bulk_mtx, "pfsync bulk", NULL, MTX_DEF);
+ callout_init(&sc->sc_tmo, CALLOUT_MPSAFE);
+ callout_init_mtx(&sc->sc_bulk_tmo, &sc->sc_bulk_mtx, 0);
+ callout_init_mtx(&sc->sc_bulkfail_tmo, &sc->sc_bulk_mtx, 0);
+
+ if_attach(ifp);
+
+ bpfattach(ifp, DLT_PFSYNC, PFSYNC_HDRLEN);
+
+ V_pfsyncif = sc;
+
+ return (0);
+}
+
+static void
+pfsync_clone_destroy(struct ifnet *ifp)
+{
+ struct pfsync_softc *sc = ifp->if_softc;
+
+ /*
+ * At this stage, everything should have already been
+ * cleared by pfsync_uninit(), and we have only to
+ * drain callouts.
+ */
+ while (sc->sc_deferred > 0) {
+ struct pfsync_deferral *pd = TAILQ_FIRST(&sc->sc_deferrals);
+
+ TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
+ sc->sc_deferred--;
+ if (callout_stop(&pd->pd_tmo)) {
+ pf_release_state(pd->pd_st);
+ m_freem(pd->pd_m);
+ free(pd, M_PFSYNC);
+ } else {
+ pd->pd_refs++;
+ callout_drain(&pd->pd_tmo);
+ free(pd, M_PFSYNC);
+ }
+ }
+
+ callout_drain(&sc->sc_tmo);
+ callout_drain(&sc->sc_bulkfail_tmo);
+ callout_drain(&sc->sc_bulk_tmo);
+
+ if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
+ (*carp_demote_adj_p)(-V_pfsync_carp_adj, "pfsync destroy");
+ bpfdetach(ifp);
+ if_detach(ifp);
+
+ pfsync_drop(sc);
+
+ if_free(ifp);
+ if (sc->sc_imo.imo_membership)
+ pfsync_multicast_cleanup(sc);
+ mtx_destroy(&sc->sc_mtx);
+ mtx_destroy(&sc->sc_bulk_mtx);
+ free(sc, M_PFSYNC);
+
+ V_pfsyncif = NULL;
+}
+
+static int
+pfsync_alloc_scrub_memory(struct pfsync_state_peer *s,
+ struct pf_state_peer *d)
+{
+ if (s->scrub.scrub_flag && d->scrub == NULL) {
+ d->scrub = uma_zalloc(V_pf_state_scrub_z, M_NOWAIT | M_ZERO);
+ if (d->scrub == NULL)
+ return (ENOMEM);
+ }
+
+ return (0);
+}
+
+
+static int
+pfsync_state_import(struct pfsync_state *sp, u_int8_t flags)
+{
+ struct pfsync_softc *sc = V_pfsyncif;
+ struct pf_state *st = NULL;
+ struct pf_state_key *skw = NULL, *sks = NULL;
+ struct pf_rule *r = NULL;
+ struct pfi_kif *kif;
+ int error;
+
+ PF_RULES_RASSERT();
+
+ if (sp->creatorid == 0 && V_pf_status.debug >= PF_DEBUG_MISC) {
+ printf("%s: invalid creator id: %08x\n", __func__,
+ ntohl(sp->creatorid));
+ return (EINVAL);
+ }
+
+ if ((kif = pfi_kif_find(sp->ifname)) == NULL) {
+ if (V_pf_status.debug >= PF_DEBUG_MISC)
+ printf("%s: unknown interface: %s\n", __func__,
+ sp->ifname);
+ if (flags & PFSYNC_SI_IOCTL)
+ return (EINVAL);
+ return (0); /* skip this state */
+ }
+
+ /*
+ * If the ruleset checksums match or the state is coming from the ioctl,
+ * it's safe to associate the state with the rule of that number.
+ */
+ if (sp->rule != htonl(-1) && sp->anchor == htonl(-1) &&
+ (flags & (PFSYNC_SI_IOCTL | PFSYNC_SI_CKSUM)) && ntohl(sp->rule) <
+ pf_main_ruleset.rules[PF_RULESET_FILTER].active.rcount)
+ r = pf_main_ruleset.rules[
+ PF_RULESET_FILTER].active.ptr_array[ntohl(sp->rule)];
+ else
+ r = &V_pf_default_rule;
+
+ if ((r->max_states && r->states_cur >= r->max_states))
+ goto cleanup;
+
+ /*
+ * XXXGL: consider M_WAITOK in ioctl path after.
+ */
+ if ((st = uma_zalloc(V_pf_state_z, M_NOWAIT | M_ZERO)) == NULL)
+ goto cleanup;
+
+ if ((skw = uma_zalloc(V_pf_state_key_z, M_NOWAIT)) == NULL)
+ goto cleanup;
+
+ if (PF_ANEQ(&sp->key[PF_SK_WIRE].addr[0],
+ &sp->key[PF_SK_STACK].addr[0], sp->af) ||
+ PF_ANEQ(&sp->key[PF_SK_WIRE].addr[1],
+ &sp->key[PF_SK_STACK].addr[1], sp->af) ||
+ sp->key[PF_SK_WIRE].port[0] != sp->key[PF_SK_STACK].port[0] ||
+ sp->key[PF_SK_WIRE].port[1] != sp->key[PF_SK_STACK].port[1]) {
+ sks = uma_zalloc(V_pf_state_key_z, M_NOWAIT);
+ if (sks == NULL)
+ goto cleanup;
+ } else
+ sks = skw;
+
+ /* allocate memory for scrub info */
+ if (pfsync_alloc_scrub_memory(&sp->src, &st->src) ||
+ pfsync_alloc_scrub_memory(&sp->dst, &st->dst))
+ goto cleanup;
+
+ /* copy to state key(s) */
+ skw->addr[0] = sp->key[PF_SK_WIRE].addr[0];
+ skw->addr[1] = sp->key[PF_SK_WIRE].addr[1];
+ skw->port[0] = sp->key[PF_SK_WIRE].port[0];
+ skw->port[1] = sp->key[PF_SK_WIRE].port[1];
+ skw->proto = sp->proto;
+ skw->af = sp->af;
+ if (sks != skw) {
+ sks->addr[0] = sp->key[PF_SK_STACK].addr[0];
+ sks->addr[1] = sp->key[PF_SK_STACK].addr[1];
+ sks->port[0] = sp->key[PF_SK_STACK].port[0];
+ sks->port[1] = sp->key[PF_SK_STACK].port[1];
+ sks->proto = sp->proto;
+ sks->af = sp->af;
+ }
+
+ /* copy to state */
+ bcopy(&sp->rt_addr, &st->rt_addr, sizeof(st->rt_addr));
+ st->creation = time_uptime - ntohl(sp->creation);
+ st->expire = time_uptime;
+ if (sp->expire) {
+ uint32_t timeout;
+
+ timeout = r->timeout[sp->timeout];
+ if (!timeout)
+ timeout = V_pf_default_rule.timeout[sp->timeout];
+
+ /* sp->expire may have been adaptively scaled by export. */
+ st->expire -= timeout - ntohl(sp->expire);
+ }
+
+ st->direction = sp->direction;
+ st->log = sp->log;
+ st->timeout = sp->timeout;
+ st->state_flags = sp->state_flags;
+
+ st->id = sp->id;
+ st->creatorid = sp->creatorid;
+ pf_state_peer_ntoh(&sp->src, &st->src);
+ pf_state_peer_ntoh(&sp->dst, &st->dst);
+
+ st->rule.ptr = r;
+ st->nat_rule.ptr = NULL;
+ st->anchor.ptr = NULL;
+ st->rt_kif = NULL;
+
+ st->pfsync_time = time_uptime;
+ st->sync_state = PFSYNC_S_NONE;
+
+ /* XXX when we have nat_rule/anchors, use STATE_INC_COUNTERS */
+ r->states_cur++;
+ r->states_tot++;
+
+ if (!(flags & PFSYNC_SI_IOCTL))
+ st->state_flags |= PFSTATE_NOSYNC;
+
+ if ((error = pf_state_insert(kif, skw, sks, st)) != 0) {
+ /* XXX when we have nat_rule/anchors, use STATE_DEC_COUNTERS */
+ r->states_cur--;
+ goto cleanup_state;
+ }
+
+ if (!(flags & PFSYNC_SI_IOCTL)) {
+ st->state_flags &= ~PFSTATE_NOSYNC;
+ if (st->state_flags & PFSTATE_ACK) {
+ pfsync_q_ins(st, PFSYNC_S_IACK);
+ pfsync_push(sc);
+ }
+ }
+ st->state_flags &= ~PFSTATE_ACK;
+ PF_STATE_UNLOCK(st);
+
+ return (0);
+
+cleanup:
+ error = ENOMEM;
+ if (skw == sks)
+ sks = NULL;
+ if (skw != NULL)
+ uma_zfree(V_pf_state_key_z, skw);
+ if (sks != NULL)
+ uma_zfree(V_pf_state_key_z, sks);
+
+cleanup_state: /* pf_state_insert() frees the state keys. */
+ if (st) {
+ if (st->dst.scrub)
+ uma_zfree(V_pf_state_scrub_z, st->dst.scrub);
+ if (st->src.scrub)
+ uma_zfree(V_pf_state_scrub_z, st->src.scrub);
+ uma_zfree(V_pf_state_z, st);
+ }
+ return (error);
+}
+
+static void
+pfsync_input(struct mbuf *m, __unused int off)
+{
+ struct pfsync_softc *sc = V_pfsyncif;
+ struct pfsync_pkt pkt;
+ struct ip *ip = mtod(m, struct ip *);
+ struct pfsync_header *ph;
+ struct pfsync_subheader subh;
+
+ int offset;
+ int rv;
+ uint16_t count;
+
+ V_pfsyncstats.pfsyncs_ipackets++;
+
+ /* Verify that we have a sync interface configured. */
+ if (!sc || !sc->sc_sync_if || !V_pf_status.running ||
+ (sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
+ goto done;
+
+ /* verify that the packet came in on the right interface */
+ if (sc->sc_sync_if != m->m_pkthdr.rcvif) {
+ V_pfsyncstats.pfsyncs_badif++;
+ goto done;
+ }
+
+ sc->sc_ifp->if_ipackets++;
+ sc->sc_ifp->if_ibytes += m->m_pkthdr.len;
+ /* verify that the IP TTL is 255. */
+ if (ip->ip_ttl != PFSYNC_DFLTTL) {
+ V_pfsyncstats.pfsyncs_badttl++;
+ goto done;
+ }
+
+ offset = ip->ip_hl << 2;
+ if (m->m_pkthdr.len < offset + sizeof(*ph)) {
+ V_pfsyncstats.pfsyncs_hdrops++;
+ goto done;
+ }
+
+ if (offset + sizeof(*ph) > m->m_len) {
+ if (m_pullup(m, offset + sizeof(*ph)) == NULL) {
+ V_pfsyncstats.pfsyncs_hdrops++;
+ return;
+ }
+ ip = mtod(m, struct ip *);
+ }
+ ph = (struct pfsync_header *)((char *)ip + offset);
+
+ /* verify the version */
+ if (ph->version != PFSYNC_VERSION) {
+ V_pfsyncstats.pfsyncs_badver++;
+ goto done;
+ }
+
+ /* Cheaper to grab this now than having to mess with mbufs later */
+ pkt.ip = ip;
+ pkt.src = ip->ip_src;
+ pkt.flags = 0;
+
+ /*
+ * Trusting pf_chksum during packet processing, as well as seeking
+ * in interface name tree, require holding PF_RULES_RLOCK().
+ */
+ PF_RULES_RLOCK();
+ if (!bcmp(&ph->pfcksum, &V_pf_status.pf_chksum, PF_MD5_DIGEST_LENGTH))
+ pkt.flags |= PFSYNC_SI_CKSUM;
+
+ offset += sizeof(*ph);
+ for (;;) {
+ m_copydata(m, offset, sizeof(subh), (caddr_t)&subh);
+ offset += sizeof(subh);
+
+ if (subh.action >= PFSYNC_ACT_MAX) {
+ V_pfsyncstats.pfsyncs_badact++;
+ PF_RULES_RUNLOCK();
+ goto done;
+ }
+
+ count = ntohs(subh.count);
+ V_pfsyncstats.pfsyncs_iacts[subh.action] += count;
+ rv = (*pfsync_acts[subh.action])(&pkt, m, offset, count);
+ if (rv == -1) {
+ PF_RULES_RUNLOCK();
+ return;
+ }
+
+ offset += rv;
+ }
+ PF_RULES_RUNLOCK();
+
+done:
+ m_freem(m);
+}
+
+static int
+pfsync_in_clr(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
+{
+ struct pfsync_clr *clr;
+ struct mbuf *mp;
+ int len = sizeof(*clr) * count;
+ int i, offp;
+ u_int32_t creatorid;
+
+ mp = m_pulldown(m, offset, len, &offp);
+ if (mp == NULL) {
+ V_pfsyncstats.pfsyncs_badlen++;
+ return (-1);
+ }
+ clr = (struct pfsync_clr *)(mp->m_data + offp);
+
+ for (i = 0; i < count; i++) {
+ creatorid = clr[i].creatorid;
+
+ if (clr[i].ifname[0] != '\0' &&
+ pfi_kif_find(clr[i].ifname) == NULL)
+ continue;
+
+ for (int i = 0; i <= V_pf_hashmask; i++) {
+ struct pf_idhash *ih = &V_pf_idhash[i];
+ struct pf_state *s;
+relock:
+ PF_HASHROW_LOCK(ih);
+ LIST_FOREACH(s, &ih->states, entry) {
+ if (s->creatorid == creatorid) {
+ s->state_flags |= PFSTATE_NOSYNC;
+ pf_unlink_state(s, PF_ENTER_LOCKED);
+ goto relock;
+ }
+ }
+ PF_HASHROW_UNLOCK(ih);
+ }
+ }
+
+ return (len);
+}
+
+static int
+pfsync_in_ins(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
+{
+ struct mbuf *mp;
+ struct pfsync_state *sa, *sp;
+ int len = sizeof(*sp) * count;
+ int i, offp;
+
+ mp = m_pulldown(m, offset, len, &offp);
+ if (mp == NULL) {
+ V_pfsyncstats.pfsyncs_badlen++;
+ return (-1);
+ }
+ sa = (struct pfsync_state *)(mp->m_data + offp);
+
+ for (i = 0; i < count; i++) {
+ sp = &sa[i];
+
+ /* Check for invalid values. */
+ if (sp->timeout >= PFTM_MAX ||
+ sp->src.state > PF_TCPS_PROXY_DST ||
+ sp->dst.state > PF_TCPS_PROXY_DST ||
+ sp->direction > PF_OUT ||
+ (sp->af != AF_INET && sp->af != AF_INET6)) {
+ if (V_pf_status.debug >= PF_DEBUG_MISC)
+ printf("%s: invalid value\n", __func__);
+ V_pfsyncstats.pfsyncs_badval++;
+ continue;
+ }
+
+ if (pfsync_state_import(sp, pkt->flags) == ENOMEM)
+ /* Drop out, but process the rest of the actions. */
+ break;
+ }
+
+ return (len);
+}
+
+static int
+pfsync_in_iack(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
+{
+ struct pfsync_ins_ack *ia, *iaa;
+ struct pf_state *st;
+
+ struct mbuf *mp;
+ int len = count * sizeof(*ia);
+ int offp, i;
+
+ mp = m_pulldown(m, offset, len, &offp);
+ if (mp == NULL) {
+ V_pfsyncstats.pfsyncs_badlen++;
+ return (-1);
+ }
+ iaa = (struct pfsync_ins_ack *)(mp->m_data + offp);
+
+ for (i = 0; i < count; i++) {
+ ia = &iaa[i];
+
+ st = pf_find_state_byid(ia->id, ia->creatorid);
+ if (st == NULL)
+ continue;
+
+ if (st->state_flags & PFSTATE_ACK) {
+ PFSYNC_LOCK(V_pfsyncif);
+ pfsync_undefer_state(st, 0);
+ PFSYNC_UNLOCK(V_pfsyncif);
+ }
+ PF_STATE_UNLOCK(st);
+ }
+ /*
+ * XXX this is not yet implemented, but we know the size of the
+ * message so we can skip it.
+ */
+
+ return (count * sizeof(struct pfsync_ins_ack));
+}
+
+static int
+pfsync_upd_tcp(struct pf_state *st, struct pfsync_state_peer *src,
+ struct pfsync_state_peer *dst)
+{
+ int sfail = 0;
+
+ PF_STATE_LOCK_ASSERT(st);
+
+ /*
+ * The state should never go backwards except
+ * for syn-proxy states. Neither should the
+ * sequence window slide backwards.
+ */
+ if (st->src.state > src->state &&
+ (st->src.state < PF_TCPS_PROXY_SRC ||
+ src->state >= PF_TCPS_PROXY_SRC))
+ sfail = 1;
+ else if (SEQ_GT(st->src.seqlo, ntohl(src->seqlo)))
+ sfail = 3;
+ else if (st->dst.state > dst->state) {
+ /* There might still be useful
+ * information about the src state here,
+ * so import that part of the update,
+ * then "fail" so we send the updated
+ * state back to the peer who is missing
+ * our what we know. */
+ pf_state_peer_ntoh(src, &st->src);
+ /* XXX do anything with timeouts? */
+ sfail = 7;
+ } else if (st->dst.state >= TCPS_SYN_SENT &&
+ SEQ_GT(st->dst.seqlo, ntohl(dst->seqlo)))
+ sfail = 4;
+
+ return (sfail);
+}
+
+static int
+pfsync_in_upd(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
+{
+ struct pfsync_softc *sc = V_pfsyncif;
+ struct pfsync_state *sa, *sp;
+ struct pf_state_key *sk;
+ struct pf_state *st;
+ int sfail;
+
+ struct mbuf *mp;
+ int len = count * sizeof(*sp);
+ int offp, i;
+
+ mp = m_pulldown(m, offset, len, &offp);
+ if (mp == NULL) {
+ V_pfsyncstats.pfsyncs_badlen++;
+ return (-1);
+ }
+ sa = (struct pfsync_state *)(mp->m_data + offp);
+
+ for (i = 0; i < count; i++) {
+ sp = &sa[i];
+
+ /* check for invalid values */
+ if (sp->timeout >= PFTM_MAX ||
+ sp->src.state > PF_TCPS_PROXY_DST ||
+ sp->dst.state > PF_TCPS_PROXY_DST) {
+ if (V_pf_status.debug >= PF_DEBUG_MISC) {
+ printf("pfsync_input: PFSYNC_ACT_UPD: "
+ "invalid value\n");
+ }
+ V_pfsyncstats.pfsyncs_badval++;
+ continue;
+ }
+
+ st = pf_find_state_byid(sp->id, sp->creatorid);
+ if (st == NULL) {
+ /* insert the update */
+ if (pfsync_state_import(sp, 0))
+ V_pfsyncstats.pfsyncs_badstate++;
+ continue;
+ }
+
+ if (st->state_flags & PFSTATE_ACK) {
+ PFSYNC_LOCK(sc);
+ pfsync_undefer_state(st, 1);
+ PFSYNC_UNLOCK(sc);
+ }
+
+ sk = st->key[PF_SK_WIRE]; /* XXX right one? */
+ sfail = 0;
+ if (sk->proto == IPPROTO_TCP)
+ sfail = pfsync_upd_tcp(st, &sp->src, &sp->dst);
+ else {
+ /*
+ * Non-TCP protocol state machine always go
+ * forwards
+ */
+ if (st->src.state > sp->src.state)
+ sfail = 5;
+ else if (st->dst.state > sp->dst.state)
+ sfail = 6;
+ }
+
+ if (sfail) {
+ if (V_pf_status.debug >= PF_DEBUG_MISC) {
+ printf("pfsync: %s stale update (%d)"
+ " id: %016llx creatorid: %08x\n",
+ (sfail < 7 ? "ignoring" : "partial"),
+ sfail, (unsigned long long)be64toh(st->id),
+ ntohl(st->creatorid));
+ }
+ V_pfsyncstats.pfsyncs_stale++;
+
+ pfsync_update_state(st);
+ PF_STATE_UNLOCK(st);
+ PFSYNC_LOCK(sc);
+ pfsync_push(sc);
+ PFSYNC_UNLOCK(sc);
+ continue;
+ }
+ pfsync_alloc_scrub_memory(&sp->dst, &st->dst);
+ pf_state_peer_ntoh(&sp->src, &st->src);
+ pf_state_peer_ntoh(&sp->dst, &st->dst);
+ st->expire = time_uptime;
+ st->timeout = sp->timeout;
+ st->pfsync_time = time_uptime;
+ PF_STATE_UNLOCK(st);
+ }
+
+ return (len);
+}
+
+static int
+pfsync_in_upd_c(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
+{
+ struct pfsync_softc *sc = V_pfsyncif;
+ struct pfsync_upd_c *ua, *up;
+ struct pf_state_key *sk;
+ struct pf_state *st;
+
+ int len = count * sizeof(*up);
+ int sfail;
+
+ struct mbuf *mp;
+ int offp, i;
+
+ mp = m_pulldown(m, offset, len, &offp);
+ if (mp == NULL) {
+ V_pfsyncstats.pfsyncs_badlen++;
+ return (-1);
+ }
+ ua = (struct pfsync_upd_c *)(mp->m_data + offp);
+
+ for (i = 0; i < count; i++) {
+ up = &ua[i];
+
+ /* check for invalid values */
+ if (up->timeout >= PFTM_MAX ||
+ up->src.state > PF_TCPS_PROXY_DST ||
+ up->dst.state > PF_TCPS_PROXY_DST) {
+ if (V_pf_status.debug >= PF_DEBUG_MISC) {
+ printf("pfsync_input: "
+ "PFSYNC_ACT_UPD_C: "
+ "invalid value\n");
+ }
+ V_pfsyncstats.pfsyncs_badval++;
+ continue;
+ }
+
+ st = pf_find_state_byid(up->id, up->creatorid);
+ if (st == NULL) {
+ /* We don't have this state. Ask for it. */
+ PFSYNC_LOCK(sc);
+ pfsync_request_update(up->creatorid, up->id);
+ PFSYNC_UNLOCK(sc);
+ continue;
+ }
+
+ if (st->state_flags & PFSTATE_ACK) {
+ PFSYNC_LOCK(sc);
+ pfsync_undefer_state(st, 1);
+ PFSYNC_UNLOCK(sc);
+ }
+
+ sk = st->key[PF_SK_WIRE]; /* XXX right one? */
+ sfail = 0;
+ if (sk->proto == IPPROTO_TCP)
+ sfail = pfsync_upd_tcp(st, &up->src, &up->dst);
+ else {
+ /*
+ * Non-TCP protocol state machine always go forwards
+ */
+ if (st->src.state > up->src.state)
+ sfail = 5;
+ else if (st->dst.state > up->dst.state)
+ sfail = 6;
+ }
+
+ if (sfail) {
+ if (V_pf_status.debug >= PF_DEBUG_MISC) {
+ printf("pfsync: ignoring stale update "
+ "(%d) id: %016llx "
+ "creatorid: %08x\n", sfail,
+ (unsigned long long)be64toh(st->id),
+ ntohl(st->creatorid));
+ }
+ V_pfsyncstats.pfsyncs_stale++;
+
+ pfsync_update_state(st);
+ PF_STATE_UNLOCK(st);
+ PFSYNC_LOCK(sc);
+ pfsync_push(sc);
+ PFSYNC_UNLOCK(sc);
+ continue;
+ }
+ pfsync_alloc_scrub_memory(&up->dst, &st->dst);
+ pf_state_peer_ntoh(&up->src, &st->src);
+ pf_state_peer_ntoh(&up->dst, &st->dst);
+ st->expire = time_uptime;
+ st->timeout = up->timeout;
+ st->pfsync_time = time_uptime;
+ PF_STATE_UNLOCK(st);
+ }
+
+ return (len);
+}
+
+static int
+pfsync_in_ureq(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
+{
+ struct pfsync_upd_req *ur, *ura;
+ struct mbuf *mp;
+ int len = count * sizeof(*ur);
+ int i, offp;
+
+ struct pf_state *st;
+
+ mp = m_pulldown(m, offset, len, &offp);
+ if (mp == NULL) {
+ V_pfsyncstats.pfsyncs_badlen++;
+ return (-1);
+ }
+ ura = (struct pfsync_upd_req *)(mp->m_data + offp);
+
+ for (i = 0; i < count; i++) {
+ ur = &ura[i];
+
+ if (ur->id == 0 && ur->creatorid == 0)
+ pfsync_bulk_start();
+ else {
+ st = pf_find_state_byid(ur->id, ur->creatorid);
+ if (st == NULL) {
+ V_pfsyncstats.pfsyncs_badstate++;
+ continue;
+ }
+ if (st->state_flags & PFSTATE_NOSYNC) {
+ PF_STATE_UNLOCK(st);
+ continue;
+ }
+
+ pfsync_update_state_req(st);
+ PF_STATE_UNLOCK(st);
+ }
+ }
+
+ return (len);
+}
+
+static int
+pfsync_in_del(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
+{
+ struct mbuf *mp;
+ struct pfsync_state *sa, *sp;
+ struct pf_state *st;
+ int len = count * sizeof(*sp);
+ int offp, i;
+
+ mp = m_pulldown(m, offset, len, &offp);
+ if (mp == NULL) {
+ V_pfsyncstats.pfsyncs_badlen++;
+ return (-1);
+ }
+ sa = (struct pfsync_state *)(mp->m_data + offp);
+
+ for (i = 0; i < count; i++) {
+ sp = &sa[i];
+
+ st = pf_find_state_byid(sp->id, sp->creatorid);
+ if (st == NULL) {
+ V_pfsyncstats.pfsyncs_badstate++;
+ continue;
+ }
+ st->state_flags |= PFSTATE_NOSYNC;
+ pf_unlink_state(st, PF_ENTER_LOCKED);
+ }
+
+ return (len);
+}
+
+static int
+pfsync_in_del_c(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
+{
+ struct mbuf *mp;
+ struct pfsync_del_c *sa, *sp;
+ struct pf_state *st;
+ int len = count * sizeof(*sp);
+ int offp, i;
+
+ mp = m_pulldown(m, offset, len, &offp);
+ if (mp == NULL) {
+ V_pfsyncstats.pfsyncs_badlen++;
+ return (-1);
+ }
+ sa = (struct pfsync_del_c *)(mp->m_data + offp);
+
+ for (i = 0; i < count; i++) {
+ sp = &sa[i];
+
+ st = pf_find_state_byid(sp->id, sp->creatorid);
+ if (st == NULL) {
+ V_pfsyncstats.pfsyncs_badstate++;
+ continue;
+ }
+
+ st->state_flags |= PFSTATE_NOSYNC;
+ pf_unlink_state(st, PF_ENTER_LOCKED);
+ }
+
+ return (len);
+}
+
+static int
+pfsync_in_bus(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
+{
+ struct pfsync_softc *sc = V_pfsyncif;
+ struct pfsync_bus *bus;
+ struct mbuf *mp;
+ int len = count * sizeof(*bus);
+ int offp;
+
+ PFSYNC_BLOCK(sc);
+
+ /* If we're not waiting for a bulk update, who cares. */
+ if (sc->sc_ureq_sent == 0) {
+ PFSYNC_BUNLOCK(sc);
+ return (len);
+ }
+
+ mp = m_pulldown(m, offset, len, &offp);
+ if (mp == NULL) {
+ PFSYNC_BUNLOCK(sc);
+ V_pfsyncstats.pfsyncs_badlen++;
+ return (-1);
+ }
+ bus = (struct pfsync_bus *)(mp->m_data + offp);
+
+ switch (bus->status) {
+ case PFSYNC_BUS_START:
+ callout_reset(&sc->sc_bulkfail_tmo, 4 * hz +
+ V_pf_limits[PF_LIMIT_STATES].limit /
+ ((sc->sc_ifp->if_mtu - PFSYNC_MINPKT) /
+ sizeof(struct pfsync_state)),
+ pfsync_bulk_fail, sc);
+ if (V_pf_status.debug >= PF_DEBUG_MISC)
+ printf("pfsync: received bulk update start\n");
+ break;
+
+ case PFSYNC_BUS_END:
+ if (time_uptime - ntohl(bus->endtime) >=
+ sc->sc_ureq_sent) {
+ /* that's it, we're happy */
+ sc->sc_ureq_sent = 0;
+ sc->sc_bulk_tries = 0;
+ callout_stop(&sc->sc_bulkfail_tmo);
+ if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
+ (*carp_demote_adj_p)(-V_pfsync_carp_adj,
+ "pfsync bulk done");
+ sc->sc_flags |= PFSYNCF_OK;
+ if (V_pf_status.debug >= PF_DEBUG_MISC)
+ printf("pfsync: received valid "
+ "bulk update end\n");
+ } else {
+ if (V_pf_status.debug >= PF_DEBUG_MISC)
+ printf("pfsync: received invalid "
+ "bulk update end: bad timestamp\n");
+ }
+ break;
+ }
+ PFSYNC_BUNLOCK(sc);
+
+ return (len);
+}
+
+static int
+pfsync_in_tdb(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
+{
+ int len = count * sizeof(struct pfsync_tdb);
+
+#if defined(IPSEC)
+ struct pfsync_tdb *tp;
+ struct mbuf *mp;
+ int offp;
+ int i;
+ int s;
+
+ mp = m_pulldown(m, offset, len, &offp);
+ if (mp == NULL) {
+ V_pfsyncstats.pfsyncs_badlen++;
+ return (-1);
+ }
+ tp = (struct pfsync_tdb *)(mp->m_data + offp);
+
+ for (i = 0; i < count; i++)
+ pfsync_update_net_tdb(&tp[i]);
+#endif
+
+ return (len);
+}
+
+#if defined(IPSEC)
+/* Update an in-kernel tdb. Silently fail if no tdb is found. */
+static void
+pfsync_update_net_tdb(struct pfsync_tdb *pt)
+{
+ struct tdb *tdb;
+ int s;
+
+ /* check for invalid values */
+ if (ntohl(pt->spi) <= SPI_RESERVED_MAX ||
+ (pt->dst.sa.sa_family != AF_INET &&
+ pt->dst.sa.sa_family != AF_INET6))
+ goto bad;
+
+ tdb = gettdb(pt->spi, &pt->dst, pt->sproto);
+ if (tdb) {
+ pt->rpl = ntohl(pt->rpl);
+ pt->cur_bytes = (unsigned long long)be64toh(pt->cur_bytes);
+
+ /* Neither replay nor byte counter should ever decrease. */
+ if (pt->rpl < tdb->tdb_rpl ||
+ pt->cur_bytes < tdb->tdb_cur_bytes) {
+ goto bad;
+ }
+
+ tdb->tdb_rpl = pt->rpl;
+ tdb->tdb_cur_bytes = pt->cur_bytes;
+ }
+ return;
+
+bad:
+ if (V_pf_status.debug >= PF_DEBUG_MISC)
+ printf("pfsync_insert: PFSYNC_ACT_TDB_UPD: "
+ "invalid value\n");
+ V_pfsyncstats.pfsyncs_badstate++;
+ return;
+}
+#endif
+
+
+static int
+pfsync_in_eof(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
+{
+ /* check if we are at the right place in the packet */
+ if (offset != m->m_pkthdr.len - sizeof(struct pfsync_eof))
+ V_pfsyncstats.pfsyncs_badact++;
+
+ /* we're done. free and let the caller return */
+ m_freem(m);
+ return (-1);
+}
+
+static int
+pfsync_in_error(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
+{
+ V_pfsyncstats.pfsyncs_badact++;
+
+ m_freem(m);
+ return (-1);
+}
+
+static int
+pfsyncoutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
+ struct route *rt)
+{
+ m_freem(m);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
+{
+ struct pfsync_softc *sc = ifp->if_softc;
+ struct ifreq *ifr = (struct ifreq *)data;
+ struct pfsyncreq pfsyncr;
+ int error;
+
+ switch (cmd) {
+ case SIOCSIFFLAGS:
+ PFSYNC_LOCK(sc);
+ if (ifp->if_flags & IFF_UP)
+ ifp->if_drv_flags |= IFF_DRV_RUNNING;
+ else
+ ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
+ PFSYNC_UNLOCK(sc);
+ break;
+ case SIOCSIFMTU:
+ if (!sc->sc_sync_if ||
+ ifr->ifr_mtu <= PFSYNC_MINPKT ||
+ ifr->ifr_mtu > sc->sc_sync_if->if_mtu)
+ return (EINVAL);
+ if (ifr->ifr_mtu < ifp->if_mtu) {
+ PFSYNC_LOCK(sc);
+ if (sc->sc_len > PFSYNC_MINPKT)
+ pfsync_sendout(1);
+ PFSYNC_UNLOCK(sc);
+ }
+ ifp->if_mtu = ifr->ifr_mtu;
+ break;
+ case SIOCGETPFSYNC:
+ bzero(&pfsyncr, sizeof(pfsyncr));
+ PFSYNC_LOCK(sc);
+ if (sc->sc_sync_if) {
+ strlcpy(pfsyncr.pfsyncr_syncdev,
+ sc->sc_sync_if->if_xname, IFNAMSIZ);
+ }
+ pfsyncr.pfsyncr_syncpeer = sc->sc_sync_peer;
+ pfsyncr.pfsyncr_maxupdates = sc->sc_maxupdates;
+ pfsyncr.pfsyncr_defer = (PFSYNCF_DEFER ==
+ (sc->sc_flags & PFSYNCF_DEFER));
+ PFSYNC_UNLOCK(sc);
+ return (copyout(&pfsyncr, ifr->ifr_data, sizeof(pfsyncr)));
+
+ case SIOCSETPFSYNC:
+ {
+ struct ip_moptions *imo = &sc->sc_imo;
+ struct ifnet *sifp;
+ struct ip *ip;
+ void *mship = NULL;
+
+ if ((error = priv_check(curthread, PRIV_NETINET_PF)) != 0)
+ return (error);
+ if ((error = copyin(ifr->ifr_data, &pfsyncr, sizeof(pfsyncr))))
+ return (error);
+
+ if (pfsyncr.pfsyncr_maxupdates > 255)
+ return (EINVAL);
+
+ if (pfsyncr.pfsyncr_syncdev[0] == 0)
+ sifp = NULL;
+ else if ((sifp = ifunit_ref(pfsyncr.pfsyncr_syncdev)) == NULL)
+ return (EINVAL);
+
+ if (pfsyncr.pfsyncr_syncpeer.s_addr == 0 && sifp != NULL)
+ mship = malloc((sizeof(struct in_multi *) *
+ IP_MIN_MEMBERSHIPS), M_PFSYNC, M_WAITOK | M_ZERO);
+
+ PFSYNC_LOCK(sc);
+ if (pfsyncr.pfsyncr_syncpeer.s_addr == 0)
+ sc->sc_sync_peer.s_addr = htonl(INADDR_PFSYNC_GROUP);
+ else
+ sc->sc_sync_peer.s_addr =
+ pfsyncr.pfsyncr_syncpeer.s_addr;
+
+ sc->sc_maxupdates = pfsyncr.pfsyncr_maxupdates;
+ if (pfsyncr.pfsyncr_defer) {
+ sc->sc_flags |= PFSYNCF_DEFER;
+ pfsync_defer_ptr = pfsync_defer;
+ } else {
+ sc->sc_flags &= ~PFSYNCF_DEFER;
+ pfsync_defer_ptr = NULL;
+ }
+
+ if (sifp == NULL) {
+ if (sc->sc_sync_if)
+ if_rele(sc->sc_sync_if);
+ sc->sc_sync_if = NULL;
+ if (imo->imo_membership)
+ pfsync_multicast_cleanup(sc);
+ PFSYNC_UNLOCK(sc);
+ break;
+ }
+
+ if (sc->sc_len > PFSYNC_MINPKT &&
+ (sifp->if_mtu < sc->sc_ifp->if_mtu ||
+ (sc->sc_sync_if != NULL &&
+ sifp->if_mtu < sc->sc_sync_if->if_mtu) ||
+ sifp->if_mtu < MCLBYTES - sizeof(struct ip)))
+ pfsync_sendout(1);
+
+ if (imo->imo_membership)
+ pfsync_multicast_cleanup(sc);
+
+ if (sc->sc_sync_peer.s_addr == htonl(INADDR_PFSYNC_GROUP)) {
+ error = pfsync_multicast_setup(sc, sifp, mship);
+ if (error) {
+ if_rele(sifp);
+ free(mship, M_PFSYNC);
+ return (error);
+ }
+ }
+ if (sc->sc_sync_if)
+ if_rele(sc->sc_sync_if);
+ sc->sc_sync_if = sifp;
+
+ ip = &sc->sc_template;
+ bzero(ip, sizeof(*ip));
+ ip->ip_v = IPVERSION;
+ ip->ip_hl = sizeof(sc->sc_template) >> 2;
+ ip->ip_tos = IPTOS_LOWDELAY;
+ /* len and id are set later. */
+ ip->ip_off = IP_DF;
+ ip->ip_ttl = PFSYNC_DFLTTL;
+ ip->ip_p = IPPROTO_PFSYNC;
+ ip->ip_src.s_addr = INADDR_ANY;
+ ip->ip_dst.s_addr = sc->sc_sync_peer.s_addr;
+
+ /* Request a full state table update. */
+ if ((sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
+ (*carp_demote_adj_p)(V_pfsync_carp_adj,
+ "pfsync bulk start");
+ sc->sc_flags &= ~PFSYNCF_OK;
+ if (V_pf_status.debug >= PF_DEBUG_MISC)
+ printf("pfsync: requesting bulk update\n");
+ pfsync_request_update(0, 0);
+ PFSYNC_UNLOCK(sc);
+ PFSYNC_BLOCK(sc);
+ sc->sc_ureq_sent = time_uptime;
+ callout_reset(&sc->sc_bulkfail_tmo, 5 * hz, pfsync_bulk_fail,
+ sc);
+ PFSYNC_BUNLOCK(sc);
+
+ break;
+ }
+ default:
+ return (ENOTTY);
+ }
+
+ return (0);
+}
+
+static int
+pfsync_out_state(struct pf_state *st, struct mbuf *m, int offset)
+{
+ struct pfsync_state *sp = (struct pfsync_state *)(m->m_data + offset);
+
+ pfsync_state_export(sp, st);
+
+ return (sizeof(*sp));
+}
+
+static int
+pfsync_out_iack(struct pf_state *st, struct mbuf *m, int offset)
+{
+ struct pfsync_ins_ack *iack =
+ (struct pfsync_ins_ack *)(m->m_data + offset);
+
+ iack->id = st->id;
+ iack->creatorid = st->creatorid;
+
+ return (sizeof(*iack));
+}
+
+static int
+pfsync_out_upd_c(struct pf_state *st, struct mbuf *m, int offset)
+{
+ struct pfsync_upd_c *up = (struct pfsync_upd_c *)(m->m_data + offset);
+
+ bzero(up, sizeof(*up));
+ up->id = st->id;
+ pf_state_peer_hton(&st->src, &up->src);
+ pf_state_peer_hton(&st->dst, &up->dst);
+ up->creatorid = st->creatorid;
+ up->timeout = st->timeout;
+
+ return (sizeof(*up));
+}
+
+static int
+pfsync_out_del(struct pf_state *st, struct mbuf *m, int offset)
+{
+ struct pfsync_del_c *dp = (struct pfsync_del_c *)(m->m_data + offset);
+
+ dp->id = st->id;
+ dp->creatorid = st->creatorid;
+
+ st->state_flags |= PFSTATE_NOSYNC;
+
+ return (sizeof(*dp));
+}
+
+static void
+pfsync_drop(struct pfsync_softc *sc)
+{
+ struct pf_state *st, *next;
+ struct pfsync_upd_req_item *ur;
+ int q;
+
+ for (q = 0; q < PFSYNC_S_COUNT; q++) {
+ if (TAILQ_EMPTY(&sc->sc_qs[q]))
+ continue;
+
+ TAILQ_FOREACH_SAFE(st, &sc->sc_qs[q], sync_list, next) {
+ KASSERT(st->sync_state == q,
+ ("%s: st->sync_state == q",
+ __func__));
+ st->sync_state = PFSYNC_S_NONE;
+ pf_release_state(st);
+ }
+ TAILQ_INIT(&sc->sc_qs[q]);
+ }
+
+ while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) {
+ TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry);
+ free(ur, M_PFSYNC);
+ }
+
+ sc->sc_plus = NULL;
+ sc->sc_len = PFSYNC_MINPKT;
+}
+
+static void
+pfsync_sendout(int schedswi)
+{
+ struct pfsync_softc *sc = V_pfsyncif;
+ struct ifnet *ifp = sc->sc_ifp;
+ struct mbuf *m;
+ struct ip *ip;
+ struct pfsync_header *ph;
+ struct pfsync_subheader *subh;
+ struct pf_state *st, *next;
+ struct pfsync_upd_req_item *ur;
+ int offset;
+ int q, count = 0;
+
+ KASSERT(sc != NULL, ("%s: null sc", __func__));
+ KASSERT(sc->sc_len > PFSYNC_MINPKT,
+ ("%s: sc_len %zu", __func__, sc->sc_len));
+ PFSYNC_LOCK_ASSERT(sc);
+
+ if (ifp->if_bpf == NULL && sc->sc_sync_if == NULL) {
+ pfsync_drop(sc);
+ return;
+ }
+
+ m = m_get2(M_NOWAIT, MT_DATA, M_PKTHDR, max_linkhdr + sc->sc_len);
+ if (m == NULL) {
+ sc->sc_ifp->if_oerrors++;
+ V_pfsyncstats.pfsyncs_onomem++;
+ return;
+ }
+ m->m_data += max_linkhdr;
+ m->m_len = m->m_pkthdr.len = sc->sc_len;
+
+ /* build the ip header */
+ ip = (struct ip *)m->m_data;
+ bcopy(&sc->sc_template, ip, sizeof(*ip));
+ offset = sizeof(*ip);
+
+ ip->ip_len = m->m_pkthdr.len;
+ ip->ip_id = htons(ip_randomid());
+
+ /* build the pfsync header */
+ ph = (struct pfsync_header *)(m->m_data + offset);
+ bzero(ph, sizeof(*ph));
+ offset += sizeof(*ph);
+
+ ph->version = PFSYNC_VERSION;
+ ph->len = htons(sc->sc_len - sizeof(*ip));
+ bcopy(V_pf_status.pf_chksum, ph->pfcksum, PF_MD5_DIGEST_LENGTH);
+
+ /* walk the queues */
+ for (q = 0; q < PFSYNC_S_COUNT; q++) {
+ if (TAILQ_EMPTY(&sc->sc_qs[q]))
+ continue;
+
+ subh = (struct pfsync_subheader *)(m->m_data + offset);
+ offset += sizeof(*subh);
+
+ count = 0;
+ TAILQ_FOREACH_SAFE(st, &sc->sc_qs[q], sync_list, next) {
+ KASSERT(st->sync_state == q,
+ ("%s: st->sync_state == q",
+ __func__));
+ /*
+ * XXXGL: some of write methods do unlocked reads
+ * of state data :(
+ */
+ offset += pfsync_qs[q].write(st, m, offset);
+ st->sync_state = PFSYNC_S_NONE;
+ pf_release_state(st);
+ count++;
+ }
+ TAILQ_INIT(&sc->sc_qs[q]);
+
+ bzero(subh, sizeof(*subh));
+ subh->action = pfsync_qs[q].action;
+ subh->count = htons(count);
+ V_pfsyncstats.pfsyncs_oacts[pfsync_qs[q].action] += count;
+ }
+
+ if (!TAILQ_EMPTY(&sc->sc_upd_req_list)) {
+ subh = (struct pfsync_subheader *)(m->m_data + offset);
+ offset += sizeof(*subh);
+
+ count = 0;
+ while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) {
+ TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry);
+
+ bcopy(&ur->ur_msg, m->m_data + offset,
+ sizeof(ur->ur_msg));
+ offset += sizeof(ur->ur_msg);
+ free(ur, M_PFSYNC);
+ count++;
+ }
+
+ bzero(subh, sizeof(*subh));
+ subh->action = PFSYNC_ACT_UPD_REQ;
+ subh->count = htons(count);
+ V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_UPD_REQ] += count;
+ }
+
+ /* has someone built a custom region for us to add? */
+ if (sc->sc_plus != NULL) {
+ bcopy(sc->sc_plus, m->m_data + offset, sc->sc_pluslen);
+ offset += sc->sc_pluslen;
+
+ sc->sc_plus = NULL;
+ }
+
+ subh = (struct pfsync_subheader *)(m->m_data + offset);
+ offset += sizeof(*subh);
+
+ bzero(subh, sizeof(*subh));
+ subh->action = PFSYNC_ACT_EOF;
+ subh->count = htons(1);
+ V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_EOF]++;
+
+ /* XXX write checksum in EOF here */
+
+ /* we're done, let's put it on the wire */
+ if (ifp->if_bpf) {
+ m->m_data += sizeof(*ip);
+ m->m_len = m->m_pkthdr.len = sc->sc_len - sizeof(*ip);
+ BPF_MTAP(ifp, m);
+ m->m_data -= sizeof(*ip);
+ m->m_len = m->m_pkthdr.len = sc->sc_len;
+ }
+
+ if (sc->sc_sync_if == NULL) {
+ sc->sc_len = PFSYNC_MINPKT;
+ m_freem(m);
+ return;
+ }
+
+ sc->sc_ifp->if_opackets++;
+ sc->sc_ifp->if_obytes += m->m_pkthdr.len;
+ sc->sc_len = PFSYNC_MINPKT;
+
+ if (!_IF_QFULL(&sc->sc_ifp->if_snd))
+ _IF_ENQUEUE(&sc->sc_ifp->if_snd, m);
+ else {
+ m_freem(m);
+ sc->sc_ifp->if_snd.ifq_drops++;
+ }
+ if (schedswi)
+ swi_sched(V_pfsync_swi_cookie, 0);
+}
+
+static void
+pfsync_insert_state(struct pf_state *st)
+{
+ struct pfsync_softc *sc = V_pfsyncif;
+
+ if (st->state_flags & PFSTATE_NOSYNC)
+ return;
+
+ if ((st->rule.ptr->rule_flag & PFRULE_NOSYNC) ||
+ st->key[PF_SK_WIRE]->proto == IPPROTO_PFSYNC) {
+ st->state_flags |= PFSTATE_NOSYNC;
+ return;
+ }
+
+ KASSERT(st->sync_state == PFSYNC_S_NONE,
+ ("%s: st->sync_state == PFSYNC_S_NONE", __func__));
+
+ PFSYNC_LOCK(sc);
+ if (sc->sc_len == PFSYNC_MINPKT)
+ callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif);
+
+ pfsync_q_ins(st, PFSYNC_S_INS);
+ PFSYNC_UNLOCK(sc);
+
+ st->sync_updates = 0;
+}
+
+static int
+pfsync_defer(struct pf_state *st, struct mbuf *m)
+{
+ struct pfsync_softc *sc = V_pfsyncif;
+ struct pfsync_deferral *pd;
+
+ if (m->m_flags & (M_BCAST|M_MCAST))
+ return (0);
+
+ PFSYNC_LOCK(sc);
+
+ if (sc == NULL || !(sc->sc_ifp->if_flags & IFF_DRV_RUNNING) ||
+ !(sc->sc_flags & PFSYNCF_DEFER)) {
+ PFSYNC_UNLOCK(sc);
+ return (0);
+ }
+
+ if (sc->sc_deferred >= 128)
+ pfsync_undefer(TAILQ_FIRST(&sc->sc_deferrals), 0);
+
+ pd = malloc(sizeof(*pd), M_PFSYNC, M_NOWAIT);
+ if (pd == NULL)
+ return (0);
+ sc->sc_deferred++;
+
+ m->m_flags |= M_SKIP_FIREWALL;
+ st->state_flags |= PFSTATE_ACK;
+
+ pd->pd_sc = sc;
+ pd->pd_refs = 0;
+ pd->pd_st = st;
+ pf_ref_state(st);
+ pd->pd_m = m;
+
+ TAILQ_INSERT_TAIL(&sc->sc_deferrals, pd, pd_entry);
+ callout_init_mtx(&pd->pd_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED);
+ callout_reset(&pd->pd_tmo, 10, pfsync_defer_tmo, pd);
+
+ pfsync_push(sc);
+
+ return (1);
+}
+
+static void
+pfsync_undefer(struct pfsync_deferral *pd, int drop)
+{
+ struct pfsync_softc *sc = pd->pd_sc;
+ struct mbuf *m = pd->pd_m;
+ struct pf_state *st = pd->pd_st;
+
+ PFSYNC_LOCK_ASSERT(sc);
+
+ TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
+ sc->sc_deferred--;
+ pd->pd_st->state_flags &= ~PFSTATE_ACK; /* XXX: locking! */
+ free(pd, M_PFSYNC);
+ pf_release_state(st);
+
+ if (drop)
+ m_freem(m);
+ else {
+ _IF_ENQUEUE(&sc->sc_ifp->if_snd, m);
+ pfsync_push(sc);
+ }
+}
+
+static void
+pfsync_defer_tmo(void *arg)
+{
+ struct pfsync_deferral *pd = arg;
+ struct pfsync_softc *sc = pd->pd_sc;
+ struct mbuf *m = pd->pd_m;
+ struct pf_state *st = pd->pd_st;
+
+ PFSYNC_LOCK_ASSERT(sc);
+
+ CURVNET_SET(m->m_pkthdr.rcvif->if_vnet);
+
+ TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
+ sc->sc_deferred--;
+ pd->pd_st->state_flags &= ~PFSTATE_ACK; /* XXX: locking! */
+ if (pd->pd_refs == 0)
+ free(pd, M_PFSYNC);
+ PFSYNC_UNLOCK(sc);
+
+ ip_output(m, NULL, NULL, 0, NULL, NULL);
+
+ pf_release_state(st);
+
+ CURVNET_RESTORE();
+}
+
+static void
+pfsync_undefer_state(struct pf_state *st, int drop)
+{
+ struct pfsync_softc *sc = V_pfsyncif;
+ struct pfsync_deferral *pd;
+
+ PFSYNC_LOCK_ASSERT(sc);
+
+ TAILQ_FOREACH(pd, &sc->sc_deferrals, pd_entry) {
+ if (pd->pd_st == st) {
+ if (callout_stop(&pd->pd_tmo))
+ pfsync_undefer(pd, drop);
+ return;
+ }
+ }
+
+ panic("%s: unable to find deferred state", __func__);
+}
+
+static void
+pfsync_update_state(struct pf_state *st)
+{
+ struct pfsync_softc *sc = V_pfsyncif;
+ int sync = 0;
+
+ PF_STATE_LOCK_ASSERT(st);
+ PFSYNC_LOCK(sc);
+
+ if (st->state_flags & PFSTATE_ACK)
+ pfsync_undefer_state(st, 0);
+ if (st->state_flags & PFSTATE_NOSYNC) {
+ if (st->sync_state != PFSYNC_S_NONE)
+ pfsync_q_del(st);
+ PFSYNC_UNLOCK(sc);
+ return;
+ }
+
+ if (sc->sc_len == PFSYNC_MINPKT)
+ callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif);
+
+ switch (st->sync_state) {
+ case PFSYNC_S_UPD_C:
+ case PFSYNC_S_UPD:
+ case PFSYNC_S_INS:
+ /* we're already handling it */
+
+ if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) {
+ st->sync_updates++;
+ if (st->sync_updates >= sc->sc_maxupdates)
+ sync = 1;
+ }
+ break;
+
+ case PFSYNC_S_IACK:
+ pfsync_q_del(st);
+ case PFSYNC_S_NONE:
+ pfsync_q_ins(st, PFSYNC_S_UPD_C);
+ st->sync_updates = 0;
+ break;
+
+ default:
+ panic("%s: unexpected sync state %d", __func__, st->sync_state);
+ }
+
+ if (sync || (time_uptime - st->pfsync_time) < 2)
+ pfsync_push(sc);
+
+ PFSYNC_UNLOCK(sc);
+}
+
+static void
+pfsync_request_update(u_int32_t creatorid, u_int64_t id)
+{
+ struct pfsync_softc *sc = V_pfsyncif;
+ struct pfsync_upd_req_item *item;
+ size_t nlen = sizeof(struct pfsync_upd_req);
+
+ PFSYNC_LOCK_ASSERT(sc);
+
+ /*
+ * This code does nothing to prevent multiple update requests for the
+ * same state being generated.
+ */
+ item = malloc(sizeof(*item), M_PFSYNC, M_NOWAIT);
+ if (item == NULL)
+ return; /* XXX stats */
+
+ item->ur_msg.id = id;
+ item->ur_msg.creatorid = creatorid;
+
+ if (TAILQ_EMPTY(&sc->sc_upd_req_list))
+ nlen += sizeof(struct pfsync_subheader);
+
+ if (sc->sc_len + nlen > sc->sc_ifp->if_mtu) {
+ pfsync_sendout(1);
+
+ nlen = sizeof(struct pfsync_subheader) +
+ sizeof(struct pfsync_upd_req);
+ }
+
+ TAILQ_INSERT_TAIL(&sc->sc_upd_req_list, item, ur_entry);
+ sc->sc_len += nlen;
+
+ pfsync_push(sc);
+}
+
+static void
+pfsync_update_state_req(struct pf_state *st)
+{
+ struct pfsync_softc *sc = V_pfsyncif;
+
+ PF_STATE_LOCK_ASSERT(st);
+ PFSYNC_LOCK(sc);
+
+ if (st->state_flags & PFSTATE_NOSYNC) {
+ if (st->sync_state != PFSYNC_S_NONE)
+ pfsync_q_del(st);
+ PFSYNC_UNLOCK(sc);
+ return;
+ }
+
+ switch (st->sync_state) {
+ case PFSYNC_S_UPD_C:
+ case PFSYNC_S_IACK:
+ pfsync_q_del(st);
+ case PFSYNC_S_NONE:
+ pfsync_q_ins(st, PFSYNC_S_UPD);
+ pfsync_push(sc);
+ break;
+
+ case PFSYNC_S_INS:
+ case PFSYNC_S_UPD:
+ case PFSYNC_S_DEL:
+ /* we're already handling it */
+ break;
+
+ default:
+ panic("%s: unexpected sync state %d", __func__, st->sync_state);
+ }
+
+ PFSYNC_UNLOCK(sc);
+}
+
+static void
+pfsync_delete_state(struct pf_state *st)
+{
+ struct pfsync_softc *sc = V_pfsyncif;
+
+ PFSYNC_LOCK(sc);
+ if (st->state_flags & PFSTATE_ACK)
+ pfsync_undefer_state(st, 1);
+ if (st->state_flags & PFSTATE_NOSYNC) {
+ if (st->sync_state != PFSYNC_S_NONE)
+ pfsync_q_del(st);
+ PFSYNC_UNLOCK(sc);
+ return;
+ }
+
+ if (sc->sc_len == PFSYNC_MINPKT)
+ callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif);
+
+ switch (st->sync_state) {
+ case PFSYNC_S_INS:
+ /* We never got to tell the world so just forget about it. */
+ pfsync_q_del(st);
+ break;
+
+ case PFSYNC_S_UPD_C:
+ case PFSYNC_S_UPD:
+ case PFSYNC_S_IACK:
+ pfsync_q_del(st);
+ /* FALLTHROUGH to putting it on the del list */
+
+ case PFSYNC_S_NONE:
+ pfsync_q_ins(st, PFSYNC_S_DEL);
+ break;
+
+ default:
+ panic("%s: unexpected sync state %d", __func__, st->sync_state);
+ }
+ PFSYNC_UNLOCK(sc);
+}
+
+static void
+pfsync_clear_states(u_int32_t creatorid, const char *ifname)
+{
+ struct pfsync_softc *sc = V_pfsyncif;
+ struct {
+ struct pfsync_subheader subh;
+ struct pfsync_clr clr;
+ } __packed r;
+
+ bzero(&r, sizeof(r));
+
+ r.subh.action = PFSYNC_ACT_CLR;
+ r.subh.count = htons(1);
+ V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_CLR]++;
+
+ strlcpy(r.clr.ifname, ifname, sizeof(r.clr.ifname));
+ r.clr.creatorid = creatorid;
+
+ PFSYNC_LOCK(sc);
+ pfsync_send_plus(&r, sizeof(r));
+ PFSYNC_UNLOCK(sc);
+}
+
+static void
+pfsync_q_ins(struct pf_state *st, int q)
+{
+ struct pfsync_softc *sc = V_pfsyncif;
+ size_t nlen = pfsync_qs[q].len;
+
+ PFSYNC_LOCK_ASSERT(sc);
+
+ KASSERT(st->sync_state == PFSYNC_S_NONE,
+ ("%s: st->sync_state == PFSYNC_S_NONE", __func__));
+ KASSERT(sc->sc_len >= PFSYNC_MINPKT, ("pfsync pkt len is too low %zu",
+ sc->sc_len));
+
+ if (TAILQ_EMPTY(&sc->sc_qs[q]))
+ nlen += sizeof(struct pfsync_subheader);
+
+ if (sc->sc_len + nlen > sc->sc_ifp->if_mtu) {
+ pfsync_sendout(1);
+
+ nlen = sizeof(struct pfsync_subheader) + pfsync_qs[q].len;
+ }
+
+ sc->sc_len += nlen;
+ TAILQ_INSERT_TAIL(&sc->sc_qs[q], st, sync_list);
+ st->sync_state = q;
+ pf_ref_state(st);
+}
+
+static void
+pfsync_q_del(struct pf_state *st)
+{
+ struct pfsync_softc *sc = V_pfsyncif;
+ int q = st->sync_state;
+
+ PFSYNC_LOCK_ASSERT(sc);
+ KASSERT(st->sync_state != PFSYNC_S_NONE,
+ ("%s: st->sync_state != PFSYNC_S_NONE", __func__));
+
+ sc->sc_len -= pfsync_qs[q].len;
+ TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list);
+ st->sync_state = PFSYNC_S_NONE;
+ pf_release_state(st);
+
+ if (TAILQ_EMPTY(&sc->sc_qs[q]))
+ sc->sc_len -= sizeof(struct pfsync_subheader);
+}
+
+static void
+pfsync_bulk_start(void)
+{
+ struct pfsync_softc *sc = V_pfsyncif;
+
+ if (V_pf_status.debug >= PF_DEBUG_MISC)
+ printf("pfsync: received bulk update request\n");
+
+ PFSYNC_BLOCK(sc);
+
+ sc->sc_ureq_received = time_uptime;
+ sc->sc_bulk_hashid = 0;
+ sc->sc_bulk_stateid = 0;
+ pfsync_bulk_status(PFSYNC_BUS_START);
+ callout_reset(&sc->sc_bulk_tmo, 1, pfsync_bulk_update, sc);
+ PFSYNC_BUNLOCK(sc);
+}
+
+static void
+pfsync_bulk_update(void *arg)
+{
+ struct pfsync_softc *sc = arg;
+ struct pf_state *s;
+ int i, sent = 0;
+
+ PFSYNC_BLOCK_ASSERT(sc);
+ CURVNET_SET(sc->sc_ifp->if_vnet);
+
+ /*
+ * Start with last state from previous invocation.
+ * It may had gone, in this case start from the
+ * hash slot.
+ */
+ s = pf_find_state_byid(sc->sc_bulk_stateid, sc->sc_bulk_creatorid);
+
+ if (s != NULL)
+ i = PF_IDHASH(s);
+ else
+ i = sc->sc_bulk_hashid;
+
+ for (; i <= V_pf_hashmask; i++) {
+ struct pf_idhash *ih = &V_pf_idhash[i];
+
+ if (s != NULL)
+ PF_HASHROW_ASSERT(ih);
+ else {
+ PF_HASHROW_LOCK(ih);
+ s = LIST_FIRST(&ih->states);
+ }
+
+ for (; s; s = LIST_NEXT(s, entry)) {
+
+ if (sent > 1 && (sc->sc_ifp->if_mtu - sc->sc_len) <
+ sizeof(struct pfsync_state)) {
+ /* We've filled a packet. */
+ sc->sc_bulk_hashid = i;
+ sc->sc_bulk_stateid = s->id;
+ sc->sc_bulk_creatorid = s->creatorid;
+ PF_HASHROW_UNLOCK(ih);
+ callout_reset(&sc->sc_bulk_tmo, 1,
+ pfsync_bulk_update, sc);
+ goto full;
+ }
+
+ if (s->sync_state == PFSYNC_S_NONE &&
+ s->timeout < PFTM_MAX &&
+ s->pfsync_time <= sc->sc_ureq_received) {
+ PFSYNC_LOCK(sc);
+ pfsync_update_state_req(s);
+ PFSYNC_UNLOCK(sc);
+ sent++;
+ }
+ }
+ PF_HASHROW_UNLOCK(ih);
+ }
+
+ /* We're done. */
+ pfsync_bulk_status(PFSYNC_BUS_END);
+
+full:
+ CURVNET_RESTORE();
+}
+
+static void
+pfsync_bulk_status(u_int8_t status)
+{
+ struct {
+ struct pfsync_subheader subh;
+ struct pfsync_bus bus;
+ } __packed r;
+
+ struct pfsync_softc *sc = V_pfsyncif;
+
+ bzero(&r, sizeof(r));
+
+ r.subh.action = PFSYNC_ACT_BUS;
+ r.subh.count = htons(1);
+ V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_BUS]++;
+
+ r.bus.creatorid = V_pf_status.hostid;
+ r.bus.endtime = htonl(time_uptime - sc->sc_ureq_received);
+ r.bus.status = status;
+
+ PFSYNC_LOCK(sc);
+ pfsync_send_plus(&r, sizeof(r));
+ PFSYNC_UNLOCK(sc);
+}
+
+static void
+pfsync_bulk_fail(void *arg)
+{
+ struct pfsync_softc *sc = arg;
+
+ CURVNET_SET(sc->sc_ifp->if_vnet);
+
+ PFSYNC_BLOCK_ASSERT(sc);
+
+ if (sc->sc_bulk_tries++ < PFSYNC_MAX_BULKTRIES) {
+ /* Try again */
+ callout_reset(&sc->sc_bulkfail_tmo, 5 * hz,
+ pfsync_bulk_fail, V_pfsyncif);
+ PFSYNC_LOCK(sc);
+ pfsync_request_update(0, 0);
+ PFSYNC_UNLOCK(sc);
+ } else {
+ /* Pretend like the transfer was ok. */
+ sc->sc_ureq_sent = 0;
+ sc->sc_bulk_tries = 0;
+ PFSYNC_LOCK(sc);
+ if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
+ (*carp_demote_adj_p)(-V_pfsync_carp_adj,
+ "pfsync bulk fail");
+ sc->sc_flags |= PFSYNCF_OK;
+ PFSYNC_UNLOCK(sc);
+ if (V_pf_status.debug >= PF_DEBUG_MISC)
+ printf("pfsync: failed to receive bulk update\n");
+ }
+
+ CURVNET_RESTORE();
+}
+
+static void
+pfsync_send_plus(void *plus, size_t pluslen)
+{
+ struct pfsync_softc *sc = V_pfsyncif;
+
+ PFSYNC_LOCK_ASSERT(sc);
+
+ if (sc->sc_len + pluslen > sc->sc_ifp->if_mtu)
+ pfsync_sendout(1);
+
+ sc->sc_plus = plus;
+ sc->sc_len += (sc->sc_pluslen = pluslen);
+
+ pfsync_sendout(1);
+}
+
+static void
+pfsync_timeout(void *arg)
+{
+ struct pfsync_softc *sc = arg;
+
+ CURVNET_SET(sc->sc_ifp->if_vnet);
+ PFSYNC_LOCK(sc);
+ pfsync_push(sc);
+ PFSYNC_UNLOCK(sc);
+ CURVNET_RESTORE();
+}
+
+static void
+pfsync_push(struct pfsync_softc *sc)
+{
+
+ PFSYNC_LOCK_ASSERT(sc);
+
+ sc->sc_flags |= PFSYNCF_PUSH;
+ swi_sched(V_pfsync_swi_cookie, 0);
+}
+
+static void
+pfsyncintr(void *arg)
+{
+ struct pfsync_softc *sc = arg;
+ struct mbuf *m, *n;
+
+ CURVNET_SET(sc->sc_ifp->if_vnet);
+
+ PFSYNC_LOCK(sc);
+ if ((sc->sc_flags & PFSYNCF_PUSH) && sc->sc_len > PFSYNC_MINPKT) {
+ pfsync_sendout(0);
+ sc->sc_flags &= ~PFSYNCF_PUSH;
+ }
+ _IF_DEQUEUE_ALL(&sc->sc_ifp->if_snd, m);
+ PFSYNC_UNLOCK(sc);
+
+ for (; m != NULL; m = n) {
+
+ n = m->m_nextpkt;
+ m->m_nextpkt = NULL;
+
+ /*
+ * We distinguish between a deferral packet and our
+ * own pfsync packet based on M_SKIP_FIREWALL
+ * flag. This is XXX.
+ */
+ if (m->m_flags & M_SKIP_FIREWALL)
+ ip_output(m, NULL, NULL, 0, NULL, NULL);
+ else if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo,
+ NULL) == 0)
+ V_pfsyncstats.pfsyncs_opackets++;
+ else
+ V_pfsyncstats.pfsyncs_oerrors++;
+ }
+ CURVNET_RESTORE();
+}
+
+static int
+pfsync_multicast_setup(struct pfsync_softc *sc, struct ifnet *ifp, void *mship)
+{
+ struct ip_moptions *imo = &sc->sc_imo;
+ int error;
+
+ if (!(ifp->if_flags & IFF_MULTICAST))
+ return (EADDRNOTAVAIL);
+
+ imo->imo_membership = (struct in_multi **)mship;
+ imo->imo_max_memberships = IP_MIN_MEMBERSHIPS;
+ imo->imo_multicast_vif = -1;
+
+ if ((error = in_joingroup(ifp, &sc->sc_sync_peer, NULL,
+ &imo->imo_membership[0])) != 0) {
+ imo->imo_membership = NULL;
+ return (error);
+ }
+ imo->imo_num_memberships++;
+ imo->imo_multicast_ifp = ifp;
+ imo->imo_multicast_ttl = PFSYNC_DFLTTL;
+ imo->imo_multicast_loop = 0;
+
+ return (0);
+}
+
+static void
+pfsync_multicast_cleanup(struct pfsync_softc *sc)
+{
+ struct ip_moptions *imo = &sc->sc_imo;
+
+ in_leavegroup(imo->imo_membership[0], NULL);
+ free(imo->imo_membership, M_PFSYNC);
+ imo->imo_membership = NULL;
+ imo->imo_multicast_ifp = NULL;
+}
+
+#ifdef INET
+extern struct domain inetdomain;
+static struct protosw in_pfsync_protosw = {
+ .pr_type = SOCK_RAW,
+ .pr_domain = &inetdomain,
+ .pr_protocol = IPPROTO_PFSYNC,
+ .pr_flags = PR_ATOMIC|PR_ADDR,
+ .pr_input = pfsync_input,
+ .pr_output = (pr_output_t *)rip_output,
+ .pr_ctloutput = rip_ctloutput,
+ .pr_usrreqs = &rip_usrreqs
+};
+#endif
+
+static int
+pfsync_init()
+{
+ VNET_ITERATOR_DECL(vnet_iter);
+ int error = 0;
+
+ VNET_LIST_RLOCK();
+ VNET_FOREACH(vnet_iter) {
+ CURVNET_SET(vnet_iter);
+ V_pfsync_cloner = pfsync_cloner;
+ V_pfsync_cloner_data = pfsync_cloner_data;
+ V_pfsync_cloner.ifc_data = &V_pfsync_cloner_data;
+ if_clone_attach(&V_pfsync_cloner);
+ error = swi_add(NULL, "pfsync", pfsyncintr, V_pfsyncif,
+ SWI_NET, INTR_MPSAFE, &V_pfsync_swi_cookie);
+ CURVNET_RESTORE();
+ if (error)
+ goto fail_locked;
+ }
+ VNET_LIST_RUNLOCK();
+#ifdef INET
+ error = pf_proto_register(PF_INET, &in_pfsync_protosw);
+ if (error)
+ goto fail;
+ error = ipproto_register(IPPROTO_PFSYNC);
+ if (error) {
+ pf_proto_unregister(PF_INET, IPPROTO_PFSYNC, SOCK_RAW);
+ goto fail;
+ }
+#endif
+ PF_RULES_WLOCK();
+ pfsync_state_import_ptr = pfsync_state_import;
+ pfsync_insert_state_ptr = pfsync_insert_state;
+ pfsync_update_state_ptr = pfsync_update_state;
+ pfsync_delete_state_ptr = pfsync_delete_state;
+ pfsync_clear_states_ptr = pfsync_clear_states;
+ pfsync_defer_ptr = pfsync_defer;
+ PF_RULES_WUNLOCK();
+
+ return (0);
+
+fail:
+ VNET_LIST_RLOCK();
+fail_locked:
+ VNET_FOREACH(vnet_iter) {
+ CURVNET_SET(vnet_iter);
+ if (V_pfsync_swi_cookie) {
+ swi_remove(V_pfsync_swi_cookie);
+ if_clone_detach(&V_pfsync_cloner);
+ }
+ CURVNET_RESTORE();
+ }
+ VNET_LIST_RUNLOCK();
+
+ return (error);
+}
+
+static void
+pfsync_uninit()
+{
+ VNET_ITERATOR_DECL(vnet_iter);
+
+ PF_RULES_WLOCK();
+ pfsync_state_import_ptr = NULL;
+ pfsync_insert_state_ptr = NULL;
+ pfsync_update_state_ptr = NULL;
+ pfsync_delete_state_ptr = NULL;
+ pfsync_clear_states_ptr = NULL;
+ pfsync_defer_ptr = NULL;
+ PF_RULES_WUNLOCK();
+
+ ipproto_unregister(IPPROTO_PFSYNC);
+ pf_proto_unregister(PF_INET, IPPROTO_PFSYNC, SOCK_RAW);
+ VNET_LIST_RLOCK();
+ VNET_FOREACH(vnet_iter) {
+ CURVNET_SET(vnet_iter);
+ if_clone_detach(&V_pfsync_cloner);
+ swi_remove(V_pfsync_swi_cookie);
+ CURVNET_RESTORE();
+ }
+ VNET_LIST_RUNLOCK();
+}
+
+static int
+pfsync_modevent(module_t mod, int type, void *data)
+{
+ int error = 0;
+
+ switch (type) {
+ case MOD_LOAD:
+ error = pfsync_init();
+ break;
+ case MOD_QUIESCE:
+ /*
+ * Module should not be unloaded due to race conditions.
+ */
+ error = EPERM;
+ break;
+ case MOD_UNLOAD:
+ pfsync_uninit();
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+
+ return (error);
+}
+
+static moduledata_t pfsync_mod = {
+ "pfsync",
+ pfsync_modevent,
+ 0
+};
+
+#define PFSYNC_MODVER 1
+
+DECLARE_MODULE(pfsync, pfsync_mod, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
+MODULE_VERSION(pfsync, PFSYNC_MODVER);
+MODULE_DEPEND(pfsync, pf, PF_MODVER, PF_MODVER, PF_MODVER);
diff --git a/sys/netpfil/pf/in4_cksum.c b/sys/netpfil/pf/in4_cksum.c
new file mode 100644
index 0000000..bf25baf
--- /dev/null
+++ b/sys/netpfil/pf/in4_cksum.c
@@ -0,0 +1,120 @@
+/* $FreeBSD$ */
+/* $OpenBSD: in4_cksum.c,v 1.7 2003/06/02 23:28:13 millert Exp $ */
+/* $KAME: in4_cksum.c,v 1.10 2001/11/30 10:06:15 itojun Exp $ */
+/* $NetBSD: in_cksum.c,v 1.13 1996/10/13 02:03:03 christos Exp $ */
+
+/*
+ * Copyright (C) 1999 WIDE Project.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the project nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 1988, 1992, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)in_cksum.c 8.1 (Berkeley) 6/10/93
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/mbuf.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+
+#include <machine/in_cksum.h>
+
+#define ADDCARRY(x) (x > 65535 ? x -= 65535 : x)
+#define REDUCE {l_util.l = sum; sum = l_util.s[0] + l_util.s[1]; (void)ADDCARRY(sum);}
+
+int in4_cksum(struct mbuf *, u_int8_t, int, int);
+
+int
+in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len)
+{
+ union {
+ struct ipovly ipov;
+ u_int16_t w[10];
+ } u;
+ union {
+ u_int16_t s[2];
+ u_int32_t l;
+ } l_util;
+
+ u_int16_t *w;
+ int psum;
+ int sum = 0;
+
+ if (nxt != 0) {
+ /* pseudo header */
+ if (off < sizeof(struct ipovly))
+ panic("in4_cksum: offset too short");
+ if (m->m_len < sizeof(struct ip))
+ panic("in4_cksum: bad mbuf chain");
+ bzero(&u.ipov, sizeof(u.ipov));
+ u.ipov.ih_len = htons(len);
+ u.ipov.ih_pr = nxt;
+ u.ipov.ih_src = mtod(m, struct ip *)->ip_src;
+ u.ipov.ih_dst = mtod(m, struct ip *)->ip_dst;
+ w = u.w;
+ /* assumes sizeof(ipov) == 20 */
+ sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3]; sum += w[4];
+ sum += w[5]; sum += w[6]; sum += w[7]; sum += w[8]; sum += w[9];
+ }
+
+ psum = in_cksum_skip(m, len + off, off);
+ psum = ~psum & 0xffff;
+ sum += psum;
+ REDUCE;
+ return (~sum & 0xffff);
+}
diff --git a/sys/netpfil/pf/pf.c b/sys/netpfil/pf/pf.c
new file mode 100644
index 0000000..a61b87b
--- /dev/null
+++ b/sys/netpfil/pf/pf.c
@@ -0,0 +1,6271 @@
+/* $OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $ */
+
+/*
+ * Copyright (c) 2001 Daniel Hartmeier
+ * Copyright (c) 2002 - 2008 Henning Brauer
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Effort sponsored in part by the Defense Advanced Research Projects
+ * Agency (DARPA) and Air Force Research Laboratory, Air Force
+ * Materiel Command, USAF, under agreement number F30602-01-2-0537.
+ *
+ */
+
+#include <sys/cdefs.h>
+
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_bpf.h"
+#include "opt_pf.h"
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/endian.h>
+#include <sys/hash.h>
+#include <sys/interrupt.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/limits.h>
+#include <sys/mbuf.h>
+#include <sys/md5.h>
+#include <sys/random.h>
+#include <sys/refcount.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/taskqueue.h>
+#include <sys/ucred.h>
+
+#include <net/if.h>
+#include <net/if_types.h>
+#include <net/route.h>
+#include <net/radix_mpath.h>
+#include <net/vnet.h>
+
+#include <net/pfvar.h>
+#include <net/pf_mtag.h>
+#include <net/if_pflog.h>
+#include <net/if_pfsync.h>
+
+#include <netinet/in_pcb.h>
+#include <netinet/in_var.h>
+#include <netinet/ip.h>
+#include <netinet/ip_fw.h>
+#include <netinet/ip_icmp.h>
+#include <netinet/icmp_var.h>
+#include <netinet/ip_var.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#include <netinet/udp.h>
+#include <netinet/udp_var.h>
+
+#include <netpfil/ipfw/ip_fw_private.h> /* XXX: only for DIR_IN/DIR_OUT */
+
+#ifdef INET6
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+#include <netinet6/nd6.h>
+#include <netinet6/ip6_var.h>
+#include <netinet6/in6_pcb.h>
+#endif /* INET6 */
+
+#include <machine/in_cksum.h>
+#include <security/mac/mac_framework.h>
+
+#define DPFPRINTF(n, x) if (V_pf_status.debug >= (n)) printf x
+
+/*
+ * Global variables
+ */
+
+/* state tables */
+VNET_DEFINE(struct pf_altqqueue, pf_altqs[2]);
+VNET_DEFINE(struct pf_palist, pf_pabuf);
+VNET_DEFINE(struct pf_altqqueue *, pf_altqs_active);
+VNET_DEFINE(struct pf_altqqueue *, pf_altqs_inactive);
+VNET_DEFINE(struct pf_status, pf_status);
+
+VNET_DEFINE(u_int32_t, ticket_altqs_active);
+VNET_DEFINE(u_int32_t, ticket_altqs_inactive);
+VNET_DEFINE(int, altqs_inactive_open);
+VNET_DEFINE(u_int32_t, ticket_pabuf);
+
+VNET_DEFINE(MD5_CTX, pf_tcp_secret_ctx);
+#define V_pf_tcp_secret_ctx VNET(pf_tcp_secret_ctx)
+VNET_DEFINE(u_char, pf_tcp_secret[16]);
+#define V_pf_tcp_secret VNET(pf_tcp_secret)
+VNET_DEFINE(int, pf_tcp_secret_init);
+#define V_pf_tcp_secret_init VNET(pf_tcp_secret_init)
+VNET_DEFINE(int, pf_tcp_iss_off);
+#define V_pf_tcp_iss_off VNET(pf_tcp_iss_off)
+
+struct pf_anchor_stackframe {
+ struct pf_ruleset *rs;
+ struct pf_rule *r;
+ struct pf_anchor_node *parent;
+ struct pf_anchor *child;
+};
+VNET_DEFINE(struct pf_anchor_stackframe, pf_anchor_stack[64]);
+#define V_pf_anchor_stack VNET(pf_anchor_stack)
+
+/*
+ * Queue for pf_intr() sends.
+ */
+static MALLOC_DEFINE(M_PFTEMP, "pf_temp", "pf(4) temporary allocations");
+struct pf_send_entry {
+ STAILQ_ENTRY(pf_send_entry) pfse_next;
+ struct mbuf *pfse_m;
+ enum {
+ PFSE_IP,
+ PFSE_IP6,
+ PFSE_ICMP,
+ PFSE_ICMP6,
+ } pfse_type;
+ union {
+ struct route ro;
+ struct {
+ int type;
+ int code;
+ int mtu;
+ } icmpopts;
+ } u;
+#define pfse_ro u.ro
+#define pfse_icmp_type u.icmpopts.type
+#define pfse_icmp_code u.icmpopts.code
+#define pfse_icmp_mtu u.icmpopts.mtu
+};
+
+STAILQ_HEAD(pf_send_head, pf_send_entry);
+static VNET_DEFINE(struct pf_send_head, pf_sendqueue);
+#define V_pf_sendqueue VNET(pf_sendqueue)
+
+static struct mtx pf_sendqueue_mtx;
+#define PF_SENDQ_LOCK() mtx_lock(&pf_sendqueue_mtx)
+#define PF_SENDQ_UNLOCK() mtx_unlock(&pf_sendqueue_mtx)
+
+/*
+ * Queue for pf_flush_task() tasks.
+ */
+struct pf_flush_entry {
+ SLIST_ENTRY(pf_flush_entry) next;
+ struct pf_addr addr;
+ sa_family_t af;
+ uint8_t dir;
+ struct pf_rule *rule; /* never dereferenced */
+};
+
+SLIST_HEAD(pf_flush_head, pf_flush_entry);
+static VNET_DEFINE(struct pf_flush_head, pf_flushqueue);
+#define V_pf_flushqueue VNET(pf_flushqueue)
+static VNET_DEFINE(struct task, pf_flushtask);
+#define V_pf_flushtask VNET(pf_flushtask)
+
+static struct mtx pf_flushqueue_mtx;
+#define PF_FLUSHQ_LOCK() mtx_lock(&pf_flushqueue_mtx)
+#define PF_FLUSHQ_UNLOCK() mtx_unlock(&pf_flushqueue_mtx)
+
+VNET_DEFINE(struct pf_rulequeue, pf_unlinked_rules);
+struct mtx pf_unlnkdrules_mtx;
+
+static VNET_DEFINE(uma_zone_t, pf_sources_z);
+#define V_pf_sources_z VNET(pf_sources_z)
+static VNET_DEFINE(uma_zone_t, pf_mtag_z);
+#define V_pf_mtag_z VNET(pf_mtag_z)
+VNET_DEFINE(uma_zone_t, pf_state_z);
+VNET_DEFINE(uma_zone_t, pf_state_key_z);
+
+VNET_DEFINE(uint64_t, pf_stateid[MAXCPU]);
+#define PFID_CPUBITS 8
+#define PFID_CPUSHIFT (sizeof(uint64_t) * NBBY - PFID_CPUBITS)
+#define PFID_CPUMASK ((uint64_t)((1 << PFID_CPUBITS) - 1) << PFID_CPUSHIFT)
+#define PFID_MAXID (~PFID_CPUMASK)
+CTASSERT((1 << PFID_CPUBITS) > MAXCPU);
+
+static void pf_src_tree_remove_state(struct pf_state *);
+static void pf_init_threshold(struct pf_threshold *, u_int32_t,
+ u_int32_t);
+static void pf_add_threshold(struct pf_threshold *);
+static int pf_check_threshold(struct pf_threshold *);
+
+static void pf_change_ap(struct pf_addr *, u_int16_t *,
+ u_int16_t *, u_int16_t *, struct pf_addr *,
+ u_int16_t, u_int8_t, sa_family_t);
+static int pf_modulate_sack(struct mbuf *, int, struct pf_pdesc *,
+ struct tcphdr *, struct pf_state_peer *);
+static void pf_change_icmp(struct pf_addr *, u_int16_t *,
+ struct pf_addr *, struct pf_addr *, u_int16_t,
+ u_int16_t *, u_int16_t *, u_int16_t *,
+ u_int16_t *, u_int8_t, sa_family_t);
+static void pf_send_tcp(struct mbuf *,
+ const struct pf_rule *, sa_family_t,
+ const struct pf_addr *, const struct pf_addr *,
+ u_int16_t, u_int16_t, u_int32_t, u_int32_t,
+ u_int8_t, u_int16_t, u_int16_t, u_int8_t, int,
+ u_int16_t, struct ifnet *);
+static void pf_send_icmp(struct mbuf *, u_int8_t, u_int8_t,
+ sa_family_t, struct pf_rule *);
+static void pf_detach_state(struct pf_state *);
+static int pf_state_key_attach(struct pf_state_key *,
+ struct pf_state_key *, struct pf_state *);
+static void pf_state_key_detach(struct pf_state *, int);
+static int pf_state_key_ctor(void *, int, void *, int);
+static u_int32_t pf_tcp_iss(struct pf_pdesc *);
+static int pf_test_rule(struct pf_rule **, struct pf_state **,
+ int, struct pfi_kif *, struct mbuf *, int,
+ struct pf_pdesc *, struct pf_rule **,
+ struct pf_ruleset **, struct inpcb *);
+static int pf_create_state(struct pf_rule *, struct pf_rule *,
+ struct pf_rule *, struct pf_pdesc *,
+ struct pf_src_node *, struct pf_state_key *,
+ struct pf_state_key *, struct mbuf *, int,
+ u_int16_t, u_int16_t, int *, struct pfi_kif *,
+ struct pf_state **, int, u_int16_t, u_int16_t,
+ int);
+static int pf_test_fragment(struct pf_rule **, int,
+ struct pfi_kif *, struct mbuf *, void *,
+ struct pf_pdesc *, struct pf_rule **,
+ struct pf_ruleset **);
+static int pf_tcp_track_full(struct pf_state_peer *,
+ struct pf_state_peer *, struct pf_state **,
+ struct pfi_kif *, struct mbuf *, int,
+ struct pf_pdesc *, u_short *, int *);
+static int pf_tcp_track_sloppy(struct pf_state_peer *,
+ struct pf_state_peer *, struct pf_state **,
+ struct pf_pdesc *, u_short *);
+static int pf_test_state_tcp(struct pf_state **, int,
+ struct pfi_kif *, struct mbuf *, int,
+ void *, struct pf_pdesc *, u_short *);
+static int pf_test_state_udp(struct pf_state **, int,
+ struct pfi_kif *, struct mbuf *, int,
+ void *, struct pf_pdesc *);
+static int pf_test_state_icmp(struct pf_state **, int,
+ struct pfi_kif *, struct mbuf *, int,
+ void *, struct pf_pdesc *, u_short *);
+static int pf_test_state_other(struct pf_state **, int,
+ struct pfi_kif *, struct mbuf *, struct pf_pdesc *);
+static u_int8_t pf_get_wscale(struct mbuf *, int, u_int16_t,
+ sa_family_t);
+static u_int16_t pf_get_mss(struct mbuf *, int, u_int16_t,
+ sa_family_t);
+static u_int16_t pf_calc_mss(struct pf_addr *, sa_family_t,
+ int, u_int16_t);
+static void pf_set_rt_ifp(struct pf_state *,
+ struct pf_addr *);
+static int pf_check_proto_cksum(struct mbuf *, int, int,
+ u_int8_t, sa_family_t);
+static void pf_print_state_parts(struct pf_state *,
+ struct pf_state_key *, struct pf_state_key *);
+static int pf_addr_wrap_neq(struct pf_addr_wrap *,
+ struct pf_addr_wrap *);
+static struct pf_state *pf_find_state(struct pfi_kif *,
+ struct pf_state_key_cmp *, u_int);
+static int pf_src_connlimit(struct pf_state **);
+static void pf_flush_task(void *c, int pending);
+static int pf_insert_src_node(struct pf_src_node **,
+ struct pf_rule *, struct pf_addr *, sa_family_t);
+static int pf_purge_expired_states(int);
+static void pf_purge_unlinked_rules(void);
+static int pf_mtag_init(void *, int, int);
+static void pf_mtag_free(struct m_tag *);
+#ifdef INET
+static void pf_route(struct mbuf **, struct pf_rule *, int,
+ struct ifnet *, struct pf_state *,
+ struct pf_pdesc *);
+#endif /* INET */
+#ifdef INET6
+static void pf_change_a6(struct pf_addr *, u_int16_t *,
+ struct pf_addr *, u_int8_t);
+static void pf_route6(struct mbuf **, struct pf_rule *, int,
+ struct ifnet *, struct pf_state *,
+ struct pf_pdesc *);
+#endif /* INET6 */
+
+int in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len);
+
+VNET_DECLARE(int, pf_end_threads);
+
+VNET_DEFINE(struct pf_limit, pf_limits[PF_LIMIT_MAX]);
+
+#define PACKET_LOOPED(pd) ((pd)->pf_mtag && \
+ (pd)->pf_mtag->flags & PF_PACKET_LOOPED)
+
+#define STATE_LOOKUP(i, k, d, s, pd) \
+ do { \
+ (s) = pf_find_state((i), (k), (d)); \
+ if ((s) == NULL || (s)->timeout == PFTM_PURGE) \
+ return (PF_DROP); \
+ if (PACKET_LOOPED(pd)) \
+ return (PF_PASS); \
+ if ((d) == PF_OUT && \
+ (((s)->rule.ptr->rt == PF_ROUTETO && \
+ (s)->rule.ptr->direction == PF_OUT) || \
+ ((s)->rule.ptr->rt == PF_REPLYTO && \
+ (s)->rule.ptr->direction == PF_IN)) && \
+ (s)->rt_kif != NULL && \
+ (s)->rt_kif != (i)) \
+ return (PF_PASS); \
+ } while (0)
+
+#define BOUND_IFACE(r, k) \
+ ((r)->rule_flag & PFRULE_IFBOUND) ? (k) : V_pfi_all
+
+#define STATE_INC_COUNTERS(s) \
+ do { \
+ s->rule.ptr->states_cur++; \
+ s->rule.ptr->states_tot++; \
+ if (s->anchor.ptr != NULL) { \
+ s->anchor.ptr->states_cur++; \
+ s->anchor.ptr->states_tot++; \
+ } \
+ if (s->nat_rule.ptr != NULL) { \
+ s->nat_rule.ptr->states_cur++; \
+ s->nat_rule.ptr->states_tot++; \
+ } \
+ } while (0)
+
+#define STATE_DEC_COUNTERS(s) \
+ do { \
+ if (s->nat_rule.ptr != NULL) \
+ s->nat_rule.ptr->states_cur--; \
+ if (s->anchor.ptr != NULL) \
+ s->anchor.ptr->states_cur--; \
+ s->rule.ptr->states_cur--; \
+ } while (0)
+
+static MALLOC_DEFINE(M_PFHASH, "pf_hash", "pf(4) hash header structures");
+VNET_DEFINE(struct pf_keyhash *, pf_keyhash);
+VNET_DEFINE(struct pf_idhash *, pf_idhash);
+VNET_DEFINE(u_long, pf_hashmask);
+VNET_DEFINE(struct pf_srchash *, pf_srchash);
+VNET_DEFINE(u_long, pf_srchashmask);
+
+SYSCTL_NODE(_net, OID_AUTO, pf, CTLFLAG_RW, 0, "pf(4)");
+
+VNET_DEFINE(u_long, pf_hashsize);
+#define V_pf_hashsize VNET(pf_hashsize)
+SYSCTL_VNET_UINT(_net_pf, OID_AUTO, states_hashsize, CTLFLAG_RDTUN,
+ &VNET_NAME(pf_hashsize), 0, "Size of pf(4) states hashtable");
+
+VNET_DEFINE(u_long, pf_srchashsize);
+#define V_pf_srchashsize VNET(pf_srchashsize)
+SYSCTL_VNET_UINT(_net_pf, OID_AUTO, source_nodes_hashsize, CTLFLAG_RDTUN,
+ &VNET_NAME(pf_srchashsize), 0, "Size of pf(4) source nodes hashtable");
+
+VNET_DEFINE(void *, pf_swi_cookie);
+
+VNET_DEFINE(uint32_t, pf_hashseed);
+#define V_pf_hashseed VNET(pf_hashseed)
+
+static __inline uint32_t
+pf_hashkey(struct pf_state_key *sk)
+{
+ uint32_t h;
+
+ h = jenkins_hash32((uint32_t *)sk,
+ sizeof(struct pf_state_key_cmp)/sizeof(uint32_t),
+ V_pf_hashseed);
+
+ return (h & V_pf_hashmask);
+}
+
+#ifdef INET6
+void
+pf_addrcpy(struct pf_addr *dst, struct pf_addr *src, sa_family_t af)
+{
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ dst->addr32[0] = src->addr32[0];
+ break;
+#endif /* INET */
+ case AF_INET6:
+ dst->addr32[0] = src->addr32[0];
+ dst->addr32[1] = src->addr32[1];
+ dst->addr32[2] = src->addr32[2];
+ dst->addr32[3] = src->addr32[3];
+ break;
+ }
+}
+#endif /* INET6 */
+
+static void
+pf_init_threshold(struct pf_threshold *threshold,
+ u_int32_t limit, u_int32_t seconds)
+{
+ threshold->limit = limit * PF_THRESHOLD_MULT;
+ threshold->seconds = seconds;
+ threshold->count = 0;
+ threshold->last = time_uptime;
+}
+
+static void
+pf_add_threshold(struct pf_threshold *threshold)
+{
+ u_int32_t t = time_uptime, diff = t - threshold->last;
+
+ if (diff >= threshold->seconds)
+ threshold->count = 0;
+ else
+ threshold->count -= threshold->count * diff /
+ threshold->seconds;
+ threshold->count += PF_THRESHOLD_MULT;
+ threshold->last = t;
+}
+
+static int
+pf_check_threshold(struct pf_threshold *threshold)
+{
+ return (threshold->count > threshold->limit);
+}
+
+static int
+pf_src_connlimit(struct pf_state **state)
+{
+ struct pfr_addr p;
+ struct pf_flush_entry *pffe;
+ int bad = 0;
+
+ PF_STATE_LOCK_ASSERT(*state);
+
+ (*state)->src_node->conn++;
+ (*state)->src.tcp_est = 1;
+ pf_add_threshold(&(*state)->src_node->conn_rate);
+
+ if ((*state)->rule.ptr->max_src_conn &&
+ (*state)->rule.ptr->max_src_conn <
+ (*state)->src_node->conn) {
+ V_pf_status.lcounters[LCNT_SRCCONN]++;
+ bad++;
+ }
+
+ if ((*state)->rule.ptr->max_src_conn_rate.limit &&
+ pf_check_threshold(&(*state)->src_node->conn_rate)) {
+ V_pf_status.lcounters[LCNT_SRCCONNRATE]++;
+ bad++;
+ }
+
+ if (!bad)
+ return (0);
+
+ /* Kill this state. */
+ (*state)->timeout = PFTM_PURGE;
+ (*state)->src.state = (*state)->dst.state = TCPS_CLOSED;
+
+ if ((*state)->rule.ptr->overload_tbl == NULL)
+ return (1);
+
+ V_pf_status.lcounters[LCNT_OVERLOAD_TABLE]++;
+ if (V_pf_status.debug >= PF_DEBUG_MISC) {
+ printf("%s: blocking address ", __func__);
+ pf_print_host(&(*state)->src_node->addr, 0,
+ (*state)->key[PF_SK_WIRE]->af);
+ printf("\n");
+ }
+
+ bzero(&p, sizeof(p));
+ p.pfra_af = (*state)->key[PF_SK_WIRE]->af;
+ switch ((*state)->key[PF_SK_WIRE]->af) {
+#ifdef INET
+ case AF_INET:
+ p.pfra_net = 32;
+ p.pfra_ip4addr = (*state)->src_node->addr.v4;
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ p.pfra_net = 128;
+ p.pfra_ip6addr = (*state)->src_node->addr.v6;
+ break;
+#endif /* INET6 */
+ }
+
+ pfr_insert_kentry((*state)->rule.ptr->overload_tbl, &p, time_second);
+
+ if ((*state)->rule.ptr->flush == 0)
+ return (1);
+
+ /* Schedule flushing task. */
+ pffe = malloc(sizeof(*pffe), M_PFTEMP, M_NOWAIT);
+ if (pffe == NULL)
+ return (1); /* too bad :( */
+
+ bcopy(&(*state)->src_node->addr, &pffe->addr, sizeof(pffe->addr));
+ pffe->af = (*state)->key[PF_SK_WIRE]->af;
+ pffe->dir = (*state)->direction;
+ if ((*state)->rule.ptr->flush & PF_FLUSH_GLOBAL)
+ pffe->rule = NULL;
+ else
+ pffe->rule = (*state)->rule.ptr;
+ PF_FLUSHQ_LOCK();
+ SLIST_INSERT_HEAD(&V_pf_flushqueue, pffe, next);
+ PF_FLUSHQ_UNLOCK();
+ taskqueue_enqueue(taskqueue_swi, &V_pf_flushtask);
+
+ return (1);
+}
+
+static void
+pf_flush_task(void *c, int pending)
+{
+ struct pf_flush_head queue;
+ struct pf_flush_entry *pffe, *pffe1;
+ uint32_t killed = 0;
+
+ PF_FLUSHQ_LOCK();
+ queue = *(struct pf_flush_head *)c;
+ SLIST_INIT((struct pf_flush_head *)c);
+ PF_FLUSHQ_UNLOCK();
+
+ V_pf_status.lcounters[LCNT_OVERLOAD_FLUSH]++;
+
+ for (int i = 0; i <= V_pf_hashmask; i++) {
+ struct pf_idhash *ih = &V_pf_idhash[i];
+ struct pf_state_key *sk;
+ struct pf_state *s;
+
+ PF_HASHROW_LOCK(ih);
+ LIST_FOREACH(s, &ih->states, entry) {
+ sk = s->key[PF_SK_WIRE];
+ SLIST_FOREACH(pffe, &queue, next)
+ if (sk->af == pffe->af && (pffe->rule == NULL ||
+ pffe->rule == s->rule.ptr) &&
+ ((pffe->dir == PF_OUT &&
+ PF_AEQ(&pffe->addr, &sk->addr[1], sk->af)) ||
+ (pffe->dir == PF_IN &&
+ PF_AEQ(&pffe->addr, &sk->addr[0], sk->af)))) {
+ s->timeout = PFTM_PURGE;
+ s->src.state = s->dst.state = TCPS_CLOSED;
+ killed++;
+ }
+ }
+ PF_HASHROW_UNLOCK(ih);
+ }
+ SLIST_FOREACH_SAFE(pffe, &queue, next, pffe1)
+ free(pffe, M_PFTEMP);
+ if (V_pf_status.debug >= PF_DEBUG_MISC)
+ printf("%s: %u states killed", __func__, killed);
+}
+
+/*
+ * Can return locked on failure, so that we can consistently
+ * allocate and insert a new one.
+ */
+struct pf_src_node *
+pf_find_src_node(struct pf_addr *src, struct pf_rule *rule, sa_family_t af,
+ int returnlocked)
+{
+ struct pf_srchash *sh;
+ struct pf_src_node *n;
+
+ V_pf_status.scounters[SCNT_SRC_NODE_SEARCH]++;
+
+ sh = &V_pf_srchash[pf_hashsrc(src, af)];
+ PF_HASHROW_LOCK(sh);
+ LIST_FOREACH(n, &sh->nodes, entry)
+ if (n->rule.ptr == rule && n->af == af &&
+ ((af == AF_INET && n->addr.v4.s_addr == src->v4.s_addr) ||
+ (af == AF_INET6 && bcmp(&n->addr, src, sizeof(*src)) == 0)))
+ break;
+ if (n != NULL || returnlocked == 0)
+ PF_HASHROW_UNLOCK(sh);
+
+ return (n);
+}
+
+static int
+pf_insert_src_node(struct pf_src_node **sn, struct pf_rule *rule,
+ struct pf_addr *src, sa_family_t af)
+{
+
+ KASSERT((rule->rule_flag & PFRULE_RULESRCTRACK ||
+ rule->rpool.opts & PF_POOL_STICKYADDR),
+ ("%s for non-tracking rule %p", __func__, rule));
+
+ if (*sn == NULL)
+ *sn = pf_find_src_node(src, rule, af, 1);
+
+ if (*sn == NULL) {
+ struct pf_srchash *sh = &V_pf_srchash[pf_hashsrc(src, af)];
+
+ PF_HASHROW_ASSERT(sh);
+
+ if (!rule->max_src_nodes ||
+ rule->src_nodes < rule->max_src_nodes)
+ (*sn) = uma_zalloc(V_pf_sources_z, M_NOWAIT | M_ZERO);
+ else
+ V_pf_status.lcounters[LCNT_SRCNODES]++;
+ if ((*sn) == NULL) {
+ PF_HASHROW_UNLOCK(sh);
+ return (-1);
+ }
+
+ pf_init_threshold(&(*sn)->conn_rate,
+ rule->max_src_conn_rate.limit,
+ rule->max_src_conn_rate.seconds);
+
+ (*sn)->af = af;
+ (*sn)->rule.ptr = rule;
+ PF_ACPY(&(*sn)->addr, src, af);
+ LIST_INSERT_HEAD(&sh->nodes, *sn, entry);
+ (*sn)->creation = time_uptime;
+ (*sn)->ruletype = rule->action;
+ if ((*sn)->rule.ptr != NULL)
+ (*sn)->rule.ptr->src_nodes++;
+ PF_HASHROW_UNLOCK(sh);
+ V_pf_status.scounters[SCNT_SRC_NODE_INSERT]++;
+ V_pf_status.src_nodes++;
+ } else {
+ if (rule->max_src_states &&
+ (*sn)->states >= rule->max_src_states) {
+ V_pf_status.lcounters[LCNT_SRCSTATES]++;
+ return (-1);
+ }
+ }
+ return (0);
+}
+
+static void
+pf_remove_src_node(struct pf_src_node *src)
+{
+ struct pf_srchash *sh;
+
+ sh = &V_pf_srchash[pf_hashsrc(&src->addr, src->af)];
+ PF_HASHROW_LOCK(sh);
+ LIST_REMOVE(src, entry);
+ PF_HASHROW_UNLOCK(sh);
+}
+
+/* Data storage structures initialization. */
+void
+pf_initialize()
+{
+ struct pf_keyhash *kh;
+ struct pf_idhash *ih;
+ struct pf_srchash *sh;
+ u_int i;
+
+ TUNABLE_ULONG_FETCH("net.pf.states_hashsize", &V_pf_hashsize);
+ if (V_pf_hashsize == 0 || !powerof2(V_pf_hashsize))
+ V_pf_hashsize = PF_HASHSIZ;
+ TUNABLE_ULONG_FETCH("net.pf.source_nodes_hashsize", &V_pf_srchashsize);
+ if (V_pf_srchashsize == 0 || !powerof2(V_pf_srchashsize))
+ V_pf_srchashsize = PF_HASHSIZ / 4;
+
+ V_pf_hashseed = arc4random();
+
+ /* States and state keys storage. */
+ V_pf_state_z = uma_zcreate("pf states", sizeof(struct pf_state),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+ V_pf_limits[PF_LIMIT_STATES].zone = V_pf_state_z;
+ uma_zone_set_max(V_pf_state_z, PFSTATE_HIWAT);
+
+ V_pf_state_key_z = uma_zcreate("pf state keys",
+ sizeof(struct pf_state_key), pf_state_key_ctor, NULL, NULL, NULL,
+ UMA_ALIGN_PTR, 0);
+ V_pf_keyhash = malloc(V_pf_hashsize * sizeof(struct pf_keyhash),
+ M_PFHASH, M_WAITOK | M_ZERO);
+ V_pf_idhash = malloc(V_pf_hashsize * sizeof(struct pf_idhash),
+ M_PFHASH, M_WAITOK | M_ZERO);
+ V_pf_hashmask = V_pf_hashsize - 1;
+ for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= V_pf_hashmask;
+ i++, kh++, ih++) {
+ mtx_init(&kh->lock, "pf_keyhash", NULL, MTX_DEF);
+ mtx_init(&ih->lock, "pf_idhash", NULL, MTX_DEF);
+ }
+
+ /* Source nodes. */
+ V_pf_sources_z = uma_zcreate("pf source nodes",
+ sizeof(struct pf_src_node), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
+ 0);
+ V_pf_limits[PF_LIMIT_SRC_NODES].zone = V_pf_sources_z;
+ uma_zone_set_max(V_pf_sources_z, PFSNODE_HIWAT);
+ V_pf_srchash = malloc(V_pf_srchashsize * sizeof(struct pf_srchash),
+ M_PFHASH, M_WAITOK|M_ZERO);
+ V_pf_srchashmask = V_pf_srchashsize - 1;
+ for (i = 0, sh = V_pf_srchash; i <= V_pf_srchashmask; i++, sh++)
+ mtx_init(&sh->lock, "pf_srchash", NULL, MTX_DEF);
+
+ /* ALTQ */
+ TAILQ_INIT(&V_pf_altqs[0]);
+ TAILQ_INIT(&V_pf_altqs[1]);
+ TAILQ_INIT(&V_pf_pabuf);
+ V_pf_altqs_active = &V_pf_altqs[0];
+ V_pf_altqs_inactive = &V_pf_altqs[1];
+
+ /* Mbuf tags */
+ V_pf_mtag_z = uma_zcreate("pf mtags", sizeof(struct m_tag) +
+ sizeof(struct pf_mtag), NULL, NULL, pf_mtag_init, NULL,
+ UMA_ALIGN_PTR, 0);
+
+ /* Send & flush queues. */
+ STAILQ_INIT(&V_pf_sendqueue);
+ SLIST_INIT(&V_pf_flushqueue);
+ TASK_INIT(&V_pf_flushtask, 0, pf_flush_task, &V_pf_flushqueue);
+ mtx_init(&pf_sendqueue_mtx, "pf send queue", NULL, MTX_DEF);
+ mtx_init(&pf_flushqueue_mtx, "pf flush queue", NULL, MTX_DEF);
+
+ /* Unlinked, but may be referenced rules. */
+ TAILQ_INIT(&V_pf_unlinked_rules);
+ mtx_init(&pf_unlnkdrules_mtx, "pf unlinked rules", NULL, MTX_DEF);
+}
+
+void
+pf_cleanup()
+{
+ struct pf_keyhash *kh;
+ struct pf_idhash *ih;
+ struct pf_srchash *sh;
+ struct pf_send_entry *pfse, *next;
+ u_int i;
+
+ for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= V_pf_hashmask;
+ i++, kh++, ih++) {
+ KASSERT(LIST_EMPTY(&kh->keys), ("%s: key hash not empty",
+ __func__));
+ KASSERT(LIST_EMPTY(&ih->states), ("%s: id hash not empty",
+ __func__));
+ mtx_destroy(&kh->lock);
+ mtx_destroy(&ih->lock);
+ }
+ free(V_pf_keyhash, M_PFHASH);
+ free(V_pf_idhash, M_PFHASH);
+
+ for (i = 0, sh = V_pf_srchash; i <= V_pf_srchashmask; i++, sh++) {
+ KASSERT(LIST_EMPTY(&sh->nodes),
+ ("%s: source node hash not empty", __func__));
+ mtx_destroy(&sh->lock);
+ }
+ free(V_pf_srchash, M_PFHASH);
+
+ STAILQ_FOREACH_SAFE(pfse, &V_pf_sendqueue, pfse_next, next) {
+ m_freem(pfse->pfse_m);
+ free(pfse, M_PFTEMP);
+ }
+
+ mtx_destroy(&pf_sendqueue_mtx);
+ mtx_destroy(&pf_flushqueue_mtx);
+ mtx_destroy(&pf_unlnkdrules_mtx);
+
+ uma_zdestroy(V_pf_mtag_z);
+ uma_zdestroy(V_pf_sources_z);
+ uma_zdestroy(V_pf_state_z);
+ uma_zdestroy(V_pf_state_key_z);
+}
+
+static int
+pf_mtag_init(void *mem, int size, int how)
+{
+ struct m_tag *t;
+
+ t = (struct m_tag *)mem;
+ t->m_tag_cookie = MTAG_ABI_COMPAT;
+ t->m_tag_id = PACKET_TAG_PF;
+ t->m_tag_len = sizeof(struct pf_mtag);
+ t->m_tag_free = pf_mtag_free;
+
+ return (0);
+}
+
+static void
+pf_mtag_free(struct m_tag *t)
+{
+
+ uma_zfree(V_pf_mtag_z, t);
+}
+
+struct pf_mtag *
+pf_get_mtag(struct mbuf *m)
+{
+ struct m_tag *mtag;
+
+ if ((mtag = m_tag_find(m, PACKET_TAG_PF, NULL)) != NULL)
+ return ((struct pf_mtag *)(mtag + 1));
+
+ mtag = uma_zalloc(V_pf_mtag_z, M_NOWAIT);
+ if (mtag == NULL)
+ return (NULL);
+ bzero(mtag + 1, sizeof(struct pf_mtag));
+ m_tag_prepend(m, mtag);
+
+ return ((struct pf_mtag *)(mtag + 1));
+}
+
+static int
+pf_state_key_attach(struct pf_state_key *skw, struct pf_state_key *sks,
+ struct pf_state *s)
+{
+ struct pf_keyhash *kh;
+ struct pf_state_key *sk, *cur;
+ struct pf_state *si, *olds = NULL;
+ int idx;
+
+ KASSERT(s->refs == 0, ("%s: state not pristine", __func__));
+ KASSERT(s->key[PF_SK_WIRE] == NULL, ("%s: state has key", __func__));
+ KASSERT(s->key[PF_SK_STACK] == NULL, ("%s: state has key", __func__));
+
+ /*
+ * First run: start with wire key.
+ */
+ sk = skw;
+ idx = PF_SK_WIRE;
+
+keyattach:
+ kh = &V_pf_keyhash[pf_hashkey(sk)];
+
+ PF_HASHROW_LOCK(kh);
+ LIST_FOREACH(cur, &kh->keys, entry)
+ if (bcmp(cur, sk, sizeof(struct pf_state_key_cmp)) == 0)
+ break;
+
+ if (cur != NULL) {
+ /* Key exists. Check for same kif, if none, add to key. */
+ TAILQ_FOREACH(si, &cur->states[idx], key_list[idx]) {
+ struct pf_idhash *ih = &V_pf_idhash[PF_IDHASH(si)];
+
+ PF_HASHROW_LOCK(ih);
+ if (si->kif == s->kif &&
+ si->direction == s->direction) {
+ if (sk->proto == IPPROTO_TCP &&
+ si->src.state >= TCPS_FIN_WAIT_2 &&
+ si->dst.state >= TCPS_FIN_WAIT_2) {
+ si->src.state = si->dst.state =
+ TCPS_CLOSED;
+ /* Unlink later or cur can go away. */
+ pf_ref_state(si);
+ olds = si;
+ } else {
+ if (V_pf_status.debug >= PF_DEBUG_MISC) {
+ printf("pf: %s key attach "
+ "failed on %s: ",
+ (idx == PF_SK_WIRE) ?
+ "wire" : "stack",
+ s->kif->pfik_name);
+ pf_print_state_parts(s,
+ (idx == PF_SK_WIRE) ?
+ sk : NULL,
+ (idx == PF_SK_STACK) ?
+ sk : NULL);
+ printf(", existing: ");
+ pf_print_state_parts(si,
+ (idx == PF_SK_WIRE) ?
+ sk : NULL,
+ (idx == PF_SK_STACK) ?
+ sk : NULL);
+ printf("\n");
+ }
+ PF_HASHROW_UNLOCK(ih);
+ PF_HASHROW_UNLOCK(kh);
+ uma_zfree(V_pf_state_key_z, sk);
+ if (idx == PF_SK_STACK)
+ pf_detach_state(s);
+ return (-1); /* collision! */
+ }
+ }
+ PF_HASHROW_UNLOCK(ih);
+ }
+ uma_zfree(V_pf_state_key_z, sk);
+ s->key[idx] = cur;
+ } else {
+ LIST_INSERT_HEAD(&kh->keys, sk, entry);
+ s->key[idx] = sk;
+ }
+
+stateattach:
+ /* List is sorted, if-bound states before floating. */
+ if (s->kif == V_pfi_all)
+ TAILQ_INSERT_TAIL(&s->key[idx]->states[idx], s, key_list[idx]);
+ else
+ TAILQ_INSERT_HEAD(&s->key[idx]->states[idx], s, key_list[idx]);
+
+ /*
+ * Attach done. See how should we (or should not?)
+ * attach a second key.
+ */
+ if (sks == skw) {
+ s->key[PF_SK_STACK] = s->key[PF_SK_WIRE];
+ idx = PF_SK_STACK;
+ sks = NULL;
+ goto stateattach;
+ } else if (sks != NULL) {
+ PF_HASHROW_UNLOCK(kh);
+ if (olds) {
+ pf_unlink_state(olds, 0);
+ pf_release_state(olds);
+ olds = NULL;
+ }
+ /*
+ * Continue attaching with stack key.
+ */
+ sk = sks;
+ idx = PF_SK_STACK;
+ sks = NULL;
+ goto keyattach;
+ } else
+ PF_HASHROW_UNLOCK(kh);
+
+ if (olds) {
+ pf_unlink_state(olds, 0);
+ pf_release_state(olds);
+ }
+
+ KASSERT(s->key[PF_SK_WIRE] != NULL && s->key[PF_SK_STACK] != NULL,
+ ("%s failure", __func__));
+
+ return (0);
+}
+
+static void
+pf_detach_state(struct pf_state *s)
+{
+ struct pf_state_key *sks = s->key[PF_SK_STACK];
+ struct pf_keyhash *kh;
+
+ if (sks != NULL) {
+ kh = &V_pf_keyhash[pf_hashkey(sks)];
+ PF_HASHROW_LOCK(kh);
+ if (s->key[PF_SK_STACK] != NULL)
+ pf_state_key_detach(s, PF_SK_STACK);
+ /*
+ * If both point to same key, then we are done.
+ */
+ if (sks == s->key[PF_SK_WIRE]) {
+ pf_state_key_detach(s, PF_SK_WIRE);
+ PF_HASHROW_UNLOCK(kh);
+ return;
+ }
+ PF_HASHROW_UNLOCK(kh);
+ }
+
+ if (s->key[PF_SK_WIRE] != NULL) {
+ kh = &V_pf_keyhash[pf_hashkey(s->key[PF_SK_WIRE])];
+ PF_HASHROW_LOCK(kh);
+ if (s->key[PF_SK_WIRE] != NULL)
+ pf_state_key_detach(s, PF_SK_WIRE);
+ PF_HASHROW_UNLOCK(kh);
+ }
+}
+
+static void
+pf_state_key_detach(struct pf_state *s, int idx)
+{
+ struct pf_state_key *sk = s->key[idx];
+#ifdef INVARIANTS
+ struct pf_keyhash *kh = &V_pf_keyhash[pf_hashkey(sk)];
+
+ PF_HASHROW_ASSERT(kh);
+#endif
+ TAILQ_REMOVE(&sk->states[idx], s, key_list[idx]);
+ s->key[idx] = NULL;
+
+ if (TAILQ_EMPTY(&sk->states[0]) && TAILQ_EMPTY(&sk->states[1])) {
+ LIST_REMOVE(sk, entry);
+ uma_zfree(V_pf_state_key_z, sk);
+ }
+}
+
+static int
+pf_state_key_ctor(void *mem, int size, void *arg, int flags)
+{
+ struct pf_state_key *sk = mem;
+
+ bzero(sk, sizeof(struct pf_state_key_cmp));
+ TAILQ_INIT(&sk->states[PF_SK_WIRE]);
+ TAILQ_INIT(&sk->states[PF_SK_STACK]);
+
+ return (0);
+}
+
+struct pf_state_key *
+pf_state_key_setup(struct pf_pdesc *pd, struct pf_addr *saddr,
+ struct pf_addr *daddr, u_int16_t sport, u_int16_t dport)
+{
+ struct pf_state_key *sk;
+
+ sk = uma_zalloc(V_pf_state_key_z, M_NOWAIT);
+ if (sk == NULL)
+ return (NULL);
+
+ PF_ACPY(&sk->addr[pd->sidx], saddr, pd->af);
+ PF_ACPY(&sk->addr[pd->didx], daddr, pd->af);
+ sk->port[pd->sidx] = sport;
+ sk->port[pd->didx] = dport;
+ sk->proto = pd->proto;
+ sk->af = pd->af;
+
+ return (sk);
+}
+
+struct pf_state_key *
+pf_state_key_clone(struct pf_state_key *orig)
+{
+ struct pf_state_key *sk;
+
+ sk = uma_zalloc(V_pf_state_key_z, M_NOWAIT);
+ if (sk == NULL)
+ return (NULL);
+
+ bcopy(orig, sk, sizeof(struct pf_state_key_cmp));
+
+ return (sk);
+}
+
+int
+pf_state_insert(struct pfi_kif *kif, struct pf_state_key *skw,
+ struct pf_state_key *sks, struct pf_state *s)
+{
+ struct pf_idhash *ih;
+ struct pf_state *cur;
+
+ KASSERT(TAILQ_EMPTY(&sks->states[0]) && TAILQ_EMPTY(&sks->states[1]),
+ ("%s: sks not pristine", __func__));
+ KASSERT(TAILQ_EMPTY(&skw->states[0]) && TAILQ_EMPTY(&skw->states[1]),
+ ("%s: skw not pristine", __func__));
+ KASSERT(s->refs == 0, ("%s: state not pristine", __func__));
+
+ s->kif = kif;
+
+ if (pf_state_key_attach(skw, sks, s))
+ return (-1);
+
+ if (s->id == 0 && s->creatorid == 0) {
+ /* XXX: should be atomic, but probability of collision low */
+ if ((s->id = V_pf_stateid[curcpu]++) == PFID_MAXID)
+ V_pf_stateid[curcpu] = 1;
+ s->id |= (uint64_t )curcpu << PFID_CPUSHIFT;
+ s->id = htobe64(s->id);
+ s->creatorid = V_pf_status.hostid;
+ }
+
+ ih = &V_pf_idhash[PF_IDHASH(s)];
+ PF_HASHROW_LOCK(ih);
+ LIST_FOREACH(cur, &ih->states, entry)
+ if (cur->id == s->id && cur->creatorid == s->creatorid)
+ break;
+
+ if (cur != NULL) {
+ PF_HASHROW_UNLOCK(ih);
+ if (V_pf_status.debug >= PF_DEBUG_MISC) {
+ printf("pf: state insert failed: "
+ "id: %016llx creatorid: %08x",
+ (unsigned long long)be64toh(s->id),
+ ntohl(s->creatorid));
+ printf("\n");
+ }
+ pf_detach_state(s);
+ return (-1);
+ }
+ LIST_INSERT_HEAD(&ih->states, s, entry);
+ /* One for keys, one for ID hash. */
+ refcount_init(&s->refs, 2);
+
+ V_pf_status.fcounters[FCNT_STATE_INSERT]++;
+ if (pfsync_insert_state_ptr != NULL)
+ pfsync_insert_state_ptr(s);
+
+ /* Returns locked. */
+ return (0);
+}
+
+/*
+ * Find state by ID: returns with locked row on success.
+ */
+struct pf_state *
+pf_find_state_byid(uint64_t id, uint32_t creatorid)
+{
+ struct pf_idhash *ih;
+ struct pf_state *s;
+
+ V_pf_status.fcounters[FCNT_STATE_SEARCH]++;
+
+ ih = &V_pf_idhash[(be64toh(id) % (V_pf_hashmask + 1))];
+
+ PF_HASHROW_LOCK(ih);
+ LIST_FOREACH(s, &ih->states, entry)
+ if (s->id == id && s->creatorid == creatorid)
+ break;
+
+ if (s == NULL)
+ PF_HASHROW_UNLOCK(ih);
+
+ return (s);
+}
+
+/*
+ * Find state by key.
+ * Returns with ID hash slot locked on success.
+ */
+static struct pf_state *
+pf_find_state(struct pfi_kif *kif, struct pf_state_key_cmp *key, u_int dir)
+{
+ struct pf_keyhash *kh;
+ struct pf_state_key *sk;
+ struct pf_state *s;
+ int idx;
+
+ V_pf_status.fcounters[FCNT_STATE_SEARCH]++;
+
+ kh = &V_pf_keyhash[pf_hashkey((struct pf_state_key *)key)];
+
+ PF_HASHROW_LOCK(kh);
+ LIST_FOREACH(sk, &kh->keys, entry)
+ if (bcmp(sk, key, sizeof(struct pf_state_key_cmp)) == 0)
+ break;
+ if (sk == NULL) {
+ PF_HASHROW_UNLOCK(kh);
+ return (NULL);
+ }
+
+ idx = (dir == PF_IN ? PF_SK_WIRE : PF_SK_STACK);
+
+ /* List is sorted, if-bound states before floating ones. */
+ TAILQ_FOREACH(s, &sk->states[idx], key_list[idx])
+ if (s->kif == V_pfi_all || s->kif == kif) {
+ PF_STATE_LOCK(s);
+ PF_HASHROW_UNLOCK(kh);
+ if (s->timeout == PFTM_UNLINKED) {
+ /*
+ * State is being processed
+ * by pf_unlink_state() in
+ * an other thread.
+ */
+ PF_STATE_UNLOCK(s);
+ return (NULL);
+ }
+ return (s);
+ }
+ PF_HASHROW_UNLOCK(kh);
+
+ return (NULL);
+}
+
+struct pf_state *
+pf_find_state_all(struct pf_state_key_cmp *key, u_int dir, int *more)
+{
+ struct pf_keyhash *kh;
+ struct pf_state_key *sk;
+ struct pf_state *s, *ret = NULL;
+ int idx, inout = 0;
+
+ V_pf_status.fcounters[FCNT_STATE_SEARCH]++;
+
+ kh = &V_pf_keyhash[pf_hashkey((struct pf_state_key *)key)];
+
+ PF_HASHROW_LOCK(kh);
+ LIST_FOREACH(sk, &kh->keys, entry)
+ if (bcmp(sk, key, sizeof(struct pf_state_key_cmp)) == 0)
+ break;
+ if (sk == NULL) {
+ PF_HASHROW_UNLOCK(kh);
+ return (NULL);
+ }
+ switch (dir) {
+ case PF_IN:
+ idx = PF_SK_WIRE;
+ break;
+ case PF_OUT:
+ idx = PF_SK_STACK;
+ break;
+ case PF_INOUT:
+ idx = PF_SK_WIRE;
+ inout = 1;
+ break;
+ default:
+ panic("%s: dir %u", __func__, dir);
+ }
+second_run:
+ TAILQ_FOREACH(s, &sk->states[idx], key_list[idx]) {
+ if (more == NULL) {
+ PF_HASHROW_UNLOCK(kh);
+ return (s);
+ }
+
+ if (ret)
+ (*more)++;
+ else
+ ret = s;
+ }
+ if (inout == 1) {
+ inout = 0;
+ idx = PF_SK_STACK;
+ goto second_run;
+ }
+ PF_HASHROW_UNLOCK(kh);
+
+ return (ret);
+}
+
+/* END state table stuff */
+
+static void
+pf_send(struct pf_send_entry *pfse)
+{
+
+ PF_SENDQ_LOCK();
+ STAILQ_INSERT_TAIL(&V_pf_sendqueue, pfse, pfse_next);
+ PF_SENDQ_UNLOCK();
+ swi_sched(V_pf_swi_cookie, 0);
+}
+
+void
+pf_intr(void *v)
+{
+ struct pf_send_head queue;
+ struct pf_send_entry *pfse, *next;
+
+ CURVNET_SET((struct vnet *)v);
+
+ PF_SENDQ_LOCK();
+ queue = V_pf_sendqueue;
+ STAILQ_INIT(&V_pf_sendqueue);
+ PF_SENDQ_UNLOCK();
+
+ STAILQ_FOREACH_SAFE(pfse, &queue, pfse_next, next) {
+ switch (pfse->pfse_type) {
+#ifdef INET
+ case PFSE_IP:
+ ip_output(pfse->pfse_m, NULL, NULL, 0, NULL, NULL);
+ break;
+ case PFSE_ICMP:
+ icmp_error(pfse->pfse_m, pfse->pfse_icmp_type,
+ pfse->pfse_icmp_code, 0, pfse->pfse_icmp_mtu);
+ break;
+#endif /* INET */
+#ifdef INET6
+ case PFSE_IP6:
+ ip6_output(pfse->pfse_m, NULL, NULL, 0, NULL, NULL,
+ NULL);
+ break;
+ case PFSE_ICMP6:
+ icmp6_error(pfse->pfse_m, pfse->pfse_icmp_type,
+ pfse->pfse_icmp_code, pfse->pfse_icmp_mtu);
+ break;
+#endif /* INET6 */
+ default:
+ panic("%s: unknown type", __func__);
+ }
+ free(pfse, M_PFTEMP);
+ }
+ CURVNET_RESTORE();
+}
+
+void
+pf_purge_thread(void *v)
+{
+ int fullrun;
+
+ CURVNET_SET((struct vnet *)v);
+
+ for (;;) {
+ PF_RULES_RLOCK();
+ rw_sleep(pf_purge_thread, &pf_rules_lock, 0, "pftm", hz / 10);
+
+ if (V_pf_end_threads) {
+ /*
+ * To cleanse up all kifs and rules we need
+ * two runs: first one clears reference flags,
+ * then pf_purge_expired_states() doesn't
+ * raise them, and then second run frees.
+ */
+ PF_RULES_RUNLOCK();
+ pf_purge_unlinked_rules();
+ pfi_kif_purge();
+
+ /*
+ * Now purge everything.
+ */
+ pf_purge_expired_states(V_pf_hashmask + 1);
+ pf_purge_expired_fragments();
+ pf_purge_expired_src_nodes();
+
+ /*
+ * Now all kifs & rules should be unreferenced,
+ * thus should be successfully freed.
+ */
+ pf_purge_unlinked_rules();
+ pfi_kif_purge();
+
+ /*
+ * Announce success and exit.
+ */
+ PF_RULES_RLOCK();
+ V_pf_end_threads++;
+ PF_RULES_RUNLOCK();
+ wakeup(pf_purge_thread);
+ kproc_exit(0);
+ }
+ PF_RULES_RUNLOCK();
+
+ /* Process 1/interval fraction of the state table every run. */
+ fullrun = pf_purge_expired_states(V_pf_hashmask /
+ (V_pf_default_rule.timeout[PFTM_INTERVAL] * 10));
+
+ /* Purge other expired types every PFTM_INTERVAL seconds. */
+ if (fullrun) {
+ /*
+ * Order is important:
+ * - states and src nodes reference rules
+ * - states and rules reference kifs
+ */
+ pf_purge_expired_fragments();
+ pf_purge_expired_src_nodes();
+ pf_purge_unlinked_rules();
+ pfi_kif_purge();
+ }
+ }
+ /* not reached */
+ CURVNET_RESTORE();
+}
+
+u_int32_t
+pf_state_expires(const struct pf_state *state)
+{
+ u_int32_t timeout;
+ u_int32_t start;
+ u_int32_t end;
+ u_int32_t states;
+
+ /* handle all PFTM_* > PFTM_MAX here */
+ if (state->timeout == PFTM_PURGE)
+ return (time_uptime);
+ if (state->timeout == PFTM_UNTIL_PACKET)
+ return (0);
+ KASSERT(state->timeout != PFTM_UNLINKED,
+ ("pf_state_expires: timeout == PFTM_UNLINKED"));
+ KASSERT((state->timeout < PFTM_MAX),
+ ("pf_state_expires: timeout > PFTM_MAX"));
+ timeout = state->rule.ptr->timeout[state->timeout];
+ if (!timeout)
+ timeout = V_pf_default_rule.timeout[state->timeout];
+ start = state->rule.ptr->timeout[PFTM_ADAPTIVE_START];
+ if (start) {
+ end = state->rule.ptr->timeout[PFTM_ADAPTIVE_END];
+ states = state->rule.ptr->states_cur; /* XXXGL */
+ } else {
+ start = V_pf_default_rule.timeout[PFTM_ADAPTIVE_START];
+ end = V_pf_default_rule.timeout[PFTM_ADAPTIVE_END];
+ states = V_pf_status.states;
+ }
+ if (end && states > start && start < end) {
+ if (states < end)
+ return (state->expire + timeout * (end - states) /
+ (end - start));
+ else
+ return (time_uptime);
+ }
+ return (state->expire + timeout);
+}
+
+void
+pf_purge_expired_src_nodes()
+{
+ struct pf_srchash *sh;
+ struct pf_src_node *cur, *next;
+ int i;
+
+ for (i = 0, sh = V_pf_srchash; i <= V_pf_srchashmask; i++, sh++) {
+ PF_HASHROW_LOCK(sh);
+ LIST_FOREACH_SAFE(cur, &sh->nodes, entry, next)
+ if (cur->states <= 0 && cur->expire <= time_uptime) {
+ if (cur->rule.ptr != NULL)
+ cur->rule.ptr->src_nodes--;
+ LIST_REMOVE(cur, entry);
+ V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++;
+ V_pf_status.src_nodes--;
+ uma_zfree(V_pf_sources_z, cur);
+ } else if (cur->rule.ptr != NULL)
+ cur->rule.ptr->rule_flag |= PFRULE_REFS;
+ PF_HASHROW_UNLOCK(sh);
+ }
+}
+
+static void
+pf_src_tree_remove_state(struct pf_state *s)
+{
+ u_int32_t timeout;
+
+ if (s->src_node != NULL) {
+ if (s->src.tcp_est)
+ --s->src_node->conn;
+ if (--s->src_node->states <= 0) {
+ timeout = s->rule.ptr->timeout[PFTM_SRC_NODE];
+ if (!timeout)
+ timeout =
+ V_pf_default_rule.timeout[PFTM_SRC_NODE];
+ s->src_node->expire = time_uptime + timeout;
+ }
+ }
+ if (s->nat_src_node != s->src_node && s->nat_src_node != NULL) {
+ if (--s->nat_src_node->states <= 0) {
+ timeout = s->rule.ptr->timeout[PFTM_SRC_NODE];
+ if (!timeout)
+ timeout =
+ V_pf_default_rule.timeout[PFTM_SRC_NODE];
+ s->nat_src_node->expire = time_uptime + timeout;
+ }
+ }
+ s->src_node = s->nat_src_node = NULL;
+}
+
+/*
+ * Unlink and potentilly free a state. Function may be
+ * called with ID hash row locked, but always returns
+ * unlocked, since it needs to go through key hash locking.
+ */
+int
+pf_unlink_state(struct pf_state *s, u_int flags)
+{
+ struct pf_idhash *ih = &V_pf_idhash[PF_IDHASH(s)];
+
+ if ((flags & PF_ENTER_LOCKED) == 0)
+ PF_HASHROW_LOCK(ih);
+ else
+ PF_HASHROW_ASSERT(ih);
+
+ if (s->timeout == PFTM_UNLINKED) {
+ /*
+ * State is being processed
+ * by pf_unlink_state() in
+ * an other thread.
+ */
+ PF_HASHROW_UNLOCK(ih);
+ return (0); /* XXXGL: undefined actually */
+ }
+
+ s->timeout = PFTM_UNLINKED;
+
+ if (s->src.state == PF_TCPS_PROXY_DST) {
+ /* XXX wire key the right one? */
+ pf_send_tcp(NULL, s->rule.ptr, s->key[PF_SK_WIRE]->af,
+ &s->key[PF_SK_WIRE]->addr[1],
+ &s->key[PF_SK_WIRE]->addr[0],
+ s->key[PF_SK_WIRE]->port[1],
+ s->key[PF_SK_WIRE]->port[0],
+ s->src.seqhi, s->src.seqlo + 1,
+ TH_RST|TH_ACK, 0, 0, 0, 1, s->tag, NULL);
+ }
+
+ LIST_REMOVE(s, entry);
+ pf_src_tree_remove_state(s);
+ PF_HASHROW_UNLOCK(ih);
+
+ if (pfsync_delete_state_ptr != NULL)
+ pfsync_delete_state_ptr(s);
+
+ pf_detach_state(s);
+ refcount_release(&s->refs);
+
+ return (pf_release_state(s));
+}
+
+void
+pf_free_state(struct pf_state *cur)
+{
+
+ KASSERT(cur->refs == 0, ("%s: %p has refs", __func__, cur));
+ KASSERT(cur->timeout == PFTM_UNLINKED, ("%s: timeout %u", __func__,
+ cur->timeout));
+ --cur->rule.ptr->states_cur;
+ if (cur->nat_rule.ptr != NULL)
+ --cur->nat_rule.ptr->states_cur;
+ if (cur->anchor.ptr != NULL)
+ --cur->anchor.ptr->states_cur;
+ pf_normalize_tcp_cleanup(cur);
+ uma_zfree(V_pf_state_z, cur);
+ V_pf_status.fcounters[FCNT_STATE_REMOVALS]++;
+}
+
+/*
+ * Called only from pf_purge_thread(), thus serialized.
+ */
+static int
+pf_purge_expired_states(int maxcheck)
+{
+ static u_int i = 0;
+
+ struct pf_idhash *ih;
+ struct pf_state *s;
+ int rv = 0;
+
+ V_pf_status.states = uma_zone_get_cur(V_pf_state_z);
+
+ /*
+ * Go through hash and unlink states that expire now.
+ */
+ while (maxcheck > 0) {
+
+ /* Wrap to start of hash when we hit the end. */
+ if (i > V_pf_hashmask) {
+ i = 0;
+ rv = 1;
+ }
+
+ ih = &V_pf_idhash[i];
+relock:
+ PF_HASHROW_LOCK(ih);
+ LIST_FOREACH(s, &ih->states, entry) {
+ if (pf_state_expires(s) <= time_uptime) {
+ V_pf_status.states -=
+ pf_unlink_state(s, PF_ENTER_LOCKED);
+ goto relock;
+ }
+ s->rule.ptr->rule_flag |= PFRULE_REFS;
+ if (s->nat_rule.ptr != NULL)
+ s->nat_rule.ptr->rule_flag |= PFRULE_REFS;
+ if (s->anchor.ptr != NULL)
+ s->anchor.ptr->rule_flag |= PFRULE_REFS;
+ s->kif->pfik_flags |= PFI_IFLAG_REFS;
+ if (s->rt_kif)
+ s->rt_kif->pfik_flags |= PFI_IFLAG_REFS;
+ }
+ PF_HASHROW_UNLOCK(ih);
+ i++;
+ maxcheck--;
+ }
+
+ V_pf_status.states = uma_zone_get_cur(V_pf_state_z);
+
+ return (rv);
+}
+
+static void
+pf_purge_unlinked_rules()
+{
+ struct pf_rulequeue tmpq;
+ struct pf_rule *r, *r1;
+
+ /*
+ * Do naive mark-and-sweep garbage collecting of old rules.
+ * Reference flag is raised by pf_purge_expired_states()
+ * and pf_purge_expired_src_nodes().
+ *
+ * To avoid LOR between PF_UNLNKDRULES_LOCK/PF_RULES_WLOCK,
+ * use a temporary queue.
+ */
+ TAILQ_INIT(&tmpq);
+ PF_UNLNKDRULES_LOCK();
+ TAILQ_FOREACH_SAFE(r, &V_pf_unlinked_rules, entries, r1) {
+ if (!(r->rule_flag & PFRULE_REFS)) {
+ TAILQ_REMOVE(&V_pf_unlinked_rules, r, entries);
+ TAILQ_INSERT_TAIL(&tmpq, r, entries);
+ } else
+ r->rule_flag &= ~PFRULE_REFS;
+ }
+ PF_UNLNKDRULES_UNLOCK();
+
+ if (!TAILQ_EMPTY(&tmpq)) {
+ PF_RULES_WLOCK();
+ TAILQ_FOREACH_SAFE(r, &tmpq, entries, r1) {
+ TAILQ_REMOVE(&tmpq, r, entries);
+ pf_free_rule(r);
+ }
+ PF_RULES_WUNLOCK();
+ }
+}
+
+void
+pf_print_host(struct pf_addr *addr, u_int16_t p, sa_family_t af)
+{
+ switch (af) {
+#ifdef INET
+ case AF_INET: {
+ u_int32_t a = ntohl(addr->addr32[0]);
+ printf("%u.%u.%u.%u", (a>>24)&255, (a>>16)&255,
+ (a>>8)&255, a&255);
+ if (p) {
+ p = ntohs(p);
+ printf(":%u", p);
+ }
+ break;
+ }
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6: {
+ u_int16_t b;
+ u_int8_t i, curstart, curend, maxstart, maxend;
+ curstart = curend = maxstart = maxend = 255;
+ for (i = 0; i < 8; i++) {
+ if (!addr->addr16[i]) {
+ if (curstart == 255)
+ curstart = i;
+ curend = i;
+ } else {
+ if ((curend - curstart) >
+ (maxend - maxstart)) {
+ maxstart = curstart;
+ maxend = curend;
+ }
+ curstart = curend = 255;
+ }
+ }
+ if ((curend - curstart) >
+ (maxend - maxstart)) {
+ maxstart = curstart;
+ maxend = curend;
+ }
+ for (i = 0; i < 8; i++) {
+ if (i >= maxstart && i <= maxend) {
+ if (i == 0)
+ printf(":");
+ if (i == maxend)
+ printf(":");
+ } else {
+ b = ntohs(addr->addr16[i]);
+ printf("%x", b);
+ if (i < 7)
+ printf(":");
+ }
+ }
+ if (p) {
+ p = ntohs(p);
+ printf("[%u]", p);
+ }
+ break;
+ }
+#endif /* INET6 */
+ }
+}
+
+void
+pf_print_state(struct pf_state *s)
+{
+ pf_print_state_parts(s, NULL, NULL);
+}
+
+static void
+pf_print_state_parts(struct pf_state *s,
+ struct pf_state_key *skwp, struct pf_state_key *sksp)
+{
+ struct pf_state_key *skw, *sks;
+ u_int8_t proto, dir;
+
+ /* Do our best to fill these, but they're skipped if NULL */
+ skw = skwp ? skwp : (s ? s->key[PF_SK_WIRE] : NULL);
+ sks = sksp ? sksp : (s ? s->key[PF_SK_STACK] : NULL);
+ proto = skw ? skw->proto : (sks ? sks->proto : 0);
+ dir = s ? s->direction : 0;
+
+ switch (proto) {
+ case IPPROTO_IPV4:
+ printf("IPv4");
+ break;
+ case IPPROTO_IPV6:
+ printf("IPv6");
+ break;
+ case IPPROTO_TCP:
+ printf("TCP");
+ break;
+ case IPPROTO_UDP:
+ printf("UDP");
+ break;
+ case IPPROTO_ICMP:
+ printf("ICMP");
+ break;
+ case IPPROTO_ICMPV6:
+ printf("ICMPv6");
+ break;
+ default:
+ printf("%u", skw->proto);
+ break;
+ }
+ switch (dir) {
+ case PF_IN:
+ printf(" in");
+ break;
+ case PF_OUT:
+ printf(" out");
+ break;
+ }
+ if (skw) {
+ printf(" wire: ");
+ pf_print_host(&skw->addr[0], skw->port[0], skw->af);
+ printf(" ");
+ pf_print_host(&skw->addr[1], skw->port[1], skw->af);
+ }
+ if (sks) {
+ printf(" stack: ");
+ if (sks != skw) {
+ pf_print_host(&sks->addr[0], sks->port[0], sks->af);
+ printf(" ");
+ pf_print_host(&sks->addr[1], sks->port[1], sks->af);
+ } else
+ printf("-");
+ }
+ if (s) {
+ if (proto == IPPROTO_TCP) {
+ printf(" [lo=%u high=%u win=%u modulator=%u",
+ s->src.seqlo, s->src.seqhi,
+ s->src.max_win, s->src.seqdiff);
+ if (s->src.wscale && s->dst.wscale)
+ printf(" wscale=%u",
+ s->src.wscale & PF_WSCALE_MASK);
+ printf("]");
+ printf(" [lo=%u high=%u win=%u modulator=%u",
+ s->dst.seqlo, s->dst.seqhi,
+ s->dst.max_win, s->dst.seqdiff);
+ if (s->src.wscale && s->dst.wscale)
+ printf(" wscale=%u",
+ s->dst.wscale & PF_WSCALE_MASK);
+ printf("]");
+ }
+ printf(" %u:%u", s->src.state, s->dst.state);
+ }
+}
+
+void
+pf_print_flags(u_int8_t f)
+{
+ if (f)
+ printf(" ");
+ if (f & TH_FIN)
+ printf("F");
+ if (f & TH_SYN)
+ printf("S");
+ if (f & TH_RST)
+ printf("R");
+ if (f & TH_PUSH)
+ printf("P");
+ if (f & TH_ACK)
+ printf("A");
+ if (f & TH_URG)
+ printf("U");
+ if (f & TH_ECE)
+ printf("E");
+ if (f & TH_CWR)
+ printf("W");
+}
+
+#define PF_SET_SKIP_STEPS(i) \
+ do { \
+ while (head[i] != cur) { \
+ head[i]->skip[i].ptr = cur; \
+ head[i] = TAILQ_NEXT(head[i], entries); \
+ } \
+ } while (0)
+
+void
+pf_calc_skip_steps(struct pf_rulequeue *rules)
+{
+ struct pf_rule *cur, *prev, *head[PF_SKIP_COUNT];
+ int i;
+
+ cur = TAILQ_FIRST(rules);
+ prev = cur;
+ for (i = 0; i < PF_SKIP_COUNT; ++i)
+ head[i] = cur;
+ while (cur != NULL) {
+
+ if (cur->kif != prev->kif || cur->ifnot != prev->ifnot)
+ PF_SET_SKIP_STEPS(PF_SKIP_IFP);
+ if (cur->direction != prev->direction)
+ PF_SET_SKIP_STEPS(PF_SKIP_DIR);
+ if (cur->af != prev->af)
+ PF_SET_SKIP_STEPS(PF_SKIP_AF);
+ if (cur->proto != prev->proto)
+ PF_SET_SKIP_STEPS(PF_SKIP_PROTO);
+ if (cur->src.neg != prev->src.neg ||
+ pf_addr_wrap_neq(&cur->src.addr, &prev->src.addr))
+ PF_SET_SKIP_STEPS(PF_SKIP_SRC_ADDR);
+ if (cur->src.port[0] != prev->src.port[0] ||
+ cur->src.port[1] != prev->src.port[1] ||
+ cur->src.port_op != prev->src.port_op)
+ PF_SET_SKIP_STEPS(PF_SKIP_SRC_PORT);
+ if (cur->dst.neg != prev->dst.neg ||
+ pf_addr_wrap_neq(&cur->dst.addr, &prev->dst.addr))
+ PF_SET_SKIP_STEPS(PF_SKIP_DST_ADDR);
+ if (cur->dst.port[0] != prev->dst.port[0] ||
+ cur->dst.port[1] != prev->dst.port[1] ||
+ cur->dst.port_op != prev->dst.port_op)
+ PF_SET_SKIP_STEPS(PF_SKIP_DST_PORT);
+
+ prev = cur;
+ cur = TAILQ_NEXT(cur, entries);
+ }
+ for (i = 0; i < PF_SKIP_COUNT; ++i)
+ PF_SET_SKIP_STEPS(i);
+}
+
+static int
+pf_addr_wrap_neq(struct pf_addr_wrap *aw1, struct pf_addr_wrap *aw2)
+{
+ if (aw1->type != aw2->type)
+ return (1);
+ switch (aw1->type) {
+ case PF_ADDR_ADDRMASK:
+ case PF_ADDR_RANGE:
+ if (PF_ANEQ(&aw1->v.a.addr, &aw2->v.a.addr, 0))
+ return (1);
+ if (PF_ANEQ(&aw1->v.a.mask, &aw2->v.a.mask, 0))
+ return (1);
+ return (0);
+ case PF_ADDR_DYNIFTL:
+ return (aw1->p.dyn->pfid_kt != aw2->p.dyn->pfid_kt);
+ case PF_ADDR_NOROUTE:
+ case PF_ADDR_URPFFAILED:
+ return (0);
+ case PF_ADDR_TABLE:
+ return (aw1->p.tbl != aw2->p.tbl);
+ default:
+ printf("invalid address type: %d\n", aw1->type);
+ return (1);
+ }
+}
+
+u_int16_t
+pf_cksum_fixup(u_int16_t cksum, u_int16_t old, u_int16_t new, u_int8_t udp)
+{
+ u_int32_t l;
+
+ if (udp && !cksum)
+ return (0x0000);
+ l = cksum + old - new;
+ l = (l >> 16) + (l & 65535);
+ l = l & 65535;
+ if (udp && !l)
+ return (0xFFFF);
+ return (l);
+}
+
+static void
+pf_change_ap(struct pf_addr *a, u_int16_t *p, u_int16_t *ic, u_int16_t *pc,
+ struct pf_addr *an, u_int16_t pn, u_int8_t u, sa_family_t af)
+{
+ struct pf_addr ao;
+ u_int16_t po = *p;
+
+ PF_ACPY(&ao, a, af);
+ PF_ACPY(a, an, af);
+
+ *p = pn;
+
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ *ic = pf_cksum_fixup(pf_cksum_fixup(*ic,
+ ao.addr16[0], an->addr16[0], 0),
+ ao.addr16[1], an->addr16[1], 0);
+ *p = pn;
+ *pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(*pc,
+ ao.addr16[0], an->addr16[0], u),
+ ao.addr16[1], an->addr16[1], u),
+ po, pn, u);
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ *pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
+ pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
+ pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(*pc,
+ ao.addr16[0], an->addr16[0], u),
+ ao.addr16[1], an->addr16[1], u),
+ ao.addr16[2], an->addr16[2], u),
+ ao.addr16[3], an->addr16[3], u),
+ ao.addr16[4], an->addr16[4], u),
+ ao.addr16[5], an->addr16[5], u),
+ ao.addr16[6], an->addr16[6], u),
+ ao.addr16[7], an->addr16[7], u),
+ po, pn, u);
+ break;
+#endif /* INET6 */
+ }
+}
+
+
+/* Changes a u_int32_t. Uses a void * so there are no align restrictions */
+void
+pf_change_a(void *a, u_int16_t *c, u_int32_t an, u_int8_t u)
+{
+ u_int32_t ao;
+
+ memcpy(&ao, a, sizeof(ao));
+ memcpy(a, &an, sizeof(u_int32_t));
+ *c = pf_cksum_fixup(pf_cksum_fixup(*c, ao / 65536, an / 65536, u),
+ ao % 65536, an % 65536, u);
+}
+
+#ifdef INET6
+static void
+pf_change_a6(struct pf_addr *a, u_int16_t *c, struct pf_addr *an, u_int8_t u)
+{
+ struct pf_addr ao;
+
+ PF_ACPY(&ao, a, AF_INET6);
+ PF_ACPY(a, an, AF_INET6);
+
+ *c = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
+ pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
+ pf_cksum_fixup(pf_cksum_fixup(*c,
+ ao.addr16[0], an->addr16[0], u),
+ ao.addr16[1], an->addr16[1], u),
+ ao.addr16[2], an->addr16[2], u),
+ ao.addr16[3], an->addr16[3], u),
+ ao.addr16[4], an->addr16[4], u),
+ ao.addr16[5], an->addr16[5], u),
+ ao.addr16[6], an->addr16[6], u),
+ ao.addr16[7], an->addr16[7], u);
+}
+#endif /* INET6 */
+
+static void
+pf_change_icmp(struct pf_addr *ia, u_int16_t *ip, struct pf_addr *oa,
+ struct pf_addr *na, u_int16_t np, u_int16_t *pc, u_int16_t *h2c,
+ u_int16_t *ic, u_int16_t *hc, u_int8_t u, sa_family_t af)
+{
+ struct pf_addr oia, ooa;
+
+ PF_ACPY(&oia, ia, af);
+ if (oa)
+ PF_ACPY(&ooa, oa, af);
+
+ /* Change inner protocol port, fix inner protocol checksum. */
+ if (ip != NULL) {
+ u_int16_t oip = *ip;
+ u_int32_t opc;
+
+ if (pc != NULL)
+ opc = *pc;
+ *ip = np;
+ if (pc != NULL)
+ *pc = pf_cksum_fixup(*pc, oip, *ip, u);
+ *ic = pf_cksum_fixup(*ic, oip, *ip, 0);
+ if (pc != NULL)
+ *ic = pf_cksum_fixup(*ic, opc, *pc, 0);
+ }
+ /* Change inner ip address, fix inner ip and icmp checksums. */
+ PF_ACPY(ia, na, af);
+ switch (af) {
+#ifdef INET
+ case AF_INET: {
+ u_int32_t oh2c = *h2c;
+
+ *h2c = pf_cksum_fixup(pf_cksum_fixup(*h2c,
+ oia.addr16[0], ia->addr16[0], 0),
+ oia.addr16[1], ia->addr16[1], 0);
+ *ic = pf_cksum_fixup(pf_cksum_fixup(*ic,
+ oia.addr16[0], ia->addr16[0], 0),
+ oia.addr16[1], ia->addr16[1], 0);
+ *ic = pf_cksum_fixup(*ic, oh2c, *h2c, 0);
+ break;
+ }
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ *ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
+ pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
+ pf_cksum_fixup(pf_cksum_fixup(*ic,
+ oia.addr16[0], ia->addr16[0], u),
+ oia.addr16[1], ia->addr16[1], u),
+ oia.addr16[2], ia->addr16[2], u),
+ oia.addr16[3], ia->addr16[3], u),
+ oia.addr16[4], ia->addr16[4], u),
+ oia.addr16[5], ia->addr16[5], u),
+ oia.addr16[6], ia->addr16[6], u),
+ oia.addr16[7], ia->addr16[7], u);
+ break;
+#endif /* INET6 */
+ }
+ /* Outer ip address, fix outer ip or icmpv6 checksum, if necessary. */
+ if (oa) {
+ PF_ACPY(oa, na, af);
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ *hc = pf_cksum_fixup(pf_cksum_fixup(*hc,
+ ooa.addr16[0], oa->addr16[0], 0),
+ ooa.addr16[1], oa->addr16[1], 0);
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ *ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
+ pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
+ pf_cksum_fixup(pf_cksum_fixup(*ic,
+ ooa.addr16[0], oa->addr16[0], u),
+ ooa.addr16[1], oa->addr16[1], u),
+ ooa.addr16[2], oa->addr16[2], u),
+ ooa.addr16[3], oa->addr16[3], u),
+ ooa.addr16[4], oa->addr16[4], u),
+ ooa.addr16[5], oa->addr16[5], u),
+ ooa.addr16[6], oa->addr16[6], u),
+ ooa.addr16[7], oa->addr16[7], u);
+ break;
+#endif /* INET6 */
+ }
+ }
+}
+
+
+/*
+ * Need to modulate the sequence numbers in the TCP SACK option
+ * (credits to Krzysztof Pfaff for report and patch)
+ */
+static int
+pf_modulate_sack(struct mbuf *m, int off, struct pf_pdesc *pd,
+ struct tcphdr *th, struct pf_state_peer *dst)
+{
+ int hlen = (th->th_off << 2) - sizeof(*th), thoptlen = hlen;
+ u_int8_t opts[TCP_MAXOLEN], *opt = opts;
+ int copyback = 0, i, olen;
+ struct sackblk sack;
+
+#define TCPOLEN_SACKLEN (TCPOLEN_SACK + 2)
+ if (hlen < TCPOLEN_SACKLEN ||
+ !pf_pull_hdr(m, off + sizeof(*th), opts, hlen, NULL, NULL, pd->af))
+ return 0;
+
+ while (hlen >= TCPOLEN_SACKLEN) {
+ olen = opt[1];
+ switch (*opt) {
+ case TCPOPT_EOL: /* FALLTHROUGH */
+ case TCPOPT_NOP:
+ opt++;
+ hlen--;
+ break;
+ case TCPOPT_SACK:
+ if (olen > hlen)
+ olen = hlen;
+ if (olen >= TCPOLEN_SACKLEN) {
+ for (i = 2; i + TCPOLEN_SACK <= olen;
+ i += TCPOLEN_SACK) {
+ memcpy(&sack, &opt[i], sizeof(sack));
+ pf_change_a(&sack.start, &th->th_sum,
+ htonl(ntohl(sack.start) -
+ dst->seqdiff), 0);
+ pf_change_a(&sack.end, &th->th_sum,
+ htonl(ntohl(sack.end) -
+ dst->seqdiff), 0);
+ memcpy(&opt[i], &sack, sizeof(sack));
+ }
+ copyback = 1;
+ }
+ /* FALLTHROUGH */
+ default:
+ if (olen < 2)
+ olen = 2;
+ hlen -= olen;
+ opt += olen;
+ }
+ }
+
+ if (copyback)
+ m_copyback(m, off + sizeof(*th), thoptlen, (caddr_t)opts);
+ return (copyback);
+}
+
+static void
+pf_send_tcp(struct mbuf *replyto, const struct pf_rule *r, sa_family_t af,
+ const struct pf_addr *saddr, const struct pf_addr *daddr,
+ u_int16_t sport, u_int16_t dport, u_int32_t seq, u_int32_t ack,
+ u_int8_t flags, u_int16_t win, u_int16_t mss, u_int8_t ttl, int tag,
+ u_int16_t rtag, struct ifnet *ifp)
+{
+ struct pf_send_entry *pfse;
+ struct mbuf *m;
+ int len, tlen;
+#ifdef INET
+ struct ip *h = NULL;
+#endif /* INET */
+#ifdef INET6
+ struct ip6_hdr *h6 = NULL;
+#endif /* INET6 */
+ struct tcphdr *th;
+ char *opt;
+ struct pf_mtag *pf_mtag;
+
+ len = 0;
+ th = NULL;
+
+ /* maximum segment size tcp option */
+ tlen = sizeof(struct tcphdr);
+ if (mss)
+ tlen += 4;
+
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ len = sizeof(struct ip) + tlen;
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ len = sizeof(struct ip6_hdr) + tlen;
+ break;
+#endif /* INET6 */
+ default:
+ panic("%s: unsupported af %d", __func__, af);
+ }
+
+ /* Allocate outgoing queue entry, mbuf and mbuf tag. */
+ pfse = malloc(sizeof(*pfse), M_PFTEMP, M_NOWAIT);
+ if (pfse == NULL)
+ return;
+ m = m_gethdr(M_NOWAIT, MT_HEADER);
+ if (m == NULL) {
+ free(pfse, M_PFTEMP);
+ return;
+ }
+#ifdef MAC
+ mac_netinet_firewall_send(m);
+#endif
+ if ((pf_mtag = pf_get_mtag(m)) == NULL) {
+ free(pfse, M_PFTEMP);
+ m_freem(m);
+ return;
+ }
+ if (tag)
+ m->m_flags |= M_SKIP_FIREWALL;
+ pf_mtag->tag = rtag;
+
+ if (r != NULL && r->rtableid >= 0)
+ M_SETFIB(m, r->rtableid);
+
+#ifdef ALTQ
+ if (r != NULL && r->qid) {
+ pf_mtag->qid = r->qid;
+
+ /* add hints for ecn */
+ pf_mtag->hdr = mtod(m, struct ip *);
+ }
+#endif /* ALTQ */
+ m->m_data += max_linkhdr;
+ m->m_pkthdr.len = m->m_len = len;
+ m->m_pkthdr.rcvif = NULL;
+ bzero(m->m_data, len);
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ h = mtod(m, struct ip *);
+
+ /* IP header fields included in the TCP checksum */
+ h->ip_p = IPPROTO_TCP;
+ h->ip_len = htons(tlen);
+ h->ip_src.s_addr = saddr->v4.s_addr;
+ h->ip_dst.s_addr = daddr->v4.s_addr;
+
+ th = (struct tcphdr *)((caddr_t)h + sizeof(struct ip));
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ h6 = mtod(m, struct ip6_hdr *);
+
+ /* IP header fields included in the TCP checksum */
+ h6->ip6_nxt = IPPROTO_TCP;
+ h6->ip6_plen = htons(tlen);
+ memcpy(&h6->ip6_src, &saddr->v6, sizeof(struct in6_addr));
+ memcpy(&h6->ip6_dst, &daddr->v6, sizeof(struct in6_addr));
+
+ th = (struct tcphdr *)((caddr_t)h6 + sizeof(struct ip6_hdr));
+ break;
+#endif /* INET6 */
+ }
+
+ /* TCP header */
+ th->th_sport = sport;
+ th->th_dport = dport;
+ th->th_seq = htonl(seq);
+ th->th_ack = htonl(ack);
+ th->th_off = tlen >> 2;
+ th->th_flags = flags;
+ th->th_win = htons(win);
+
+ if (mss) {
+ opt = (char *)(th + 1);
+ opt[0] = TCPOPT_MAXSEG;
+ opt[1] = 4;
+ HTONS(mss);
+ bcopy((caddr_t)&mss, (caddr_t)(opt + 2), 2);
+ }
+
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ /* TCP checksum */
+ th->th_sum = in_cksum(m, len);
+
+ /* Finish the IP header */
+ h->ip_v = 4;
+ h->ip_hl = sizeof(*h) >> 2;
+ h->ip_tos = IPTOS_LOWDELAY;
+ h->ip_off = V_path_mtu_discovery ? IP_DF : 0;
+ h->ip_len = len;
+ h->ip_ttl = ttl ? ttl : V_ip_defttl;
+ h->ip_sum = 0;
+
+ pfse->pfse_type = PFSE_IP;
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ /* TCP checksum */
+ th->th_sum = in6_cksum(m, IPPROTO_TCP,
+ sizeof(struct ip6_hdr), tlen);
+
+ h6->ip6_vfc |= IPV6_VERSION;
+ h6->ip6_hlim = IPV6_DEFHLIM;
+
+ pfse->pfse_type = PFSE_IP6;
+ break;
+#endif /* INET6 */
+ }
+ pfse->pfse_m = m;
+ pf_send(pfse);
+}
+
+static void
+pf_send_icmp(struct mbuf *m, u_int8_t type, u_int8_t code, sa_family_t af,
+ struct pf_rule *r)
+{
+ struct pf_send_entry *pfse;
+ struct mbuf *m0;
+ struct pf_mtag *pf_mtag;
+
+ /* Allocate outgoing queue entry, mbuf and mbuf tag. */
+ pfse = malloc(sizeof(*pfse), M_PFTEMP, M_NOWAIT);
+ if (pfse == NULL)
+ return;
+
+ if ((m0 = m_copypacket(m, M_NOWAIT)) == NULL) {
+ free(pfse, M_PFTEMP);
+ return;
+ }
+
+ if ((pf_mtag = pf_get_mtag(m0)) == NULL) {
+ free(pfse, M_PFTEMP);
+ return;
+ }
+ /* XXX: revisit */
+ m0->m_flags |= M_SKIP_FIREWALL;
+
+ if (r->rtableid >= 0)
+ M_SETFIB(m0, r->rtableid);
+
+#ifdef ALTQ
+ if (r->qid) {
+ pf_mtag->qid = r->qid;
+ /* add hints for ecn */
+ pf_mtag->hdr = mtod(m0, struct ip *);
+ }
+#endif /* ALTQ */
+
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ {
+ struct ip *ip;
+
+ /* icmp_error() expects host byte ordering */
+ ip = mtod(m0, struct ip *);
+ NTOHS(ip->ip_len);
+ NTOHS(ip->ip_off);
+
+ pfse->pfse_type = PFSE_ICMP;
+ break;
+ }
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ pfse->pfse_type = PFSE_ICMP6;
+ break;
+#endif /* INET6 */
+ }
+ pfse->pfse_m = m0;
+ pfse->pfse_icmp_type = type;
+ pfse->pfse_icmp_code = code;
+ pf_send(pfse);
+}
+
+/*
+ * Return 1 if the addresses a and b match (with mask m), otherwise return 0.
+ * If n is 0, they match if they are equal. If n is != 0, they match if they
+ * are different.
+ */
+int
+pf_match_addr(u_int8_t n, struct pf_addr *a, struct pf_addr *m,
+ struct pf_addr *b, sa_family_t af)
+{
+ int match = 0;
+
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ if ((a->addr32[0] & m->addr32[0]) ==
+ (b->addr32[0] & m->addr32[0]))
+ match++;
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ if (((a->addr32[0] & m->addr32[0]) ==
+ (b->addr32[0] & m->addr32[0])) &&
+ ((a->addr32[1] & m->addr32[1]) ==
+ (b->addr32[1] & m->addr32[1])) &&
+ ((a->addr32[2] & m->addr32[2]) ==
+ (b->addr32[2] & m->addr32[2])) &&
+ ((a->addr32[3] & m->addr32[3]) ==
+ (b->addr32[3] & m->addr32[3])))
+ match++;
+ break;
+#endif /* INET6 */
+ }
+ if (match) {
+ if (n)
+ return (0);
+ else
+ return (1);
+ } else {
+ if (n)
+ return (1);
+ else
+ return (0);
+ }
+}
+
+/*
+ * Return 1 if b <= a <= e, otherwise return 0.
+ */
+int
+pf_match_addr_range(struct pf_addr *b, struct pf_addr *e,
+ struct pf_addr *a, sa_family_t af)
+{
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ if ((a->addr32[0] < b->addr32[0]) ||
+ (a->addr32[0] > e->addr32[0]))
+ return (0);
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6: {
+ int i;
+
+ /* check a >= b */
+ for (i = 0; i < 4; ++i)
+ if (a->addr32[i] > b->addr32[i])
+ break;
+ else if (a->addr32[i] < b->addr32[i])
+ return (0);
+ /* check a <= e */
+ for (i = 0; i < 4; ++i)
+ if (a->addr32[i] < e->addr32[i])
+ break;
+ else if (a->addr32[i] > e->addr32[i])
+ return (0);
+ break;
+ }
+#endif /* INET6 */
+ }
+ return (1);
+}
+
+static int
+pf_match(u_int8_t op, u_int32_t a1, u_int32_t a2, u_int32_t p)
+{
+ switch (op) {
+ case PF_OP_IRG:
+ return ((p > a1) && (p < a2));
+ case PF_OP_XRG:
+ return ((p < a1) || (p > a2));
+ case PF_OP_RRG:
+ return ((p >= a1) && (p <= a2));
+ case PF_OP_EQ:
+ return (p == a1);
+ case PF_OP_NE:
+ return (p != a1);
+ case PF_OP_LT:
+ return (p < a1);
+ case PF_OP_LE:
+ return (p <= a1);
+ case PF_OP_GT:
+ return (p > a1);
+ case PF_OP_GE:
+ return (p >= a1);
+ }
+ return (0); /* never reached */
+}
+
+int
+pf_match_port(u_int8_t op, u_int16_t a1, u_int16_t a2, u_int16_t p)
+{
+ NTOHS(a1);
+ NTOHS(a2);
+ NTOHS(p);
+ return (pf_match(op, a1, a2, p));
+}
+
+static int
+pf_match_uid(u_int8_t op, uid_t a1, uid_t a2, uid_t u)
+{
+ if (u == UID_MAX && op != PF_OP_EQ && op != PF_OP_NE)
+ return (0);
+ return (pf_match(op, a1, a2, u));
+}
+
+static int
+pf_match_gid(u_int8_t op, gid_t a1, gid_t a2, gid_t g)
+{
+ if (g == GID_MAX && op != PF_OP_EQ && op != PF_OP_NE)
+ return (0);
+ return (pf_match(op, a1, a2, g));
+}
+
+int
+pf_match_tag(struct mbuf *m, struct pf_rule *r, int *tag, int mtag)
+{
+ if (*tag == -1)
+ *tag = mtag;
+
+ return ((!r->match_tag_not && r->match_tag == *tag) ||
+ (r->match_tag_not && r->match_tag != *tag));
+}
+
+int
+pf_tag_packet(struct mbuf *m, struct pf_pdesc *pd, int tag)
+{
+
+ KASSERT(tag > 0, ("%s: tag %d", __func__, tag));
+
+ if (pd->pf_mtag == NULL && ((pd->pf_mtag = pf_get_mtag(m)) == NULL))
+ return (ENOMEM);
+
+ pd->pf_mtag->tag = tag;
+
+ return (0);
+}
+
+void
+pf_step_into_anchor(int *depth, struct pf_ruleset **rs, int n,
+ struct pf_rule **r, struct pf_rule **a, int *match)
+{
+ struct pf_anchor_stackframe *f;
+
+ PF_RULES_RASSERT();
+
+ (*r)->anchor->match = 0;
+ if (match)
+ *match = 0;
+ if (*depth >= sizeof(V_pf_anchor_stack) /
+ sizeof(V_pf_anchor_stack[0])) {
+ printf("pf_step_into_anchor: stack overflow\n");
+ *r = TAILQ_NEXT(*r, entries);
+ return;
+ } else if (*depth == 0 && a != NULL)
+ *a = *r;
+ f = V_pf_anchor_stack + (*depth)++;
+ f->rs = *rs;
+ f->r = *r;
+ if ((*r)->anchor_wildcard) {
+ f->parent = &(*r)->anchor->children;
+ if ((f->child = RB_MIN(pf_anchor_node, f->parent)) ==
+ NULL) {
+ *r = NULL;
+ return;
+ }
+ *rs = &f->child->ruleset;
+ } else {
+ f->parent = NULL;
+ f->child = NULL;
+ *rs = &(*r)->anchor->ruleset;
+ }
+ *r = TAILQ_FIRST((*rs)->rules[n].active.ptr);
+}
+
+int
+pf_step_out_of_anchor(int *depth, struct pf_ruleset **rs, int n,
+ struct pf_rule **r, struct pf_rule **a, int *match)
+{
+ struct pf_anchor_stackframe *f;
+ int quick = 0;
+
+ PF_RULES_RASSERT();
+
+ do {
+ if (*depth <= 0)
+ break;
+ f = V_pf_anchor_stack + *depth - 1;
+ if (f->parent != NULL && f->child != NULL) {
+ if (f->child->match ||
+ (match != NULL && *match)) {
+ f->r->anchor->match = 1;
+ *match = 0;
+ }
+ f->child = RB_NEXT(pf_anchor_node, f->parent, f->child);
+ if (f->child != NULL) {
+ *rs = &f->child->ruleset;
+ *r = TAILQ_FIRST((*rs)->rules[n].active.ptr);
+ if (*r == NULL)
+ continue;
+ else
+ break;
+ }
+ }
+ (*depth)--;
+ if (*depth == 0 && a != NULL)
+ *a = NULL;
+ *rs = f->rs;
+ if (f->r->anchor->match || (match != NULL && *match))
+ quick = f->r->quick;
+ *r = TAILQ_NEXT(f->r, entries);
+ } while (*r == NULL);
+
+ return (quick);
+}
+
+#ifdef INET6
+void
+pf_poolmask(struct pf_addr *naddr, struct pf_addr *raddr,
+ struct pf_addr *rmask, struct pf_addr *saddr, sa_family_t af)
+{
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) |
+ ((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]);
+ break;
+#endif /* INET */
+ case AF_INET6:
+ naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) |
+ ((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]);
+ naddr->addr32[1] = (raddr->addr32[1] & rmask->addr32[1]) |
+ ((rmask->addr32[1] ^ 0xffffffff ) & saddr->addr32[1]);
+ naddr->addr32[2] = (raddr->addr32[2] & rmask->addr32[2]) |
+ ((rmask->addr32[2] ^ 0xffffffff ) & saddr->addr32[2]);
+ naddr->addr32[3] = (raddr->addr32[3] & rmask->addr32[3]) |
+ ((rmask->addr32[3] ^ 0xffffffff ) & saddr->addr32[3]);
+ break;
+ }
+}
+
+void
+pf_addr_inc(struct pf_addr *addr, sa_family_t af)
+{
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ addr->addr32[0] = htonl(ntohl(addr->addr32[0]) + 1);
+ break;
+#endif /* INET */
+ case AF_INET6:
+ if (addr->addr32[3] == 0xffffffff) {
+ addr->addr32[3] = 0;
+ if (addr->addr32[2] == 0xffffffff) {
+ addr->addr32[2] = 0;
+ if (addr->addr32[1] == 0xffffffff) {
+ addr->addr32[1] = 0;
+ addr->addr32[0] =
+ htonl(ntohl(addr->addr32[0]) + 1);
+ } else
+ addr->addr32[1] =
+ htonl(ntohl(addr->addr32[1]) + 1);
+ } else
+ addr->addr32[2] =
+ htonl(ntohl(addr->addr32[2]) + 1);
+ } else
+ addr->addr32[3] =
+ htonl(ntohl(addr->addr32[3]) + 1);
+ break;
+ }
+}
+#endif /* INET6 */
+
+int
+pf_socket_lookup(int direction, struct pf_pdesc *pd, struct mbuf *m)
+{
+ struct pf_addr *saddr, *daddr;
+ u_int16_t sport, dport;
+ struct inpcbinfo *pi;
+ struct inpcb *inp;
+
+ pd->lookup.uid = UID_MAX;
+ pd->lookup.gid = GID_MAX;
+
+ switch (pd->proto) {
+ case IPPROTO_TCP:
+ if (pd->hdr.tcp == NULL)
+ return (-1);
+ sport = pd->hdr.tcp->th_sport;
+ dport = pd->hdr.tcp->th_dport;
+ pi = &V_tcbinfo;
+ break;
+ case IPPROTO_UDP:
+ if (pd->hdr.udp == NULL)
+ return (-1);
+ sport = pd->hdr.udp->uh_sport;
+ dport = pd->hdr.udp->uh_dport;
+ pi = &V_udbinfo;
+ break;
+ default:
+ return (-1);
+ }
+ if (direction == PF_IN) {
+ saddr = pd->src;
+ daddr = pd->dst;
+ } else {
+ u_int16_t p;
+
+ p = sport;
+ sport = dport;
+ dport = p;
+ saddr = pd->dst;
+ daddr = pd->src;
+ }
+ switch (pd->af) {
+#ifdef INET
+ case AF_INET:
+ inp = in_pcblookup_mbuf(pi, saddr->v4, sport, daddr->v4,
+ dport, INPLOOKUP_RLOCKPCB, NULL, m);
+ if (inp == NULL) {
+ inp = in_pcblookup_mbuf(pi, saddr->v4, sport,
+ daddr->v4, dport, INPLOOKUP_WILDCARD |
+ INPLOOKUP_RLOCKPCB, NULL, m);
+ if (inp == NULL)
+ return (-1);
+ }
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ inp = in6_pcblookup_mbuf(pi, &saddr->v6, sport, &daddr->v6,
+ dport, INPLOOKUP_RLOCKPCB, NULL, m);
+ if (inp == NULL) {
+ inp = in6_pcblookup_mbuf(pi, &saddr->v6, sport,
+ &daddr->v6, dport, INPLOOKUP_WILDCARD |
+ INPLOOKUP_RLOCKPCB, NULL, m);
+ if (inp == NULL)
+ return (-1);
+ }
+ break;
+#endif /* INET6 */
+
+ default:
+ return (-1);
+ }
+ INP_RLOCK_ASSERT(inp);
+ pd->lookup.uid = inp->inp_cred->cr_uid;
+ pd->lookup.gid = inp->inp_cred->cr_groups[0];
+ INP_RUNLOCK(inp);
+
+ return (1);
+}
+
+static u_int8_t
+pf_get_wscale(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af)
+{
+ int hlen;
+ u_int8_t hdr[60];
+ u_int8_t *opt, optlen;
+ u_int8_t wscale = 0;
+
+ hlen = th_off << 2; /* hlen <= sizeof(hdr) */
+ if (hlen <= sizeof(struct tcphdr))
+ return (0);
+ if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af))
+ return (0);
+ opt = hdr + sizeof(struct tcphdr);
+ hlen -= sizeof(struct tcphdr);
+ while (hlen >= 3) {
+ switch (*opt) {
+ case TCPOPT_EOL:
+ case TCPOPT_NOP:
+ ++opt;
+ --hlen;
+ break;
+ case TCPOPT_WINDOW:
+ wscale = opt[2];
+ if (wscale > TCP_MAX_WINSHIFT)
+ wscale = TCP_MAX_WINSHIFT;
+ wscale |= PF_WSCALE_FLAG;
+ /* FALLTHROUGH */
+ default:
+ optlen = opt[1];
+ if (optlen < 2)
+ optlen = 2;
+ hlen -= optlen;
+ opt += optlen;
+ break;
+ }
+ }
+ return (wscale);
+}
+
+static u_int16_t
+pf_get_mss(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af)
+{
+ int hlen;
+ u_int8_t hdr[60];
+ u_int8_t *opt, optlen;
+ u_int16_t mss = V_tcp_mssdflt;
+
+ hlen = th_off << 2; /* hlen <= sizeof(hdr) */
+ if (hlen <= sizeof(struct tcphdr))
+ return (0);
+ if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af))
+ return (0);
+ opt = hdr + sizeof(struct tcphdr);
+ hlen -= sizeof(struct tcphdr);
+ while (hlen >= TCPOLEN_MAXSEG) {
+ switch (*opt) {
+ case TCPOPT_EOL:
+ case TCPOPT_NOP:
+ ++opt;
+ --hlen;
+ break;
+ case TCPOPT_MAXSEG:
+ bcopy((caddr_t)(opt + 2), (caddr_t)&mss, 2);
+ NTOHS(mss);
+ /* FALLTHROUGH */
+ default:
+ optlen = opt[1];
+ if (optlen < 2)
+ optlen = 2;
+ hlen -= optlen;
+ opt += optlen;
+ break;
+ }
+ }
+ return (mss);
+}
+
+static u_int16_t
+pf_calc_mss(struct pf_addr *addr, sa_family_t af, int rtableid, u_int16_t offer)
+{
+#ifdef INET
+ struct sockaddr_in *dst;
+ struct route ro;
+#endif /* INET */
+#ifdef INET6
+ struct sockaddr_in6 *dst6;
+ struct route_in6 ro6;
+#endif /* INET6 */
+ struct rtentry *rt = NULL;
+ int hlen = 0;
+ u_int16_t mss = V_tcp_mssdflt;
+
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ hlen = sizeof(struct ip);
+ bzero(&ro, sizeof(ro));
+ dst = (struct sockaddr_in *)&ro.ro_dst;
+ dst->sin_family = AF_INET;
+ dst->sin_len = sizeof(*dst);
+ dst->sin_addr = addr->v4;
+ in_rtalloc_ign(&ro, 0, rtableid);
+ rt = ro.ro_rt;
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ hlen = sizeof(struct ip6_hdr);
+ bzero(&ro6, sizeof(ro6));
+ dst6 = (struct sockaddr_in6 *)&ro6.ro_dst;
+ dst6->sin6_family = AF_INET6;
+ dst6->sin6_len = sizeof(*dst6);
+ dst6->sin6_addr = addr->v6;
+ in6_rtalloc_ign(&ro6, 0, rtableid);
+ rt = ro6.ro_rt;
+ break;
+#endif /* INET6 */
+ }
+
+ if (rt && rt->rt_ifp) {
+ mss = rt->rt_ifp->if_mtu - hlen - sizeof(struct tcphdr);
+ mss = max(V_tcp_mssdflt, mss);
+ RTFREE(rt);
+ }
+ mss = min(mss, offer);
+ mss = max(mss, 64); /* sanity - at least max opt space */
+ return (mss);
+}
+
+static void
+pf_set_rt_ifp(struct pf_state *s, struct pf_addr *saddr)
+{
+ struct pf_rule *r = s->rule.ptr;
+ struct pf_src_node *sn = NULL;
+
+ s->rt_kif = NULL;
+ if (!r->rt || r->rt == PF_FASTROUTE)
+ return;
+ switch (s->key[PF_SK_WIRE]->af) {
+#ifdef INET
+ case AF_INET:
+ pf_map_addr(AF_INET, r, saddr, &s->rt_addr, NULL, &sn);
+ s->rt_kif = r->rpool.cur->kif;
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ pf_map_addr(AF_INET6, r, saddr, &s->rt_addr, NULL, &sn);
+ s->rt_kif = r->rpool.cur->kif;
+ break;
+#endif /* INET6 */
+ }
+}
+
+static u_int32_t
+pf_tcp_iss(struct pf_pdesc *pd)
+{
+ MD5_CTX ctx;
+ u_int32_t digest[4];
+
+ if (V_pf_tcp_secret_init == 0) {
+ read_random(&V_pf_tcp_secret, sizeof(V_pf_tcp_secret));
+ MD5Init(&V_pf_tcp_secret_ctx);
+ MD5Update(&V_pf_tcp_secret_ctx, V_pf_tcp_secret,
+ sizeof(V_pf_tcp_secret));
+ V_pf_tcp_secret_init = 1;
+ }
+
+ ctx = V_pf_tcp_secret_ctx;
+
+ MD5Update(&ctx, (char *)&pd->hdr.tcp->th_sport, sizeof(u_short));
+ MD5Update(&ctx, (char *)&pd->hdr.tcp->th_dport, sizeof(u_short));
+ if (pd->af == AF_INET6) {
+ MD5Update(&ctx, (char *)&pd->src->v6, sizeof(struct in6_addr));
+ MD5Update(&ctx, (char *)&pd->dst->v6, sizeof(struct in6_addr));
+ } else {
+ MD5Update(&ctx, (char *)&pd->src->v4, sizeof(struct in_addr));
+ MD5Update(&ctx, (char *)&pd->dst->v4, sizeof(struct in_addr));
+ }
+ MD5Final((u_char *)digest, &ctx);
+ V_pf_tcp_iss_off += 4096;
+#define ISN_RANDOM_INCREMENT (4096 - 1)
+ return (digest[0] + (arc4random() & ISN_RANDOM_INCREMENT) +
+ V_pf_tcp_iss_off);
+#undef ISN_RANDOM_INCREMENT
+}
+
+static int
+pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction,
+ struct pfi_kif *kif, struct mbuf *m, int off, struct pf_pdesc *pd,
+ struct pf_rule **am, struct pf_ruleset **rsm, struct inpcb *inp)
+{
+ struct pf_rule *nr = NULL;
+ struct pf_addr * const saddr = pd->src;
+ struct pf_addr * const daddr = pd->dst;
+ sa_family_t af = pd->af;
+ struct pf_rule *r, *a = NULL;
+ struct pf_ruleset *ruleset = NULL;
+ struct pf_src_node *nsn = NULL;
+ struct tcphdr *th = pd->hdr.tcp;
+ struct pf_state_key *sk = NULL, *nk = NULL;
+ u_short reason;
+ int rewrite = 0, hdrlen = 0;
+ int tag = -1, rtableid = -1;
+ int asd = 0;
+ int match = 0;
+ int state_icmp = 0;
+ u_int16_t sport = 0, dport = 0;
+ u_int16_t bproto_sum = 0, bip_sum = 0;
+ u_int8_t icmptype = 0, icmpcode = 0;
+
+ PF_RULES_RASSERT();
+
+ if (inp != NULL) {
+ INP_LOCK_ASSERT(inp);
+ pd->lookup.uid = inp->inp_cred->cr_uid;
+ pd->lookup.gid = inp->inp_cred->cr_groups[0];
+ pd->lookup.done = 1;
+ }
+
+ switch (pd->proto) {
+ case IPPROTO_TCP:
+ sport = th->th_sport;
+ dport = th->th_dport;
+ hdrlen = sizeof(*th);
+ break;
+ case IPPROTO_UDP:
+ sport = pd->hdr.udp->uh_sport;
+ dport = pd->hdr.udp->uh_dport;
+ hdrlen = sizeof(*pd->hdr.udp);
+ break;
+#ifdef INET
+ case IPPROTO_ICMP:
+ if (pd->af != AF_INET)
+ break;
+ sport = dport = pd->hdr.icmp->icmp_id;
+ hdrlen = sizeof(*pd->hdr.icmp);
+ icmptype = pd->hdr.icmp->icmp_type;
+ icmpcode = pd->hdr.icmp->icmp_code;
+
+ if (icmptype == ICMP_UNREACH ||
+ icmptype == ICMP_SOURCEQUENCH ||
+ icmptype == ICMP_REDIRECT ||
+ icmptype == ICMP_TIMXCEED ||
+ icmptype == ICMP_PARAMPROB)
+ state_icmp++;
+ break;
+#endif /* INET */
+#ifdef INET6
+ case IPPROTO_ICMPV6:
+ if (af != AF_INET6)
+ break;
+ sport = dport = pd->hdr.icmp6->icmp6_id;
+ hdrlen = sizeof(*pd->hdr.icmp6);
+ icmptype = pd->hdr.icmp6->icmp6_type;
+ icmpcode = pd->hdr.icmp6->icmp6_code;
+
+ if (icmptype == ICMP6_DST_UNREACH ||
+ icmptype == ICMP6_PACKET_TOO_BIG ||
+ icmptype == ICMP6_TIME_EXCEEDED ||
+ icmptype == ICMP6_PARAM_PROB)
+ state_icmp++;
+ break;
+#endif /* INET6 */
+ default:
+ sport = dport = hdrlen = 0;
+ break;
+ }
+
+ r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr);
+
+ /* check packet for BINAT/NAT/RDR */
+ if ((nr = pf_get_translation(pd, m, off, direction, kif, &nsn, &sk,
+ &nk, saddr, daddr, sport, dport)) != NULL) {
+ KASSERT(sk != NULL, ("%s: null sk", __func__));
+ KASSERT(nk != NULL, ("%s: null nk", __func__));
+
+ if (pd->ip_sum)
+ bip_sum = *pd->ip_sum;
+
+ switch (pd->proto) {
+ case IPPROTO_TCP:
+ bproto_sum = th->th_sum;
+ pd->proto_sum = &th->th_sum;
+
+ if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) ||
+ nk->port[pd->sidx] != sport) {
+ pf_change_ap(saddr, &th->th_sport, pd->ip_sum,
+ &th->th_sum, &nk->addr[pd->sidx],
+ nk->port[pd->sidx], 0, af);
+ pd->sport = &th->th_sport;
+ sport = th->th_sport;
+ }
+
+ if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) ||
+ nk->port[pd->didx] != dport) {
+ pf_change_ap(daddr, &th->th_dport, pd->ip_sum,
+ &th->th_sum, &nk->addr[pd->didx],
+ nk->port[pd->didx], 0, af);
+ dport = th->th_dport;
+ pd->dport = &th->th_dport;
+ }
+ rewrite++;
+ break;
+ case IPPROTO_UDP:
+ bproto_sum = pd->hdr.udp->uh_sum;
+ pd->proto_sum = &pd->hdr.udp->uh_sum;
+
+ if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) ||
+ nk->port[pd->sidx] != sport) {
+ pf_change_ap(saddr, &pd->hdr.udp->uh_sport,
+ pd->ip_sum, &pd->hdr.udp->uh_sum,
+ &nk->addr[pd->sidx],
+ nk->port[pd->sidx], 1, af);
+ sport = pd->hdr.udp->uh_sport;
+ pd->sport = &pd->hdr.udp->uh_sport;
+ }
+
+ if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) ||
+ nk->port[pd->didx] != dport) {
+ pf_change_ap(daddr, &pd->hdr.udp->uh_dport,
+ pd->ip_sum, &pd->hdr.udp->uh_sum,
+ &nk->addr[pd->didx],
+ nk->port[pd->didx], 1, af);
+ dport = pd->hdr.udp->uh_dport;
+ pd->dport = &pd->hdr.udp->uh_dport;
+ }
+ rewrite++;
+ break;
+#ifdef INET
+ case IPPROTO_ICMP:
+ nk->port[0] = nk->port[1];
+ if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET))
+ pf_change_a(&saddr->v4.s_addr, pd->ip_sum,
+ nk->addr[pd->sidx].v4.s_addr, 0);
+
+ if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET))
+ pf_change_a(&daddr->v4.s_addr, pd->ip_sum,
+ nk->addr[pd->didx].v4.s_addr, 0);
+
+ if (nk->port[1] != pd->hdr.icmp->icmp_id) {
+ pd->hdr.icmp->icmp_cksum = pf_cksum_fixup(
+ pd->hdr.icmp->icmp_cksum, sport,
+ nk->port[1], 0);
+ pd->hdr.icmp->icmp_id = nk->port[1];
+ pd->sport = &pd->hdr.icmp->icmp_id;
+ }
+ m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp);
+ break;
+#endif /* INET */
+#ifdef INET6
+ case IPPROTO_ICMPV6:
+ nk->port[0] = nk->port[1];
+ if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET6))
+ pf_change_a6(saddr, &pd->hdr.icmp6->icmp6_cksum,
+ &nk->addr[pd->sidx], 0);
+
+ if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET6))
+ pf_change_a6(daddr, &pd->hdr.icmp6->icmp6_cksum,
+ &nk->addr[pd->didx], 0);
+ rewrite++;
+ break;
+#endif /* INET */
+ default:
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ if (PF_ANEQ(saddr,
+ &nk->addr[pd->sidx], AF_INET))
+ pf_change_a(&saddr->v4.s_addr,
+ pd->ip_sum,
+ nk->addr[pd->sidx].v4.s_addr, 0);
+
+ if (PF_ANEQ(daddr,
+ &nk->addr[pd->didx], AF_INET))
+ pf_change_a(&daddr->v4.s_addr,
+ pd->ip_sum,
+ nk->addr[pd->didx].v4.s_addr, 0);
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ if (PF_ANEQ(saddr,
+ &nk->addr[pd->sidx], AF_INET6))
+ PF_ACPY(saddr, &nk->addr[pd->sidx], af);
+
+ if (PF_ANEQ(daddr,
+ &nk->addr[pd->didx], AF_INET6))
+ PF_ACPY(saddr, &nk->addr[pd->didx], af);
+ break;
+#endif /* INET */
+ }
+ break;
+ }
+ if (nr->natpass)
+ r = NULL;
+ pd->nat_rule = nr;
+ }
+
+ while (r != NULL) {
+ r->evaluations++;
+ if (pfi_kif_match(r->kif, kif) == r->ifnot)
+ r = r->skip[PF_SKIP_IFP].ptr;
+ else if (r->direction && r->direction != direction)
+ r = r->skip[PF_SKIP_DIR].ptr;
+ else if (r->af && r->af != af)
+ r = r->skip[PF_SKIP_AF].ptr;
+ else if (r->proto && r->proto != pd->proto)
+ r = r->skip[PF_SKIP_PROTO].ptr;
+ else if (PF_MISMATCHAW(&r->src.addr, saddr, af,
+ r->src.neg, kif, M_GETFIB(m)))
+ r = r->skip[PF_SKIP_SRC_ADDR].ptr;
+ /* tcp/udp only. port_op always 0 in other cases */
+ else if (r->src.port_op && !pf_match_port(r->src.port_op,
+ r->src.port[0], r->src.port[1], sport))
+ r = r->skip[PF_SKIP_SRC_PORT].ptr;
+ else if (PF_MISMATCHAW(&r->dst.addr, daddr, af,
+ r->dst.neg, NULL, M_GETFIB(m)))
+ r = r->skip[PF_SKIP_DST_ADDR].ptr;
+ /* tcp/udp only. port_op always 0 in other cases */
+ else if (r->dst.port_op && !pf_match_port(r->dst.port_op,
+ r->dst.port[0], r->dst.port[1], dport))
+ r = r->skip[PF_SKIP_DST_PORT].ptr;
+ /* icmp only. type always 0 in other cases */
+ else if (r->type && r->type != icmptype + 1)
+ r = TAILQ_NEXT(r, entries);
+ /* icmp only. type always 0 in other cases */
+ else if (r->code && r->code != icmpcode + 1)
+ r = TAILQ_NEXT(r, entries);
+ else if (r->tos && !(r->tos == pd->tos))
+ r = TAILQ_NEXT(r, entries);
+ else if (r->rule_flag & PFRULE_FRAGMENT)
+ r = TAILQ_NEXT(r, entries);
+ else if (pd->proto == IPPROTO_TCP &&
+ (r->flagset & th->th_flags) != r->flags)
+ r = TAILQ_NEXT(r, entries);
+ /* tcp/udp only. uid.op always 0 in other cases */
+ else if (r->uid.op && (pd->lookup.done || (pd->lookup.done =
+ pf_socket_lookup(direction, pd, m), 1)) &&
+ !pf_match_uid(r->uid.op, r->uid.uid[0], r->uid.uid[1],
+ pd->lookup.uid))
+ r = TAILQ_NEXT(r, entries);
+ /* tcp/udp only. gid.op always 0 in other cases */
+ else if (r->gid.op && (pd->lookup.done || (pd->lookup.done =
+ pf_socket_lookup(direction, pd, m), 1)) &&
+ !pf_match_gid(r->gid.op, r->gid.gid[0], r->gid.gid[1],
+ pd->lookup.gid))
+ r = TAILQ_NEXT(r, entries);
+ else if (r->prob &&
+ r->prob <= arc4random())
+ r = TAILQ_NEXT(r, entries);
+ else if (r->match_tag && !pf_match_tag(m, r, &tag,
+ pd->pf_mtag ? pd->pf_mtag->tag : 0))
+ r = TAILQ_NEXT(r, entries);
+ else if (r->os_fingerprint != PF_OSFP_ANY &&
+ (pd->proto != IPPROTO_TCP || !pf_osfp_match(
+ pf_osfp_fingerprint(pd, m, off, th),
+ r->os_fingerprint)))
+ r = TAILQ_NEXT(r, entries);
+ else {
+ if (r->tag)
+ tag = r->tag;
+ if (r->rtableid >= 0)
+ rtableid = r->rtableid;
+ if (r->anchor == NULL) {
+ match = 1;
+ *rm = r;
+ *am = a;
+ *rsm = ruleset;
+ if ((*rm)->quick)
+ break;
+ r = TAILQ_NEXT(r, entries);
+ } else
+ pf_step_into_anchor(&asd, &ruleset,
+ PF_RULESET_FILTER, &r, &a, &match);
+ }
+ if (r == NULL && pf_step_out_of_anchor(&asd, &ruleset,
+ PF_RULESET_FILTER, &r, &a, &match))
+ break;
+ }
+ r = *rm;
+ a = *am;
+ ruleset = *rsm;
+
+ REASON_SET(&reason, PFRES_MATCH);
+
+ if (r->log || (nr != NULL && nr->log)) {
+ if (rewrite)
+ m_copyback(m, off, hdrlen, pd->hdr.any);
+ PFLOG_PACKET(kif, m, af, direction, reason, r->log ? r : nr, a,
+ ruleset, pd, 1);
+ }
+
+ if ((r->action == PF_DROP) &&
+ ((r->rule_flag & PFRULE_RETURNRST) ||
+ (r->rule_flag & PFRULE_RETURNICMP) ||
+ (r->rule_flag & PFRULE_RETURN))) {
+ /* undo NAT changes, if they have taken place */
+ if (nr != NULL) {
+ PF_ACPY(saddr, &sk->addr[pd->sidx], af);
+ PF_ACPY(daddr, &sk->addr[pd->didx], af);
+ if (pd->sport)
+ *pd->sport = sk->port[pd->sidx];
+ if (pd->dport)
+ *pd->dport = sk->port[pd->didx];
+ if (pd->proto_sum)
+ *pd->proto_sum = bproto_sum;
+ if (pd->ip_sum)
+ *pd->ip_sum = bip_sum;
+ m_copyback(m, off, hdrlen, pd->hdr.any);
+ }
+ if (pd->proto == IPPROTO_TCP &&
+ ((r->rule_flag & PFRULE_RETURNRST) ||
+ (r->rule_flag & PFRULE_RETURN)) &&
+ !(th->th_flags & TH_RST)) {
+ u_int32_t ack = ntohl(th->th_seq) + pd->p_len;
+ int len = 0;
+#ifdef INET
+ struct ip *h4;
+#endif
+#ifdef INET6
+ struct ip6_hdr *h6;
+#endif
+
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ h4 = mtod(m, struct ip *);
+ len = ntohs(h4->ip_len) - off;
+ break;
+#endif
+#ifdef INET6
+ case AF_INET6:
+ h6 = mtod(m, struct ip6_hdr *);
+ len = ntohs(h6->ip6_plen) - (off - sizeof(*h6));
+ break;
+#endif
+ }
+
+ if (pf_check_proto_cksum(m, off, len, IPPROTO_TCP, af))
+ REASON_SET(&reason, PFRES_PROTCKSUM);
+ else {
+ if (th->th_flags & TH_SYN)
+ ack++;
+ if (th->th_flags & TH_FIN)
+ ack++;
+ pf_send_tcp(m, r, af, pd->dst,
+ pd->src, th->th_dport, th->th_sport,
+ ntohl(th->th_ack), ack, TH_RST|TH_ACK, 0, 0,
+ r->return_ttl, 1, 0, kif->pfik_ifp);
+ }
+ } else if (pd->proto != IPPROTO_ICMP && af == AF_INET &&
+ r->return_icmp)
+ pf_send_icmp(m, r->return_icmp >> 8,
+ r->return_icmp & 255, af, r);
+ else if (pd->proto != IPPROTO_ICMPV6 && af == AF_INET6 &&
+ r->return_icmp6)
+ pf_send_icmp(m, r->return_icmp6 >> 8,
+ r->return_icmp6 & 255, af, r);
+ }
+
+ if (r->action == PF_DROP)
+ goto cleanup;
+
+ if (tag > 0 && pf_tag_packet(m, pd, tag)) {
+ REASON_SET(&reason, PFRES_MEMORY);
+ goto cleanup;
+ }
+ if (rtableid >= 0)
+ M_SETFIB(m, rtableid);
+
+ if (!state_icmp && (r->keep_state || nr != NULL ||
+ (pd->flags & PFDESC_TCP_NORM))) {
+ int action;
+ action = pf_create_state(r, nr, a, pd, nsn, nk, sk, m, off,
+ sport, dport, &rewrite, kif, sm, tag, bproto_sum, bip_sum,
+ hdrlen);
+ if (action != PF_PASS)
+ return (action);
+ } else {
+ if (sk != NULL)
+ uma_zfree(V_pf_state_key_z, sk);
+ if (nk != NULL)
+ uma_zfree(V_pf_state_key_z, nk);
+ }
+
+ /* copy back packet headers if we performed NAT operations */
+ if (rewrite)
+ m_copyback(m, off, hdrlen, pd->hdr.any);
+
+ if (*sm != NULL && !((*sm)->state_flags & PFSTATE_NOSYNC) &&
+ direction == PF_OUT &&
+ pfsync_defer_ptr != NULL && pfsync_defer_ptr(*sm, m))
+ /*
+ * We want the state created, but we dont
+ * want to send this in case a partner
+ * firewall has to know about it to allow
+ * replies through it.
+ */
+ return (PF_DEFER);
+
+ return (PF_PASS);
+
+cleanup:
+ if (sk != NULL)
+ uma_zfree(V_pf_state_key_z, sk);
+ if (nk != NULL)
+ uma_zfree(V_pf_state_key_z, nk);
+ return (PF_DROP);
+}
+
+static int
+pf_create_state(struct pf_rule *r, struct pf_rule *nr, struct pf_rule *a,
+ struct pf_pdesc *pd, struct pf_src_node *nsn, struct pf_state_key *nk,
+ struct pf_state_key *sk, struct mbuf *m, int off, u_int16_t sport,
+ u_int16_t dport, int *rewrite, struct pfi_kif *kif, struct pf_state **sm,
+ int tag, u_int16_t bproto_sum, u_int16_t bip_sum, int hdrlen)
+{
+ struct pf_state *s = NULL;
+ struct pf_src_node *sn = NULL;
+ struct tcphdr *th = pd->hdr.tcp;
+ u_int16_t mss = V_tcp_mssdflt;
+ u_short reason;
+
+ /* check maximums */
+ if (r->max_states && (r->states_cur >= r->max_states)) {
+ V_pf_status.lcounters[LCNT_STATES]++;
+ REASON_SET(&reason, PFRES_MAXSTATES);
+ return (PF_DROP);
+ }
+ /* src node for filter rule */
+ if ((r->rule_flag & PFRULE_SRCTRACK ||
+ r->rpool.opts & PF_POOL_STICKYADDR) &&
+ pf_insert_src_node(&sn, r, pd->src, pd->af) != 0) {
+ REASON_SET(&reason, PFRES_SRCLIMIT);
+ goto csfailed;
+ }
+ /* src node for translation rule */
+ if (nr != NULL && (nr->rpool.opts & PF_POOL_STICKYADDR) &&
+ pf_insert_src_node(&nsn, nr, &sk->addr[pd->sidx], pd->af)) {
+ REASON_SET(&reason, PFRES_SRCLIMIT);
+ goto csfailed;
+ }
+ s = uma_zalloc(V_pf_state_z, M_NOWAIT | M_ZERO);
+ if (s == NULL) {
+ REASON_SET(&reason, PFRES_MEMORY);
+ goto csfailed;
+ }
+ s->rule.ptr = r;
+ s->nat_rule.ptr = nr;
+ s->anchor.ptr = a;
+ STATE_INC_COUNTERS(s);
+ if (r->allow_opts)
+ s->state_flags |= PFSTATE_ALLOWOPTS;
+ if (r->rule_flag & PFRULE_STATESLOPPY)
+ s->state_flags |= PFSTATE_SLOPPY;
+ s->log = r->log & PF_LOG_ALL;
+ s->sync_state = PFSYNC_S_NONE;
+ if (nr != NULL)
+ s->log |= nr->log & PF_LOG_ALL;
+ switch (pd->proto) {
+ case IPPROTO_TCP:
+ s->src.seqlo = ntohl(th->th_seq);
+ s->src.seqhi = s->src.seqlo + pd->p_len + 1;
+ if ((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN &&
+ r->keep_state == PF_STATE_MODULATE) {
+ /* Generate sequence number modulator */
+ if ((s->src.seqdiff = pf_tcp_iss(pd) - s->src.seqlo) ==
+ 0)
+ s->src.seqdiff = 1;
+ pf_change_a(&th->th_seq, &th->th_sum,
+ htonl(s->src.seqlo + s->src.seqdiff), 0);
+ *rewrite = 1;
+ } else
+ s->src.seqdiff = 0;
+ if (th->th_flags & TH_SYN) {
+ s->src.seqhi++;
+ s->src.wscale = pf_get_wscale(m, off,
+ th->th_off, pd->af);
+ }
+ s->src.max_win = MAX(ntohs(th->th_win), 1);
+ if (s->src.wscale & PF_WSCALE_MASK) {
+ /* Remove scale factor from initial window */
+ int win = s->src.max_win;
+ win += 1 << (s->src.wscale & PF_WSCALE_MASK);
+ s->src.max_win = (win - 1) >>
+ (s->src.wscale & PF_WSCALE_MASK);
+ }
+ if (th->th_flags & TH_FIN)
+ s->src.seqhi++;
+ s->dst.seqhi = 1;
+ s->dst.max_win = 1;
+ s->src.state = TCPS_SYN_SENT;
+ s->dst.state = TCPS_CLOSED;
+ s->timeout = PFTM_TCP_FIRST_PACKET;
+ break;
+ case IPPROTO_UDP:
+ s->src.state = PFUDPS_SINGLE;
+ s->dst.state = PFUDPS_NO_TRAFFIC;
+ s->timeout = PFTM_UDP_FIRST_PACKET;
+ break;
+ case IPPROTO_ICMP:
+#ifdef INET6
+ case IPPROTO_ICMPV6:
+#endif
+ s->timeout = PFTM_ICMP_FIRST_PACKET;
+ break;
+ default:
+ s->src.state = PFOTHERS_SINGLE;
+ s->dst.state = PFOTHERS_NO_TRAFFIC;
+ s->timeout = PFTM_OTHER_FIRST_PACKET;
+ }
+
+ s->creation = time_uptime;
+ s->expire = time_uptime;
+
+ if (sn != NULL) {
+ s->src_node = sn;
+ s->src_node->states++;
+ }
+ if (nsn != NULL) {
+ /* XXX We only modify one side for now. */
+ PF_ACPY(&nsn->raddr, &nk->addr[1], pd->af);
+ s->nat_src_node = nsn;
+ s->nat_src_node->states++;
+ }
+ if (pd->proto == IPPROTO_TCP) {
+ if ((pd->flags & PFDESC_TCP_NORM) && pf_normalize_tcp_init(m,
+ off, pd, th, &s->src, &s->dst)) {
+ REASON_SET(&reason, PFRES_MEMORY);
+ pf_src_tree_remove_state(s);
+ STATE_DEC_COUNTERS(s);
+ uma_zfree(V_pf_state_z, s);
+ return (PF_DROP);
+ }
+ if ((pd->flags & PFDESC_TCP_NORM) && s->src.scrub &&
+ pf_normalize_tcp_stateful(m, off, pd, &reason, th, s,
+ &s->src, &s->dst, rewrite)) {
+ /* This really shouldn't happen!!! */
+ DPFPRINTF(PF_DEBUG_URGENT,
+ ("pf_normalize_tcp_stateful failed on first pkt"));
+ pf_normalize_tcp_cleanup(s);
+ pf_src_tree_remove_state(s);
+ STATE_DEC_COUNTERS(s);
+ uma_zfree(V_pf_state_z, s);
+ return (PF_DROP);
+ }
+ }
+ s->direction = pd->dir;
+
+ /*
+ * sk/nk could already been setup by pf_get_translation().
+ */
+ if (nr == NULL) {
+ KASSERT((sk == NULL && nk == NULL), ("%s: nr %p sk %p, nk %p",
+ __func__, nr, sk, nk));
+ sk = pf_state_key_setup(pd, pd->src, pd->dst, sport, dport);
+ if (sk == NULL)
+ goto csfailed;
+ nk = sk;
+ } else
+ KASSERT((sk != NULL && nk != NULL), ("%s: nr %p sk %p, nk %p",
+ __func__, nr, sk, nk));
+
+ /* Swap sk/nk for PF_OUT. */
+ if (pf_state_insert(BOUND_IFACE(r, kif),
+ (pd->dir == PF_IN) ? sk : nk,
+ (pd->dir == PF_IN) ? nk : sk, s)) {
+ if (pd->proto == IPPROTO_TCP)
+ pf_normalize_tcp_cleanup(s);
+ REASON_SET(&reason, PFRES_STATEINS);
+ pf_src_tree_remove_state(s);
+ STATE_DEC_COUNTERS(s);
+ uma_zfree(V_pf_state_z, s);
+ return (PF_DROP);
+ } else
+ *sm = s;
+
+ pf_set_rt_ifp(s, pd->src); /* needs s->state_key set */
+ if (tag > 0)
+ s->tag = tag;
+ if (pd->proto == IPPROTO_TCP && (th->th_flags & (TH_SYN|TH_ACK)) ==
+ TH_SYN && r->keep_state == PF_STATE_SYNPROXY) {
+ s->src.state = PF_TCPS_PROXY_SRC;
+ /* undo NAT changes, if they have taken place */
+ if (nr != NULL) {
+ struct pf_state_key *skt = s->key[PF_SK_WIRE];
+ if (pd->dir == PF_OUT)
+ skt = s->key[PF_SK_STACK];
+ PF_ACPY(pd->src, &skt->addr[pd->sidx], pd->af);
+ PF_ACPY(pd->dst, &skt->addr[pd->didx], pd->af);
+ if (pd->sport)
+ *pd->sport = skt->port[pd->sidx];
+ if (pd->dport)
+ *pd->dport = skt->port[pd->didx];
+ if (pd->proto_sum)
+ *pd->proto_sum = bproto_sum;
+ if (pd->ip_sum)
+ *pd->ip_sum = bip_sum;
+ m_copyback(m, off, hdrlen, pd->hdr.any);
+ }
+ s->src.seqhi = htonl(arc4random());
+ /* Find mss option */
+ int rtid = M_GETFIB(m);
+ mss = pf_get_mss(m, off, th->th_off, pd->af);
+ mss = pf_calc_mss(pd->src, pd->af, rtid, mss);
+ mss = pf_calc_mss(pd->dst, pd->af, rtid, mss);
+ s->src.mss = mss;
+ pf_send_tcp(NULL, r, pd->af, pd->dst, pd->src, th->th_dport,
+ th->th_sport, s->src.seqhi, ntohl(th->th_seq) + 1,
+ TH_SYN|TH_ACK, 0, s->src.mss, 0, 1, 0, NULL);
+ REASON_SET(&reason, PFRES_SYNPROXY);
+ return (PF_SYNPROXY_DROP);
+ }
+
+ return (PF_PASS);
+
+csfailed:
+ if (sk != NULL)
+ uma_zfree(V_pf_state_key_z, sk);
+ if (nk != NULL)
+ uma_zfree(V_pf_state_key_z, nk);
+
+ if (sn != NULL && sn->states == 0 && sn->expire == 0) {
+ pf_remove_src_node(sn);
+ V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++;
+ V_pf_status.src_nodes--;
+ uma_zfree(V_pf_sources_z, sn);
+ }
+ if (nsn != sn && nsn != NULL && nsn->states == 0 && nsn->expire == 0) {
+ pf_remove_src_node(nsn);
+ V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++;
+ V_pf_status.src_nodes--;
+ uma_zfree(V_pf_sources_z, nsn);
+ }
+ return (PF_DROP);
+}
+
+static int
+pf_test_fragment(struct pf_rule **rm, int direction, struct pfi_kif *kif,
+ struct mbuf *m, void *h, struct pf_pdesc *pd, struct pf_rule **am,
+ struct pf_ruleset **rsm)
+{
+ struct pf_rule *r, *a = NULL;
+ struct pf_ruleset *ruleset = NULL;
+ sa_family_t af = pd->af;
+ u_short reason;
+ int tag = -1;
+ int asd = 0;
+ int match = 0;
+
+ PF_RULES_RASSERT();
+
+ r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr);
+ while (r != NULL) {
+ r->evaluations++;
+ if (pfi_kif_match(r->kif, kif) == r->ifnot)
+ r = r->skip[PF_SKIP_IFP].ptr;
+ else if (r->direction && r->direction != direction)
+ r = r->skip[PF_SKIP_DIR].ptr;
+ else if (r->af && r->af != af)
+ r = r->skip[PF_SKIP_AF].ptr;
+ else if (r->proto && r->proto != pd->proto)
+ r = r->skip[PF_SKIP_PROTO].ptr;
+ else if (PF_MISMATCHAW(&r->src.addr, pd->src, af,
+ r->src.neg, kif, M_GETFIB(m)))
+ r = r->skip[PF_SKIP_SRC_ADDR].ptr;
+ else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af,
+ r->dst.neg, NULL, M_GETFIB(m)))
+ r = r->skip[PF_SKIP_DST_ADDR].ptr;
+ else if (r->tos && !(r->tos == pd->tos))
+ r = TAILQ_NEXT(r, entries);
+ else if (r->os_fingerprint != PF_OSFP_ANY)
+ r = TAILQ_NEXT(r, entries);
+ else if (pd->proto == IPPROTO_UDP &&
+ (r->src.port_op || r->dst.port_op))
+ r = TAILQ_NEXT(r, entries);
+ else if (pd->proto == IPPROTO_TCP &&
+ (r->src.port_op || r->dst.port_op || r->flagset))
+ r = TAILQ_NEXT(r, entries);
+ else if ((pd->proto == IPPROTO_ICMP ||
+ pd->proto == IPPROTO_ICMPV6) &&
+ (r->type || r->code))
+ r = TAILQ_NEXT(r, entries);
+ else if (r->prob && r->prob <=
+ (arc4random() % (UINT_MAX - 1) + 1))
+ r = TAILQ_NEXT(r, entries);
+ else if (r->match_tag && !pf_match_tag(m, r, &tag,
+ pd->pf_mtag ? pd->pf_mtag->tag : 0))
+ r = TAILQ_NEXT(r, entries);
+ else {
+ if (r->anchor == NULL) {
+ match = 1;
+ *rm = r;
+ *am = a;
+ *rsm = ruleset;
+ if ((*rm)->quick)
+ break;
+ r = TAILQ_NEXT(r, entries);
+ } else
+ pf_step_into_anchor(&asd, &ruleset,
+ PF_RULESET_FILTER, &r, &a, &match);
+ }
+ if (r == NULL && pf_step_out_of_anchor(&asd, &ruleset,
+ PF_RULESET_FILTER, &r, &a, &match))
+ break;
+ }
+ r = *rm;
+ a = *am;
+ ruleset = *rsm;
+
+ REASON_SET(&reason, PFRES_MATCH);
+
+ if (r->log)
+ PFLOG_PACKET(kif, m, af, direction, reason, r, a, ruleset, pd,
+ 1);
+
+ if (r->action != PF_PASS)
+ return (PF_DROP);
+
+ if (tag > 0 && pf_tag_packet(m, pd, tag)) {
+ REASON_SET(&reason, PFRES_MEMORY);
+ return (PF_DROP);
+ }
+
+ return (PF_PASS);
+}
+
+static int
+pf_tcp_track_full(struct pf_state_peer *src, struct pf_state_peer *dst,
+ struct pf_state **state, struct pfi_kif *kif, struct mbuf *m, int off,
+ struct pf_pdesc *pd, u_short *reason, int *copyback)
+{
+ struct tcphdr *th = pd->hdr.tcp;
+ u_int16_t win = ntohs(th->th_win);
+ u_int32_t ack, end, seq, orig_seq;
+ u_int8_t sws, dws;
+ int ackskew;
+
+ if (src->wscale && dst->wscale && !(th->th_flags & TH_SYN)) {
+ sws = src->wscale & PF_WSCALE_MASK;
+ dws = dst->wscale & PF_WSCALE_MASK;
+ } else
+ sws = dws = 0;
+
+ /*
+ * Sequence tracking algorithm from Guido van Rooij's paper:
+ * http://www.madison-gurkha.com/publications/tcp_filtering/
+ * tcp_filtering.ps
+ */
+
+ orig_seq = seq = ntohl(th->th_seq);
+ if (src->seqlo == 0) {
+ /* First packet from this end. Set its state */
+
+ if ((pd->flags & PFDESC_TCP_NORM || dst->scrub) &&
+ src->scrub == NULL) {
+ if (pf_normalize_tcp_init(m, off, pd, th, src, dst)) {
+ REASON_SET(reason, PFRES_MEMORY);
+ return (PF_DROP);
+ }
+ }
+
+ /* Deferred generation of sequence number modulator */
+ if (dst->seqdiff && !src->seqdiff) {
+ /* use random iss for the TCP server */
+ while ((src->seqdiff = arc4random() - seq) == 0)
+ ;
+ ack = ntohl(th->th_ack) - dst->seqdiff;
+ pf_change_a(&th->th_seq, &th->th_sum, htonl(seq +
+ src->seqdiff), 0);
+ pf_change_a(&th->th_ack, &th->th_sum, htonl(ack), 0);
+ *copyback = 1;
+ } else {
+ ack = ntohl(th->th_ack);
+ }
+
+ end = seq + pd->p_len;
+ if (th->th_flags & TH_SYN) {
+ end++;
+ if (dst->wscale & PF_WSCALE_FLAG) {
+ src->wscale = pf_get_wscale(m, off, th->th_off,
+ pd->af);
+ if (src->wscale & PF_WSCALE_FLAG) {
+ /* Remove scale factor from initial
+ * window */
+ sws = src->wscale & PF_WSCALE_MASK;
+ win = ((u_int32_t)win + (1 << sws) - 1)
+ >> sws;
+ dws = dst->wscale & PF_WSCALE_MASK;
+ } else {
+ /* fixup other window */
+ dst->max_win <<= dst->wscale &
+ PF_WSCALE_MASK;
+ /* in case of a retrans SYN|ACK */
+ dst->wscale = 0;
+ }
+ }
+ }
+ if (th->th_flags & TH_FIN)
+ end++;
+
+ src->seqlo = seq;
+ if (src->state < TCPS_SYN_SENT)
+ src->state = TCPS_SYN_SENT;
+
+ /*
+ * May need to slide the window (seqhi may have been set by
+ * the crappy stack check or if we picked up the connection
+ * after establishment)
+ */
+ if (src->seqhi == 1 ||
+ SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi))
+ src->seqhi = end + MAX(1, dst->max_win << dws);
+ if (win > src->max_win)
+ src->max_win = win;
+
+ } else {
+ ack = ntohl(th->th_ack) - dst->seqdiff;
+ if (src->seqdiff) {
+ /* Modulate sequence numbers */
+ pf_change_a(&th->th_seq, &th->th_sum, htonl(seq +
+ src->seqdiff), 0);
+ pf_change_a(&th->th_ack, &th->th_sum, htonl(ack), 0);
+ *copyback = 1;
+ }
+ end = seq + pd->p_len;
+ if (th->th_flags & TH_SYN)
+ end++;
+ if (th->th_flags & TH_FIN)
+ end++;
+ }
+
+ if ((th->th_flags & TH_ACK) == 0) {
+ /* Let it pass through the ack skew check */
+ ack = dst->seqlo;
+ } else if ((ack == 0 &&
+ (th->th_flags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) ||
+ /* broken tcp stacks do not set ack */
+ (dst->state < TCPS_SYN_SENT)) {
+ /*
+ * Many stacks (ours included) will set the ACK number in an
+ * FIN|ACK if the SYN times out -- no sequence to ACK.
+ */
+ ack = dst->seqlo;
+ }
+
+ if (seq == end) {
+ /* Ease sequencing restrictions on no data packets */
+ seq = src->seqlo;
+ end = seq;
+ }
+
+ ackskew = dst->seqlo - ack;
+
+
+ /*
+ * Need to demodulate the sequence numbers in any TCP SACK options
+ * (Selective ACK). We could optionally validate the SACK values
+ * against the current ACK window, either forwards or backwards, but
+ * I'm not confident that SACK has been implemented properly
+ * everywhere. It wouldn't surprise me if several stacks accidently
+ * SACK too far backwards of previously ACKed data. There really aren't
+ * any security implications of bad SACKing unless the target stack
+ * doesn't validate the option length correctly. Someone trying to
+ * spoof into a TCP connection won't bother blindly sending SACK
+ * options anyway.
+ */
+ if (dst->seqdiff && (th->th_off << 2) > sizeof(struct tcphdr)) {
+ if (pf_modulate_sack(m, off, pd, th, dst))
+ *copyback = 1;
+ }
+
+
+#define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary fudge factor */
+ if (SEQ_GEQ(src->seqhi, end) &&
+ /* Last octet inside other's window space */
+ SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) &&
+ /* Retrans: not more than one window back */
+ (ackskew >= -MAXACKWINDOW) &&
+ /* Acking not more than one reassembled fragment backwards */
+ (ackskew <= (MAXACKWINDOW << sws)) &&
+ /* Acking not more than one window forward */
+ ((th->th_flags & TH_RST) == 0 || orig_seq == src->seqlo ||
+ (orig_seq == src->seqlo + 1) || (orig_seq + 1 == src->seqlo) ||
+ (pd->flags & PFDESC_IP_REAS) == 0)) {
+ /* Require an exact/+1 sequence match on resets when possible */
+
+ if (dst->scrub || src->scrub) {
+ if (pf_normalize_tcp_stateful(m, off, pd, reason, th,
+ *state, src, dst, copyback))
+ return (PF_DROP);
+ }
+
+ /* update max window */
+ if (src->max_win < win)
+ src->max_win = win;
+ /* synchronize sequencing */
+ if (SEQ_GT(end, src->seqlo))
+ src->seqlo = end;
+ /* slide the window of what the other end can send */
+ if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
+ dst->seqhi = ack + MAX((win << sws), 1);
+
+
+ /* update states */
+ if (th->th_flags & TH_SYN)
+ if (src->state < TCPS_SYN_SENT)
+ src->state = TCPS_SYN_SENT;
+ if (th->th_flags & TH_FIN)
+ if (src->state < TCPS_CLOSING)
+ src->state = TCPS_CLOSING;
+ if (th->th_flags & TH_ACK) {
+ if (dst->state == TCPS_SYN_SENT) {
+ dst->state = TCPS_ESTABLISHED;
+ if (src->state == TCPS_ESTABLISHED &&
+ (*state)->src_node != NULL &&
+ pf_src_connlimit(state)) {
+ REASON_SET(reason, PFRES_SRCLIMIT);
+ return (PF_DROP);
+ }
+ } else if (dst->state == TCPS_CLOSING)
+ dst->state = TCPS_FIN_WAIT_2;
+ }
+ if (th->th_flags & TH_RST)
+ src->state = dst->state = TCPS_TIME_WAIT;
+
+ /* update expire time */
+ (*state)->expire = time_uptime;
+ if (src->state >= TCPS_FIN_WAIT_2 &&
+ dst->state >= TCPS_FIN_WAIT_2)
+ (*state)->timeout = PFTM_TCP_CLOSED;
+ else if (src->state >= TCPS_CLOSING &&
+ dst->state >= TCPS_CLOSING)
+ (*state)->timeout = PFTM_TCP_FIN_WAIT;
+ else if (src->state < TCPS_ESTABLISHED ||
+ dst->state < TCPS_ESTABLISHED)
+ (*state)->timeout = PFTM_TCP_OPENING;
+ else if (src->state >= TCPS_CLOSING ||
+ dst->state >= TCPS_CLOSING)
+ (*state)->timeout = PFTM_TCP_CLOSING;
+ else
+ (*state)->timeout = PFTM_TCP_ESTABLISHED;
+
+ /* Fall through to PASS packet */
+
+ } else if ((dst->state < TCPS_SYN_SENT ||
+ dst->state >= TCPS_FIN_WAIT_2 ||
+ src->state >= TCPS_FIN_WAIT_2) &&
+ SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) &&
+ /* Within a window forward of the originating packet */
+ SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) {
+ /* Within a window backward of the originating packet */
+
+ /*
+ * This currently handles three situations:
+ * 1) Stupid stacks will shotgun SYNs before their peer
+ * replies.
+ * 2) When PF catches an already established stream (the
+ * firewall rebooted, the state table was flushed, routes
+ * changed...)
+ * 3) Packets get funky immediately after the connection
+ * closes (this should catch Solaris spurious ACK|FINs
+ * that web servers like to spew after a close)
+ *
+ * This must be a little more careful than the above code
+ * since packet floods will also be caught here. We don't
+ * update the TTL here to mitigate the damage of a packet
+ * flood and so the same code can handle awkward establishment
+ * and a loosened connection close.
+ * In the establishment case, a correct peer response will
+ * validate the connection, go through the normal state code
+ * and keep updating the state TTL.
+ */
+
+ if (V_pf_status.debug >= PF_DEBUG_MISC) {
+ printf("pf: loose state match: ");
+ pf_print_state(*state);
+ pf_print_flags(th->th_flags);
+ printf(" seq=%u (%u) ack=%u len=%u ackskew=%d "
+ "pkts=%llu:%llu dir=%s,%s\n", seq, orig_seq, ack,
+ pd->p_len, ackskew, (unsigned long long)(*state)->packets[0],
+ (unsigned long long)(*state)->packets[1],
+ pd->dir == PF_IN ? "in" : "out",
+ pd->dir == (*state)->direction ? "fwd" : "rev");
+ }
+
+ if (dst->scrub || src->scrub) {
+ if (pf_normalize_tcp_stateful(m, off, pd, reason, th,
+ *state, src, dst, copyback))
+ return (PF_DROP);
+ }
+
+ /* update max window */
+ if (src->max_win < win)
+ src->max_win = win;
+ /* synchronize sequencing */
+ if (SEQ_GT(end, src->seqlo))
+ src->seqlo = end;
+ /* slide the window of what the other end can send */
+ if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
+ dst->seqhi = ack + MAX((win << sws), 1);
+
+ /*
+ * Cannot set dst->seqhi here since this could be a shotgunned
+ * SYN and not an already established connection.
+ */
+
+ if (th->th_flags & TH_FIN)
+ if (src->state < TCPS_CLOSING)
+ src->state = TCPS_CLOSING;
+ if (th->th_flags & TH_RST)
+ src->state = dst->state = TCPS_TIME_WAIT;
+
+ /* Fall through to PASS packet */
+
+ } else {
+ if ((*state)->dst.state == TCPS_SYN_SENT &&
+ (*state)->src.state == TCPS_SYN_SENT) {
+ /* Send RST for state mismatches during handshake */
+ if (!(th->th_flags & TH_RST))
+ pf_send_tcp(NULL, (*state)->rule.ptr, pd->af,
+ pd->dst, pd->src, th->th_dport,
+ th->th_sport, ntohl(th->th_ack), 0,
+ TH_RST, 0, 0,
+ (*state)->rule.ptr->return_ttl, 1, 0,
+ kif->pfik_ifp);
+ src->seqlo = 0;
+ src->seqhi = 1;
+ src->max_win = 1;
+ } else if (V_pf_status.debug >= PF_DEBUG_MISC) {
+ printf("pf: BAD state: ");
+ pf_print_state(*state);
+ pf_print_flags(th->th_flags);
+ printf(" seq=%u (%u) ack=%u len=%u ackskew=%d "
+ "pkts=%llu:%llu dir=%s,%s\n",
+ seq, orig_seq, ack, pd->p_len, ackskew,
+ (unsigned long long)(*state)->packets[0],
+ (unsigned long long)(*state)->packets[1],
+ pd->dir == PF_IN ? "in" : "out",
+ pd->dir == (*state)->direction ? "fwd" : "rev");
+ printf("pf: State failure on: %c %c %c %c | %c %c\n",
+ SEQ_GEQ(src->seqhi, end) ? ' ' : '1',
+ SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) ?
+ ' ': '2',
+ (ackskew >= -MAXACKWINDOW) ? ' ' : '3',
+ (ackskew <= (MAXACKWINDOW << sws)) ? ' ' : '4',
+ SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) ?' ' :'5',
+ SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW) ?' ' :'6');
+ }
+ REASON_SET(reason, PFRES_BADSTATE);
+ return (PF_DROP);
+ }
+
+ return (PF_PASS);
+}
+
+static int
+pf_tcp_track_sloppy(struct pf_state_peer *src, struct pf_state_peer *dst,
+ struct pf_state **state, struct pf_pdesc *pd, u_short *reason)
+{
+ struct tcphdr *th = pd->hdr.tcp;
+
+ if (th->th_flags & TH_SYN)
+ if (src->state < TCPS_SYN_SENT)
+ src->state = TCPS_SYN_SENT;
+ if (th->th_flags & TH_FIN)
+ if (src->state < TCPS_CLOSING)
+ src->state = TCPS_CLOSING;
+ if (th->th_flags & TH_ACK) {
+ if (dst->state == TCPS_SYN_SENT) {
+ dst->state = TCPS_ESTABLISHED;
+ if (src->state == TCPS_ESTABLISHED &&
+ (*state)->src_node != NULL &&
+ pf_src_connlimit(state)) {
+ REASON_SET(reason, PFRES_SRCLIMIT);
+ return (PF_DROP);
+ }
+ } else if (dst->state == TCPS_CLOSING) {
+ dst->state = TCPS_FIN_WAIT_2;
+ } else if (src->state == TCPS_SYN_SENT &&
+ dst->state < TCPS_SYN_SENT) {
+ /*
+ * Handle a special sloppy case where we only see one
+ * half of the connection. If there is a ACK after
+ * the initial SYN without ever seeing a packet from
+ * the destination, set the connection to established.
+ */
+ dst->state = src->state = TCPS_ESTABLISHED;
+ if ((*state)->src_node != NULL &&
+ pf_src_connlimit(state)) {
+ REASON_SET(reason, PFRES_SRCLIMIT);
+ return (PF_DROP);
+ }
+ } else if (src->state == TCPS_CLOSING &&
+ dst->state == TCPS_ESTABLISHED &&
+ dst->seqlo == 0) {
+ /*
+ * Handle the closing of half connections where we
+ * don't see the full bidirectional FIN/ACK+ACK
+ * handshake.
+ */
+ dst->state = TCPS_CLOSING;
+ }
+ }
+ if (th->th_flags & TH_RST)
+ src->state = dst->state = TCPS_TIME_WAIT;
+
+ /* update expire time */
+ (*state)->expire = time_uptime;
+ if (src->state >= TCPS_FIN_WAIT_2 &&
+ dst->state >= TCPS_FIN_WAIT_2)
+ (*state)->timeout = PFTM_TCP_CLOSED;
+ else if (src->state >= TCPS_CLOSING &&
+ dst->state >= TCPS_CLOSING)
+ (*state)->timeout = PFTM_TCP_FIN_WAIT;
+ else if (src->state < TCPS_ESTABLISHED ||
+ dst->state < TCPS_ESTABLISHED)
+ (*state)->timeout = PFTM_TCP_OPENING;
+ else if (src->state >= TCPS_CLOSING ||
+ dst->state >= TCPS_CLOSING)
+ (*state)->timeout = PFTM_TCP_CLOSING;
+ else
+ (*state)->timeout = PFTM_TCP_ESTABLISHED;
+
+ return (PF_PASS);
+}
+
+static int
+pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif,
+ struct mbuf *m, int off, void *h, struct pf_pdesc *pd,
+ u_short *reason)
+{
+ struct pf_state_key_cmp key;
+ struct tcphdr *th = pd->hdr.tcp;
+ int copyback = 0;
+ struct pf_state_peer *src, *dst;
+ struct pf_state_key *sk;
+
+ bzero(&key, sizeof(key));
+ key.af = pd->af;
+ key.proto = IPPROTO_TCP;
+ if (direction == PF_IN) { /* wire side, straight */
+ PF_ACPY(&key.addr[0], pd->src, key.af);
+ PF_ACPY(&key.addr[1], pd->dst, key.af);
+ key.port[0] = th->th_sport;
+ key.port[1] = th->th_dport;
+ } else { /* stack side, reverse */
+ PF_ACPY(&key.addr[1], pd->src, key.af);
+ PF_ACPY(&key.addr[0], pd->dst, key.af);
+ key.port[1] = th->th_sport;
+ key.port[0] = th->th_dport;
+ }
+
+ STATE_LOOKUP(kif, &key, direction, *state, pd);
+
+ if (direction == (*state)->direction) {
+ src = &(*state)->src;
+ dst = &(*state)->dst;
+ } else {
+ src = &(*state)->dst;
+ dst = &(*state)->src;
+ }
+
+ sk = (*state)->key[pd->didx];
+
+ if ((*state)->src.state == PF_TCPS_PROXY_SRC) {
+ if (direction != (*state)->direction) {
+ REASON_SET(reason, PFRES_SYNPROXY);
+ return (PF_SYNPROXY_DROP);
+ }
+ if (th->th_flags & TH_SYN) {
+ if (ntohl(th->th_seq) != (*state)->src.seqlo) {
+ REASON_SET(reason, PFRES_SYNPROXY);
+ return (PF_DROP);
+ }
+ pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, pd->dst,
+ pd->src, th->th_dport, th->th_sport,
+ (*state)->src.seqhi, ntohl(th->th_seq) + 1,
+ TH_SYN|TH_ACK, 0, (*state)->src.mss, 0, 1, 0, NULL);
+ REASON_SET(reason, PFRES_SYNPROXY);
+ return (PF_SYNPROXY_DROP);
+ } else if (!(th->th_flags & TH_ACK) ||
+ (ntohl(th->th_ack) != (*state)->src.seqhi + 1) ||
+ (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) {
+ REASON_SET(reason, PFRES_SYNPROXY);
+ return (PF_DROP);
+ } else if ((*state)->src_node != NULL &&
+ pf_src_connlimit(state)) {
+ REASON_SET(reason, PFRES_SRCLIMIT);
+ return (PF_DROP);
+ } else
+ (*state)->src.state = PF_TCPS_PROXY_DST;
+ }
+ if ((*state)->src.state == PF_TCPS_PROXY_DST) {
+ if (direction == (*state)->direction) {
+ if (((th->th_flags & (TH_SYN|TH_ACK)) != TH_ACK) ||
+ (ntohl(th->th_ack) != (*state)->src.seqhi + 1) ||
+ (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) {
+ REASON_SET(reason, PFRES_SYNPROXY);
+ return (PF_DROP);
+ }
+ (*state)->src.max_win = MAX(ntohs(th->th_win), 1);
+ if ((*state)->dst.seqhi == 1)
+ (*state)->dst.seqhi = htonl(arc4random());
+ pf_send_tcp(NULL, (*state)->rule.ptr, pd->af,
+ &sk->addr[pd->sidx], &sk->addr[pd->didx],
+ sk->port[pd->sidx], sk->port[pd->didx],
+ (*state)->dst.seqhi, 0, TH_SYN, 0,
+ (*state)->src.mss, 0, 0, (*state)->tag, NULL);
+ REASON_SET(reason, PFRES_SYNPROXY);
+ return (PF_SYNPROXY_DROP);
+ } else if (((th->th_flags & (TH_SYN|TH_ACK)) !=
+ (TH_SYN|TH_ACK)) ||
+ (ntohl(th->th_ack) != (*state)->dst.seqhi + 1)) {
+ REASON_SET(reason, PFRES_SYNPROXY);
+ return (PF_DROP);
+ } else {
+ (*state)->dst.max_win = MAX(ntohs(th->th_win), 1);
+ (*state)->dst.seqlo = ntohl(th->th_seq);
+ pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, pd->dst,
+ pd->src, th->th_dport, th->th_sport,
+ ntohl(th->th_ack), ntohl(th->th_seq) + 1,
+ TH_ACK, (*state)->src.max_win, 0, 0, 0,
+ (*state)->tag, NULL);
+ pf_send_tcp(NULL, (*state)->rule.ptr, pd->af,
+ &sk->addr[pd->sidx], &sk->addr[pd->didx],
+ sk->port[pd->sidx], sk->port[pd->didx],
+ (*state)->src.seqhi + 1, (*state)->src.seqlo + 1,
+ TH_ACK, (*state)->dst.max_win, 0, 0, 1, 0, NULL);
+ (*state)->src.seqdiff = (*state)->dst.seqhi -
+ (*state)->src.seqlo;
+ (*state)->dst.seqdiff = (*state)->src.seqhi -
+ (*state)->dst.seqlo;
+ (*state)->src.seqhi = (*state)->src.seqlo +
+ (*state)->dst.max_win;
+ (*state)->dst.seqhi = (*state)->dst.seqlo +
+ (*state)->src.max_win;
+ (*state)->src.wscale = (*state)->dst.wscale = 0;
+ (*state)->src.state = (*state)->dst.state =
+ TCPS_ESTABLISHED;
+ REASON_SET(reason, PFRES_SYNPROXY);
+ return (PF_SYNPROXY_DROP);
+ }
+ }
+
+ if (((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN) &&
+ dst->state >= TCPS_FIN_WAIT_2 &&
+ src->state >= TCPS_FIN_WAIT_2) {
+ if (V_pf_status.debug >= PF_DEBUG_MISC) {
+ printf("pf: state reuse ");
+ pf_print_state(*state);
+ pf_print_flags(th->th_flags);
+ printf("\n");
+ }
+ /* XXX make sure it's the same direction ?? */
+ (*state)->src.state = (*state)->dst.state = TCPS_CLOSED;
+ pf_unlink_state(*state, PF_ENTER_LOCKED);
+ *state = NULL;
+ return (PF_DROP);
+ }
+
+ if ((*state)->state_flags & PFSTATE_SLOPPY) {
+ if (pf_tcp_track_sloppy(src, dst, state, pd, reason) == PF_DROP)
+ return (PF_DROP);
+ } else {
+ if (pf_tcp_track_full(src, dst, state, kif, m, off, pd, reason,
+ &copyback) == PF_DROP)
+ return (PF_DROP);
+ }
+
+ /* translate source/destination address, if necessary */
+ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
+ struct pf_state_key *nk = (*state)->key[pd->didx];
+
+ if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) ||
+ nk->port[pd->sidx] != th->th_sport)
+ pf_change_ap(pd->src, &th->th_sport, pd->ip_sum,
+ &th->th_sum, &nk->addr[pd->sidx],
+ nk->port[pd->sidx], 0, pd->af);
+
+ if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) ||
+ nk->port[pd->didx] != th->th_dport)
+ pf_change_ap(pd->dst, &th->th_dport, pd->ip_sum,
+ &th->th_sum, &nk->addr[pd->didx],
+ nk->port[pd->didx], 0, pd->af);
+ copyback = 1;
+ }
+
+ /* Copyback sequence modulation or stateful scrub changes if needed */
+ if (copyback)
+ m_copyback(m, off, sizeof(*th), (caddr_t)th);
+
+ return (PF_PASS);
+}
+
+static int
+pf_test_state_udp(struct pf_state **state, int direction, struct pfi_kif *kif,
+ struct mbuf *m, int off, void *h, struct pf_pdesc *pd)
+{
+ struct pf_state_peer *src, *dst;
+ struct pf_state_key_cmp key;
+ struct udphdr *uh = pd->hdr.udp;
+
+ bzero(&key, sizeof(key));
+ key.af = pd->af;
+ key.proto = IPPROTO_UDP;
+ if (direction == PF_IN) { /* wire side, straight */
+ PF_ACPY(&key.addr[0], pd->src, key.af);
+ PF_ACPY(&key.addr[1], pd->dst, key.af);
+ key.port[0] = uh->uh_sport;
+ key.port[1] = uh->uh_dport;
+ } else { /* stack side, reverse */
+ PF_ACPY(&key.addr[1], pd->src, key.af);
+ PF_ACPY(&key.addr[0], pd->dst, key.af);
+ key.port[1] = uh->uh_sport;
+ key.port[0] = uh->uh_dport;
+ }
+
+ STATE_LOOKUP(kif, &key, direction, *state, pd);
+
+ if (direction == (*state)->direction) {
+ src = &(*state)->src;
+ dst = &(*state)->dst;
+ } else {
+ src = &(*state)->dst;
+ dst = &(*state)->src;
+ }
+
+ /* update states */
+ if (src->state < PFUDPS_SINGLE)
+ src->state = PFUDPS_SINGLE;
+ if (dst->state == PFUDPS_SINGLE)
+ dst->state = PFUDPS_MULTIPLE;
+
+ /* update expire time */
+ (*state)->expire = time_uptime;
+ if (src->state == PFUDPS_MULTIPLE && dst->state == PFUDPS_MULTIPLE)
+ (*state)->timeout = PFTM_UDP_MULTIPLE;
+ else
+ (*state)->timeout = PFTM_UDP_SINGLE;
+
+ /* translate source/destination address, if necessary */
+ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
+ struct pf_state_key *nk = (*state)->key[pd->didx];
+
+ if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) ||
+ nk->port[pd->sidx] != uh->uh_sport)
+ pf_change_ap(pd->src, &uh->uh_sport, pd->ip_sum,
+ &uh->uh_sum, &nk->addr[pd->sidx],
+ nk->port[pd->sidx], 1, pd->af);
+
+ if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) ||
+ nk->port[pd->didx] != uh->uh_dport)
+ pf_change_ap(pd->dst, &uh->uh_dport, pd->ip_sum,
+ &uh->uh_sum, &nk->addr[pd->didx],
+ nk->port[pd->didx], 1, pd->af);
+ m_copyback(m, off, sizeof(*uh), (caddr_t)uh);
+ }
+
+ return (PF_PASS);
+}
+
+static int
+pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif,
+ struct mbuf *m, int off, void *h, struct pf_pdesc *pd, u_short *reason)
+{
+ struct pf_addr *saddr = pd->src, *daddr = pd->dst;
+ u_int16_t icmpid = 0, *icmpsum;
+ u_int8_t icmptype;
+ int state_icmp = 0;
+ struct pf_state_key_cmp key;
+
+ bzero(&key, sizeof(key));
+ switch (pd->proto) {
+#ifdef INET
+ case IPPROTO_ICMP:
+ icmptype = pd->hdr.icmp->icmp_type;
+ icmpid = pd->hdr.icmp->icmp_id;
+ icmpsum = &pd->hdr.icmp->icmp_cksum;
+
+ if (icmptype == ICMP_UNREACH ||
+ icmptype == ICMP_SOURCEQUENCH ||
+ icmptype == ICMP_REDIRECT ||
+ icmptype == ICMP_TIMXCEED ||
+ icmptype == ICMP_PARAMPROB)
+ state_icmp++;
+ break;
+#endif /* INET */
+#ifdef INET6
+ case IPPROTO_ICMPV6:
+ icmptype = pd->hdr.icmp6->icmp6_type;
+ icmpid = pd->hdr.icmp6->icmp6_id;
+ icmpsum = &pd->hdr.icmp6->icmp6_cksum;
+
+ if (icmptype == ICMP6_DST_UNREACH ||
+ icmptype == ICMP6_PACKET_TOO_BIG ||
+ icmptype == ICMP6_TIME_EXCEEDED ||
+ icmptype == ICMP6_PARAM_PROB)
+ state_icmp++;
+ break;
+#endif /* INET6 */
+ }
+
+ if (!state_icmp) {
+
+ /*
+ * ICMP query/reply message not related to a TCP/UDP packet.
+ * Search for an ICMP state.
+ */
+ key.af = pd->af;
+ key.proto = pd->proto;
+ key.port[0] = key.port[1] = icmpid;
+ if (direction == PF_IN) { /* wire side, straight */
+ PF_ACPY(&key.addr[0], pd->src, key.af);
+ PF_ACPY(&key.addr[1], pd->dst, key.af);
+ } else { /* stack side, reverse */
+ PF_ACPY(&key.addr[1], pd->src, key.af);
+ PF_ACPY(&key.addr[0], pd->dst, key.af);
+ }
+
+ STATE_LOOKUP(kif, &key, direction, *state, pd);
+
+ (*state)->expire = time_uptime;
+ (*state)->timeout = PFTM_ICMP_ERROR_REPLY;
+
+ /* translate source/destination address, if necessary */
+ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
+ struct pf_state_key *nk = (*state)->key[pd->didx];
+
+ switch (pd->af) {
+#ifdef INET
+ case AF_INET:
+ if (PF_ANEQ(pd->src,
+ &nk->addr[pd->sidx], AF_INET))
+ pf_change_a(&saddr->v4.s_addr,
+ pd->ip_sum,
+ nk->addr[pd->sidx].v4.s_addr, 0);
+
+ if (PF_ANEQ(pd->dst, &nk->addr[pd->didx],
+ AF_INET))
+ pf_change_a(&daddr->v4.s_addr,
+ pd->ip_sum,
+ nk->addr[pd->didx].v4.s_addr, 0);
+
+ if (nk->port[0] !=
+ pd->hdr.icmp->icmp_id) {
+ pd->hdr.icmp->icmp_cksum =
+ pf_cksum_fixup(
+ pd->hdr.icmp->icmp_cksum, icmpid,
+ nk->port[pd->sidx], 0);
+ pd->hdr.icmp->icmp_id =
+ nk->port[pd->sidx];
+ }
+
+ m_copyback(m, off, ICMP_MINLEN,
+ (caddr_t )pd->hdr.icmp);
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ if (PF_ANEQ(pd->src,
+ &nk->addr[pd->sidx], AF_INET6))
+ pf_change_a6(saddr,
+ &pd->hdr.icmp6->icmp6_cksum,
+ &nk->addr[pd->sidx], 0);
+
+ if (PF_ANEQ(pd->dst,
+ &nk->addr[pd->didx], AF_INET6))
+ pf_change_a6(daddr,
+ &pd->hdr.icmp6->icmp6_cksum,
+ &nk->addr[pd->didx], 0);
+
+ m_copyback(m, off, sizeof(struct icmp6_hdr),
+ (caddr_t )pd->hdr.icmp6);
+ break;
+#endif /* INET6 */
+ }
+ }
+ return (PF_PASS);
+
+ } else {
+ /*
+ * ICMP error message in response to a TCP/UDP packet.
+ * Extract the inner TCP/UDP header and search for that state.
+ */
+
+ struct pf_pdesc pd2;
+ bzero(&pd2, sizeof pd2);
+#ifdef INET
+ struct ip h2;
+#endif /* INET */
+#ifdef INET6
+ struct ip6_hdr h2_6;
+ int terminal = 0;
+#endif /* INET6 */
+ int ipoff2 = 0;
+ int off2 = 0;
+
+ pd2.af = pd->af;
+ /* Payload packet is from the opposite direction. */
+ pd2.sidx = (direction == PF_IN) ? 1 : 0;
+ pd2.didx = (direction == PF_IN) ? 0 : 1;
+ switch (pd->af) {
+#ifdef INET
+ case AF_INET:
+ /* offset of h2 in mbuf chain */
+ ipoff2 = off + ICMP_MINLEN;
+
+ if (!pf_pull_hdr(m, ipoff2, &h2, sizeof(h2),
+ NULL, reason, pd2.af)) {
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: ICMP error message too short "
+ "(ip)\n"));
+ return (PF_DROP);
+ }
+ /*
+ * ICMP error messages don't refer to non-first
+ * fragments
+ */
+ if (h2.ip_off & htons(IP_OFFMASK)) {
+ REASON_SET(reason, PFRES_FRAG);
+ return (PF_DROP);
+ }
+
+ /* offset of protocol header that follows h2 */
+ off2 = ipoff2 + (h2.ip_hl << 2);
+
+ pd2.proto = h2.ip_p;
+ pd2.src = (struct pf_addr *)&h2.ip_src;
+ pd2.dst = (struct pf_addr *)&h2.ip_dst;
+ pd2.ip_sum = &h2.ip_sum;
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ ipoff2 = off + sizeof(struct icmp6_hdr);
+
+ if (!pf_pull_hdr(m, ipoff2, &h2_6, sizeof(h2_6),
+ NULL, reason, pd2.af)) {
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: ICMP error message too short "
+ "(ip6)\n"));
+ return (PF_DROP);
+ }
+ pd2.proto = h2_6.ip6_nxt;
+ pd2.src = (struct pf_addr *)&h2_6.ip6_src;
+ pd2.dst = (struct pf_addr *)&h2_6.ip6_dst;
+ pd2.ip_sum = NULL;
+ off2 = ipoff2 + sizeof(h2_6);
+ do {
+ switch (pd2.proto) {
+ case IPPROTO_FRAGMENT:
+ /*
+ * ICMPv6 error messages for
+ * non-first fragments
+ */
+ REASON_SET(reason, PFRES_FRAG);
+ return (PF_DROP);
+ case IPPROTO_AH:
+ case IPPROTO_HOPOPTS:
+ case IPPROTO_ROUTING:
+ case IPPROTO_DSTOPTS: {
+ /* get next header and header length */
+ struct ip6_ext opt6;
+
+ if (!pf_pull_hdr(m, off2, &opt6,
+ sizeof(opt6), NULL, reason,
+ pd2.af)) {
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: ICMPv6 short opt\n"));
+ return (PF_DROP);
+ }
+ if (pd2.proto == IPPROTO_AH)
+ off2 += (opt6.ip6e_len + 2) * 4;
+ else
+ off2 += (opt6.ip6e_len + 1) * 8;
+ pd2.proto = opt6.ip6e_nxt;
+ /* goto the next header */
+ break;
+ }
+ default:
+ terminal++;
+ break;
+ }
+ } while (!terminal);
+ break;
+#endif /* INET6 */
+ }
+
+ switch (pd2.proto) {
+ case IPPROTO_TCP: {
+ struct tcphdr th;
+ u_int32_t seq;
+ struct pf_state_peer *src, *dst;
+ u_int8_t dws;
+ int copyback = 0;
+
+ /*
+ * Only the first 8 bytes of the TCP header can be
+ * expected. Don't access any TCP header fields after
+ * th_seq, an ackskew test is not possible.
+ */
+ if (!pf_pull_hdr(m, off2, &th, 8, NULL, reason,
+ pd2.af)) {
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: ICMP error message too short "
+ "(tcp)\n"));
+ return (PF_DROP);
+ }
+
+ key.af = pd2.af;
+ key.proto = IPPROTO_TCP;
+ PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
+ PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
+ key.port[pd2.sidx] = th.th_sport;
+ key.port[pd2.didx] = th.th_dport;
+
+ STATE_LOOKUP(kif, &key, direction, *state, pd);
+
+ if (direction == (*state)->direction) {
+ src = &(*state)->dst;
+ dst = &(*state)->src;
+ } else {
+ src = &(*state)->src;
+ dst = &(*state)->dst;
+ }
+
+ if (src->wscale && dst->wscale)
+ dws = dst->wscale & PF_WSCALE_MASK;
+ else
+ dws = 0;
+
+ /* Demodulate sequence number */
+ seq = ntohl(th.th_seq) - src->seqdiff;
+ if (src->seqdiff) {
+ pf_change_a(&th.th_seq, icmpsum,
+ htonl(seq), 0);
+ copyback = 1;
+ }
+
+ if (!((*state)->state_flags & PFSTATE_SLOPPY) &&
+ (!SEQ_GEQ(src->seqhi, seq) ||
+ !SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)))) {
+ if (V_pf_status.debug >= PF_DEBUG_MISC) {
+ printf("pf: BAD ICMP %d:%d ",
+ icmptype, pd->hdr.icmp->icmp_code);
+ pf_print_host(pd->src, 0, pd->af);
+ printf(" -> ");
+ pf_print_host(pd->dst, 0, pd->af);
+ printf(" state: ");
+ pf_print_state(*state);
+ printf(" seq=%u\n", seq);
+ }
+ REASON_SET(reason, PFRES_BADSTATE);
+ return (PF_DROP);
+ } else {
+ if (V_pf_status.debug >= PF_DEBUG_MISC) {
+ printf("pf: OK ICMP %d:%d ",
+ icmptype, pd->hdr.icmp->icmp_code);
+ pf_print_host(pd->src, 0, pd->af);
+ printf(" -> ");
+ pf_print_host(pd->dst, 0, pd->af);
+ printf(" state: ");
+ pf_print_state(*state);
+ printf(" seq=%u\n", seq);
+ }
+ }
+
+ /* translate source/destination address, if necessary */
+ if ((*state)->key[PF_SK_WIRE] !=
+ (*state)->key[PF_SK_STACK]) {
+ struct pf_state_key *nk =
+ (*state)->key[pd->didx];
+
+ if (PF_ANEQ(pd2.src,
+ &nk->addr[pd2.sidx], pd2.af) ||
+ nk->port[pd2.sidx] != th.th_sport)
+ pf_change_icmp(pd2.src, &th.th_sport,
+ daddr, &nk->addr[pd2.sidx],
+ nk->port[pd2.sidx], NULL,
+ pd2.ip_sum, icmpsum,
+ pd->ip_sum, 0, pd2.af);
+
+ if (PF_ANEQ(pd2.dst,
+ &nk->addr[pd2.didx], pd2.af) ||
+ nk->port[pd2.didx] != th.th_dport)
+ pf_change_icmp(pd2.dst, &th.th_dport,
+ NULL, /* XXX Inbound NAT? */
+ &nk->addr[pd2.didx],
+ nk->port[pd2.didx], NULL,
+ pd2.ip_sum, icmpsum,
+ pd->ip_sum, 0, pd2.af);
+ copyback = 1;
+ }
+
+ if (copyback) {
+ switch (pd2.af) {
+#ifdef INET
+ case AF_INET:
+ m_copyback(m, off, ICMP_MINLEN,
+ (caddr_t )pd->hdr.icmp);
+ m_copyback(m, ipoff2, sizeof(h2),
+ (caddr_t )&h2);
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ m_copyback(m, off,
+ sizeof(struct icmp6_hdr),
+ (caddr_t )pd->hdr.icmp6);
+ m_copyback(m, ipoff2, sizeof(h2_6),
+ (caddr_t )&h2_6);
+ break;
+#endif /* INET6 */
+ }
+ m_copyback(m, off2, 8, (caddr_t)&th);
+ }
+
+ return (PF_PASS);
+ break;
+ }
+ case IPPROTO_UDP: {
+ struct udphdr uh;
+
+ if (!pf_pull_hdr(m, off2, &uh, sizeof(uh),
+ NULL, reason, pd2.af)) {
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: ICMP error message too short "
+ "(udp)\n"));
+ return (PF_DROP);
+ }
+
+ key.af = pd2.af;
+ key.proto = IPPROTO_UDP;
+ PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
+ PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
+ key.port[pd2.sidx] = uh.uh_sport;
+ key.port[pd2.didx] = uh.uh_dport;
+
+ STATE_LOOKUP(kif, &key, direction, *state, pd);
+
+ /* translate source/destination address, if necessary */
+ if ((*state)->key[PF_SK_WIRE] !=
+ (*state)->key[PF_SK_STACK]) {
+ struct pf_state_key *nk =
+ (*state)->key[pd->didx];
+
+ if (PF_ANEQ(pd2.src,
+ &nk->addr[pd2.sidx], pd2.af) ||
+ nk->port[pd2.sidx] != uh.uh_sport)
+ pf_change_icmp(pd2.src, &uh.uh_sport,
+ daddr, &nk->addr[pd2.sidx],
+ nk->port[pd2.sidx], &uh.uh_sum,
+ pd2.ip_sum, icmpsum,
+ pd->ip_sum, 1, pd2.af);
+
+ if (PF_ANEQ(pd2.dst,
+ &nk->addr[pd2.didx], pd2.af) ||
+ nk->port[pd2.didx] != uh.uh_dport)
+ pf_change_icmp(pd2.dst, &uh.uh_dport,
+ NULL, /* XXX Inbound NAT? */
+ &nk->addr[pd2.didx],
+ nk->port[pd2.didx], &uh.uh_sum,
+ pd2.ip_sum, icmpsum,
+ pd->ip_sum, 1, pd2.af);
+
+ switch (pd2.af) {
+#ifdef INET
+ case AF_INET:
+ m_copyback(m, off, ICMP_MINLEN,
+ (caddr_t )pd->hdr.icmp);
+ m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ m_copyback(m, off,
+ sizeof(struct icmp6_hdr),
+ (caddr_t )pd->hdr.icmp6);
+ m_copyback(m, ipoff2, sizeof(h2_6),
+ (caddr_t )&h2_6);
+ break;
+#endif /* INET6 */
+ }
+ m_copyback(m, off2, sizeof(uh), (caddr_t)&uh);
+ }
+ return (PF_PASS);
+ break;
+ }
+#ifdef INET
+ case IPPROTO_ICMP: {
+ struct icmp iih;
+
+ if (!pf_pull_hdr(m, off2, &iih, ICMP_MINLEN,
+ NULL, reason, pd2.af)) {
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: ICMP error message too short i"
+ "(icmp)\n"));
+ return (PF_DROP);
+ }
+
+ key.af = pd2.af;
+ key.proto = IPPROTO_ICMP;
+ PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
+ PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
+ key.port[0] = key.port[1] = iih.icmp_id;
+
+ STATE_LOOKUP(kif, &key, direction, *state, pd);
+
+ /* translate source/destination address, if necessary */
+ if ((*state)->key[PF_SK_WIRE] !=
+ (*state)->key[PF_SK_STACK]) {
+ struct pf_state_key *nk =
+ (*state)->key[pd->didx];
+
+ if (PF_ANEQ(pd2.src,
+ &nk->addr[pd2.sidx], pd2.af) ||
+ nk->port[pd2.sidx] != iih.icmp_id)
+ pf_change_icmp(pd2.src, &iih.icmp_id,
+ daddr, &nk->addr[pd2.sidx],
+ nk->port[pd2.sidx], NULL,
+ pd2.ip_sum, icmpsum,
+ pd->ip_sum, 0, AF_INET);
+
+ if (PF_ANEQ(pd2.dst,
+ &nk->addr[pd2.didx], pd2.af) ||
+ nk->port[pd2.didx] != iih.icmp_id)
+ pf_change_icmp(pd2.dst, &iih.icmp_id,
+ NULL, /* XXX Inbound NAT? */
+ &nk->addr[pd2.didx],
+ nk->port[pd2.didx], NULL,
+ pd2.ip_sum, icmpsum,
+ pd->ip_sum, 0, AF_INET);
+
+ m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp);
+ m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
+ m_copyback(m, off2, ICMP_MINLEN, (caddr_t)&iih);
+ }
+ return (PF_PASS);
+ break;
+ }
+#endif /* INET */
+#ifdef INET6
+ case IPPROTO_ICMPV6: {
+ struct icmp6_hdr iih;
+
+ if (!pf_pull_hdr(m, off2, &iih,
+ sizeof(struct icmp6_hdr), NULL, reason, pd2.af)) {
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: ICMP error message too short "
+ "(icmp6)\n"));
+ return (PF_DROP);
+ }
+
+ key.af = pd2.af;
+ key.proto = IPPROTO_ICMPV6;
+ PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
+ PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
+ key.port[0] = key.port[1] = iih.icmp6_id;
+
+ STATE_LOOKUP(kif, &key, direction, *state, pd);
+
+ /* translate source/destination address, if necessary */
+ if ((*state)->key[PF_SK_WIRE] !=
+ (*state)->key[PF_SK_STACK]) {
+ struct pf_state_key *nk =
+ (*state)->key[pd->didx];
+
+ if (PF_ANEQ(pd2.src,
+ &nk->addr[pd2.sidx], pd2.af) ||
+ nk->port[pd2.sidx] != iih.icmp6_id)
+ pf_change_icmp(pd2.src, &iih.icmp6_id,
+ daddr, &nk->addr[pd2.sidx],
+ nk->port[pd2.sidx], NULL,
+ pd2.ip_sum, icmpsum,
+ pd->ip_sum, 0, AF_INET6);
+
+ if (PF_ANEQ(pd2.dst,
+ &nk->addr[pd2.didx], pd2.af) ||
+ nk->port[pd2.didx] != iih.icmp6_id)
+ pf_change_icmp(pd2.dst, &iih.icmp6_id,
+ NULL, /* XXX Inbound NAT? */
+ &nk->addr[pd2.didx],
+ nk->port[pd2.didx], NULL,
+ pd2.ip_sum, icmpsum,
+ pd->ip_sum, 0, AF_INET6);
+
+ m_copyback(m, off, sizeof(struct icmp6_hdr),
+ (caddr_t)pd->hdr.icmp6);
+ m_copyback(m, ipoff2, sizeof(h2_6), (caddr_t)&h2_6);
+ m_copyback(m, off2, sizeof(struct icmp6_hdr),
+ (caddr_t)&iih);
+ }
+ return (PF_PASS);
+ break;
+ }
+#endif /* INET6 */
+ default: {
+ key.af = pd2.af;
+ key.proto = pd2.proto;
+ PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
+ PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
+ key.port[0] = key.port[1] = 0;
+
+ STATE_LOOKUP(kif, &key, direction, *state, pd);
+
+ /* translate source/destination address, if necessary */
+ if ((*state)->key[PF_SK_WIRE] !=
+ (*state)->key[PF_SK_STACK]) {
+ struct pf_state_key *nk =
+ (*state)->key[pd->didx];
+
+ if (PF_ANEQ(pd2.src,
+ &nk->addr[pd2.sidx], pd2.af))
+ pf_change_icmp(pd2.src, NULL, daddr,
+ &nk->addr[pd2.sidx], 0, NULL,
+ pd2.ip_sum, icmpsum,
+ pd->ip_sum, 0, pd2.af);
+
+ if (PF_ANEQ(pd2.dst,
+ &nk->addr[pd2.didx], pd2.af))
+ pf_change_icmp(pd2.src, NULL,
+ NULL, /* XXX Inbound NAT? */
+ &nk->addr[pd2.didx], 0, NULL,
+ pd2.ip_sum, icmpsum,
+ pd->ip_sum, 0, pd2.af);
+
+ switch (pd2.af) {
+#ifdef INET
+ case AF_INET:
+ m_copyback(m, off, ICMP_MINLEN,
+ (caddr_t)pd->hdr.icmp);
+ m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ m_copyback(m, off,
+ sizeof(struct icmp6_hdr),
+ (caddr_t )pd->hdr.icmp6);
+ m_copyback(m, ipoff2, sizeof(h2_6),
+ (caddr_t )&h2_6);
+ break;
+#endif /* INET6 */
+ }
+ }
+ return (PF_PASS);
+ break;
+ }
+ }
+ }
+}
+
+static int
+pf_test_state_other(struct pf_state **state, int direction, struct pfi_kif *kif,
+ struct mbuf *m, struct pf_pdesc *pd)
+{
+ struct pf_state_peer *src, *dst;
+ struct pf_state_key_cmp key;
+
+ bzero(&key, sizeof(key));
+ key.af = pd->af;
+ key.proto = pd->proto;
+ if (direction == PF_IN) {
+ PF_ACPY(&key.addr[0], pd->src, key.af);
+ PF_ACPY(&key.addr[1], pd->dst, key.af);
+ key.port[0] = key.port[1] = 0;
+ } else {
+ PF_ACPY(&key.addr[1], pd->src, key.af);
+ PF_ACPY(&key.addr[0], pd->dst, key.af);
+ key.port[1] = key.port[0] = 0;
+ }
+
+ STATE_LOOKUP(kif, &key, direction, *state, pd);
+
+ if (direction == (*state)->direction) {
+ src = &(*state)->src;
+ dst = &(*state)->dst;
+ } else {
+ src = &(*state)->dst;
+ dst = &(*state)->src;
+ }
+
+ /* update states */
+ if (src->state < PFOTHERS_SINGLE)
+ src->state = PFOTHERS_SINGLE;
+ if (dst->state == PFOTHERS_SINGLE)
+ dst->state = PFOTHERS_MULTIPLE;
+
+ /* update expire time */
+ (*state)->expire = time_uptime;
+ if (src->state == PFOTHERS_MULTIPLE && dst->state == PFOTHERS_MULTIPLE)
+ (*state)->timeout = PFTM_OTHER_MULTIPLE;
+ else
+ (*state)->timeout = PFTM_OTHER_SINGLE;
+
+ /* translate source/destination address, if necessary */
+ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
+ struct pf_state_key *nk = (*state)->key[pd->didx];
+
+ KASSERT(nk, ("%s: nk is null", __func__));
+ KASSERT(pd, ("%s: pd is null", __func__));
+ KASSERT(pd->src, ("%s: pd->src is null", __func__));
+ KASSERT(pd->dst, ("%s: pd->dst is null", __func__));
+ switch (pd->af) {
+#ifdef INET
+ case AF_INET:
+ if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET))
+ pf_change_a(&pd->src->v4.s_addr,
+ pd->ip_sum,
+ nk->addr[pd->sidx].v4.s_addr,
+ 0);
+
+
+ if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET))
+ pf_change_a(&pd->dst->v4.s_addr,
+ pd->ip_sum,
+ nk->addr[pd->didx].v4.s_addr,
+ 0);
+
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET))
+ PF_ACPY(pd->src, &nk->addr[pd->sidx], pd->af);
+
+ if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET))
+ PF_ACPY(pd->dst, &nk->addr[pd->didx], pd->af);
+#endif /* INET6 */
+ }
+ }
+ return (PF_PASS);
+}
+
+/*
+ * ipoff and off are measured from the start of the mbuf chain.
+ * h must be at "ipoff" on the mbuf chain.
+ */
+void *
+pf_pull_hdr(struct mbuf *m, int off, void *p, int len,
+ u_short *actionp, u_short *reasonp, sa_family_t af)
+{
+ switch (af) {
+#ifdef INET
+ case AF_INET: {
+ struct ip *h = mtod(m, struct ip *);
+ u_int16_t fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3;
+
+ if (fragoff) {
+ if (fragoff >= len)
+ ACTION_SET(actionp, PF_PASS);
+ else {
+ ACTION_SET(actionp, PF_DROP);
+ REASON_SET(reasonp, PFRES_FRAG);
+ }
+ return (NULL);
+ }
+ if (m->m_pkthdr.len < off + len ||
+ ntohs(h->ip_len) < off + len) {
+ ACTION_SET(actionp, PF_DROP);
+ REASON_SET(reasonp, PFRES_SHORT);
+ return (NULL);
+ }
+ break;
+ }
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6: {
+ struct ip6_hdr *h = mtod(m, struct ip6_hdr *);
+
+ if (m->m_pkthdr.len < off + len ||
+ (ntohs(h->ip6_plen) + sizeof(struct ip6_hdr)) <
+ (unsigned)(off + len)) {
+ ACTION_SET(actionp, PF_DROP);
+ REASON_SET(reasonp, PFRES_SHORT);
+ return (NULL);
+ }
+ break;
+ }
+#endif /* INET6 */
+ }
+ m_copydata(m, off, len, p);
+ return (p);
+}
+
+int
+pf_routable(struct pf_addr *addr, sa_family_t af, struct pfi_kif *kif,
+ int rtableid)
+{
+#ifdef RADIX_MPATH
+ struct radix_node_head *rnh;
+#endif
+ struct sockaddr_in *dst;
+ int ret = 1;
+ int check_mpath;
+#ifdef INET6
+ struct sockaddr_in6 *dst6;
+ struct route_in6 ro;
+#else
+ struct route ro;
+#endif
+ struct radix_node *rn;
+ struct rtentry *rt;
+ struct ifnet *ifp;
+
+ check_mpath = 0;
+#ifdef RADIX_MPATH
+ /* XXX: stick to table 0 for now */
+ rnh = rt_tables_get_rnh(0, af);
+ if (rnh != NULL && rn_mpath_capable(rnh))
+ check_mpath = 1;
+#endif
+ bzero(&ro, sizeof(ro));
+ switch (af) {
+ case AF_INET:
+ dst = satosin(&ro.ro_dst);
+ dst->sin_family = AF_INET;
+ dst->sin_len = sizeof(*dst);
+ dst->sin_addr = addr->v4;
+ break;
+#ifdef INET6
+ case AF_INET6:
+ /*
+ * Skip check for addresses with embedded interface scope,
+ * as they would always match anyway.
+ */
+ if (IN6_IS_SCOPE_EMBED(&addr->v6))
+ goto out;
+ dst6 = (struct sockaddr_in6 *)&ro.ro_dst;
+ dst6->sin6_family = AF_INET6;
+ dst6->sin6_len = sizeof(*dst6);
+ dst6->sin6_addr = addr->v6;
+ break;
+#endif /* INET6 */
+ default:
+ return (0);
+ }
+
+ /* Skip checks for ipsec interfaces */
+ if (kif != NULL && kif->pfik_ifp->if_type == IFT_ENC)
+ goto out;
+
+ switch (af) {
+#ifdef INET6
+ case AF_INET6:
+ in6_rtalloc_ign(&ro, 0, rtableid);
+ break;
+#endif
+#ifdef INET
+ case AF_INET:
+ in_rtalloc_ign((struct route *)&ro, 0, rtableid);
+ break;
+#endif
+ default:
+ rtalloc_ign((struct route *)&ro, 0); /* No/default FIB. */
+ break;
+ }
+
+ if (ro.ro_rt != NULL) {
+ /* No interface given, this is a no-route check */
+ if (kif == NULL)
+ goto out;
+
+ if (kif->pfik_ifp == NULL) {
+ ret = 0;
+ goto out;
+ }
+
+ /* Perform uRPF check if passed input interface */
+ ret = 0;
+ rn = (struct radix_node *)ro.ro_rt;
+ do {
+ rt = (struct rtentry *)rn;
+ ifp = rt->rt_ifp;
+
+ if (kif->pfik_ifp == ifp)
+ ret = 1;
+#ifdef RADIX_MPATH
+ rn = rn_mpath_next(rn);
+#endif
+ } while (check_mpath == 1 && rn != NULL && ret == 0);
+ } else
+ ret = 0;
+out:
+ if (ro.ro_rt != NULL)
+ RTFREE(ro.ro_rt);
+ return (ret);
+}
+
+#ifdef INET
+static void
+pf_route(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp,
+ struct pf_state *s, struct pf_pdesc *pd)
+{
+ struct mbuf *m0, *m1;
+ struct sockaddr_in dst;
+ struct ip *ip;
+ struct ifnet *ifp = NULL;
+ struct pf_addr naddr;
+ struct pf_src_node *sn = NULL;
+ int error = 0;
+ int sw_csum;
+
+ KASSERT(m && *m && r && oifp, ("%s: invalid parameters", __func__));
+ KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: invalid direction",
+ __func__));
+
+ if ((pd->pf_mtag == NULL &&
+ ((pd->pf_mtag = pf_get_mtag(*m)) == NULL)) ||
+ pd->pf_mtag->routed++ > 3) {
+ m0 = *m;
+ *m = NULL;
+ goto bad_locked;
+ }
+
+ if (r->rt == PF_DUPTO) {
+ if ((m0 = m_dup(*m, M_NOWAIT)) == NULL) {
+ if (s)
+ PF_STATE_UNLOCK(s);
+ return;
+ }
+ } else {
+ if ((r->rt == PF_REPLYTO) == (r->direction == dir)) {
+ if (s)
+ PF_STATE_UNLOCK(s);
+ return;
+ }
+ m0 = *m;
+ }
+
+ ip = mtod(m0, struct ip *);
+
+ bzero(&dst, sizeof(dst));
+ dst.sin_family = AF_INET;
+ dst.sin_len = sizeof(dst);
+ dst.sin_addr = ip->ip_dst;
+
+ if (r->rt == PF_FASTROUTE) {
+ struct rtentry *rt;
+
+ if (s)
+ PF_STATE_UNLOCK(s);
+ rt = rtalloc1_fib(sintosa(&dst), 0, 0, M_GETFIB(m0));
+ if (rt == NULL) {
+ RTFREE_LOCKED(rt);
+ KMOD_IPSTAT_INC(ips_noroute);
+ error = EHOSTUNREACH;
+ goto bad;
+ }
+
+ ifp = rt->rt_ifp;
+ rt->rt_rmx.rmx_pksent++;
+
+ if (rt->rt_flags & RTF_GATEWAY)
+ bcopy(satosin(rt->rt_gateway), &dst, sizeof(dst));
+ RTFREE_LOCKED(rt);
+ } else {
+ if (TAILQ_EMPTY(&r->rpool.list)) {
+ DPFPRINTF(PF_DEBUG_URGENT,
+ ("%s: TAILQ_EMPTY(&r->rpool.list)\n", __func__));
+ goto bad_locked;
+ }
+ if (s == NULL) {
+ pf_map_addr(AF_INET, r, (struct pf_addr *)&ip->ip_src,
+ &naddr, NULL, &sn);
+ if (!PF_AZERO(&naddr, AF_INET))
+ dst.sin_addr.s_addr = naddr.v4.s_addr;
+ ifp = r->rpool.cur->kif ?
+ r->rpool.cur->kif->pfik_ifp : NULL;
+ } else {
+ if (!PF_AZERO(&s->rt_addr, AF_INET))
+ dst.sin_addr.s_addr =
+ s->rt_addr.v4.s_addr;
+ ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL;
+ PF_STATE_UNLOCK(s);
+ }
+ }
+ if (ifp == NULL)
+ goto bad;
+
+ if (oifp != ifp) {
+ if (pf_test(PF_OUT, ifp, &m0, NULL) != PF_PASS)
+ goto bad;
+ else if (m0 == NULL)
+ goto done;
+ if (m0->m_len < sizeof(struct ip)) {
+ DPFPRINTF(PF_DEBUG_URGENT,
+ ("%s: m0->m_len < sizeof(struct ip)\n", __func__));
+ goto bad;
+ }
+ ip = mtod(m0, struct ip *);
+ }
+
+ if (ifp->if_flags & IFF_LOOPBACK)
+ m0->m_flags |= M_SKIP_FIREWALL;
+
+ /* Back to host byte order. */
+ ip->ip_len = ntohs(ip->ip_len);
+ ip->ip_off = ntohs(ip->ip_off);
+
+ /* Copied from FreeBSD 10.0-CURRENT ip_output. */
+ m0->m_pkthdr.csum_flags |= CSUM_IP;
+ sw_csum = m0->m_pkthdr.csum_flags & ~ifp->if_hwassist;
+ if (sw_csum & CSUM_DELAY_DATA) {
+ in_delayed_cksum(m0);
+ sw_csum &= ~CSUM_DELAY_DATA;
+ }
+#ifdef SCTP
+ if (sw_csum & CSUM_SCTP) {
+ sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2));
+ sw_csum &= ~CSUM_SCTP;
+ }
+#endif
+ m0->m_pkthdr.csum_flags &= ifp->if_hwassist;
+
+ /*
+ * If small enough for interface, or the interface will take
+ * care of the fragmentation for us, we can just send directly.
+ */
+ if (ip->ip_len <= ifp->if_mtu ||
+ (m0->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0 ||
+ ((ip->ip_off & IP_DF) == 0 && (ifp->if_hwassist & CSUM_FRAGMENT))) {
+ ip->ip_len = htons(ip->ip_len);
+ ip->ip_off = htons(ip->ip_off);
+ ip->ip_sum = 0;
+ if (sw_csum & CSUM_DELAY_IP)
+ ip->ip_sum = in_cksum(m0, ip->ip_hl << 2);
+ m0->m_flags &= ~(M_PROTOFLAGS);
+ error = (*ifp->if_output)(ifp, m0, sintosa(&dst), NULL);
+ goto done;
+ }
+
+ /* Balk when DF bit is set or the interface didn't support TSO. */
+ if ((ip->ip_off & IP_DF) || (m0->m_pkthdr.csum_flags & CSUM_TSO)) {
+ error = EMSGSIZE;
+ KMOD_IPSTAT_INC(ips_cantfrag);
+ if (r->rt != PF_DUPTO) {
+ icmp_error(m0, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0,
+ ifp->if_mtu);
+ goto done;
+ } else
+ goto bad;
+ }
+
+ error = ip_fragment(ip, &m0, ifp->if_mtu, ifp->if_hwassist, sw_csum);
+ if (error)
+ goto bad;
+
+ for (; m0; m0 = m1) {
+ m1 = m0->m_nextpkt;
+ m0->m_nextpkt = NULL;
+ if (error == 0) {
+ m0->m_flags &= ~(M_PROTOFLAGS);
+ error = (*ifp->if_output)(ifp, m0, sintosa(&dst), NULL);
+ } else
+ m_freem(m0);
+ }
+
+ if (error == 0)
+ KMOD_IPSTAT_INC(ips_fragmented);
+
+done:
+ if (r->rt != PF_DUPTO)
+ *m = NULL;
+ return;
+
+bad_locked:
+ if (s)
+ PF_STATE_UNLOCK(s);
+bad:
+ m_freem(m0);
+ goto done;
+}
+#endif /* INET */
+
+#ifdef INET6
+static void
+pf_route6(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp,
+ struct pf_state *s, struct pf_pdesc *pd)
+{
+ struct mbuf *m0;
+ struct sockaddr_in6 dst;
+ struct ip6_hdr *ip6;
+ struct ifnet *ifp = NULL;
+ struct pf_addr naddr;
+ struct pf_src_node *sn = NULL;
+
+ KASSERT(m && *m && r && oifp, ("%s: invalid parameters", __func__));
+ KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: invalid direction",
+ __func__));
+
+ if ((pd->pf_mtag == NULL &&
+ ((pd->pf_mtag = pf_get_mtag(*m)) == NULL)) ||
+ pd->pf_mtag->routed++ > 3) {
+ m0 = *m;
+ *m = NULL;
+ goto bad_locked;
+ }
+
+ if (r->rt == PF_DUPTO) {
+ if ((m0 = m_dup(*m, M_NOWAIT)) == NULL) {
+ if (s)
+ PF_STATE_UNLOCK(s);
+ return;
+ }
+ } else {
+ if ((r->rt == PF_REPLYTO) == (r->direction == dir)) {
+ if (s)
+ PF_STATE_UNLOCK(s);
+ return;
+ }
+ m0 = *m;
+ }
+
+ ip6 = mtod(m0, struct ip6_hdr *);
+
+ bzero(&dst, sizeof(dst));
+ dst.sin6_family = AF_INET6;
+ dst.sin6_len = sizeof(dst);
+ dst.sin6_addr = ip6->ip6_dst;
+
+ /* Cheat. XXX why only in the v6 case??? */
+ if (r->rt == PF_FASTROUTE) {
+ if (s)
+ PF_STATE_UNLOCK(s);
+ m0->m_flags |= M_SKIP_FIREWALL;
+ ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL);
+ return;
+ }
+
+ if (TAILQ_EMPTY(&r->rpool.list)) {
+ DPFPRINTF(PF_DEBUG_URGENT,
+ ("%s: TAILQ_EMPTY(&r->rpool.list)\n", __func__));
+ goto bad_locked;
+ }
+ if (s == NULL) {
+ pf_map_addr(AF_INET6, r, (struct pf_addr *)&ip6->ip6_src,
+ &naddr, NULL, &sn);
+ if (!PF_AZERO(&naddr, AF_INET6))
+ PF_ACPY((struct pf_addr *)&dst.sin6_addr,
+ &naddr, AF_INET6);
+ ifp = r->rpool.cur->kif ? r->rpool.cur->kif->pfik_ifp : NULL;
+ } else {
+ if (!PF_AZERO(&s->rt_addr, AF_INET6))
+ PF_ACPY((struct pf_addr *)&dst.sin6_addr,
+ &s->rt_addr, AF_INET6);
+ ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL;
+ }
+
+ if (s)
+ PF_STATE_UNLOCK(s);
+
+ if (ifp == NULL)
+ goto bad;
+
+ if (oifp != ifp) {
+ if (pf_test6(PF_OUT, ifp, &m0, NULL) != PF_PASS)
+ goto bad;
+ else if (m0 == NULL)
+ goto done;
+ if (m0->m_len < sizeof(struct ip6_hdr)) {
+ DPFPRINTF(PF_DEBUG_URGENT,
+ ("%s: m0->m_len < sizeof(struct ip6_hdr)\n",
+ __func__));
+ goto bad;
+ }
+ ip6 = mtod(m0, struct ip6_hdr *);
+ }
+
+ if (ifp->if_flags & IFF_LOOPBACK)
+ m0->m_flags |= M_SKIP_FIREWALL;
+
+ /*
+ * If the packet is too large for the outgoing interface,
+ * send back an icmp6 error.
+ */
+ if (IN6_IS_SCOPE_EMBED(&dst.sin6_addr))
+ dst.sin6_addr.s6_addr16[1] = htons(ifp->if_index);
+ if ((u_long)m0->m_pkthdr.len <= ifp->if_mtu)
+ nd6_output(ifp, ifp, m0, &dst, NULL);
+ else {
+ in6_ifstat_inc(ifp, ifs6_in_toobig);
+ if (r->rt != PF_DUPTO)
+ icmp6_error(m0, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu);
+ else
+ goto bad;
+ }
+
+done:
+ if (r->rt != PF_DUPTO)
+ *m = NULL;
+ return;
+
+bad_locked:
+ if (s)
+ PF_STATE_UNLOCK(s);
+bad:
+ m_freem(m0);
+ goto done;
+}
+#endif /* INET6 */
+
+/*
+ * FreeBSD supports cksum offloads for the following drivers.
+ * em(4), fxp(4), ixgb(4), lge(4), ndis(4), nge(4), re(4),
+ * ti(4), txp(4), xl(4)
+ *
+ * CSUM_DATA_VALID | CSUM_PSEUDO_HDR :
+ * network driver performed cksum including pseudo header, need to verify
+ * csum_data
+ * CSUM_DATA_VALID :
+ * network driver performed cksum, needs to additional pseudo header
+ * cksum computation with partial csum_data(i.e. lack of H/W support for
+ * pseudo header, for instance hme(4), sk(4) and possibly gem(4))
+ *
+ * After validating the cksum of packet, set both flag CSUM_DATA_VALID and
+ * CSUM_PSEUDO_HDR in order to avoid recomputation of the cksum in upper
+ * TCP/UDP layer.
+ * Also, set csum_data to 0xffff to force cksum validation.
+ */
+static int
+pf_check_proto_cksum(struct mbuf *m, int off, int len, u_int8_t p, sa_family_t af)
+{
+ u_int16_t sum = 0;
+ int hw_assist = 0;
+ struct ip *ip;
+
+ if (off < sizeof(struct ip) || len < sizeof(struct udphdr))
+ return (1);
+ if (m->m_pkthdr.len < off + len)
+ return (1);
+
+ switch (p) {
+ case IPPROTO_TCP:
+ if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
+ if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
+ sum = m->m_pkthdr.csum_data;
+ } else {
+ ip = mtod(m, struct ip *);
+ sum = in_pseudo(ip->ip_src.s_addr,
+ ip->ip_dst.s_addr, htonl((u_short)len +
+ m->m_pkthdr.csum_data + IPPROTO_TCP));
+ }
+ sum ^= 0xffff;
+ ++hw_assist;
+ }
+ break;
+ case IPPROTO_UDP:
+ if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
+ if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
+ sum = m->m_pkthdr.csum_data;
+ } else {
+ ip = mtod(m, struct ip *);
+ sum = in_pseudo(ip->ip_src.s_addr,
+ ip->ip_dst.s_addr, htonl((u_short)len +
+ m->m_pkthdr.csum_data + IPPROTO_UDP));
+ }
+ sum ^= 0xffff;
+ ++hw_assist;
+ }
+ break;
+ case IPPROTO_ICMP:
+#ifdef INET6
+ case IPPROTO_ICMPV6:
+#endif /* INET6 */
+ break;
+ default:
+ return (1);
+ }
+
+ if (!hw_assist) {
+ switch (af) {
+ case AF_INET:
+ if (p == IPPROTO_ICMP) {
+ if (m->m_len < off)
+ return (1);
+ m->m_data += off;
+ m->m_len -= off;
+ sum = in_cksum(m, len);
+ m->m_data -= off;
+ m->m_len += off;
+ } else {
+ if (m->m_len < sizeof(struct ip))
+ return (1);
+ sum = in4_cksum(m, p, off, len);
+ }
+ break;
+#ifdef INET6
+ case AF_INET6:
+ if (m->m_len < sizeof(struct ip6_hdr))
+ return (1);
+ sum = in6_cksum(m, p, off, len);
+ break;
+#endif /* INET6 */
+ default:
+ return (1);
+ }
+ }
+ if (sum) {
+ switch (p) {
+ case IPPROTO_TCP:
+ {
+ KMOD_TCPSTAT_INC(tcps_rcvbadsum);
+ break;
+ }
+ case IPPROTO_UDP:
+ {
+ KMOD_UDPSTAT_INC(udps_badsum);
+ break;
+ }
+#ifdef INET
+ case IPPROTO_ICMP:
+ {
+ KMOD_ICMPSTAT_INC(icps_checksum);
+ break;
+ }
+#endif
+#ifdef INET6
+ case IPPROTO_ICMPV6:
+ {
+ KMOD_ICMP6STAT_INC(icp6s_checksum);
+ break;
+ }
+#endif /* INET6 */
+ }
+ return (1);
+ } else {
+ if (p == IPPROTO_TCP || p == IPPROTO_UDP) {
+ m->m_pkthdr.csum_flags |=
+ (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
+ m->m_pkthdr.csum_data = 0xffff;
+ }
+ }
+ return (0);
+}
+
+
+#ifdef INET
+int
+pf_test(int dir, struct ifnet *ifp, struct mbuf **m0, struct inpcb *inp)
+{
+ struct pfi_kif *kif;
+ u_short action, reason = 0, log = 0;
+ struct mbuf *m = *m0;
+ struct ip *h = NULL;
+ struct m_tag *ipfwtag;
+ struct pf_rule *a = NULL, *r = &V_pf_default_rule, *tr, *nr;
+ struct pf_state *s = NULL;
+ struct pf_ruleset *ruleset = NULL;
+ struct pf_pdesc pd;
+ int off, dirndx, pqid = 0;
+
+ M_ASSERTPKTHDR(m);
+
+ if (!V_pf_status.running)
+ return (PF_PASS);
+
+ memset(&pd, 0, sizeof(pd));
+
+ kif = (struct pfi_kif *)ifp->if_pf_kif;
+
+ if (kif == NULL) {
+ DPFPRINTF(PF_DEBUG_URGENT,
+ ("pf_test: kif == NULL, if_xname %s\n", ifp->if_xname));
+ return (PF_DROP);
+ }
+ if (kif->pfik_flags & PFI_IFLAG_SKIP)
+ return (PF_PASS);
+
+ if (m->m_flags & M_SKIP_FIREWALL)
+ return (PF_PASS);
+
+ if (m->m_pkthdr.len < (int)sizeof(struct ip)) {
+ action = PF_DROP;
+ REASON_SET(&reason, PFRES_SHORT);
+ log = 1;
+ goto done;
+ }
+
+ pd.pf_mtag = pf_find_mtag(m);
+
+ PF_RULES_RLOCK();
+
+ if (ip_divert_ptr != NULL &&
+ ((ipfwtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL)) != NULL)) {
+ struct ipfw_rule_ref *rr = (struct ipfw_rule_ref *)(ipfwtag+1);
+ if (rr->info & IPFW_IS_DIVERT && rr->rulenum == 0) {
+ if (pd.pf_mtag == NULL &&
+ ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
+ action = PF_DROP;
+ goto done;
+ }
+ pd.pf_mtag->flags |= PF_PACKET_LOOPED;
+ m_tag_delete(m, ipfwtag);
+ }
+ if (pd.pf_mtag && pd.pf_mtag->flags & PF_FASTFWD_OURS_PRESENT) {
+ m->m_flags |= M_FASTFWD_OURS;
+ pd.pf_mtag->flags &= ~PF_FASTFWD_OURS_PRESENT;
+ }
+ } else if (pf_normalize_ip(m0, dir, kif, &reason, &pd) != PF_PASS) {
+ /* We do IP header normalization and packet reassembly here */
+ action = PF_DROP;
+ goto done;
+ }
+ m = *m0; /* pf_normalize messes with m0 */
+ h = mtod(m, struct ip *);
+
+ off = h->ip_hl << 2;
+ if (off < (int)sizeof(struct ip)) {
+ action = PF_DROP;
+ REASON_SET(&reason, PFRES_SHORT);
+ log = 1;
+ goto done;
+ }
+
+ pd.src = (struct pf_addr *)&h->ip_src;
+ pd.dst = (struct pf_addr *)&h->ip_dst;
+ pd.sport = pd.dport = NULL;
+ pd.ip_sum = &h->ip_sum;
+ pd.proto_sum = NULL;
+ pd.proto = h->ip_p;
+ pd.dir = dir;
+ pd.sidx = (dir == PF_IN) ? 0 : 1;
+ pd.didx = (dir == PF_IN) ? 1 : 0;
+ pd.af = AF_INET;
+ pd.tos = h->ip_tos;
+ pd.tot_len = ntohs(h->ip_len);
+
+ /* handle fragments that didn't get reassembled by normalization */
+ if (h->ip_off & htons(IP_MF | IP_OFFMASK)) {
+ action = pf_test_fragment(&r, dir, kif, m, h,
+ &pd, &a, &ruleset);
+ goto done;
+ }
+
+ switch (h->ip_p) {
+
+ case IPPROTO_TCP: {
+ struct tcphdr th;
+
+ pd.hdr.tcp = &th;
+ if (!pf_pull_hdr(m, off, &th, sizeof(th),
+ &action, &reason, AF_INET)) {
+ log = action != PF_PASS;
+ goto done;
+ }
+ pd.p_len = pd.tot_len - off - (th.th_off << 2);
+ if ((th.th_flags & TH_ACK) && pd.p_len == 0)
+ pqid = 1;
+ action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd);
+ if (action == PF_DROP)
+ goto done;
+ action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd,
+ &reason);
+ if (action == PF_PASS) {
+ if (pfsync_update_state_ptr != NULL)
+ pfsync_update_state_ptr(s);
+ r = s->rule.ptr;
+ a = s->anchor.ptr;
+ log = s->log;
+ } else if (s == NULL)
+ action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
+ &a, &ruleset, inp);
+ break;
+ }
+
+ case IPPROTO_UDP: {
+ struct udphdr uh;
+
+ pd.hdr.udp = &uh;
+ if (!pf_pull_hdr(m, off, &uh, sizeof(uh),
+ &action, &reason, AF_INET)) {
+ log = action != PF_PASS;
+ goto done;
+ }
+ if (uh.uh_dport == 0 ||
+ ntohs(uh.uh_ulen) > m->m_pkthdr.len - off ||
+ ntohs(uh.uh_ulen) < sizeof(struct udphdr)) {
+ action = PF_DROP;
+ REASON_SET(&reason, PFRES_SHORT);
+ goto done;
+ }
+ action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd);
+ if (action == PF_PASS) {
+ if (pfsync_update_state_ptr != NULL)
+ pfsync_update_state_ptr(s);
+ r = s->rule.ptr;
+ a = s->anchor.ptr;
+ log = s->log;
+ } else if (s == NULL)
+ action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
+ &a, &ruleset, inp);
+ break;
+ }
+
+ case IPPROTO_ICMP: {
+ struct icmp ih;
+
+ pd.hdr.icmp = &ih;
+ if (!pf_pull_hdr(m, off, &ih, ICMP_MINLEN,
+ &action, &reason, AF_INET)) {
+ log = action != PF_PASS;
+ goto done;
+ }
+ action = pf_test_state_icmp(&s, dir, kif, m, off, h, &pd,
+ &reason);
+ if (action == PF_PASS) {
+ if (pfsync_update_state_ptr != NULL)
+ pfsync_update_state_ptr(s);
+ r = s->rule.ptr;
+ a = s->anchor.ptr;
+ log = s->log;
+ } else if (s == NULL)
+ action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
+ &a, &ruleset, inp);
+ break;
+ }
+
+#ifdef INET6
+ case IPPROTO_ICMPV6: {
+ action = PF_DROP;
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: dropping IPv4 packet with ICMPv6 payload\n"));
+ goto done;
+ }
+#endif
+
+ default:
+ action = pf_test_state_other(&s, dir, kif, m, &pd);
+ if (action == PF_PASS) {
+ if (pfsync_update_state_ptr != NULL)
+ pfsync_update_state_ptr(s);
+ r = s->rule.ptr;
+ a = s->anchor.ptr;
+ log = s->log;
+ } else if (s == NULL)
+ action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
+ &a, &ruleset, inp);
+ break;
+ }
+
+done:
+ PF_RULES_RUNLOCK();
+ if (action == PF_PASS && h->ip_hl > 5 &&
+ !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) {
+ action = PF_DROP;
+ REASON_SET(&reason, PFRES_IPOPTIONS);
+ log = 1;
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: dropping packet with ip options\n"));
+ }
+
+ if (s && s->tag > 0 && pf_tag_packet(m, &pd, s->tag)) {
+ action = PF_DROP;
+ REASON_SET(&reason, PFRES_MEMORY);
+ }
+ if (r->rtableid >= 0)
+ M_SETFIB(m, r->rtableid);
+
+#ifdef ALTQ
+ if (action == PF_PASS && r->qid) {
+ if (pd.pf_mtag == NULL &&
+ ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
+ action = PF_DROP;
+ REASON_SET(&reason, PFRES_MEMORY);
+ }
+ if (pqid || (pd.tos & IPTOS_LOWDELAY))
+ pd.pf_mtag->qid = r->pqid;
+ else
+ pd.pf_mtag->qid = r->qid;
+ /* add hints for ecn */
+ pd.pf_mtag->hdr = h;
+
+ }
+#endif /* ALTQ */
+
+ /*
+ * connections redirected to loopback should not match sockets
+ * bound specifically to loopback due to security implications,
+ * see tcp_input() and in_pcblookup_listen().
+ */
+ if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP ||
+ pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL &&
+ (s->nat_rule.ptr->action == PF_RDR ||
+ s->nat_rule.ptr->action == PF_BINAT) &&
+ (ntohl(pd.dst->v4.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
+ m->m_flags |= M_SKIP_FIREWALL;
+
+ if (action == PF_PASS && r->divert.port && ip_divert_ptr != NULL &&
+ !PACKET_LOOPED(&pd)) {
+
+ ipfwtag = m_tag_alloc(MTAG_IPFW_RULE, 0,
+ sizeof(struct ipfw_rule_ref), M_NOWAIT | M_ZERO);
+ if (ipfwtag != NULL) {
+ ((struct ipfw_rule_ref *)(ipfwtag+1))->info =
+ ntohs(r->divert.port);
+ ((struct ipfw_rule_ref *)(ipfwtag+1))->rulenum = dir;
+
+ if (s)
+ PF_STATE_UNLOCK(s);
+
+ m_tag_prepend(m, ipfwtag);
+ if (m->m_flags & M_FASTFWD_OURS) {
+ if (pd.pf_mtag == NULL &&
+ ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
+ action = PF_DROP;
+ REASON_SET(&reason, PFRES_MEMORY);
+ log = 1;
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: failed to allocate tag\n"));
+ }
+ pd.pf_mtag->flags |= PF_FASTFWD_OURS_PRESENT;
+ m->m_flags &= ~M_FASTFWD_OURS;
+ }
+ ip_divert_ptr(*m0, dir == PF_IN ? DIR_IN : DIR_OUT);
+ *m0 = NULL;
+
+ return (action);
+ } else {
+ /* XXX: ipfw has the same behaviour! */
+ action = PF_DROP;
+ REASON_SET(&reason, PFRES_MEMORY);
+ log = 1;
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: failed to allocate divert tag\n"));
+ }
+ }
+
+ if (log) {
+ struct pf_rule *lr;
+
+ if (s != NULL && s->nat_rule.ptr != NULL &&
+ s->nat_rule.ptr->log & PF_LOG_ALL)
+ lr = s->nat_rule.ptr;
+ else
+ lr = r;
+ PFLOG_PACKET(kif, m, AF_INET, dir, reason, lr, a, ruleset, &pd,
+ (s == NULL));
+ }
+
+ kif->pfik_bytes[0][dir == PF_OUT][action != PF_PASS] += pd.tot_len;
+ kif->pfik_packets[0][dir == PF_OUT][action != PF_PASS]++;
+
+ if (action == PF_PASS || r->action == PF_DROP) {
+ dirndx = (dir == PF_OUT);
+ r->packets[dirndx]++;
+ r->bytes[dirndx] += pd.tot_len;
+ if (a != NULL) {
+ a->packets[dirndx]++;
+ a->bytes[dirndx] += pd.tot_len;
+ }
+ if (s != NULL) {
+ if (s->nat_rule.ptr != NULL) {
+ s->nat_rule.ptr->packets[dirndx]++;
+ s->nat_rule.ptr->bytes[dirndx] += pd.tot_len;
+ }
+ if (s->src_node != NULL) {
+ s->src_node->packets[dirndx]++;
+ s->src_node->bytes[dirndx] += pd.tot_len;
+ }
+ if (s->nat_src_node != NULL) {
+ s->nat_src_node->packets[dirndx]++;
+ s->nat_src_node->bytes[dirndx] += pd.tot_len;
+ }
+ dirndx = (dir == s->direction) ? 0 : 1;
+ s->packets[dirndx]++;
+ s->bytes[dirndx] += pd.tot_len;
+ }
+ tr = r;
+ nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule;
+ if (nr != NULL && r == &V_pf_default_rule)
+ tr = nr;
+ if (tr->src.addr.type == PF_ADDR_TABLE)
+ pfr_update_stats(tr->src.addr.p.tbl,
+ (s == NULL) ? pd.src :
+ &s->key[(s->direction == PF_IN)]->
+ addr[(s->direction == PF_OUT)],
+ pd.af, pd.tot_len, dir == PF_OUT,
+ r->action == PF_PASS, tr->src.neg);
+ if (tr->dst.addr.type == PF_ADDR_TABLE)
+ pfr_update_stats(tr->dst.addr.p.tbl,
+ (s == NULL) ? pd.dst :
+ &s->key[(s->direction == PF_IN)]->
+ addr[(s->direction == PF_IN)],
+ pd.af, pd.tot_len, dir == PF_OUT,
+ r->action == PF_PASS, tr->dst.neg);
+ }
+
+ switch (action) {
+ case PF_SYNPROXY_DROP:
+ m_freem(*m0);
+ case PF_DEFER:
+ *m0 = NULL;
+ action = PF_PASS;
+ break;
+ default:
+ /* pf_route() returns unlocked. */
+ if (r->rt) {
+ pf_route(m0, r, dir, kif->pfik_ifp, s, &pd);
+ return (action);
+ }
+ break;
+ }
+ if (s)
+ PF_STATE_UNLOCK(s);
+
+ return (action);
+}
+#endif /* INET */
+
+#ifdef INET6
+int
+pf_test6(int dir, struct ifnet *ifp, struct mbuf **m0, struct inpcb *inp)
+{
+ struct pfi_kif *kif;
+ u_short action, reason = 0, log = 0;
+ struct mbuf *m = *m0, *n = NULL;
+ struct ip6_hdr *h = NULL;
+ struct pf_rule *a = NULL, *r = &V_pf_default_rule, *tr, *nr;
+ struct pf_state *s = NULL;
+ struct pf_ruleset *ruleset = NULL;
+ struct pf_pdesc pd;
+ int off, terminal = 0, dirndx, rh_cnt = 0;
+
+ M_ASSERTPKTHDR(m);
+
+ if (!V_pf_status.running)
+ return (PF_PASS);
+
+ memset(&pd, 0, sizeof(pd));
+ pd.pf_mtag = pf_find_mtag(m);
+
+ if (pd.pf_mtag && pd.pf_mtag->flags & PF_TAG_GENERATED)
+ return (PF_PASS);
+
+ kif = (struct pfi_kif *)ifp->if_pf_kif;
+ if (kif == NULL) {
+ DPFPRINTF(PF_DEBUG_URGENT,
+ ("pf_test6: kif == NULL, if_xname %s\n", ifp->if_xname));
+ return (PF_DROP);
+ }
+ if (kif->pfik_flags & PFI_IFLAG_SKIP)
+ return (PF_PASS);
+
+ if (m->m_pkthdr.len < (int)sizeof(*h)) {
+ action = PF_DROP;
+ REASON_SET(&reason, PFRES_SHORT);
+ log = 1;
+ goto done;
+ }
+
+ PF_RULES_RLOCK();
+
+ /* We do IP header normalization and packet reassembly here */
+ if (pf_normalize_ip6(m0, dir, kif, &reason, &pd) != PF_PASS) {
+ action = PF_DROP;
+ goto done;
+ }
+ m = *m0; /* pf_normalize messes with m0 */
+ h = mtod(m, struct ip6_hdr *);
+
+#if 1
+ /*
+ * we do not support jumbogram yet. if we keep going, zero ip6_plen
+ * will do something bad, so drop the packet for now.
+ */
+ if (htons(h->ip6_plen) == 0) {
+ action = PF_DROP;
+ REASON_SET(&reason, PFRES_NORM); /*XXX*/
+ goto done;
+ }
+#endif
+
+ pd.src = (struct pf_addr *)&h->ip6_src;
+ pd.dst = (struct pf_addr *)&h->ip6_dst;
+ pd.sport = pd.dport = NULL;
+ pd.ip_sum = NULL;
+ pd.proto_sum = NULL;
+ pd.dir = dir;
+ pd.sidx = (dir == PF_IN) ? 0 : 1;
+ pd.didx = (dir == PF_IN) ? 1 : 0;
+ pd.af = AF_INET6;
+ pd.tos = 0;
+ pd.tot_len = ntohs(h->ip6_plen) + sizeof(struct ip6_hdr);
+
+ off = ((caddr_t)h - m->m_data) + sizeof(struct ip6_hdr);
+ pd.proto = h->ip6_nxt;
+ do {
+ switch (pd.proto) {
+ case IPPROTO_FRAGMENT:
+ action = pf_test_fragment(&r, dir, kif, m, h,
+ &pd, &a, &ruleset);
+ if (action == PF_DROP)
+ REASON_SET(&reason, PFRES_FRAG);
+ goto done;
+ case IPPROTO_ROUTING: {
+ struct ip6_rthdr rthdr;
+
+ if (rh_cnt++) {
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: IPv6 more than one rthdr\n"));
+ action = PF_DROP;
+ REASON_SET(&reason, PFRES_IPOPTIONS);
+ log = 1;
+ goto done;
+ }
+ if (!pf_pull_hdr(m, off, &rthdr, sizeof(rthdr), NULL,
+ &reason, pd.af)) {
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: IPv6 short rthdr\n"));
+ action = PF_DROP;
+ REASON_SET(&reason, PFRES_SHORT);
+ log = 1;
+ goto done;
+ }
+ if (rthdr.ip6r_type == IPV6_RTHDR_TYPE_0) {
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: IPv6 rthdr0\n"));
+ action = PF_DROP;
+ REASON_SET(&reason, PFRES_IPOPTIONS);
+ log = 1;
+ goto done;
+ }
+ /* FALLTHROUGH */
+ }
+ case IPPROTO_AH:
+ case IPPROTO_HOPOPTS:
+ case IPPROTO_DSTOPTS: {
+ /* get next header and header length */
+ struct ip6_ext opt6;
+
+ if (!pf_pull_hdr(m, off, &opt6, sizeof(opt6),
+ NULL, &reason, pd.af)) {
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: IPv6 short opt\n"));
+ action = PF_DROP;
+ log = 1;
+ goto done;
+ }
+ if (pd.proto == IPPROTO_AH)
+ off += (opt6.ip6e_len + 2) * 4;
+ else
+ off += (opt6.ip6e_len + 1) * 8;
+ pd.proto = opt6.ip6e_nxt;
+ /* goto the next header */
+ break;
+ }
+ default:
+ terminal++;
+ break;
+ }
+ } while (!terminal);
+
+ /* if there's no routing header, use unmodified mbuf for checksumming */
+ if (!n)
+ n = m;
+
+ switch (pd.proto) {
+
+ case IPPROTO_TCP: {
+ struct tcphdr th;
+
+ pd.hdr.tcp = &th;
+ if (!pf_pull_hdr(m, off, &th, sizeof(th),
+ &action, &reason, AF_INET6)) {
+ log = action != PF_PASS;
+ goto done;
+ }
+ pd.p_len = pd.tot_len - off - (th.th_off << 2);
+ action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd);
+ if (action == PF_DROP)
+ goto done;
+ action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd,
+ &reason);
+ if (action == PF_PASS) {
+ if (pfsync_update_state_ptr != NULL)
+ pfsync_update_state_ptr(s);
+ r = s->rule.ptr;
+ a = s->anchor.ptr;
+ log = s->log;
+ } else if (s == NULL)
+ action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
+ &a, &ruleset, inp);
+ break;
+ }
+
+ case IPPROTO_UDP: {
+ struct udphdr uh;
+
+ pd.hdr.udp = &uh;
+ if (!pf_pull_hdr(m, off, &uh, sizeof(uh),
+ &action, &reason, AF_INET6)) {
+ log = action != PF_PASS;
+ goto done;
+ }
+ if (uh.uh_dport == 0 ||
+ ntohs(uh.uh_ulen) > m->m_pkthdr.len - off ||
+ ntohs(uh.uh_ulen) < sizeof(struct udphdr)) {
+ action = PF_DROP;
+ REASON_SET(&reason, PFRES_SHORT);
+ goto done;
+ }
+ action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd);
+ if (action == PF_PASS) {
+ if (pfsync_update_state_ptr != NULL)
+ pfsync_update_state_ptr(s);
+ r = s->rule.ptr;
+ a = s->anchor.ptr;
+ log = s->log;
+ } else if (s == NULL)
+ action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
+ &a, &ruleset, inp);
+ break;
+ }
+
+ case IPPROTO_ICMP: {
+ action = PF_DROP;
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: dropping IPv6 packet with ICMPv4 payload\n"));
+ goto done;
+ }
+
+ case IPPROTO_ICMPV6: {
+ struct icmp6_hdr ih;
+
+ pd.hdr.icmp6 = &ih;
+ if (!pf_pull_hdr(m, off, &ih, sizeof(ih),
+ &action, &reason, AF_INET6)) {
+ log = action != PF_PASS;
+ goto done;
+ }
+ action = pf_test_state_icmp(&s, dir, kif,
+ m, off, h, &pd, &reason);
+ if (action == PF_PASS) {
+ if (pfsync_update_state_ptr != NULL)
+ pfsync_update_state_ptr(s);
+ r = s->rule.ptr;
+ a = s->anchor.ptr;
+ log = s->log;
+ } else if (s == NULL)
+ action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
+ &a, &ruleset, inp);
+ break;
+ }
+
+ default:
+ action = pf_test_state_other(&s, dir, kif, m, &pd);
+ if (action == PF_PASS) {
+ if (pfsync_update_state_ptr != NULL)
+ pfsync_update_state_ptr(s);
+ r = s->rule.ptr;
+ a = s->anchor.ptr;
+ log = s->log;
+ } else if (s == NULL)
+ action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
+ &a, &ruleset, inp);
+ break;
+ }
+
+done:
+ PF_RULES_RUNLOCK();
+ if (n != m) {
+ m_freem(n);
+ n = NULL;
+ }
+
+ /* handle dangerous IPv6 extension headers. */
+ if (action == PF_PASS && rh_cnt &&
+ !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) {
+ action = PF_DROP;
+ REASON_SET(&reason, PFRES_IPOPTIONS);
+ log = 1;
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: dropping packet with dangerous v6 headers\n"));
+ }
+
+ if (s && s->tag > 0 && pf_tag_packet(m, &pd, s->tag)) {
+ action = PF_DROP;
+ REASON_SET(&reason, PFRES_MEMORY);
+ }
+ if (r->rtableid >= 0)
+ M_SETFIB(m, r->rtableid);
+
+#ifdef ALTQ
+ if (action == PF_PASS && r->qid) {
+ if (pd.pf_mtag == NULL &&
+ ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
+ action = PF_DROP;
+ REASON_SET(&reason, PFRES_MEMORY);
+ }
+ if (pd.tos & IPTOS_LOWDELAY)
+ pd.pf_mtag->qid = r->pqid;
+ else
+ pd.pf_mtag->qid = r->qid;
+ /* add hints for ecn */
+ pd.pf_mtag->hdr = h;
+ }
+#endif /* ALTQ */
+
+ if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP ||
+ pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL &&
+ (s->nat_rule.ptr->action == PF_RDR ||
+ s->nat_rule.ptr->action == PF_BINAT) &&
+ IN6_IS_ADDR_LOOPBACK(&pd.dst->v6))
+ m->m_flags |= M_SKIP_FIREWALL;
+
+ /* XXX: Anybody working on it?! */
+ if (r->divert.port)
+ printf("pf: divert(9) is not supported for IPv6\n");
+
+ if (log) {
+ struct pf_rule *lr;
+
+ if (s != NULL && s->nat_rule.ptr != NULL &&
+ s->nat_rule.ptr->log & PF_LOG_ALL)
+ lr = s->nat_rule.ptr;
+ else
+ lr = r;
+ PFLOG_PACKET(kif, m, AF_INET6, dir, reason, lr, a, ruleset,
+ &pd, (s == NULL));
+ }
+
+ kif->pfik_bytes[1][dir == PF_OUT][action != PF_PASS] += pd.tot_len;
+ kif->pfik_packets[1][dir == PF_OUT][action != PF_PASS]++;
+
+ if (action == PF_PASS || r->action == PF_DROP) {
+ dirndx = (dir == PF_OUT);
+ r->packets[dirndx]++;
+ r->bytes[dirndx] += pd.tot_len;
+ if (a != NULL) {
+ a->packets[dirndx]++;
+ a->bytes[dirndx] += pd.tot_len;
+ }
+ if (s != NULL) {
+ if (s->nat_rule.ptr != NULL) {
+ s->nat_rule.ptr->packets[dirndx]++;
+ s->nat_rule.ptr->bytes[dirndx] += pd.tot_len;
+ }
+ if (s->src_node != NULL) {
+ s->src_node->packets[dirndx]++;
+ s->src_node->bytes[dirndx] += pd.tot_len;
+ }
+ if (s->nat_src_node != NULL) {
+ s->nat_src_node->packets[dirndx]++;
+ s->nat_src_node->bytes[dirndx] += pd.tot_len;
+ }
+ dirndx = (dir == s->direction) ? 0 : 1;
+ s->packets[dirndx]++;
+ s->bytes[dirndx] += pd.tot_len;
+ }
+ tr = r;
+ nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule;
+ if (nr != NULL && r == &V_pf_default_rule)
+ tr = nr;
+ if (tr->src.addr.type == PF_ADDR_TABLE)
+ pfr_update_stats(tr->src.addr.p.tbl,
+ (s == NULL) ? pd.src :
+ &s->key[(s->direction == PF_IN)]->addr[0],
+ pd.af, pd.tot_len, dir == PF_OUT,
+ r->action == PF_PASS, tr->src.neg);
+ if (tr->dst.addr.type == PF_ADDR_TABLE)
+ pfr_update_stats(tr->dst.addr.p.tbl,
+ (s == NULL) ? pd.dst :
+ &s->key[(s->direction == PF_IN)]->addr[1],
+ pd.af, pd.tot_len, dir == PF_OUT,
+ r->action == PF_PASS, tr->dst.neg);
+ }
+
+ switch (action) {
+ case PF_SYNPROXY_DROP:
+ m_freem(*m0);
+ case PF_DEFER:
+ *m0 = NULL;
+ action = PF_PASS;
+ break;
+ default:
+ /* pf_route6() returns unlocked. */
+ if (r->rt) {
+ pf_route6(m0, r, dir, kif->pfik_ifp, s, &pd);
+ return (action);
+ }
+ break;
+ }
+
+ if (s)
+ PF_STATE_UNLOCK(s);
+
+ return (action);
+}
+#endif /* INET6 */
diff --git a/sys/netpfil/pf/pf_if.c b/sys/netpfil/pf/pf_if.c
new file mode 100644
index 0000000..c010b65
--- /dev/null
+++ b/sys/netpfil/pf/pf_if.c
@@ -0,0 +1,859 @@
+/* $OpenBSD: pf_if.c,v 1.54 2008/06/14 16:55:28 mk Exp $ */
+
+/*
+ * Copyright 2005 Henning Brauer <henning@openbsd.org>
+ * Copyright 2005 Ryan McBride <mcbride@openbsd.org>
+ * Copyright (c) 2001 Daniel Hartmeier
+ * Copyright (c) 2003 Cedric Berger
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/socket.h>
+
+#include <net/if.h>
+#include <net/pfvar.h>
+#include <net/route.h>
+
+VNET_DEFINE(struct pfi_kif *, pfi_all);
+static VNET_DEFINE(long, pfi_update);
+#define V_pfi_update VNET(pfi_update)
+#define PFI_BUFFER_MAX 0x10000
+
+static VNET_DEFINE(struct pfr_addr *, pfi_buffer);
+static VNET_DEFINE(int, pfi_buffer_cnt);
+static VNET_DEFINE(int, pfi_buffer_max);
+#define V_pfi_buffer VNET(pfi_buffer)
+#define V_pfi_buffer_cnt VNET(pfi_buffer_cnt)
+#define V_pfi_buffer_max VNET(pfi_buffer_max)
+
+eventhandler_tag pfi_attach_cookie;
+eventhandler_tag pfi_detach_cookie;
+eventhandler_tag pfi_attach_group_cookie;
+eventhandler_tag pfi_change_group_cookie;
+eventhandler_tag pfi_detach_group_cookie;
+eventhandler_tag pfi_ifaddr_event_cookie;
+
+static void pfi_attach_ifnet(struct ifnet *);
+static void pfi_attach_ifgroup(struct ifg_group *);
+
+static void pfi_kif_update(struct pfi_kif *);
+static void pfi_dynaddr_update(struct pfi_dynaddr *dyn);
+static void pfi_table_update(struct pfr_ktable *, struct pfi_kif *, int,
+ int);
+static void pfi_instance_add(struct ifnet *, int, int);
+static void pfi_address_add(struct sockaddr *, int, int);
+static int pfi_if_compare(struct pfi_kif *, struct pfi_kif *);
+static int pfi_skip_if(const char *, struct pfi_kif *);
+static int pfi_unmask(void *);
+static void pfi_attach_ifnet_event(void * __unused, struct ifnet *);
+static void pfi_detach_ifnet_event(void * __unused, struct ifnet *);
+static void pfi_attach_group_event(void *, struct ifg_group *);
+static void pfi_change_group_event(void *, char *);
+static void pfi_detach_group_event(void *, struct ifg_group *);
+static void pfi_ifaddr_event(void * __unused, struct ifnet *);
+
+RB_HEAD(pfi_ifhead, pfi_kif);
+static RB_PROTOTYPE(pfi_ifhead, pfi_kif, pfik_tree, pfi_if_compare);
+static RB_GENERATE(pfi_ifhead, pfi_kif, pfik_tree, pfi_if_compare);
+static VNET_DEFINE(struct pfi_ifhead, pfi_ifs);
+#define V_pfi_ifs VNET(pfi_ifs)
+
+#define PFI_BUFFER_MAX 0x10000
+MALLOC_DEFINE(PFI_MTYPE, "pf_ifnet", "pf(4) interface database");
+
+LIST_HEAD(pfi_list, pfi_kif);
+static VNET_DEFINE(struct pfi_list, pfi_unlinked_kifs);
+#define V_pfi_unlinked_kifs VNET(pfi_unlinked_kifs)
+static struct mtx pfi_unlnkdkifs_mtx;
+
+void
+pfi_initialize(void)
+{
+ struct ifg_group *ifg;
+ struct ifnet *ifp;
+ struct pfi_kif *kif;
+
+ V_pfi_buffer_max = 64;
+ V_pfi_buffer = malloc(V_pfi_buffer_max * sizeof(*V_pfi_buffer),
+ PFI_MTYPE, M_WAITOK);
+
+ mtx_init(&pfi_unlnkdkifs_mtx, "pf unlinked interfaces", NULL, MTX_DEF);
+
+ kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK);
+ PF_RULES_WLOCK();
+ V_pfi_all = pfi_kif_attach(kif, IFG_ALL);
+ PF_RULES_WUNLOCK();
+
+ IFNET_RLOCK();
+ TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next)
+ pfi_attach_ifgroup(ifg);
+ TAILQ_FOREACH(ifp, &V_ifnet, if_link)
+ pfi_attach_ifnet(ifp);
+ IFNET_RUNLOCK();
+
+ pfi_attach_cookie = EVENTHANDLER_REGISTER(ifnet_arrival_event,
+ pfi_attach_ifnet_event, NULL, EVENTHANDLER_PRI_ANY);
+ pfi_detach_cookie = EVENTHANDLER_REGISTER(ifnet_departure_event,
+ pfi_detach_ifnet_event, NULL, EVENTHANDLER_PRI_ANY);
+ pfi_attach_group_cookie = EVENTHANDLER_REGISTER(group_attach_event,
+ pfi_attach_group_event, curvnet, EVENTHANDLER_PRI_ANY);
+ pfi_change_group_cookie = EVENTHANDLER_REGISTER(group_change_event,
+ pfi_change_group_event, curvnet, EVENTHANDLER_PRI_ANY);
+ pfi_detach_group_cookie = EVENTHANDLER_REGISTER(group_detach_event,
+ pfi_detach_group_event, curvnet, EVENTHANDLER_PRI_ANY);
+ pfi_ifaddr_event_cookie = EVENTHANDLER_REGISTER(ifaddr_event,
+ pfi_ifaddr_event, NULL, EVENTHANDLER_PRI_ANY);
+}
+
+void
+pfi_cleanup(void)
+{
+ struct pfi_kif *p;
+
+ EVENTHANDLER_DEREGISTER(ifnet_arrival_event, pfi_attach_cookie);
+ EVENTHANDLER_DEREGISTER(ifnet_departure_event, pfi_detach_cookie);
+ EVENTHANDLER_DEREGISTER(group_attach_event, pfi_attach_group_cookie);
+ EVENTHANDLER_DEREGISTER(group_change_event, pfi_change_group_cookie);
+ EVENTHANDLER_DEREGISTER(group_detach_event, pfi_detach_group_cookie);
+ EVENTHANDLER_DEREGISTER(ifaddr_event, pfi_ifaddr_event_cookie);
+
+ V_pfi_all = NULL;
+ while ((p = RB_MIN(pfi_ifhead, &V_pfi_ifs))) {
+ RB_REMOVE(pfi_ifhead, &V_pfi_ifs, p);
+ free(p, PFI_MTYPE);
+ }
+
+ while ((p = LIST_FIRST(&V_pfi_unlinked_kifs))) {
+ LIST_REMOVE(p, pfik_list);
+ free(p, PFI_MTYPE);
+ }
+
+ mtx_destroy(&pfi_unlnkdkifs_mtx);
+
+ free(V_pfi_buffer, PFI_MTYPE);
+}
+
+struct pfi_kif *
+pfi_kif_find(const char *kif_name)
+{
+ struct pfi_kif_cmp s;
+
+ PF_RULES_ASSERT();
+
+ bzero(&s, sizeof(s));
+ strlcpy(s.pfik_name, kif_name, sizeof(s.pfik_name));
+
+ return (RB_FIND(pfi_ifhead, &V_pfi_ifs, (struct pfi_kif *)&s));
+}
+
+struct pfi_kif *
+pfi_kif_attach(struct pfi_kif *kif, const char *kif_name)
+{
+ struct pfi_kif *kif1;
+
+ PF_RULES_WASSERT();
+ KASSERT(kif != NULL, ("%s: null kif", __func__));
+
+ kif1 = pfi_kif_find(kif_name);
+ if (kif1 != NULL) {
+ free(kif, PFI_MTYPE);
+ return (kif1);
+ }
+
+ bzero(kif, sizeof(*kif));
+ strlcpy(kif->pfik_name, kif_name, sizeof(kif->pfik_name));
+ /*
+ * It seems that the value of time_second is in unintialzied state
+ * when pf sets interface statistics clear time in boot phase if pf
+ * was statically linked to kernel. Instead of setting the bogus
+ * time value have pfi_get_ifaces handle this case. In
+ * pfi_get_ifaces it uses time_second if it sees the time is 0.
+ */
+ kif->pfik_tzero = time_second > 1 ? time_second : 0;
+ TAILQ_INIT(&kif->pfik_dynaddrs);
+
+ RB_INSERT(pfi_ifhead, &V_pfi_ifs, kif);
+
+ return (kif);
+}
+
+void
+pfi_kif_ref(struct pfi_kif *kif)
+{
+
+ PF_RULES_WASSERT();
+ kif->pfik_rulerefs++;
+}
+
+void
+pfi_kif_unref(struct pfi_kif *kif)
+{
+
+ PF_RULES_WASSERT();
+ KASSERT(kif->pfik_rulerefs > 0, ("%s: %p has zero refs", __func__, kif));
+
+ kif->pfik_rulerefs--;
+
+ if (kif->pfik_rulerefs > 0)
+ return;
+
+ /* kif referencing an existing ifnet or group should exist. */
+ if (kif->pfik_ifp != NULL || kif->pfik_group != NULL || kif == V_pfi_all)
+ return;
+
+ RB_REMOVE(pfi_ifhead, &V_pfi_ifs, kif);
+
+ kif->pfik_flags |= PFI_IFLAG_REFS;
+
+ mtx_lock(&pfi_unlnkdkifs_mtx);
+ LIST_INSERT_HEAD(&V_pfi_unlinked_kifs, kif, pfik_list);
+ mtx_unlock(&pfi_unlnkdkifs_mtx);
+}
+
+void
+pfi_kif_purge(void)
+{
+ struct pfi_kif *kif, *kif1;
+
+ /*
+ * Do naive mark-and-sweep garbage collecting of old kifs.
+ * Reference flag is raised by pf_purge_expired_states().
+ */
+ mtx_lock(&pfi_unlnkdkifs_mtx);
+ LIST_FOREACH_SAFE(kif, &V_pfi_unlinked_kifs, pfik_list, kif1) {
+ if (!(kif->pfik_flags & PFI_IFLAG_REFS)) {
+ LIST_REMOVE(kif, pfik_list);
+ free(kif, PFI_MTYPE);
+ } else
+ kif->pfik_flags &= ~PFI_IFLAG_REFS;
+ }
+ mtx_unlock(&pfi_unlnkdkifs_mtx);
+}
+
+int
+pfi_kif_match(struct pfi_kif *rule_kif, struct pfi_kif *packet_kif)
+{
+ struct ifg_list *p;
+
+ if (rule_kif == NULL || rule_kif == packet_kif)
+ return (1);
+
+ if (rule_kif->pfik_group != NULL)
+ /* XXXGL: locking? */
+ TAILQ_FOREACH(p, &packet_kif->pfik_ifp->if_groups, ifgl_next)
+ if (p->ifgl_group == rule_kif->pfik_group)
+ return (1);
+
+ return (0);
+}
+
+static void
+pfi_attach_ifnet(struct ifnet *ifp)
+{
+ struct pfi_kif *kif;
+
+ kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK);
+
+ PF_RULES_WLOCK();
+ V_pfi_update++;
+ kif = pfi_kif_attach(kif, ifp->if_xname);
+
+ kif->pfik_ifp = ifp;
+ ifp->if_pf_kif = kif;
+
+ pfi_kif_update(kif);
+ PF_RULES_WUNLOCK();
+}
+
+static void
+pfi_attach_ifgroup(struct ifg_group *ifg)
+{
+ struct pfi_kif *kif;
+
+ kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK);
+
+ PF_RULES_WLOCK();
+ V_pfi_update++;
+ kif = pfi_kif_attach(kif, ifg->ifg_group);
+
+ kif->pfik_group = ifg;
+ ifg->ifg_pf_kif = kif;
+ PF_RULES_WUNLOCK();
+}
+
+int
+pfi_match_addr(struct pfi_dynaddr *dyn, struct pf_addr *a, sa_family_t af)
+{
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ switch (dyn->pfid_acnt4) {
+ case 0:
+ return (0);
+ case 1:
+ return (PF_MATCHA(0, &dyn->pfid_addr4,
+ &dyn->pfid_mask4, a, AF_INET));
+ default:
+ return (pfr_match_addr(dyn->pfid_kt, a, AF_INET));
+ }
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ switch (dyn->pfid_acnt6) {
+ case 0:
+ return (0);
+ case 1:
+ return (PF_MATCHA(0, &dyn->pfid_addr6,
+ &dyn->pfid_mask6, a, AF_INET6));
+ default:
+ return (pfr_match_addr(dyn->pfid_kt, a, AF_INET6));
+ }
+ break;
+#endif /* INET6 */
+ default:
+ return (0);
+ }
+}
+
+int
+pfi_dynaddr_setup(struct pf_addr_wrap *aw, sa_family_t af)
+{
+ struct pfi_dynaddr *dyn;
+ char tblname[PF_TABLE_NAME_SIZE];
+ struct pf_ruleset *ruleset = NULL;
+ struct pfi_kif *kif;
+ int rv = 0;
+
+ PF_RULES_WASSERT();
+ KASSERT(aw->type == PF_ADDR_DYNIFTL, ("%s: type %u",
+ __func__, aw->type));
+ KASSERT(aw->p.dyn == NULL, ("%s: dyn is %p", __func__, aw->p.dyn));
+
+ if ((dyn = malloc(sizeof(*dyn), PFI_MTYPE, M_NOWAIT | M_ZERO)) == NULL)
+ return (ENOMEM);
+
+ if ((kif = malloc(sizeof(*kif), PFI_MTYPE, M_NOWAIT)) == NULL) {
+ free(dyn, PFI_MTYPE);
+ return (ENOMEM);
+ }
+
+ if (!strcmp(aw->v.ifname, "self"))
+ dyn->pfid_kif = pfi_kif_attach(kif, IFG_ALL);
+ else
+ dyn->pfid_kif = pfi_kif_attach(kif, aw->v.ifname);
+ pfi_kif_ref(dyn->pfid_kif);
+
+ dyn->pfid_net = pfi_unmask(&aw->v.a.mask);
+ if (af == AF_INET && dyn->pfid_net == 32)
+ dyn->pfid_net = 128;
+ strlcpy(tblname, aw->v.ifname, sizeof(tblname));
+ if (aw->iflags & PFI_AFLAG_NETWORK)
+ strlcat(tblname, ":network", sizeof(tblname));
+ if (aw->iflags & PFI_AFLAG_BROADCAST)
+ strlcat(tblname, ":broadcast", sizeof(tblname));
+ if (aw->iflags & PFI_AFLAG_PEER)
+ strlcat(tblname, ":peer", sizeof(tblname));
+ if (aw->iflags & PFI_AFLAG_NOALIAS)
+ strlcat(tblname, ":0", sizeof(tblname));
+ if (dyn->pfid_net != 128)
+ snprintf(tblname + strlen(tblname),
+ sizeof(tblname) - strlen(tblname), "/%d", dyn->pfid_net);
+ if ((ruleset = pf_find_or_create_ruleset(PF_RESERVED_ANCHOR)) == NULL) {
+ rv = ENOMEM;
+ goto _bad;
+ }
+
+ if ((dyn->pfid_kt = pfr_attach_table(ruleset, tblname)) == NULL) {
+ rv = ENOMEM;
+ goto _bad;
+ }
+
+ dyn->pfid_kt->pfrkt_flags |= PFR_TFLAG_ACTIVE;
+ dyn->pfid_iflags = aw->iflags;
+ dyn->pfid_af = af;
+
+ TAILQ_INSERT_TAIL(&dyn->pfid_kif->pfik_dynaddrs, dyn, entry);
+ aw->p.dyn = dyn;
+ pfi_kif_update(dyn->pfid_kif);
+
+ return (0);
+
+_bad:
+ if (dyn->pfid_kt != NULL)
+ pfr_detach_table(dyn->pfid_kt);
+ if (ruleset != NULL)
+ pf_remove_if_empty_ruleset(ruleset);
+ if (dyn->pfid_kif != NULL)
+ pfi_kif_unref(dyn->pfid_kif);
+ free(dyn, PFI_MTYPE);
+
+ return (rv);
+}
+
+static void
+pfi_kif_update(struct pfi_kif *kif)
+{
+ struct ifg_list *ifgl;
+ struct pfi_dynaddr *p;
+
+ PF_RULES_WASSERT();
+
+ /* update all dynaddr */
+ TAILQ_FOREACH(p, &kif->pfik_dynaddrs, entry)
+ pfi_dynaddr_update(p);
+
+ /* again for all groups kif is member of */
+ if (kif->pfik_ifp != NULL) {
+ IF_ADDR_RLOCK(kif->pfik_ifp);
+ TAILQ_FOREACH(ifgl, &kif->pfik_ifp->if_groups, ifgl_next)
+ pfi_kif_update((struct pfi_kif *)
+ ifgl->ifgl_group->ifg_pf_kif);
+ IF_ADDR_RUNLOCK(kif->pfik_ifp);
+ }
+}
+
+static void
+pfi_dynaddr_update(struct pfi_dynaddr *dyn)
+{
+ struct pfi_kif *kif;
+ struct pfr_ktable *kt;
+
+ PF_RULES_WASSERT();
+ KASSERT(dyn && dyn->pfid_kif && dyn->pfid_kt,
+ ("%s: bad argument", __func__));
+
+ kif = dyn->pfid_kif;
+ kt = dyn->pfid_kt;
+
+ if (kt->pfrkt_larg != V_pfi_update) {
+ /* this table needs to be brought up-to-date */
+ pfi_table_update(kt, kif, dyn->pfid_net, dyn->pfid_iflags);
+ kt->pfrkt_larg = V_pfi_update;
+ }
+ pfr_dynaddr_update(kt, dyn);
+}
+
+static void
+pfi_table_update(struct pfr_ktable *kt, struct pfi_kif *kif, int net, int flags)
+{
+ int e, size2 = 0;
+ struct ifg_member *ifgm;
+
+ V_pfi_buffer_cnt = 0;
+
+ if (kif->pfik_ifp != NULL)
+ pfi_instance_add(kif->pfik_ifp, net, flags);
+ else if (kif->pfik_group != NULL) {
+ IFNET_RLOCK();
+ TAILQ_FOREACH(ifgm, &kif->pfik_group->ifg_members, ifgm_next)
+ pfi_instance_add(ifgm->ifgm_ifp, net, flags);
+ IFNET_RUNLOCK();
+ }
+
+ if ((e = pfr_set_addrs(&kt->pfrkt_t, V_pfi_buffer, V_pfi_buffer_cnt, &size2,
+ NULL, NULL, NULL, 0, PFR_TFLAG_ALLMASK)))
+ printf("%s: cannot set %d new addresses into table %s: %d\n",
+ __func__, V_pfi_buffer_cnt, kt->pfrkt_name, e);
+}
+
+static void
+pfi_instance_add(struct ifnet *ifp, int net, int flags)
+{
+ struct ifaddr *ia;
+ int got4 = 0, got6 = 0;
+ int net2, af;
+
+ IF_ADDR_RLOCK(ifp);
+ TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_list) {
+ if (ia->ifa_addr == NULL)
+ continue;
+ af = ia->ifa_addr->sa_family;
+ if (af != AF_INET && af != AF_INET6)
+ continue;
+ /*
+ * XXX: For point-to-point interfaces, (ifname:0) and IPv4,
+ * jump over addresses without a proper route to work
+ * around a problem with ppp not fully removing the
+ * address used during IPCP.
+ */
+ if ((ifp->if_flags & IFF_POINTOPOINT) &&
+ !(ia->ifa_flags & IFA_ROUTE) &&
+ (flags & PFI_AFLAG_NOALIAS) && (af == AF_INET))
+ continue;
+ if ((flags & PFI_AFLAG_BROADCAST) && af == AF_INET6)
+ continue;
+ if ((flags & PFI_AFLAG_BROADCAST) &&
+ !(ifp->if_flags & IFF_BROADCAST))
+ continue;
+ if ((flags & PFI_AFLAG_PEER) &&
+ !(ifp->if_flags & IFF_POINTOPOINT))
+ continue;
+ if ((flags & PFI_AFLAG_NETWORK) && af == AF_INET6 &&
+ IN6_IS_ADDR_LINKLOCAL(
+ &((struct sockaddr_in6 *)ia->ifa_addr)->sin6_addr))
+ continue;
+ if (flags & PFI_AFLAG_NOALIAS) {
+ if (af == AF_INET && got4)
+ continue;
+ if (af == AF_INET6 && got6)
+ continue;
+ }
+ if (af == AF_INET)
+ got4 = 1;
+ else if (af == AF_INET6)
+ got6 = 1;
+ net2 = net;
+ if (net2 == 128 && (flags & PFI_AFLAG_NETWORK)) {
+ if (af == AF_INET)
+ net2 = pfi_unmask(&((struct sockaddr_in *)
+ ia->ifa_netmask)->sin_addr);
+ else if (af == AF_INET6)
+ net2 = pfi_unmask(&((struct sockaddr_in6 *)
+ ia->ifa_netmask)->sin6_addr);
+ }
+ if (af == AF_INET && net2 > 32)
+ net2 = 32;
+ if (flags & PFI_AFLAG_BROADCAST)
+ pfi_address_add(ia->ifa_broadaddr, af, net2);
+ else if (flags & PFI_AFLAG_PEER)
+ pfi_address_add(ia->ifa_dstaddr, af, net2);
+ else
+ pfi_address_add(ia->ifa_addr, af, net2);
+ }
+ IF_ADDR_RUNLOCK(ifp);
+}
+
+static void
+pfi_address_add(struct sockaddr *sa, int af, int net)
+{
+ struct pfr_addr *p;
+ int i;
+
+ if (V_pfi_buffer_cnt >= V_pfi_buffer_max) {
+ int new_max = V_pfi_buffer_max * 2;
+
+ if (new_max > PFI_BUFFER_MAX) {
+ printf("%s: address buffer full (%d/%d)\n", __func__,
+ V_pfi_buffer_cnt, PFI_BUFFER_MAX);
+ return;
+ }
+ p = malloc(new_max * sizeof(*V_pfi_buffer), PFI_MTYPE,
+ M_NOWAIT);
+ if (p == NULL) {
+ printf("%s: no memory to grow buffer (%d/%d)\n",
+ __func__, V_pfi_buffer_cnt, PFI_BUFFER_MAX);
+ return;
+ }
+ memcpy(V_pfi_buffer, p, V_pfi_buffer_cnt * sizeof(*V_pfi_buffer));
+ /* no need to zero buffer */
+ free(V_pfi_buffer, PFI_MTYPE);
+ V_pfi_buffer = p;
+ V_pfi_buffer_max = new_max;
+ }
+ if (af == AF_INET && net > 32)
+ net = 128;
+ p = V_pfi_buffer + V_pfi_buffer_cnt++;
+ bzero(p, sizeof(*p));
+ p->pfra_af = af;
+ p->pfra_net = net;
+ if (af == AF_INET)
+ p->pfra_ip4addr = ((struct sockaddr_in *)sa)->sin_addr;
+ else if (af == AF_INET6) {
+ p->pfra_ip6addr = ((struct sockaddr_in6 *)sa)->sin6_addr;
+ if (IN6_IS_SCOPE_EMBED(&p->pfra_ip6addr))
+ p->pfra_ip6addr.s6_addr16[1] = 0;
+ }
+ /* mask network address bits */
+ if (net < 128)
+ ((caddr_t)p)[p->pfra_net/8] &= ~(0xFF >> (p->pfra_net%8));
+ for (i = (p->pfra_net+7)/8; i < sizeof(p->pfra_u); i++)
+ ((caddr_t)p)[i] = 0;
+}
+
+void
+pfi_dynaddr_remove(struct pfi_dynaddr *dyn)
+{
+
+ KASSERT(dyn->pfid_kif != NULL, ("%s: null pfid_kif", __func__));
+ KASSERT(dyn->pfid_kt != NULL, ("%s: null pfid_kt", __func__));
+
+ TAILQ_REMOVE(&dyn->pfid_kif->pfik_dynaddrs, dyn, entry);
+ pfi_kif_unref(dyn->pfid_kif);
+ pfr_detach_table(dyn->pfid_kt);
+ free(dyn, PFI_MTYPE);
+}
+
+void
+pfi_dynaddr_copyout(struct pf_addr_wrap *aw)
+{
+
+ KASSERT(aw->type == PF_ADDR_DYNIFTL,
+ ("%s: type %u", __func__, aw->type));
+
+ if (aw->p.dyn == NULL || aw->p.dyn->pfid_kif == NULL)
+ return;
+ aw->p.dyncnt = aw->p.dyn->pfid_acnt4 + aw->p.dyn->pfid_acnt6;
+}
+
+static int
+pfi_if_compare(struct pfi_kif *p, struct pfi_kif *q)
+{
+ return (strncmp(p->pfik_name, q->pfik_name, IFNAMSIZ));
+}
+
+void
+pfi_update_status(const char *name, struct pf_status *pfs)
+{
+ struct pfi_kif *p;
+ struct pfi_kif_cmp key;
+ struct ifg_member p_member, *ifgm;
+ TAILQ_HEAD(, ifg_member) ifg_members;
+ int i, j, k;
+
+ strlcpy(key.pfik_name, name, sizeof(key.pfik_name));
+ p = RB_FIND(pfi_ifhead, &V_pfi_ifs, (struct pfi_kif *)&key);
+ if (p == NULL)
+ return;
+
+ if (p->pfik_group != NULL) {
+ bcopy(&p->pfik_group->ifg_members, &ifg_members,
+ sizeof(ifg_members));
+ } else {
+ /* build a temporary list for p only */
+ bzero(&p_member, sizeof(p_member));
+ p_member.ifgm_ifp = p->pfik_ifp;
+ TAILQ_INIT(&ifg_members);
+ TAILQ_INSERT_TAIL(&ifg_members, &p_member, ifgm_next);
+ }
+ if (pfs) {
+ bzero(pfs->pcounters, sizeof(pfs->pcounters));
+ bzero(pfs->bcounters, sizeof(pfs->bcounters));
+ }
+ TAILQ_FOREACH(ifgm, &ifg_members, ifgm_next) {
+ if (ifgm->ifgm_ifp == NULL)
+ continue;
+ p = (struct pfi_kif *)ifgm->ifgm_ifp->if_pf_kif;
+
+ /* just clear statistics */
+ if (pfs == NULL) {
+ bzero(p->pfik_packets, sizeof(p->pfik_packets));
+ bzero(p->pfik_bytes, sizeof(p->pfik_bytes));
+ p->pfik_tzero = time_second;
+ continue;
+ }
+ for (i = 0; i < 2; i++)
+ for (j = 0; j < 2; j++)
+ for (k = 0; k < 2; k++) {
+ pfs->pcounters[i][j][k] +=
+ p->pfik_packets[i][j][k];
+ pfs->bcounters[i][j] +=
+ p->pfik_bytes[i][j][k];
+ }
+ }
+}
+
+void
+pfi_get_ifaces(const char *name, struct pfi_kif *buf, int *size)
+{
+ struct pfi_kif *p, *nextp;
+ int n = 0;
+
+ for (p = RB_MIN(pfi_ifhead, &V_pfi_ifs); p; p = nextp) {
+ nextp = RB_NEXT(pfi_ifhead, &V_pfi_ifs, p);
+ if (pfi_skip_if(name, p))
+ continue;
+ if (*size <= n++)
+ break;
+ if (!p->pfik_tzero)
+ p->pfik_tzero = time_second;
+ bcopy(p, buf++, sizeof(*buf));
+ nextp = RB_NEXT(pfi_ifhead, &V_pfi_ifs, p);
+ }
+ *size = n;
+}
+
+static int
+pfi_skip_if(const char *filter, struct pfi_kif *p)
+{
+ int n;
+
+ if (filter == NULL || !*filter)
+ return (0);
+ if (!strcmp(p->pfik_name, filter))
+ return (0); /* exact match */
+ n = strlen(filter);
+ if (n < 1 || n >= IFNAMSIZ)
+ return (1); /* sanity check */
+ if (filter[n-1] >= '0' && filter[n-1] <= '9')
+ return (1); /* only do exact match in that case */
+ if (strncmp(p->pfik_name, filter, n))
+ return (1); /* prefix doesn't match */
+ return (p->pfik_name[n] < '0' || p->pfik_name[n] > '9');
+}
+
+int
+pfi_set_flags(const char *name, int flags)
+{
+ struct pfi_kif *p;
+
+ RB_FOREACH(p, pfi_ifhead, &V_pfi_ifs) {
+ if (pfi_skip_if(name, p))
+ continue;
+ p->pfik_flags |= flags;
+ }
+ return (0);
+}
+
+int
+pfi_clear_flags(const char *name, int flags)
+{
+ struct pfi_kif *p;
+
+ RB_FOREACH(p, pfi_ifhead, &V_pfi_ifs) {
+ if (pfi_skip_if(name, p))
+ continue;
+ p->pfik_flags &= ~flags;
+ }
+ return (0);
+}
+
+/* from pf_print_state.c */
+static int
+pfi_unmask(void *addr)
+{
+ struct pf_addr *m = addr;
+ int i = 31, j = 0, b = 0;
+ u_int32_t tmp;
+
+ while (j < 4 && m->addr32[j] == 0xffffffff) {
+ b += 32;
+ j++;
+ }
+ if (j < 4) {
+ tmp = ntohl(m->addr32[j]);
+ for (i = 31; tmp & (1 << i); --i)
+ b++;
+ }
+ return (b);
+}
+
+static void
+pfi_attach_ifnet_event(void *arg __unused, struct ifnet *ifp)
+{
+
+ CURVNET_SET(ifp->if_vnet);
+ pfi_attach_ifnet(ifp);
+#ifdef ALTQ
+ PF_RULES_WLOCK();
+ pf_altq_ifnet_event(ifp, 0);
+ PF_RULES_WUNLOCK();
+#endif
+ CURVNET_RESTORE();
+}
+
+static void
+pfi_detach_ifnet_event(void *arg __unused, struct ifnet *ifp)
+{
+ struct pfi_kif *kif = (struct pfi_kif *)ifp->if_pf_kif;
+
+ CURVNET_SET(ifp->if_vnet);
+ PF_RULES_WLOCK();
+ V_pfi_update++;
+ pfi_kif_update(kif);
+
+ kif->pfik_ifp = NULL;
+ ifp->if_pf_kif = NULL;
+#ifdef ALTQ
+ pf_altq_ifnet_event(ifp, 1);
+#endif
+ PF_RULES_WUNLOCK();
+ CURVNET_RESTORE();
+}
+
+static void
+pfi_attach_group_event(void *arg , struct ifg_group *ifg)
+{
+
+ CURVNET_SET((struct vnet *)arg);
+ pfi_attach_ifgroup(ifg);
+ CURVNET_RESTORE();
+}
+
+static void
+pfi_change_group_event(void *arg, char *gname)
+{
+ struct pfi_kif *kif;
+
+ kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK);
+
+ CURVNET_SET((struct vnet *)arg);
+ PF_RULES_WLOCK();
+ V_pfi_update++;
+ kif = pfi_kif_attach(kif, gname);
+ pfi_kif_update(kif);
+ PF_RULES_WUNLOCK();
+ CURVNET_RESTORE();
+}
+
+static void
+pfi_detach_group_event(void *arg, struct ifg_group *ifg)
+{
+ struct pfi_kif *kif = (struct pfi_kif *)ifg->ifg_pf_kif;
+
+ CURVNET_SET((struct vnet *)arg);
+ PF_RULES_WLOCK();
+ V_pfi_update++;
+
+ kif->pfik_group = NULL;
+ ifg->ifg_pf_kif = NULL;
+ PF_RULES_WUNLOCK();
+ CURVNET_RESTORE();
+}
+
+static void
+pfi_ifaddr_event(void *arg __unused, struct ifnet *ifp)
+{
+
+ CURVNET_SET(ifp->if_vnet);
+ PF_RULES_WLOCK();
+ if (ifp && ifp->if_pf_kif) {
+ V_pfi_update++;
+ pfi_kif_update(ifp->if_pf_kif);
+ }
+ PF_RULES_WUNLOCK();
+ CURVNET_RESTORE();
+}
diff --git a/sys/netpfil/pf/pf_ioctl.c b/sys/netpfil/pf/pf_ioctl.c
new file mode 100644
index 0000000..032f051
--- /dev/null
+++ b/sys/netpfil/pf/pf_ioctl.c
@@ -0,0 +1,3774 @@
+/* $OpenBSD: pf_ioctl.c,v 1.213 2009/02/15 21:46:12 mbalmer Exp $ */
+
+/*
+ * Copyright (c) 2001 Daniel Hartmeier
+ * Copyright (c) 2002,2003 Henning Brauer
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Effort sponsored in part by the Defense Advanced Research Projects
+ * Agency (DARPA) and Air Force Research Laboratory, Air Force
+ * Materiel Command, USAF, under agreement number F30602-01-2-0537.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_bpf.h"
+#include "opt_pf.h"
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/conf.h>
+#include <sys/endian.h>
+#include <sys/fcntl.h>
+#include <sys/filio.h>
+#include <sys/interrupt.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/proc.h>
+#include <sys/smp.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/md5.h>
+#include <sys/ucred.h>
+
+#include <net/if.h>
+#include <net/route.h>
+#include <net/pfil.h>
+#include <net/pfvar.h>
+#include <net/if_pfsync.h>
+#include <net/if_pflog.h>
+
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip_icmp.h>
+
+#ifdef INET6
+#include <netinet/ip6.h>
+#endif /* INET6 */
+
+#ifdef ALTQ
+#include <altq/altq.h>
+#endif
+
+static int pfattach(void);
+static struct pf_pool *pf_get_pool(char *, u_int32_t, u_int8_t, u_int32_t,
+ u_int8_t, u_int8_t, u_int8_t);
+
+static void pf_mv_pool(struct pf_palist *, struct pf_palist *);
+static void pf_empty_pool(struct pf_palist *);
+static int pfioctl(struct cdev *, u_long, caddr_t, int,
+ struct thread *);
+#ifdef ALTQ
+static int pf_begin_altq(u_int32_t *);
+static int pf_rollback_altq(u_int32_t);
+static int pf_commit_altq(u_int32_t);
+static int pf_enable_altq(struct pf_altq *);
+static int pf_disable_altq(struct pf_altq *);
+static u_int32_t pf_qname2qid(char *);
+static void pf_qid_unref(u_int32_t);
+#endif /* ALTQ */
+static int pf_begin_rules(u_int32_t *, int, const char *);
+static int pf_rollback_rules(u_int32_t, int, char *);
+static int pf_setup_pfsync_matching(struct pf_ruleset *);
+static void pf_hash_rule(MD5_CTX *, struct pf_rule *);
+static void pf_hash_rule_addr(MD5_CTX *, struct pf_rule_addr *);
+static int pf_commit_rules(u_int32_t, int, char *);
+static int pf_addr_setup(struct pf_ruleset *,
+ struct pf_addr_wrap *, sa_family_t);
+static void pf_addr_copyout(struct pf_addr_wrap *);
+
+VNET_DEFINE(struct pf_rule, pf_default_rule);
+
+#ifdef ALTQ
+static VNET_DEFINE(int, pf_altq_running);
+#define V_pf_altq_running VNET(pf_altq_running)
+#endif
+
+#define TAGID_MAX 50000
+struct pf_tagname {
+ TAILQ_ENTRY(pf_tagname) entries;
+ char name[PF_TAG_NAME_SIZE];
+ uint16_t tag;
+ int ref;
+};
+
+TAILQ_HEAD(pf_tags, pf_tagname);
+#define V_pf_tags VNET(pf_tags)
+VNET_DEFINE(struct pf_tags, pf_tags);
+#define V_pf_qids VNET(pf_qids)
+VNET_DEFINE(struct pf_tags, pf_qids);
+static MALLOC_DEFINE(M_PFTAG, "pf_tag", "pf(4) tag names");
+static MALLOC_DEFINE(M_PFALTQ, "pf_altq", "pf(4) altq configuration db");
+static MALLOC_DEFINE(M_PFRULE, "pf_rule", "pf(4) rules");
+
+#if (PF_QNAME_SIZE != PF_TAG_NAME_SIZE)
+#error PF_QNAME_SIZE must be equal to PF_TAG_NAME_SIZE
+#endif
+
+static u_int16_t tagname2tag(struct pf_tags *, char *);
+static u_int16_t pf_tagname2tag(char *);
+static void tag_unref(struct pf_tags *, u_int16_t);
+
+#define DPFPRINTF(n, x) if (V_pf_status.debug >= (n)) printf x
+
+struct cdev *pf_dev;
+
+/*
+ * XXX - These are new and need to be checked when moveing to a new version
+ */
+static void pf_clear_states(void);
+static int pf_clear_tables(void);
+static void pf_clear_srcnodes(struct pf_src_node *);
+static void pf_tbladdr_copyout(struct pf_addr_wrap *);
+
+/*
+ * Wrapper functions for pfil(9) hooks
+ */
+#ifdef INET
+static int pf_check_in(void *arg, struct mbuf **m, struct ifnet *ifp,
+ int dir, struct inpcb *inp);
+static int pf_check_out(void *arg, struct mbuf **m, struct ifnet *ifp,
+ int dir, struct inpcb *inp);
+#endif
+#ifdef INET6
+static int pf_check6_in(void *arg, struct mbuf **m, struct ifnet *ifp,
+ int dir, struct inpcb *inp);
+static int pf_check6_out(void *arg, struct mbuf **m, struct ifnet *ifp,
+ int dir, struct inpcb *inp);
+#endif
+
+static int hook_pf(void);
+static int dehook_pf(void);
+static int shutdown_pf(void);
+static int pf_load(void);
+static int pf_unload(void);
+
+static struct cdevsw pf_cdevsw = {
+ .d_ioctl = pfioctl,
+ .d_name = PF_NAME,
+ .d_version = D_VERSION,
+};
+
+static volatile VNET_DEFINE(int, pf_pfil_hooked);
+#define V_pf_pfil_hooked VNET(pf_pfil_hooked)
+VNET_DEFINE(int, pf_end_threads);
+
+struct rwlock pf_rules_lock;
+
+/* pfsync */
+pfsync_state_import_t *pfsync_state_import_ptr = NULL;
+pfsync_insert_state_t *pfsync_insert_state_ptr = NULL;
+pfsync_update_state_t *pfsync_update_state_ptr = NULL;
+pfsync_delete_state_t *pfsync_delete_state_ptr = NULL;
+pfsync_clear_states_t *pfsync_clear_states_ptr = NULL;
+pfsync_defer_t *pfsync_defer_ptr = NULL;
+/* pflog */
+pflog_packet_t *pflog_packet_ptr = NULL;
+
+static int
+pfattach(void)
+{
+ u_int32_t *my_timeout = V_pf_default_rule.timeout;
+ int error;
+
+ pf_initialize();
+ pfr_initialize();
+ pfi_initialize();
+ pf_normalize_init();
+
+ V_pf_limits[PF_LIMIT_STATES].limit = PFSTATE_HIWAT;
+ V_pf_limits[PF_LIMIT_SRC_NODES].limit = PFSNODE_HIWAT;
+
+ RB_INIT(&V_pf_anchors);
+ pf_init_ruleset(&pf_main_ruleset);
+
+ /* default rule should never be garbage collected */
+ V_pf_default_rule.entries.tqe_prev = &V_pf_default_rule.entries.tqe_next;
+ V_pf_default_rule.action = PF_PASS;
+ V_pf_default_rule.nr = -1;
+ V_pf_default_rule.rtableid = -1;
+
+ /* initialize default timeouts */
+ my_timeout[PFTM_TCP_FIRST_PACKET] = PFTM_TCP_FIRST_PACKET_VAL;
+ my_timeout[PFTM_TCP_OPENING] = PFTM_TCP_OPENING_VAL;
+ my_timeout[PFTM_TCP_ESTABLISHED] = PFTM_TCP_ESTABLISHED_VAL;
+ my_timeout[PFTM_TCP_CLOSING] = PFTM_TCP_CLOSING_VAL;
+ my_timeout[PFTM_TCP_FIN_WAIT] = PFTM_TCP_FIN_WAIT_VAL;
+ my_timeout[PFTM_TCP_CLOSED] = PFTM_TCP_CLOSED_VAL;
+ my_timeout[PFTM_UDP_FIRST_PACKET] = PFTM_UDP_FIRST_PACKET_VAL;
+ my_timeout[PFTM_UDP_SINGLE] = PFTM_UDP_SINGLE_VAL;
+ my_timeout[PFTM_UDP_MULTIPLE] = PFTM_UDP_MULTIPLE_VAL;
+ my_timeout[PFTM_ICMP_FIRST_PACKET] = PFTM_ICMP_FIRST_PACKET_VAL;
+ my_timeout[PFTM_ICMP_ERROR_REPLY] = PFTM_ICMP_ERROR_REPLY_VAL;
+ my_timeout[PFTM_OTHER_FIRST_PACKET] = PFTM_OTHER_FIRST_PACKET_VAL;
+ my_timeout[PFTM_OTHER_SINGLE] = PFTM_OTHER_SINGLE_VAL;
+ my_timeout[PFTM_OTHER_MULTIPLE] = PFTM_OTHER_MULTIPLE_VAL;
+ my_timeout[PFTM_FRAG] = PFTM_FRAG_VAL;
+ my_timeout[PFTM_INTERVAL] = PFTM_INTERVAL_VAL;
+ my_timeout[PFTM_SRC_NODE] = PFTM_SRC_NODE_VAL;
+ my_timeout[PFTM_TS_DIFF] = PFTM_TS_DIFF_VAL;
+ my_timeout[PFTM_ADAPTIVE_START] = PFSTATE_ADAPT_START;
+ my_timeout[PFTM_ADAPTIVE_END] = PFSTATE_ADAPT_END;
+
+ bzero(&V_pf_status, sizeof(V_pf_status));
+ V_pf_status.debug = PF_DEBUG_URGENT;
+
+ V_pf_pfil_hooked = 0;
+
+ /* XXX do our best to avoid a conflict */
+ V_pf_status.hostid = arc4random();
+
+ if ((error = kproc_create(pf_purge_thread, curvnet, NULL, 0, 0,
+ "pf purge")) != 0)
+ /* XXXGL: leaked all above. */
+ return (error);
+ if ((error = swi_add(NULL, "pf send", pf_intr, curvnet, SWI_NET,
+ INTR_MPSAFE, &V_pf_swi_cookie)) != 0)
+ /* XXXGL: leaked all above. */
+ return (error);
+
+ return (0);
+}
+
+static struct pf_pool *
+pf_get_pool(char *anchor, u_int32_t ticket, u_int8_t rule_action,
+ u_int32_t rule_number, u_int8_t r_last, u_int8_t active,
+ u_int8_t check_ticket)
+{
+ struct pf_ruleset *ruleset;
+ struct pf_rule *rule;
+ int rs_num;
+
+ ruleset = pf_find_ruleset(anchor);
+ if (ruleset == NULL)
+ return (NULL);
+ rs_num = pf_get_ruleset_number(rule_action);
+ if (rs_num >= PF_RULESET_MAX)
+ return (NULL);
+ if (active) {
+ if (check_ticket && ticket !=
+ ruleset->rules[rs_num].active.ticket)
+ return (NULL);
+ if (r_last)
+ rule = TAILQ_LAST(ruleset->rules[rs_num].active.ptr,
+ pf_rulequeue);
+ else
+ rule = TAILQ_FIRST(ruleset->rules[rs_num].active.ptr);
+ } else {
+ if (check_ticket && ticket !=
+ ruleset->rules[rs_num].inactive.ticket)
+ return (NULL);
+ if (r_last)
+ rule = TAILQ_LAST(ruleset->rules[rs_num].inactive.ptr,
+ pf_rulequeue);
+ else
+ rule = TAILQ_FIRST(ruleset->rules[rs_num].inactive.ptr);
+ }
+ if (!r_last) {
+ while ((rule != NULL) && (rule->nr != rule_number))
+ rule = TAILQ_NEXT(rule, entries);
+ }
+ if (rule == NULL)
+ return (NULL);
+
+ return (&rule->rpool);
+}
+
+static void
+pf_mv_pool(struct pf_palist *poola, struct pf_palist *poolb)
+{
+ struct pf_pooladdr *mv_pool_pa;
+
+ while ((mv_pool_pa = TAILQ_FIRST(poola)) != NULL) {
+ TAILQ_REMOVE(poola, mv_pool_pa, entries);
+ TAILQ_INSERT_TAIL(poolb, mv_pool_pa, entries);
+ }
+}
+
+static void
+pf_empty_pool(struct pf_palist *poola)
+{
+ struct pf_pooladdr *pa;
+
+ while ((pa = TAILQ_FIRST(poola)) != NULL) {
+ switch (pa->addr.type) {
+ case PF_ADDR_DYNIFTL:
+ pfi_dynaddr_remove(pa->addr.p.dyn);
+ break;
+ case PF_ADDR_TABLE:
+ pfr_detach_table(pa->addr.p.tbl);
+ break;
+ }
+ if (pa->kif)
+ pfi_kif_unref(pa->kif);
+ TAILQ_REMOVE(poola, pa, entries);
+ free(pa, M_PFRULE);
+ }
+}
+
+static void
+pf_unlink_rule(struct pf_rulequeue *rulequeue, struct pf_rule *rule)
+{
+
+ PF_RULES_WASSERT();
+
+ TAILQ_REMOVE(rulequeue, rule, entries);
+
+ PF_UNLNKDRULES_LOCK();
+ rule->rule_flag |= PFRULE_REFS;
+ TAILQ_INSERT_TAIL(&V_pf_unlinked_rules, rule, entries);
+ PF_UNLNKDRULES_UNLOCK();
+}
+
+void
+pf_free_rule(struct pf_rule *rule)
+{
+
+ PF_RULES_WASSERT();
+
+ if (rule->tag)
+ tag_unref(&V_pf_tags, rule->tag);
+ if (rule->match_tag)
+ tag_unref(&V_pf_tags, rule->match_tag);
+#ifdef ALTQ
+ if (rule->pqid != rule->qid)
+ pf_qid_unref(rule->pqid);
+ pf_qid_unref(rule->qid);
+#endif
+ switch (rule->src.addr.type) {
+ case PF_ADDR_DYNIFTL:
+ pfi_dynaddr_remove(rule->src.addr.p.dyn);
+ break;
+ case PF_ADDR_TABLE:
+ pfr_detach_table(rule->src.addr.p.tbl);
+ break;
+ }
+ switch (rule->dst.addr.type) {
+ case PF_ADDR_DYNIFTL:
+ pfi_dynaddr_remove(rule->dst.addr.p.dyn);
+ break;
+ case PF_ADDR_TABLE:
+ pfr_detach_table(rule->dst.addr.p.tbl);
+ break;
+ }
+ if (rule->overload_tbl)
+ pfr_detach_table(rule->overload_tbl);
+ if (rule->kif)
+ pfi_kif_unref(rule->kif);
+ pf_anchor_remove(rule);
+ pf_empty_pool(&rule->rpool.list);
+ free(rule, M_PFRULE);
+}
+
+static u_int16_t
+tagname2tag(struct pf_tags *head, char *tagname)
+{
+ struct pf_tagname *tag, *p = NULL;
+ u_int16_t new_tagid = 1;
+
+ PF_RULES_WASSERT();
+
+ TAILQ_FOREACH(tag, head, entries)
+ if (strcmp(tagname, tag->name) == 0) {
+ tag->ref++;
+ return (tag->tag);
+ }
+
+ /*
+ * to avoid fragmentation, we do a linear search from the beginning
+ * and take the first free slot we find. if there is none or the list
+ * is empty, append a new entry at the end.
+ */
+
+ /* new entry */
+ if (!TAILQ_EMPTY(head))
+ for (p = TAILQ_FIRST(head); p != NULL &&
+ p->tag == new_tagid; p = TAILQ_NEXT(p, entries))
+ new_tagid = p->tag + 1;
+
+ if (new_tagid > TAGID_MAX)
+ return (0);
+
+ /* allocate and fill new struct pf_tagname */
+ tag = malloc(sizeof(*tag), M_PFTAG, M_NOWAIT|M_ZERO);
+ if (tag == NULL)
+ return (0);
+ strlcpy(tag->name, tagname, sizeof(tag->name));
+ tag->tag = new_tagid;
+ tag->ref++;
+
+ if (p != NULL) /* insert new entry before p */
+ TAILQ_INSERT_BEFORE(p, tag, entries);
+ else /* either list empty or no free slot in between */
+ TAILQ_INSERT_TAIL(head, tag, entries);
+
+ return (tag->tag);
+}
+
+static void
+tag_unref(struct pf_tags *head, u_int16_t tag)
+{
+ struct pf_tagname *p, *next;
+
+ PF_RULES_WASSERT();
+
+ for (p = TAILQ_FIRST(head); p != NULL; p = next) {
+ next = TAILQ_NEXT(p, entries);
+ if (tag == p->tag) {
+ if (--p->ref == 0) {
+ TAILQ_REMOVE(head, p, entries);
+ free(p, M_PFTAG);
+ }
+ break;
+ }
+ }
+}
+
+static u_int16_t
+pf_tagname2tag(char *tagname)
+{
+ return (tagname2tag(&V_pf_tags, tagname));
+}
+
+#ifdef ALTQ
+static u_int32_t
+pf_qname2qid(char *qname)
+{
+ return ((u_int32_t)tagname2tag(&V_pf_qids, qname));
+}
+
+static void
+pf_qid_unref(u_int32_t qid)
+{
+ tag_unref(&V_pf_qids, (u_int16_t)qid);
+}
+
+static int
+pf_begin_altq(u_int32_t *ticket)
+{
+ struct pf_altq *altq;
+ int error = 0;
+
+ PF_RULES_WASSERT();
+
+ /* Purge the old altq list */
+ while ((altq = TAILQ_FIRST(V_pf_altqs_inactive)) != NULL) {
+ TAILQ_REMOVE(V_pf_altqs_inactive, altq, entries);
+ if (altq->qname[0] == 0 &&
+ (altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) {
+ /* detach and destroy the discipline */
+ error = altq_remove(altq);
+ } else
+ pf_qid_unref(altq->qid);
+ free(altq, M_PFALTQ);
+ }
+ if (error)
+ return (error);
+ *ticket = ++V_ticket_altqs_inactive;
+ V_altqs_inactive_open = 1;
+ return (0);
+}
+
+static int
+pf_rollback_altq(u_int32_t ticket)
+{
+ struct pf_altq *altq;
+ int error = 0;
+
+ PF_RULES_WASSERT();
+
+ if (!V_altqs_inactive_open || ticket != V_ticket_altqs_inactive)
+ return (0);
+ /* Purge the old altq list */
+ while ((altq = TAILQ_FIRST(V_pf_altqs_inactive)) != NULL) {
+ TAILQ_REMOVE(V_pf_altqs_inactive, altq, entries);
+ if (altq->qname[0] == 0 &&
+ (altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) {
+ /* detach and destroy the discipline */
+ error = altq_remove(altq);
+ } else
+ pf_qid_unref(altq->qid);
+ free(altq, M_PFALTQ);
+ }
+ V_altqs_inactive_open = 0;
+ return (error);
+}
+
+static int
+pf_commit_altq(u_int32_t ticket)
+{
+ struct pf_altqqueue *old_altqs;
+ struct pf_altq *altq;
+ int err, error = 0;
+
+ PF_RULES_WASSERT();
+
+ if (!V_altqs_inactive_open || ticket != V_ticket_altqs_inactive)
+ return (EBUSY);
+
+ /* swap altqs, keep the old. */
+ old_altqs = V_pf_altqs_active;
+ V_pf_altqs_active = V_pf_altqs_inactive;
+ V_pf_altqs_inactive = old_altqs;
+ V_ticket_altqs_active = V_ticket_altqs_inactive;
+
+ /* Attach new disciplines */
+ TAILQ_FOREACH(altq, V_pf_altqs_active, entries) {
+ if (altq->qname[0] == 0 &&
+ (altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) {
+ /* attach the discipline */
+ error = altq_pfattach(altq);
+ if (error == 0 && V_pf_altq_running)
+ error = pf_enable_altq(altq);
+ if (error != 0)
+ return (error);
+ }
+ }
+
+ /* Purge the old altq list */
+ while ((altq = TAILQ_FIRST(V_pf_altqs_inactive)) != NULL) {
+ TAILQ_REMOVE(V_pf_altqs_inactive, altq, entries);
+ if (altq->qname[0] == 0 &&
+ (altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) {
+ /* detach and destroy the discipline */
+ if (V_pf_altq_running)
+ error = pf_disable_altq(altq);
+ err = altq_pfdetach(altq);
+ if (err != 0 && error == 0)
+ error = err;
+ err = altq_remove(altq);
+ if (err != 0 && error == 0)
+ error = err;
+ } else
+ pf_qid_unref(altq->qid);
+ free(altq, M_PFALTQ);
+ }
+
+ V_altqs_inactive_open = 0;
+ return (error);
+}
+
+static int
+pf_enable_altq(struct pf_altq *altq)
+{
+ struct ifnet *ifp;
+ struct tb_profile tb;
+ int error = 0;
+
+ if ((ifp = ifunit(altq->ifname)) == NULL)
+ return (EINVAL);
+
+ if (ifp->if_snd.altq_type != ALTQT_NONE)
+ error = altq_enable(&ifp->if_snd);
+
+ /* set tokenbucket regulator */
+ if (error == 0 && ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd)) {
+ tb.rate = altq->ifbandwidth;
+ tb.depth = altq->tbrsize;
+ error = tbr_set(&ifp->if_snd, &tb);
+ }
+
+ return (error);
+}
+
+static int
+pf_disable_altq(struct pf_altq *altq)
+{
+ struct ifnet *ifp;
+ struct tb_profile tb;
+ int error;
+
+ if ((ifp = ifunit(altq->ifname)) == NULL)
+ return (EINVAL);
+
+ /*
+ * when the discipline is no longer referenced, it was overridden
+ * by a new one. if so, just return.
+ */
+ if (altq->altq_disc != ifp->if_snd.altq_disc)
+ return (0);
+
+ error = altq_disable(&ifp->if_snd);
+
+ if (error == 0) {
+ /* clear tokenbucket regulator */
+ tb.rate = 0;
+ error = tbr_set(&ifp->if_snd, &tb);
+ }
+
+ return (error);
+}
+
+void
+pf_altq_ifnet_event(struct ifnet *ifp, int remove)
+{
+ struct ifnet *ifp1;
+ struct pf_altq *a1, *a2, *a3;
+ u_int32_t ticket;
+ int error = 0;
+
+ /* Interrupt userland queue modifications */
+ if (V_altqs_inactive_open)
+ pf_rollback_altq(V_ticket_altqs_inactive);
+
+ /* Start new altq ruleset */
+ if (pf_begin_altq(&ticket))
+ return;
+
+ /* Copy the current active set */
+ TAILQ_FOREACH(a1, V_pf_altqs_active, entries) {
+ a2 = malloc(sizeof(*a2), M_PFALTQ, M_NOWAIT);
+ if (a2 == NULL) {
+ error = ENOMEM;
+ break;
+ }
+ bcopy(a1, a2, sizeof(struct pf_altq));
+
+ if (a2->qname[0] != 0) {
+ if ((a2->qid = pf_qname2qid(a2->qname)) == 0) {
+ error = EBUSY;
+ free(a2, M_PFALTQ);
+ break;
+ }
+ a2->altq_disc = NULL;
+ TAILQ_FOREACH(a3, V_pf_altqs_inactive, entries) {
+ if (strncmp(a3->ifname, a2->ifname,
+ IFNAMSIZ) == 0 && a3->qname[0] == 0) {
+ a2->altq_disc = a3->altq_disc;
+ break;
+ }
+ }
+ }
+ /* Deactivate the interface in question */
+ a2->local_flags &= ~PFALTQ_FLAG_IF_REMOVED;
+ if ((ifp1 = ifunit(a2->ifname)) == NULL ||
+ (remove && ifp1 == ifp)) {
+ a2->local_flags |= PFALTQ_FLAG_IF_REMOVED;
+ } else {
+ error = altq_add(a2);
+
+ if (ticket != V_ticket_altqs_inactive)
+ error = EBUSY;
+
+ if (error) {
+ free(a2, M_PFALTQ);
+ break;
+ }
+ }
+
+ TAILQ_INSERT_TAIL(V_pf_altqs_inactive, a2, entries);
+ }
+
+ if (error != 0)
+ pf_rollback_altq(ticket);
+ else
+ pf_commit_altq(ticket);
+}
+#endif /* ALTQ */
+
+static int
+pf_begin_rules(u_int32_t *ticket, int rs_num, const char *anchor)
+{
+ struct pf_ruleset *rs;
+ struct pf_rule *rule;
+
+ PF_RULES_WASSERT();
+
+ if (rs_num < 0 || rs_num >= PF_RULESET_MAX)
+ return (EINVAL);
+ rs = pf_find_or_create_ruleset(anchor);
+ if (rs == NULL)
+ return (EINVAL);
+ while ((rule = TAILQ_FIRST(rs->rules[rs_num].inactive.ptr)) != NULL) {
+ pf_unlink_rule(rs->rules[rs_num].inactive.ptr, rule);
+ rs->rules[rs_num].inactive.rcount--;
+ }
+ *ticket = ++rs->rules[rs_num].inactive.ticket;
+ rs->rules[rs_num].inactive.open = 1;
+ return (0);
+}
+
+static int
+pf_rollback_rules(u_int32_t ticket, int rs_num, char *anchor)
+{
+ struct pf_ruleset *rs;
+ struct pf_rule *rule;
+
+ PF_RULES_WASSERT();
+
+ if (rs_num < 0 || rs_num >= PF_RULESET_MAX)
+ return (EINVAL);
+ rs = pf_find_ruleset(anchor);
+ if (rs == NULL || !rs->rules[rs_num].inactive.open ||
+ rs->rules[rs_num].inactive.ticket != ticket)
+ return (0);
+ while ((rule = TAILQ_FIRST(rs->rules[rs_num].inactive.ptr)) != NULL) {
+ pf_unlink_rule(rs->rules[rs_num].inactive.ptr, rule);
+ rs->rules[rs_num].inactive.rcount--;
+ }
+ rs->rules[rs_num].inactive.open = 0;
+ return (0);
+}
+
+#define PF_MD5_UPD(st, elm) \
+ MD5Update(ctx, (u_int8_t *) &(st)->elm, sizeof((st)->elm))
+
+#define PF_MD5_UPD_STR(st, elm) \
+ MD5Update(ctx, (u_int8_t *) (st)->elm, strlen((st)->elm))
+
+#define PF_MD5_UPD_HTONL(st, elm, stor) do { \
+ (stor) = htonl((st)->elm); \
+ MD5Update(ctx, (u_int8_t *) &(stor), sizeof(u_int32_t));\
+} while (0)
+
+#define PF_MD5_UPD_HTONS(st, elm, stor) do { \
+ (stor) = htons((st)->elm); \
+ MD5Update(ctx, (u_int8_t *) &(stor), sizeof(u_int16_t));\
+} while (0)
+
+static void
+pf_hash_rule_addr(MD5_CTX *ctx, struct pf_rule_addr *pfr)
+{
+ PF_MD5_UPD(pfr, addr.type);
+ switch (pfr->addr.type) {
+ case PF_ADDR_DYNIFTL:
+ PF_MD5_UPD(pfr, addr.v.ifname);
+ PF_MD5_UPD(pfr, addr.iflags);
+ break;
+ case PF_ADDR_TABLE:
+ PF_MD5_UPD(pfr, addr.v.tblname);
+ break;
+ case PF_ADDR_ADDRMASK:
+ /* XXX ignore af? */
+ PF_MD5_UPD(pfr, addr.v.a.addr.addr32);
+ PF_MD5_UPD(pfr, addr.v.a.mask.addr32);
+ break;
+ }
+
+ PF_MD5_UPD(pfr, port[0]);
+ PF_MD5_UPD(pfr, port[1]);
+ PF_MD5_UPD(pfr, neg);
+ PF_MD5_UPD(pfr, port_op);
+}
+
+static void
+pf_hash_rule(MD5_CTX *ctx, struct pf_rule *rule)
+{
+ u_int16_t x;
+ u_int32_t y;
+
+ pf_hash_rule_addr(ctx, &rule->src);
+ pf_hash_rule_addr(ctx, &rule->dst);
+ PF_MD5_UPD_STR(rule, label);
+ PF_MD5_UPD_STR(rule, ifname);
+ PF_MD5_UPD_STR(rule, match_tagname);
+ PF_MD5_UPD_HTONS(rule, match_tag, x); /* dup? */
+ PF_MD5_UPD_HTONL(rule, os_fingerprint, y);
+ PF_MD5_UPD_HTONL(rule, prob, y);
+ PF_MD5_UPD_HTONL(rule, uid.uid[0], y);
+ PF_MD5_UPD_HTONL(rule, uid.uid[1], y);
+ PF_MD5_UPD(rule, uid.op);
+ PF_MD5_UPD_HTONL(rule, gid.gid[0], y);
+ PF_MD5_UPD_HTONL(rule, gid.gid[1], y);
+ PF_MD5_UPD(rule, gid.op);
+ PF_MD5_UPD_HTONL(rule, rule_flag, y);
+ PF_MD5_UPD(rule, action);
+ PF_MD5_UPD(rule, direction);
+ PF_MD5_UPD(rule, af);
+ PF_MD5_UPD(rule, quick);
+ PF_MD5_UPD(rule, ifnot);
+ PF_MD5_UPD(rule, match_tag_not);
+ PF_MD5_UPD(rule, natpass);
+ PF_MD5_UPD(rule, keep_state);
+ PF_MD5_UPD(rule, proto);
+ PF_MD5_UPD(rule, type);
+ PF_MD5_UPD(rule, code);
+ PF_MD5_UPD(rule, flags);
+ PF_MD5_UPD(rule, flagset);
+ PF_MD5_UPD(rule, allow_opts);
+ PF_MD5_UPD(rule, rt);
+ PF_MD5_UPD(rule, tos);
+}
+
+static int
+pf_commit_rules(u_int32_t ticket, int rs_num, char *anchor)
+{
+ struct pf_ruleset *rs;
+ struct pf_rule *rule, **old_array;
+ struct pf_rulequeue *old_rules;
+ int error;
+ u_int32_t old_rcount;
+
+ PF_RULES_WASSERT();
+
+ if (rs_num < 0 || rs_num >= PF_RULESET_MAX)
+ return (EINVAL);
+ rs = pf_find_ruleset(anchor);
+ if (rs == NULL || !rs->rules[rs_num].inactive.open ||
+ ticket != rs->rules[rs_num].inactive.ticket)
+ return (EBUSY);
+
+ /* Calculate checksum for the main ruleset */
+ if (rs == &pf_main_ruleset) {
+ error = pf_setup_pfsync_matching(rs);
+ if (error != 0)
+ return (error);
+ }
+
+ /* Swap rules, keep the old. */
+ old_rules = rs->rules[rs_num].active.ptr;
+ old_rcount = rs->rules[rs_num].active.rcount;
+ old_array = rs->rules[rs_num].active.ptr_array;
+
+ rs->rules[rs_num].active.ptr =
+ rs->rules[rs_num].inactive.ptr;
+ rs->rules[rs_num].active.ptr_array =
+ rs->rules[rs_num].inactive.ptr_array;
+ rs->rules[rs_num].active.rcount =
+ rs->rules[rs_num].inactive.rcount;
+ rs->rules[rs_num].inactive.ptr = old_rules;
+ rs->rules[rs_num].inactive.ptr_array = old_array;
+ rs->rules[rs_num].inactive.rcount = old_rcount;
+
+ rs->rules[rs_num].active.ticket =
+ rs->rules[rs_num].inactive.ticket;
+ pf_calc_skip_steps(rs->rules[rs_num].active.ptr);
+
+
+ /* Purge the old rule list. */
+ while ((rule = TAILQ_FIRST(old_rules)) != NULL)
+ pf_unlink_rule(old_rules, rule);
+ if (rs->rules[rs_num].inactive.ptr_array)
+ free(rs->rules[rs_num].inactive.ptr_array, M_TEMP);
+ rs->rules[rs_num].inactive.ptr_array = NULL;
+ rs->rules[rs_num].inactive.rcount = 0;
+ rs->rules[rs_num].inactive.open = 0;
+ pf_remove_if_empty_ruleset(rs);
+
+ return (0);
+}
+
+static int
+pf_setup_pfsync_matching(struct pf_ruleset *rs)
+{
+ MD5_CTX ctx;
+ struct pf_rule *rule;
+ int rs_cnt;
+ u_int8_t digest[PF_MD5_DIGEST_LENGTH];
+
+ MD5Init(&ctx);
+ for (rs_cnt = 0; rs_cnt < PF_RULESET_MAX; rs_cnt++) {
+ /* XXX PF_RULESET_SCRUB as well? */
+ if (rs_cnt == PF_RULESET_SCRUB)
+ continue;
+
+ if (rs->rules[rs_cnt].inactive.ptr_array)
+ free(rs->rules[rs_cnt].inactive.ptr_array, M_TEMP);
+ rs->rules[rs_cnt].inactive.ptr_array = NULL;
+
+ if (rs->rules[rs_cnt].inactive.rcount) {
+ rs->rules[rs_cnt].inactive.ptr_array =
+ malloc(sizeof(caddr_t) *
+ rs->rules[rs_cnt].inactive.rcount,
+ M_TEMP, M_NOWAIT);
+
+ if (!rs->rules[rs_cnt].inactive.ptr_array)
+ return (ENOMEM);
+ }
+
+ TAILQ_FOREACH(rule, rs->rules[rs_cnt].inactive.ptr,
+ entries) {
+ pf_hash_rule(&ctx, rule);
+ (rs->rules[rs_cnt].inactive.ptr_array)[rule->nr] = rule;
+ }
+ }
+
+ MD5Final(digest, &ctx);
+ memcpy(V_pf_status.pf_chksum, digest, sizeof(V_pf_status.pf_chksum));
+ return (0);
+}
+
+static int
+pf_addr_setup(struct pf_ruleset *ruleset, struct pf_addr_wrap *addr,
+ sa_family_t af)
+{
+ int error = 0;
+
+ switch (addr->type) {
+ case PF_ADDR_TABLE:
+ addr->p.tbl = pfr_attach_table(ruleset, addr->v.tblname);
+ if (addr->p.tbl == NULL)
+ error = ENOMEM;
+ break;
+ case PF_ADDR_DYNIFTL:
+ error = pfi_dynaddr_setup(addr, af);
+ break;
+ }
+
+ return (error);
+}
+
+static void
+pf_addr_copyout(struct pf_addr_wrap *addr)
+{
+
+ switch (addr->type) {
+ case PF_ADDR_DYNIFTL:
+ pfi_dynaddr_copyout(addr);
+ break;
+ case PF_ADDR_TABLE:
+ pf_tbladdr_copyout(addr);
+ break;
+ }
+}
+
+static int
+pfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td)
+{
+ int error = 0;
+
+ CURVNET_SET(TD_TO_VNET(td));
+
+ /* XXX keep in sync with switch() below */
+ if (securelevel_gt(td->td_ucred, 2))
+ switch (cmd) {
+ case DIOCGETRULES:
+ case DIOCGETRULE:
+ case DIOCGETADDRS:
+ case DIOCGETADDR:
+ case DIOCGETSTATE:
+ case DIOCSETSTATUSIF:
+ case DIOCGETSTATUS:
+ case DIOCCLRSTATUS:
+ case DIOCNATLOOK:
+ case DIOCSETDEBUG:
+ case DIOCGETSTATES:
+ case DIOCGETTIMEOUT:
+ case DIOCCLRRULECTRS:
+ case DIOCGETLIMIT:
+ case DIOCGETALTQS:
+ case DIOCGETALTQ:
+ case DIOCGETQSTATS:
+ case DIOCGETRULESETS:
+ case DIOCGETRULESET:
+ case DIOCRGETTABLES:
+ case DIOCRGETTSTATS:
+ case DIOCRCLRTSTATS:
+ case DIOCRCLRADDRS:
+ case DIOCRADDADDRS:
+ case DIOCRDELADDRS:
+ case DIOCRSETADDRS:
+ case DIOCRGETADDRS:
+ case DIOCRGETASTATS:
+ case DIOCRCLRASTATS:
+ case DIOCRTSTADDRS:
+ case DIOCOSFPGET:
+ case DIOCGETSRCNODES:
+ case DIOCCLRSRCNODES:
+ case DIOCIGETIFACES:
+ case DIOCGIFSPEED:
+ case DIOCSETIFFLAG:
+ case DIOCCLRIFFLAG:
+ break;
+ case DIOCRCLRTABLES:
+ case DIOCRADDTABLES:
+ case DIOCRDELTABLES:
+ case DIOCRSETTFLAGS:
+ if (((struct pfioc_table *)addr)->pfrio_flags &
+ PFR_FLAG_DUMMY)
+ break; /* dummy operation ok */
+ return (EPERM);
+ default:
+ return (EPERM);
+ }
+
+ if (!(flags & FWRITE))
+ switch (cmd) {
+ case DIOCGETRULES:
+ case DIOCGETADDRS:
+ case DIOCGETADDR:
+ case DIOCGETSTATE:
+ case DIOCGETSTATUS:
+ case DIOCGETSTATES:
+ case DIOCGETTIMEOUT:
+ case DIOCGETLIMIT:
+ case DIOCGETALTQS:
+ case DIOCGETALTQ:
+ case DIOCGETQSTATS:
+ case DIOCGETRULESETS:
+ case DIOCGETRULESET:
+ case DIOCNATLOOK:
+ case DIOCRGETTABLES:
+ case DIOCRGETTSTATS:
+ case DIOCRGETADDRS:
+ case DIOCRGETASTATS:
+ case DIOCRTSTADDRS:
+ case DIOCOSFPGET:
+ case DIOCGETSRCNODES:
+ case DIOCIGETIFACES:
+ case DIOCGIFSPEED:
+ break;
+ case DIOCRCLRTABLES:
+ case DIOCRADDTABLES:
+ case DIOCRDELTABLES:
+ case DIOCRCLRTSTATS:
+ case DIOCRCLRADDRS:
+ case DIOCRADDADDRS:
+ case DIOCRDELADDRS:
+ case DIOCRSETADDRS:
+ case DIOCRSETTFLAGS:
+ if (((struct pfioc_table *)addr)->pfrio_flags &
+ PFR_FLAG_DUMMY) {
+ flags |= FWRITE; /* need write lock for dummy */
+ break; /* dummy operation ok */
+ }
+ return (EACCES);
+ case DIOCGETRULE:
+ if (((struct pfioc_rule *)addr)->action ==
+ PF_GET_CLR_CNTR)
+ return (EACCES);
+ break;
+ default:
+ return (EACCES);
+ }
+
+ switch (cmd) {
+ case DIOCSTART:
+ PF_RULES_WLOCK();
+ if (V_pf_status.running)
+ error = EEXIST;
+ else {
+ int cpu;
+
+ PF_RULES_WUNLOCK();
+ error = hook_pf();
+ if (error) {
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: pfil registration failed\n"));
+ break;
+ }
+ PF_RULES_WLOCK();
+ V_pf_status.running = 1;
+ V_pf_status.since = time_second;
+
+ CPU_FOREACH(cpu)
+ V_pf_stateid[cpu] = time_second;
+
+ DPFPRINTF(PF_DEBUG_MISC, ("pf: started\n"));
+ }
+ PF_RULES_WUNLOCK();
+ break;
+
+ case DIOCSTOP:
+ PF_RULES_WLOCK();
+ if (!V_pf_status.running)
+ error = ENOENT;
+ else {
+ V_pf_status.running = 0;
+ PF_RULES_WUNLOCK();
+ error = dehook_pf();
+ if (error) {
+ V_pf_status.running = 1;
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: pfil unregistration failed\n"));
+ }
+ PF_RULES_WLOCK();
+ V_pf_status.since = time_second;
+ DPFPRINTF(PF_DEBUG_MISC, ("pf: stopped\n"));
+ }
+ PF_RULES_WUNLOCK();
+ break;
+
+ case DIOCADDRULE: {
+ struct pfioc_rule *pr = (struct pfioc_rule *)addr;
+ struct pf_ruleset *ruleset;
+ struct pf_rule *rule, *tail;
+ struct pf_pooladdr *pa;
+ struct pfi_kif *kif = NULL;
+ int rs_num;
+
+ if (pr->rule.return_icmp >> 8 > ICMP_MAXTYPE) {
+ error = EINVAL;
+ break;
+ }
+#ifndef INET
+ if (pr->rule.af == AF_INET) {
+ error = EAFNOSUPPORT;
+ break;
+ }
+#endif /* INET */
+#ifndef INET6
+ if (pr->rule.af == AF_INET6) {
+ error = EAFNOSUPPORT;
+ break;
+ }
+#endif /* INET6 */
+
+ rule = malloc(sizeof(*rule), M_PFRULE, M_WAITOK);
+ bcopy(&pr->rule, rule, sizeof(struct pf_rule));
+ if (rule->ifname[0])
+ kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK);
+ rule->cuid = td->td_ucred->cr_ruid;
+ rule->cpid = td->td_proc ? td->td_proc->p_pid : 0;
+ TAILQ_INIT(&rule->rpool.list);
+
+#define ERROUT(x) { error = (x); goto DIOCADDRULE_error; }
+
+ PF_RULES_WLOCK();
+ pr->anchor[sizeof(pr->anchor) - 1] = 0;
+ ruleset = pf_find_ruleset(pr->anchor);
+ if (ruleset == NULL)
+ ERROUT(EINVAL);
+ rs_num = pf_get_ruleset_number(pr->rule.action);
+ if (rs_num >= PF_RULESET_MAX)
+ ERROUT(EINVAL);
+ if (pr->ticket != ruleset->rules[rs_num].inactive.ticket) {
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("ticket: %d != [%d]%d\n", pr->ticket, rs_num,
+ ruleset->rules[rs_num].inactive.ticket));
+ ERROUT(EBUSY);
+ }
+ if (pr->pool_ticket != V_ticket_pabuf) {
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pool_ticket: %d != %d\n", pr->pool_ticket,
+ V_ticket_pabuf));
+ ERROUT(EBUSY);
+ }
+
+ tail = TAILQ_LAST(ruleset->rules[rs_num].inactive.ptr,
+ pf_rulequeue);
+ if (tail)
+ rule->nr = tail->nr + 1;
+ else
+ rule->nr = 0;
+ if (rule->ifname[0]) {
+ rule->kif = pfi_kif_attach(kif, rule->ifname);
+ pfi_kif_ref(rule->kif);
+ } else
+ rule->kif = NULL;
+
+ if (rule->rtableid > 0 && rule->rtableid >= rt_numfibs)
+ error = EBUSY;
+
+#ifdef ALTQ
+ /* set queue IDs */
+ if (rule->qname[0] != 0) {
+ if ((rule->qid = pf_qname2qid(rule->qname)) == 0)
+ error = EBUSY;
+ else if (rule->pqname[0] != 0) {
+ if ((rule->pqid =
+ pf_qname2qid(rule->pqname)) == 0)
+ error = EBUSY;
+ } else
+ rule->pqid = rule->qid;
+ }
+#endif
+ if (rule->tagname[0])
+ if ((rule->tag = pf_tagname2tag(rule->tagname)) == 0)
+ error = EBUSY;
+ if (rule->match_tagname[0])
+ if ((rule->match_tag =
+ pf_tagname2tag(rule->match_tagname)) == 0)
+ error = EBUSY;
+ if (rule->rt && !rule->direction)
+ error = EINVAL;
+ if (!rule->log)
+ rule->logif = 0;
+ if (rule->logif >= PFLOGIFS_MAX)
+ error = EINVAL;
+ if (pf_addr_setup(ruleset, &rule->src.addr, rule->af))
+ error = ENOMEM;
+ if (pf_addr_setup(ruleset, &rule->dst.addr, rule->af))
+ error = ENOMEM;
+ if (pf_anchor_setup(rule, ruleset, pr->anchor_call))
+ error = EINVAL;
+ TAILQ_FOREACH(pa, &V_pf_pabuf, entries)
+ if (pa->addr.type == PF_ADDR_TABLE) {
+ pa->addr.p.tbl = pfr_attach_table(ruleset,
+ pa->addr.v.tblname);
+ if (pa->addr.p.tbl == NULL)
+ error = ENOMEM;
+ }
+
+ if (rule->overload_tblname[0]) {
+ if ((rule->overload_tbl = pfr_attach_table(ruleset,
+ rule->overload_tblname)) == NULL)
+ error = EINVAL;
+ else
+ rule->overload_tbl->pfrkt_flags |=
+ PFR_TFLAG_ACTIVE;
+ }
+
+ pf_mv_pool(&V_pf_pabuf, &rule->rpool.list);
+ if (((((rule->action == PF_NAT) || (rule->action == PF_RDR) ||
+ (rule->action == PF_BINAT)) && rule->anchor == NULL) ||
+ (rule->rt > PF_FASTROUTE)) &&
+ (TAILQ_FIRST(&rule->rpool.list) == NULL))
+ error = EINVAL;
+
+ if (error) {
+ pf_free_rule(rule);
+ PF_RULES_WUNLOCK();
+ break;
+ }
+
+ rule->rpool.cur = TAILQ_FIRST(&rule->rpool.list);
+ rule->evaluations = rule->packets[0] = rule->packets[1] =
+ rule->bytes[0] = rule->bytes[1] = 0;
+ TAILQ_INSERT_TAIL(ruleset->rules[rs_num].inactive.ptr,
+ rule, entries);
+ ruleset->rules[rs_num].inactive.rcount++;
+ PF_RULES_WUNLOCK();
+ break;
+
+#undef ERROUT
+DIOCADDRULE_error:
+ PF_RULES_WUNLOCK();
+ free(rule, M_PFRULE);
+ if (kif)
+ free(kif, PFI_MTYPE);
+ break;
+ }
+
+ case DIOCGETRULES: {
+ struct pfioc_rule *pr = (struct pfioc_rule *)addr;
+ struct pf_ruleset *ruleset;
+ struct pf_rule *tail;
+ int rs_num;
+
+ PF_RULES_WLOCK();
+ pr->anchor[sizeof(pr->anchor) - 1] = 0;
+ ruleset = pf_find_ruleset(pr->anchor);
+ if (ruleset == NULL) {
+ PF_RULES_WUNLOCK();
+ error = EINVAL;
+ break;
+ }
+ rs_num = pf_get_ruleset_number(pr->rule.action);
+ if (rs_num >= PF_RULESET_MAX) {
+ PF_RULES_WUNLOCK();
+ error = EINVAL;
+ break;
+ }
+ tail = TAILQ_LAST(ruleset->rules[rs_num].active.ptr,
+ pf_rulequeue);
+ if (tail)
+ pr->nr = tail->nr + 1;
+ else
+ pr->nr = 0;
+ pr->ticket = ruleset->rules[rs_num].active.ticket;
+ PF_RULES_WUNLOCK();
+ break;
+ }
+
+ case DIOCGETRULE: {
+ struct pfioc_rule *pr = (struct pfioc_rule *)addr;
+ struct pf_ruleset *ruleset;
+ struct pf_rule *rule;
+ int rs_num, i;
+
+ PF_RULES_WLOCK();
+ pr->anchor[sizeof(pr->anchor) - 1] = 0;
+ ruleset = pf_find_ruleset(pr->anchor);
+ if (ruleset == NULL) {
+ PF_RULES_WUNLOCK();
+ error = EINVAL;
+ break;
+ }
+ rs_num = pf_get_ruleset_number(pr->rule.action);
+ if (rs_num >= PF_RULESET_MAX) {
+ PF_RULES_WUNLOCK();
+ error = EINVAL;
+ break;
+ }
+ if (pr->ticket != ruleset->rules[rs_num].active.ticket) {
+ PF_RULES_WUNLOCK();
+ error = EBUSY;
+ break;
+ }
+ rule = TAILQ_FIRST(ruleset->rules[rs_num].active.ptr);
+ while ((rule != NULL) && (rule->nr != pr->nr))
+ rule = TAILQ_NEXT(rule, entries);
+ if (rule == NULL) {
+ PF_RULES_WUNLOCK();
+ error = EBUSY;
+ break;
+ }
+ bcopy(rule, &pr->rule, sizeof(struct pf_rule));
+ if (pf_anchor_copyout(ruleset, rule, pr)) {
+ PF_RULES_WUNLOCK();
+ error = EBUSY;
+ break;
+ }
+ pf_addr_copyout(&pr->rule.src.addr);
+ pf_addr_copyout(&pr->rule.dst.addr);
+ for (i = 0; i < PF_SKIP_COUNT; ++i)
+ if (rule->skip[i].ptr == NULL)
+ pr->rule.skip[i].nr = -1;
+ else
+ pr->rule.skip[i].nr =
+ rule->skip[i].ptr->nr;
+
+ if (pr->action == PF_GET_CLR_CNTR) {
+ rule->evaluations = 0;
+ rule->packets[0] = rule->packets[1] = 0;
+ rule->bytes[0] = rule->bytes[1] = 0;
+ rule->states_tot = 0;
+ }
+ PF_RULES_WUNLOCK();
+ break;
+ }
+
+ case DIOCCHANGERULE: {
+ struct pfioc_rule *pcr = (struct pfioc_rule *)addr;
+ struct pf_ruleset *ruleset;
+ struct pf_rule *oldrule = NULL, *newrule = NULL;
+ struct pfi_kif *kif = NULL;
+ struct pf_pooladdr *pa;
+ u_int32_t nr = 0;
+ int rs_num;
+
+ if (pcr->action < PF_CHANGE_ADD_HEAD ||
+ pcr->action > PF_CHANGE_GET_TICKET) {
+ error = EINVAL;
+ break;
+ }
+ if (pcr->rule.return_icmp >> 8 > ICMP_MAXTYPE) {
+ error = EINVAL;
+ break;
+ }
+
+ if (pcr->action != PF_CHANGE_REMOVE) {
+#ifndef INET
+ if (pcr->rule.af == AF_INET) {
+ error = EAFNOSUPPORT;
+ break;
+ }
+#endif /* INET */
+#ifndef INET6
+ if (pcr->rule.af == AF_INET6) {
+ error = EAFNOSUPPORT;
+ break;
+ }
+#endif /* INET6 */
+ newrule = malloc(sizeof(*newrule), M_PFRULE, M_WAITOK);
+ bcopy(&pcr->rule, newrule, sizeof(struct pf_rule));
+ newrule->cuid = td->td_ucred->cr_ruid;
+ newrule->cpid = td->td_proc ? td->td_proc->p_pid : 0;
+ TAILQ_INIT(&newrule->rpool.list);
+ /* Initialize refcounting. */
+ newrule->states_cur = 0;
+ newrule->entries.tqe_prev = NULL;
+
+ if (newrule->ifname[0])
+ kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK);
+ }
+
+#define ERROUT(x) { error = (x); goto DIOCCHANGERULE_error; }
+
+ PF_RULES_WLOCK();
+ if (!(pcr->action == PF_CHANGE_REMOVE ||
+ pcr->action == PF_CHANGE_GET_TICKET) &&
+ pcr->pool_ticket != V_ticket_pabuf)
+ ERROUT(EBUSY);
+
+ ruleset = pf_find_ruleset(pcr->anchor);
+ if (ruleset == NULL)
+ ERROUT(EINVAL);
+
+ rs_num = pf_get_ruleset_number(pcr->rule.action);
+ if (rs_num >= PF_RULESET_MAX)
+ ERROUT(EINVAL);
+
+ if (pcr->action == PF_CHANGE_GET_TICKET) {
+ pcr->ticket = ++ruleset->rules[rs_num].active.ticket;
+ ERROUT(0);
+ } else if (pcr->ticket !=
+ ruleset->rules[rs_num].active.ticket)
+ ERROUT(EINVAL);
+
+ if (pcr->action != PF_CHANGE_REMOVE) {
+ if (newrule->ifname[0]) {
+ newrule->kif = pfi_kif_attach(kif,
+ newrule->ifname);
+ pfi_kif_ref(newrule->kif);
+ } else
+ newrule->kif = NULL;
+
+ if (newrule->rtableid > 0 &&
+ newrule->rtableid >= rt_numfibs)
+ error = EBUSY;
+
+#ifdef ALTQ
+ /* set queue IDs */
+ if (newrule->qname[0] != 0) {
+ if ((newrule->qid =
+ pf_qname2qid(newrule->qname)) == 0)
+ error = EBUSY;
+ else if (newrule->pqname[0] != 0) {
+ if ((newrule->pqid =
+ pf_qname2qid(newrule->pqname)) == 0)
+ error = EBUSY;
+ } else
+ newrule->pqid = newrule->qid;
+ }
+#endif /* ALTQ */
+ if (newrule->tagname[0])
+ if ((newrule->tag =
+ pf_tagname2tag(newrule->tagname)) == 0)
+ error = EBUSY;
+ if (newrule->match_tagname[0])
+ if ((newrule->match_tag = pf_tagname2tag(
+ newrule->match_tagname)) == 0)
+ error = EBUSY;
+ if (newrule->rt && !newrule->direction)
+ error = EINVAL;
+ if (!newrule->log)
+ newrule->logif = 0;
+ if (newrule->logif >= PFLOGIFS_MAX)
+ error = EINVAL;
+ if (pf_addr_setup(ruleset, &newrule->src.addr, newrule->af))
+ error = ENOMEM;
+ if (pf_addr_setup(ruleset, &newrule->dst.addr, newrule->af))
+ error = ENOMEM;
+ if (pf_anchor_setup(newrule, ruleset, pcr->anchor_call))
+ error = EINVAL;
+ TAILQ_FOREACH(pa, &V_pf_pabuf, entries)
+ if (pa->addr.type == PF_ADDR_TABLE) {
+ pa->addr.p.tbl =
+ pfr_attach_table(ruleset,
+ pa->addr.v.tblname);
+ if (pa->addr.p.tbl == NULL)
+ error = ENOMEM;
+ }
+
+ if (newrule->overload_tblname[0]) {
+ if ((newrule->overload_tbl = pfr_attach_table(
+ ruleset, newrule->overload_tblname)) ==
+ NULL)
+ error = EINVAL;
+ else
+ newrule->overload_tbl->pfrkt_flags |=
+ PFR_TFLAG_ACTIVE;
+ }
+
+ pf_mv_pool(&V_pf_pabuf, &newrule->rpool.list);
+ if (((((newrule->action == PF_NAT) ||
+ (newrule->action == PF_RDR) ||
+ (newrule->action == PF_BINAT) ||
+ (newrule->rt > PF_FASTROUTE)) &&
+ !newrule->anchor)) &&
+ (TAILQ_FIRST(&newrule->rpool.list) == NULL))
+ error = EINVAL;
+
+ if (error) {
+ pf_free_rule(newrule);
+ PF_RULES_WUNLOCK();
+ break;
+ }
+
+ newrule->rpool.cur = TAILQ_FIRST(&newrule->rpool.list);
+ newrule->evaluations = 0;
+ newrule->packets[0] = newrule->packets[1] = 0;
+ newrule->bytes[0] = newrule->bytes[1] = 0;
+ }
+ pf_empty_pool(&V_pf_pabuf);
+
+ if (pcr->action == PF_CHANGE_ADD_HEAD)
+ oldrule = TAILQ_FIRST(
+ ruleset->rules[rs_num].active.ptr);
+ else if (pcr->action == PF_CHANGE_ADD_TAIL)
+ oldrule = TAILQ_LAST(
+ ruleset->rules[rs_num].active.ptr, pf_rulequeue);
+ else {
+ oldrule = TAILQ_FIRST(
+ ruleset->rules[rs_num].active.ptr);
+ while ((oldrule != NULL) && (oldrule->nr != pcr->nr))
+ oldrule = TAILQ_NEXT(oldrule, entries);
+ if (oldrule == NULL) {
+ if (newrule != NULL)
+ pf_free_rule(newrule);
+ PF_RULES_WUNLOCK();
+ error = EINVAL;
+ break;
+ }
+ }
+
+ if (pcr->action == PF_CHANGE_REMOVE) {
+ pf_unlink_rule(ruleset->rules[rs_num].active.ptr,
+ oldrule);
+ ruleset->rules[rs_num].active.rcount--;
+ } else {
+ if (oldrule == NULL)
+ TAILQ_INSERT_TAIL(
+ ruleset->rules[rs_num].active.ptr,
+ newrule, entries);
+ else if (pcr->action == PF_CHANGE_ADD_HEAD ||
+ pcr->action == PF_CHANGE_ADD_BEFORE)
+ TAILQ_INSERT_BEFORE(oldrule, newrule, entries);
+ else
+ TAILQ_INSERT_AFTER(
+ ruleset->rules[rs_num].active.ptr,
+ oldrule, newrule, entries);
+ ruleset->rules[rs_num].active.rcount++;
+ }
+
+ nr = 0;
+ TAILQ_FOREACH(oldrule,
+ ruleset->rules[rs_num].active.ptr, entries)
+ oldrule->nr = nr++;
+
+ ruleset->rules[rs_num].active.ticket++;
+
+ pf_calc_skip_steps(ruleset->rules[rs_num].active.ptr);
+ pf_remove_if_empty_ruleset(ruleset);
+
+ PF_RULES_WUNLOCK();
+ break;
+
+#undef ERROUT
+DIOCCHANGERULE_error:
+ PF_RULES_WUNLOCK();
+ if (newrule != NULL)
+ free(newrule, M_PFRULE);
+ if (kif != NULL)
+ free(kif, PFI_MTYPE);
+ break;
+ }
+
+ case DIOCCLRSTATES: {
+ struct pf_state *s;
+ struct pfioc_state_kill *psk = (struct pfioc_state_kill *)addr;
+ u_int i, killed = 0;
+
+ for (i = 0; i <= V_pf_hashmask; i++) {
+ struct pf_idhash *ih = &V_pf_idhash[i];
+
+relock_DIOCCLRSTATES:
+ PF_HASHROW_LOCK(ih);
+ LIST_FOREACH(s, &ih->states, entry)
+ if (!psk->psk_ifname[0] ||
+ !strcmp(psk->psk_ifname,
+ s->kif->pfik_name)) {
+ /*
+ * Don't send out individual
+ * delete messages.
+ */
+ s->state_flags |= PFSTATE_NOSYNC;
+ pf_unlink_state(s, PF_ENTER_LOCKED);
+ killed++;
+ goto relock_DIOCCLRSTATES;
+ }
+ PF_HASHROW_UNLOCK(ih);
+ }
+ psk->psk_killed = killed;
+ if (pfsync_clear_states_ptr != NULL)
+ pfsync_clear_states_ptr(V_pf_status.hostid, psk->psk_ifname);
+ break;
+ }
+
+ case DIOCKILLSTATES: {
+ struct pf_state *s;
+ struct pf_state_key *sk;
+ struct pf_addr *srcaddr, *dstaddr;
+ u_int16_t srcport, dstport;
+ struct pfioc_state_kill *psk = (struct pfioc_state_kill *)addr;
+ u_int i, killed = 0;
+
+ if (psk->psk_pfcmp.id) {
+ if (psk->psk_pfcmp.creatorid == 0)
+ psk->psk_pfcmp.creatorid = V_pf_status.hostid;
+ if ((s = pf_find_state_byid(psk->psk_pfcmp.id,
+ psk->psk_pfcmp.creatorid))) {
+ pf_unlink_state(s, PF_ENTER_LOCKED);
+ psk->psk_killed = 1;
+ }
+ break;
+ }
+
+ for (i = 0; i <= V_pf_hashmask; i++) {
+ struct pf_idhash *ih = &V_pf_idhash[i];
+
+relock_DIOCKILLSTATES:
+ PF_HASHROW_LOCK(ih);
+ LIST_FOREACH(s, &ih->states, entry) {
+ sk = s->key[PF_SK_WIRE];
+ if (s->direction == PF_OUT) {
+ srcaddr = &sk->addr[1];
+ dstaddr = &sk->addr[0];
+ srcport = sk->port[0];
+ dstport = sk->port[0];
+ } else {
+ srcaddr = &sk->addr[0];
+ dstaddr = &sk->addr[1];
+ srcport = sk->port[0];
+ dstport = sk->port[0];
+ }
+
+ if ((!psk->psk_af || sk->af == psk->psk_af)
+ && (!psk->psk_proto || psk->psk_proto ==
+ sk->proto) &&
+ PF_MATCHA(psk->psk_src.neg,
+ &psk->psk_src.addr.v.a.addr,
+ &psk->psk_src.addr.v.a.mask,
+ srcaddr, sk->af) &&
+ PF_MATCHA(psk->psk_dst.neg,
+ &psk->psk_dst.addr.v.a.addr,
+ &psk->psk_dst.addr.v.a.mask,
+ dstaddr, sk->af) &&
+ (psk->psk_src.port_op == 0 ||
+ pf_match_port(psk->psk_src.port_op,
+ psk->psk_src.port[0], psk->psk_src.port[1],
+ srcport)) &&
+ (psk->psk_dst.port_op == 0 ||
+ pf_match_port(psk->psk_dst.port_op,
+ psk->psk_dst.port[0], psk->psk_dst.port[1],
+ dstport)) &&
+ (!psk->psk_label[0] ||
+ (s->rule.ptr->label[0] &&
+ !strcmp(psk->psk_label,
+ s->rule.ptr->label))) &&
+ (!psk->psk_ifname[0] ||
+ !strcmp(psk->psk_ifname,
+ s->kif->pfik_name))) {
+ pf_unlink_state(s, PF_ENTER_LOCKED);
+ killed++;
+ goto relock_DIOCKILLSTATES;
+ }
+ }
+ PF_HASHROW_UNLOCK(ih);
+ }
+ psk->psk_killed = killed;
+ break;
+ }
+
+ case DIOCADDSTATE: {
+ struct pfioc_state *ps = (struct pfioc_state *)addr;
+ struct pfsync_state *sp = &ps->state;
+
+ if (sp->timeout >= PFTM_MAX &&
+ sp->timeout != PFTM_UNTIL_PACKET) {
+ error = EINVAL;
+ break;
+ }
+ if (pfsync_state_import_ptr != NULL) {
+ PF_RULES_RLOCK();
+ error = pfsync_state_import_ptr(sp, PFSYNC_SI_IOCTL);
+ PF_RULES_RUNLOCK();
+ }
+ error = EOPNOTSUPP;
+ break;
+ }
+
+ case DIOCGETSTATE: {
+ struct pfioc_state *ps = (struct pfioc_state *)addr;
+ struct pf_state *s;
+
+ s = pf_find_state_byid(ps->state.id, ps->state.creatorid);
+ if (s == NULL) {
+ error = ENOENT;
+ break;
+ }
+
+ pfsync_state_export(&ps->state, s);
+ PF_STATE_UNLOCK(s);
+ break;
+ }
+
+ case DIOCGETSTATES: {
+ struct pfioc_states *ps = (struct pfioc_states *)addr;
+ struct pf_state *s;
+ struct pfsync_state *pstore, *p;
+ int i, nr;
+
+ if (ps->ps_len == 0) {
+ nr = uma_zone_get_cur(V_pf_state_z);
+ ps->ps_len = sizeof(struct pfsync_state) * nr;
+ break;
+ }
+
+ p = pstore = malloc(ps->ps_len, M_TEMP, M_WAITOK);
+ nr = 0;
+
+ for (i = 0; i <= V_pf_hashmask; i++) {
+ struct pf_idhash *ih = &V_pf_idhash[i];
+
+ PF_HASHROW_LOCK(ih);
+ LIST_FOREACH(s, &ih->states, entry) {
+
+ if (s->timeout == PFTM_UNLINKED)
+ continue;
+
+ if ((nr+1) * sizeof(*p) > ps->ps_len) {
+ PF_HASHROW_UNLOCK(ih);
+ goto DIOCGETSTATES_full;
+ }
+ pfsync_state_export(p, s);
+ p++;
+ nr++;
+ }
+ PF_HASHROW_UNLOCK(ih);
+ }
+DIOCGETSTATES_full:
+ error = copyout(pstore, ps->ps_states,
+ sizeof(struct pfsync_state) * nr);
+ if (error) {
+ free(pstore, M_TEMP);
+ break;
+ }
+ ps->ps_len = sizeof(struct pfsync_state) * nr;
+ free(pstore, M_TEMP);
+
+ break;
+ }
+
+ case DIOCGETSTATUS: {
+ struct pf_status *s = (struct pf_status *)addr;
+ PF_RULES_RLOCK();
+ bcopy(&V_pf_status, s, sizeof(struct pf_status));
+ pfi_update_status(s->ifname, s);
+ PF_RULES_RUNLOCK();
+ break;
+ }
+
+ case DIOCSETSTATUSIF: {
+ struct pfioc_if *pi = (struct pfioc_if *)addr;
+
+ if (pi->ifname[0] == 0) {
+ bzero(V_pf_status.ifname, IFNAMSIZ);
+ break;
+ }
+ PF_RULES_WLOCK();
+ strlcpy(V_pf_status.ifname, pi->ifname, IFNAMSIZ);
+ PF_RULES_WUNLOCK();
+ break;
+ }
+
+ case DIOCCLRSTATUS: {
+ PF_RULES_WLOCK();
+ bzero(V_pf_status.counters, sizeof(V_pf_status.counters));
+ bzero(V_pf_status.fcounters, sizeof(V_pf_status.fcounters));
+ bzero(V_pf_status.scounters, sizeof(V_pf_status.scounters));
+ V_pf_status.since = time_second;
+ if (*V_pf_status.ifname)
+ pfi_update_status(V_pf_status.ifname, NULL);
+ PF_RULES_WUNLOCK();
+ break;
+ }
+
+ case DIOCNATLOOK: {
+ struct pfioc_natlook *pnl = (struct pfioc_natlook *)addr;
+ struct pf_state_key *sk;
+ struct pf_state *state;
+ struct pf_state_key_cmp key;
+ int m = 0, direction = pnl->direction;
+ int sidx, didx;
+
+ /* NATLOOK src and dst are reversed, so reverse sidx/didx */
+ sidx = (direction == PF_IN) ? 1 : 0;
+ didx = (direction == PF_IN) ? 0 : 1;
+
+ if (!pnl->proto ||
+ PF_AZERO(&pnl->saddr, pnl->af) ||
+ PF_AZERO(&pnl->daddr, pnl->af) ||
+ ((pnl->proto == IPPROTO_TCP ||
+ pnl->proto == IPPROTO_UDP) &&
+ (!pnl->dport || !pnl->sport)))
+ error = EINVAL;
+ else {
+ key.af = pnl->af;
+ key.proto = pnl->proto;
+ PF_ACPY(&key.addr[sidx], &pnl->saddr, pnl->af);
+ key.port[sidx] = pnl->sport;
+ PF_ACPY(&key.addr[didx], &pnl->daddr, pnl->af);
+ key.port[didx] = pnl->dport;
+
+ state = pf_find_state_all(&key, direction, &m);
+
+ if (m > 1)
+ error = E2BIG; /* more than one state */
+ else if (state != NULL) {
+ /* XXXGL: not locked read */
+ sk = state->key[sidx];
+ PF_ACPY(&pnl->rsaddr, &sk->addr[sidx], sk->af);
+ pnl->rsport = sk->port[sidx];
+ PF_ACPY(&pnl->rdaddr, &sk->addr[didx], sk->af);
+ pnl->rdport = sk->port[didx];
+ } else
+ error = ENOENT;
+ }
+ break;
+ }
+
+ case DIOCSETTIMEOUT: {
+ struct pfioc_tm *pt = (struct pfioc_tm *)addr;
+ int old;
+
+ if (pt->timeout < 0 || pt->timeout >= PFTM_MAX ||
+ pt->seconds < 0) {
+ error = EINVAL;
+ break;
+ }
+ PF_RULES_WLOCK();
+ old = V_pf_default_rule.timeout[pt->timeout];
+ if (pt->timeout == PFTM_INTERVAL && pt->seconds == 0)
+ pt->seconds = 1;
+ V_pf_default_rule.timeout[pt->timeout] = pt->seconds;
+ if (pt->timeout == PFTM_INTERVAL && pt->seconds < old)
+ wakeup(pf_purge_thread);
+ pt->seconds = old;
+ PF_RULES_WUNLOCK();
+ break;
+ }
+
+ case DIOCGETTIMEOUT: {
+ struct pfioc_tm *pt = (struct pfioc_tm *)addr;
+
+ if (pt->timeout < 0 || pt->timeout >= PFTM_MAX) {
+ error = EINVAL;
+ break;
+ }
+ PF_RULES_RLOCK();
+ pt->seconds = V_pf_default_rule.timeout[pt->timeout];
+ PF_RULES_RUNLOCK();
+ break;
+ }
+
+ case DIOCGETLIMIT: {
+ struct pfioc_limit *pl = (struct pfioc_limit *)addr;
+
+ if (pl->index < 0 || pl->index >= PF_LIMIT_MAX) {
+ error = EINVAL;
+ break;
+ }
+ PF_RULES_RLOCK();
+ pl->limit = V_pf_limits[pl->index].limit;
+ PF_RULES_RUNLOCK();
+ break;
+ }
+
+ case DIOCSETLIMIT: {
+ struct pfioc_limit *pl = (struct pfioc_limit *)addr;
+ int old_limit;
+
+ PF_RULES_WLOCK();
+ if (pl->index < 0 || pl->index >= PF_LIMIT_MAX ||
+ V_pf_limits[pl->index].zone == NULL) {
+ PF_RULES_WUNLOCK();
+ error = EINVAL;
+ break;
+ }
+ uma_zone_set_max(V_pf_limits[pl->index].zone, pl->limit);
+ old_limit = V_pf_limits[pl->index].limit;
+ V_pf_limits[pl->index].limit = pl->limit;
+ pl->limit = old_limit;
+ PF_RULES_WUNLOCK();
+ break;
+ }
+
+ case DIOCSETDEBUG: {
+ u_int32_t *level = (u_int32_t *)addr;
+
+ PF_RULES_WLOCK();
+ V_pf_status.debug = *level;
+ PF_RULES_WUNLOCK();
+ break;
+ }
+
+ case DIOCCLRRULECTRS: {
+ /* obsoleted by DIOCGETRULE with action=PF_GET_CLR_CNTR */
+ struct pf_ruleset *ruleset = &pf_main_ruleset;
+ struct pf_rule *rule;
+
+ PF_RULES_WLOCK();
+ TAILQ_FOREACH(rule,
+ ruleset->rules[PF_RULESET_FILTER].active.ptr, entries) {
+ rule->evaluations = 0;
+ rule->packets[0] = rule->packets[1] = 0;
+ rule->bytes[0] = rule->bytes[1] = 0;
+ }
+ PF_RULES_WUNLOCK();
+ break;
+ }
+
+ case DIOCGIFSPEED: {
+ struct pf_ifspeed *psp = (struct pf_ifspeed *)addr;
+ struct pf_ifspeed ps;
+ struct ifnet *ifp;
+
+ if (psp->ifname[0] != 0) {
+ /* Can we completely trust user-land? */
+ strlcpy(ps.ifname, psp->ifname, IFNAMSIZ);
+ ifp = ifunit(ps.ifname);
+ if (ifp != NULL)
+ psp->baudrate = ifp->if_baudrate;
+ else
+ error = EINVAL;
+ } else
+ error = EINVAL;
+ break;
+ }
+
+#ifdef ALTQ
+ case DIOCSTARTALTQ: {
+ struct pf_altq *altq;
+
+ PF_RULES_WLOCK();
+ /* enable all altq interfaces on active list */
+ TAILQ_FOREACH(altq, V_pf_altqs_active, entries) {
+ if (altq->qname[0] == 0 && (altq->local_flags &
+ PFALTQ_FLAG_IF_REMOVED) == 0) {
+ error = pf_enable_altq(altq);
+ if (error != 0)
+ break;
+ }
+ }
+ if (error == 0)
+ V_pf_altq_running = 1;
+ PF_RULES_WUNLOCK();
+ DPFPRINTF(PF_DEBUG_MISC, ("altq: started\n"));
+ break;
+ }
+
+ case DIOCSTOPALTQ: {
+ struct pf_altq *altq;
+
+ PF_RULES_WLOCK();
+ /* disable all altq interfaces on active list */
+ TAILQ_FOREACH(altq, V_pf_altqs_active, entries) {
+ if (altq->qname[0] == 0 && (altq->local_flags &
+ PFALTQ_FLAG_IF_REMOVED) == 0) {
+ error = pf_disable_altq(altq);
+ if (error != 0)
+ break;
+ }
+ }
+ if (error == 0)
+ V_pf_altq_running = 0;
+ PF_RULES_WUNLOCK();
+ DPFPRINTF(PF_DEBUG_MISC, ("altq: stopped\n"));
+ break;
+ }
+
+ case DIOCADDALTQ: {
+ struct pfioc_altq *pa = (struct pfioc_altq *)addr;
+ struct pf_altq *altq, *a;
+ struct ifnet *ifp;
+
+ altq = malloc(sizeof(*altq), M_PFALTQ, M_WAITOK);
+ bcopy(&pa->altq, altq, sizeof(struct pf_altq));
+ altq->local_flags = 0;
+
+ PF_RULES_WLOCK();
+ if (pa->ticket != V_ticket_altqs_inactive) {
+ PF_RULES_WUNLOCK();
+ free(altq, M_PFALTQ);
+ error = EBUSY;
+ break;
+ }
+
+ /*
+ * if this is for a queue, find the discipline and
+ * copy the necessary fields
+ */
+ if (altq->qname[0] != 0) {
+ if ((altq->qid = pf_qname2qid(altq->qname)) == 0) {
+ PF_RULES_WUNLOCK();
+ error = EBUSY;
+ free(altq, M_PFALTQ);
+ break;
+ }
+ altq->altq_disc = NULL;
+ TAILQ_FOREACH(a, V_pf_altqs_inactive, entries) {
+ if (strncmp(a->ifname, altq->ifname,
+ IFNAMSIZ) == 0 && a->qname[0] == 0) {
+ altq->altq_disc = a->altq_disc;
+ break;
+ }
+ }
+ }
+
+ if ((ifp = ifunit(altq->ifname)) == NULL)
+ altq->local_flags |= PFALTQ_FLAG_IF_REMOVED;
+ else
+ error = altq_add(altq);
+
+ if (error) {
+ PF_RULES_WUNLOCK();
+ free(altq, M_PFALTQ);
+ break;
+ }
+
+ TAILQ_INSERT_TAIL(V_pf_altqs_inactive, altq, entries);
+ bcopy(altq, &pa->altq, sizeof(struct pf_altq));
+ PF_RULES_WUNLOCK();
+ break;
+ }
+
+ case DIOCGETALTQS: {
+ struct pfioc_altq *pa = (struct pfioc_altq *)addr;
+ struct pf_altq *altq;
+
+ PF_RULES_RLOCK();
+ pa->nr = 0;
+ TAILQ_FOREACH(altq, V_pf_altqs_active, entries)
+ pa->nr++;
+ pa->ticket = V_ticket_altqs_active;
+ PF_RULES_RUNLOCK();
+ break;
+ }
+
+ case DIOCGETALTQ: {
+ struct pfioc_altq *pa = (struct pfioc_altq *)addr;
+ struct pf_altq *altq;
+ u_int32_t nr;
+
+ PF_RULES_RLOCK();
+ if (pa->ticket != V_ticket_altqs_active) {
+ PF_RULES_RUNLOCK();
+ error = EBUSY;
+ break;
+ }
+ nr = 0;
+ altq = TAILQ_FIRST(V_pf_altqs_active);
+ while ((altq != NULL) && (nr < pa->nr)) {
+ altq = TAILQ_NEXT(altq, entries);
+ nr++;
+ }
+ if (altq == NULL) {
+ PF_RULES_RUNLOCK();
+ error = EBUSY;
+ break;
+ }
+ bcopy(altq, &pa->altq, sizeof(struct pf_altq));
+ PF_RULES_RUNLOCK();
+ break;
+ }
+
+ case DIOCCHANGEALTQ:
+ /* CHANGEALTQ not supported yet! */
+ error = ENODEV;
+ break;
+
+ case DIOCGETQSTATS: {
+ struct pfioc_qstats *pq = (struct pfioc_qstats *)addr;
+ struct pf_altq *altq;
+ u_int32_t nr;
+ int nbytes;
+
+ PF_RULES_RLOCK();
+ if (pq->ticket != V_ticket_altqs_active) {
+ PF_RULES_RUNLOCK();
+ error = EBUSY;
+ break;
+ }
+ nbytes = pq->nbytes;
+ nr = 0;
+ altq = TAILQ_FIRST(V_pf_altqs_active);
+ while ((altq != NULL) && (nr < pq->nr)) {
+ altq = TAILQ_NEXT(altq, entries);
+ nr++;
+ }
+ if (altq == NULL) {
+ PF_RULES_RUNLOCK();
+ error = EBUSY;
+ break;
+ }
+
+ if ((altq->local_flags & PFALTQ_FLAG_IF_REMOVED) != 0) {
+ PF_RULES_RUNLOCK();
+ error = ENXIO;
+ break;
+ }
+ PF_RULES_RUNLOCK();
+ error = altq_getqstats(altq, pq->buf, &nbytes);
+ if (error == 0) {
+ pq->scheduler = altq->scheduler;
+ pq->nbytes = nbytes;
+ }
+ break;
+ }
+#endif /* ALTQ */
+
+ case DIOCBEGINADDRS: {
+ struct pfioc_pooladdr *pp = (struct pfioc_pooladdr *)addr;
+
+ PF_RULES_WLOCK();
+ pf_empty_pool(&V_pf_pabuf);
+ pp->ticket = ++V_ticket_pabuf;
+ PF_RULES_WUNLOCK();
+ break;
+ }
+
+ case DIOCADDADDR: {
+ struct pfioc_pooladdr *pp = (struct pfioc_pooladdr *)addr;
+ struct pf_pooladdr *pa;
+ struct pfi_kif *kif = NULL;
+
+#ifndef INET
+ if (pp->af == AF_INET) {
+ error = EAFNOSUPPORT;
+ break;
+ }
+#endif /* INET */
+#ifndef INET6
+ if (pp->af == AF_INET6) {
+ error = EAFNOSUPPORT;
+ break;
+ }
+#endif /* INET6 */
+ if (pp->addr.addr.type != PF_ADDR_ADDRMASK &&
+ pp->addr.addr.type != PF_ADDR_DYNIFTL &&
+ pp->addr.addr.type != PF_ADDR_TABLE) {
+ error = EINVAL;
+ break;
+ }
+ pa = malloc(sizeof(*pa), M_PFRULE, M_WAITOK);
+ bcopy(&pp->addr, pa, sizeof(struct pf_pooladdr));
+ if (pa->ifname[0])
+ kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK);
+ PF_RULES_WLOCK();
+ if (pp->ticket != V_ticket_pabuf) {
+ PF_RULES_WUNLOCK();
+ if (pa->ifname[0])
+ free(kif, PFI_MTYPE);
+ free(pa, M_PFRULE);
+ error = EBUSY;
+ break;
+ }
+ if (pa->ifname[0]) {
+ pa->kif = pfi_kif_attach(kif, pa->ifname);
+ pfi_kif_ref(pa->kif);
+ } else
+ pa->kif = NULL;
+ if (pa->addr.type == PF_ADDR_DYNIFTL && ((error =
+ pfi_dynaddr_setup(&pa->addr, pp->af)) != 0)) {
+ if (pa->ifname[0])
+ pfi_kif_unref(pa->kif);
+ PF_RULES_WUNLOCK();
+ free(pa, M_PFRULE);
+ break;
+ }
+ TAILQ_INSERT_TAIL(&V_pf_pabuf, pa, entries);
+ PF_RULES_WUNLOCK();
+ break;
+ }
+
+ case DIOCGETADDRS: {
+ struct pfioc_pooladdr *pp = (struct pfioc_pooladdr *)addr;
+ struct pf_pool *pool;
+ struct pf_pooladdr *pa;
+
+ PF_RULES_RLOCK();
+ pp->nr = 0;
+ pool = pf_get_pool(pp->anchor, pp->ticket, pp->r_action,
+ pp->r_num, 0, 1, 0);
+ if (pool == NULL) {
+ PF_RULES_RUNLOCK();
+ error = EBUSY;
+ break;
+ }
+ TAILQ_FOREACH(pa, &pool->list, entries)
+ pp->nr++;
+ PF_RULES_RUNLOCK();
+ break;
+ }
+
+ case DIOCGETADDR: {
+ struct pfioc_pooladdr *pp = (struct pfioc_pooladdr *)addr;
+ struct pf_pool *pool;
+ struct pf_pooladdr *pa;
+ u_int32_t nr = 0;
+
+ PF_RULES_RLOCK();
+ pool = pf_get_pool(pp->anchor, pp->ticket, pp->r_action,
+ pp->r_num, 0, 1, 1);
+ if (pool == NULL) {
+ PF_RULES_RUNLOCK();
+ error = EBUSY;
+ break;
+ }
+ pa = TAILQ_FIRST(&pool->list);
+ while ((pa != NULL) && (nr < pp->nr)) {
+ pa = TAILQ_NEXT(pa, entries);
+ nr++;
+ }
+ if (pa == NULL) {
+ PF_RULES_RUNLOCK();
+ error = EBUSY;
+ break;
+ }
+ bcopy(pa, &pp->addr, sizeof(struct pf_pooladdr));
+ pf_addr_copyout(&pp->addr.addr);
+ PF_RULES_RUNLOCK();
+ break;
+ }
+
+ case DIOCCHANGEADDR: {
+ struct pfioc_pooladdr *pca = (struct pfioc_pooladdr *)addr;
+ struct pf_pool *pool;
+ struct pf_pooladdr *oldpa = NULL, *newpa = NULL;
+ struct pf_ruleset *ruleset;
+ struct pfi_kif *kif = NULL;
+
+ if (pca->action < PF_CHANGE_ADD_HEAD ||
+ pca->action > PF_CHANGE_REMOVE) {
+ error = EINVAL;
+ break;
+ }
+ if (pca->addr.addr.type != PF_ADDR_ADDRMASK &&
+ pca->addr.addr.type != PF_ADDR_DYNIFTL &&
+ pca->addr.addr.type != PF_ADDR_TABLE) {
+ error = EINVAL;
+ break;
+ }
+
+ if (pca->action != PF_CHANGE_REMOVE) {
+#ifndef INET
+ if (pca->af == AF_INET) {
+ error = EAFNOSUPPORT;
+ break;
+ }
+#endif /* INET */
+#ifndef INET6
+ if (pca->af == AF_INET6) {
+ error = EAFNOSUPPORT;
+ break;
+ }
+#endif /* INET6 */
+ newpa = malloc(sizeof(*newpa), M_PFRULE, M_WAITOK);
+ bcopy(&pca->addr, newpa, sizeof(struct pf_pooladdr));
+ if (newpa->ifname[0])
+ kif = malloc(sizeof(*kif), PFI_MTYPE, M_WAITOK);
+ }
+
+#define ERROUT(x) { error = (x); goto DIOCCHANGEADDR_error; }
+ PF_RULES_WLOCK();
+ ruleset = pf_find_ruleset(pca->anchor);
+ if (ruleset == NULL)
+ ERROUT(EBUSY);
+
+ pool = pf_get_pool(pca->anchor, pca->ticket, pca->r_action,
+ pca->r_num, pca->r_last, 1, 1);
+ if (pool == NULL)
+ ERROUT(EBUSY);
+
+ if (pca->action != PF_CHANGE_REMOVE) {
+ if (newpa->ifname[0]) {
+ newpa->kif = pfi_kif_attach(kif, newpa->ifname);
+ pfi_kif_ref(newpa->kif);
+ } else
+ newpa->kif = NULL;
+
+ switch (newpa->addr.type) {
+ case PF_ADDR_DYNIFTL:
+ error = pfi_dynaddr_setup(&newpa->addr,
+ pca->af);
+ break;
+ case PF_ADDR_TABLE:
+ newpa->addr.p.tbl = pfr_attach_table(ruleset,
+ newpa->addr.v.tblname);
+ if (newpa->addr.p.tbl == NULL)
+ error = ENOMEM;
+ break;
+ }
+ if (error) {
+ if (newpa->kif)
+ pfi_kif_unref(newpa->kif);
+ PF_RULES_WUNLOCK();
+ free(newpa, M_PFRULE);
+ break;
+ }
+ }
+
+ if (pca->action == PF_CHANGE_ADD_HEAD)
+ oldpa = TAILQ_FIRST(&pool->list);
+ else if (pca->action == PF_CHANGE_ADD_TAIL)
+ oldpa = TAILQ_LAST(&pool->list, pf_palist);
+ else {
+ int i = 0;
+
+ oldpa = TAILQ_FIRST(&pool->list);
+ while ((oldpa != NULL) && (i < pca->nr)) {
+ oldpa = TAILQ_NEXT(oldpa, entries);
+ i++;
+ }
+ if (oldpa == NULL) {
+ PF_RULES_WUNLOCK();
+ error = EINVAL;
+ break;
+ }
+ }
+
+ if (pca->action == PF_CHANGE_REMOVE) {
+ TAILQ_REMOVE(&pool->list, oldpa, entries);
+ switch (oldpa->addr.type) {
+ case PF_ADDR_DYNIFTL:
+ pfi_dynaddr_remove(oldpa->addr.p.dyn);
+ break;
+ case PF_ADDR_TABLE:
+ pfr_detach_table(oldpa->addr.p.tbl);
+ break;
+ }
+ if (oldpa->kif)
+ pfi_kif_unref(oldpa->kif);
+ free(oldpa, M_PFRULE);
+ } else {
+ if (oldpa == NULL)
+ TAILQ_INSERT_TAIL(&pool->list, newpa, entries);
+ else if (pca->action == PF_CHANGE_ADD_HEAD ||
+ pca->action == PF_CHANGE_ADD_BEFORE)
+ TAILQ_INSERT_BEFORE(oldpa, newpa, entries);
+ else
+ TAILQ_INSERT_AFTER(&pool->list, oldpa,
+ newpa, entries);
+ }
+
+ pool->cur = TAILQ_FIRST(&pool->list);
+ PF_ACPY(&pool->counter, &pool->cur->addr.v.a.addr,
+ pca->af);
+ PF_RULES_WUNLOCK();
+ break;
+
+#undef ERROUT
+DIOCCHANGEADDR_error:
+ PF_RULES_WUNLOCK();
+ if (newpa != NULL)
+ free(newpa, M_PFRULE);
+ if (kif != NULL)
+ free(kif, PFI_MTYPE);
+ break;
+ }
+
+ case DIOCGETRULESETS: {
+ struct pfioc_ruleset *pr = (struct pfioc_ruleset *)addr;
+ struct pf_ruleset *ruleset;
+ struct pf_anchor *anchor;
+
+ PF_RULES_RLOCK();
+ pr->path[sizeof(pr->path) - 1] = 0;
+ if ((ruleset = pf_find_ruleset(pr->path)) == NULL) {
+ PF_RULES_RUNLOCK();
+ error = ENOENT;
+ break;
+ }
+ pr->nr = 0;
+ if (ruleset->anchor == NULL) {
+ /* XXX kludge for pf_main_ruleset */
+ RB_FOREACH(anchor, pf_anchor_global, &V_pf_anchors)
+ if (anchor->parent == NULL)
+ pr->nr++;
+ } else {
+ RB_FOREACH(anchor, pf_anchor_node,
+ &ruleset->anchor->children)
+ pr->nr++;
+ }
+ PF_RULES_RUNLOCK();
+ break;
+ }
+
+ case DIOCGETRULESET: {
+ struct pfioc_ruleset *pr = (struct pfioc_ruleset *)addr;
+ struct pf_ruleset *ruleset;
+ struct pf_anchor *anchor;
+ u_int32_t nr = 0;
+
+ PF_RULES_RLOCK();
+ pr->path[sizeof(pr->path) - 1] = 0;
+ if ((ruleset = pf_find_ruleset(pr->path)) == NULL) {
+ PF_RULES_RUNLOCK();
+ error = ENOENT;
+ break;
+ }
+ pr->name[0] = 0;
+ if (ruleset->anchor == NULL) {
+ /* XXX kludge for pf_main_ruleset */
+ RB_FOREACH(anchor, pf_anchor_global, &V_pf_anchors)
+ if (anchor->parent == NULL && nr++ == pr->nr) {
+ strlcpy(pr->name, anchor->name,
+ sizeof(pr->name));
+ break;
+ }
+ } else {
+ RB_FOREACH(anchor, pf_anchor_node,
+ &ruleset->anchor->children)
+ if (nr++ == pr->nr) {
+ strlcpy(pr->name, anchor->name,
+ sizeof(pr->name));
+ break;
+ }
+ }
+ if (!pr->name[0])
+ error = EBUSY;
+ PF_RULES_RUNLOCK();
+ break;
+ }
+
+ case DIOCRCLRTABLES: {
+ struct pfioc_table *io = (struct pfioc_table *)addr;
+
+ if (io->pfrio_esize != 0) {
+ error = ENODEV;
+ break;
+ }
+ PF_RULES_WLOCK();
+ error = pfr_clr_tables(&io->pfrio_table, &io->pfrio_ndel,
+ io->pfrio_flags | PFR_FLAG_USERIOCTL);
+ PF_RULES_WUNLOCK();
+ break;
+ }
+
+ case DIOCRADDTABLES: {
+ struct pfioc_table *io = (struct pfioc_table *)addr;
+ struct pfr_table *pfrts;
+ size_t totlen;
+
+ if (io->pfrio_esize != sizeof(struct pfr_table)) {
+ error = ENODEV;
+ break;
+ }
+ totlen = io->pfrio_size * sizeof(struct pfr_table);
+ pfrts = malloc(totlen, M_TEMP, M_WAITOK);
+ error = copyin(io->pfrio_buffer, pfrts, totlen);
+ if (error) {
+ free(pfrts, M_TEMP);
+ break;
+ }
+ PF_RULES_WLOCK();
+ error = pfr_add_tables(pfrts, io->pfrio_size,
+ &io->pfrio_nadd, io->pfrio_flags | PFR_FLAG_USERIOCTL);
+ PF_RULES_WUNLOCK();
+ free(pfrts, M_TEMP);
+ break;
+ }
+
+ case DIOCRDELTABLES: {
+ struct pfioc_table *io = (struct pfioc_table *)addr;
+ struct pfr_table *pfrts;
+ size_t totlen;
+
+ if (io->pfrio_esize != sizeof(struct pfr_table)) {
+ error = ENODEV;
+ break;
+ }
+ totlen = io->pfrio_size * sizeof(struct pfr_table);
+ pfrts = malloc(totlen, M_TEMP, M_WAITOK);
+ error = copyin(io->pfrio_buffer, pfrts, totlen);
+ if (error) {
+ free(pfrts, M_TEMP);
+ break;
+ }
+ PF_RULES_WLOCK();
+ error = pfr_del_tables(pfrts, io->pfrio_size,
+ &io->pfrio_ndel, io->pfrio_flags | PFR_FLAG_USERIOCTL);
+ PF_RULES_WUNLOCK();
+ free(pfrts, M_TEMP);
+ break;
+ }
+
+ case DIOCRGETTABLES: {
+ struct pfioc_table *io = (struct pfioc_table *)addr;
+ struct pfr_table *pfrts;
+ size_t totlen;
+
+ if (io->pfrio_esize != sizeof(struct pfr_table)) {
+ error = ENODEV;
+ break;
+ }
+ totlen = io->pfrio_size * sizeof(struct pfr_table);
+ pfrts = malloc(totlen, M_TEMP, M_WAITOK);
+ PF_RULES_RLOCK();
+ error = pfr_get_tables(&io->pfrio_table, pfrts,
+ &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL);
+ PF_RULES_RUNLOCK();
+ if (error == 0)
+ error = copyout(pfrts, io->pfrio_buffer, totlen);
+ free(pfrts, M_TEMP);
+ break;
+ }
+
+ case DIOCRGETTSTATS: {
+ struct pfioc_table *io = (struct pfioc_table *)addr;
+ struct pfr_tstats *pfrtstats;
+ size_t totlen;
+
+ if (io->pfrio_esize != sizeof(struct pfr_tstats)) {
+ error = ENODEV;
+ break;
+ }
+ totlen = io->pfrio_size * sizeof(struct pfr_tstats);
+ pfrtstats = malloc(totlen, M_TEMP, M_WAITOK);
+ PF_RULES_WLOCK();
+ error = pfr_get_tstats(&io->pfrio_table, pfrtstats,
+ &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL);
+ PF_RULES_WUNLOCK();
+ if (error == 0)
+ error = copyout(pfrtstats, io->pfrio_buffer, totlen);
+ free(pfrtstats, M_TEMP);
+ break;
+ }
+
+ case DIOCRCLRTSTATS: {
+ struct pfioc_table *io = (struct pfioc_table *)addr;
+ struct pfr_table *pfrts;
+ size_t totlen;
+
+ if (io->pfrio_esize != sizeof(struct pfr_table)) {
+ error = ENODEV;
+ break;
+ }
+ totlen = io->pfrio_size * sizeof(struct pfr_table);
+ pfrts = malloc(totlen, M_TEMP, M_WAITOK);
+ error = copyin(io->pfrio_buffer, pfrts, totlen);
+ if (error) {
+ free(pfrts, M_TEMP);
+ break;
+ }
+ PF_RULES_WLOCK();
+ error = pfr_clr_tstats(pfrts, io->pfrio_size,
+ &io->pfrio_nzero, io->pfrio_flags | PFR_FLAG_USERIOCTL);
+ PF_RULES_WUNLOCK();
+ free(pfrts, M_TEMP);
+ break;
+ }
+
+ case DIOCRSETTFLAGS: {
+ struct pfioc_table *io = (struct pfioc_table *)addr;
+ struct pfr_table *pfrts;
+ size_t totlen;
+
+ if (io->pfrio_esize != sizeof(struct pfr_table)) {
+ error = ENODEV;
+ break;
+ }
+ totlen = io->pfrio_size * sizeof(struct pfr_table);
+ pfrts = malloc(totlen, M_TEMP, M_WAITOK);
+ error = copyin(io->pfrio_buffer, pfrts, totlen);
+ if (error) {
+ free(pfrts, M_TEMP);
+ break;
+ }
+ PF_RULES_WLOCK();
+ error = pfr_set_tflags(pfrts, io->pfrio_size,
+ io->pfrio_setflag, io->pfrio_clrflag, &io->pfrio_nchange,
+ &io->pfrio_ndel, io->pfrio_flags | PFR_FLAG_USERIOCTL);
+ PF_RULES_WUNLOCK();
+ free(pfrts, M_TEMP);
+ break;
+ }
+
+ case DIOCRCLRADDRS: {
+ struct pfioc_table *io = (struct pfioc_table *)addr;
+
+ if (io->pfrio_esize != 0) {
+ error = ENODEV;
+ break;
+ }
+ PF_RULES_WLOCK();
+ error = pfr_clr_addrs(&io->pfrio_table, &io->pfrio_ndel,
+ io->pfrio_flags | PFR_FLAG_USERIOCTL);
+ PF_RULES_WUNLOCK();
+ break;
+ }
+
+ case DIOCRADDADDRS: {
+ struct pfioc_table *io = (struct pfioc_table *)addr;
+ struct pfr_addr *pfras;
+ size_t totlen;
+
+ if (io->pfrio_esize != sizeof(struct pfr_addr)) {
+ error = ENODEV;
+ break;
+ }
+ totlen = io->pfrio_size * sizeof(struct pfr_addr);
+ pfras = malloc(totlen, M_TEMP, M_WAITOK);
+ error = copyin(io->pfrio_buffer, pfras, totlen);
+ if (error) {
+ free(pfras, M_TEMP);
+ break;
+ }
+ PF_RULES_WLOCK();
+ error = pfr_add_addrs(&io->pfrio_table, pfras,
+ io->pfrio_size, &io->pfrio_nadd, io->pfrio_flags |
+ PFR_FLAG_USERIOCTL);
+ PF_RULES_WUNLOCK();
+ if (error == 0 && io->pfrio_flags & PFR_FLAG_FEEDBACK)
+ error = copyout(pfras, io->pfrio_buffer, totlen);
+ free(pfras, M_TEMP);
+ break;
+ }
+
+ case DIOCRDELADDRS: {
+ struct pfioc_table *io = (struct pfioc_table *)addr;
+ struct pfr_addr *pfras;
+ size_t totlen;
+
+ if (io->pfrio_esize != sizeof(struct pfr_addr)) {
+ error = ENODEV;
+ break;
+ }
+ totlen = io->pfrio_size * sizeof(struct pfr_addr);
+ pfras = malloc(totlen, M_TEMP, M_WAITOK);
+ error = copyin(io->pfrio_buffer, pfras, totlen);
+ if (error) {
+ free(pfras, M_TEMP);
+ break;
+ }
+ PF_RULES_WLOCK();
+ error = pfr_del_addrs(&io->pfrio_table, pfras,
+ io->pfrio_size, &io->pfrio_ndel, io->pfrio_flags |
+ PFR_FLAG_USERIOCTL);
+ PF_RULES_WUNLOCK();
+ if (error == 0 && io->pfrio_flags & PFR_FLAG_FEEDBACK)
+ error = copyout(pfras, io->pfrio_buffer, totlen);
+ free(pfras, M_TEMP);
+ break;
+ }
+
+ case DIOCRSETADDRS: {
+ struct pfioc_table *io = (struct pfioc_table *)addr;
+ struct pfr_addr *pfras;
+ size_t totlen;
+
+ if (io->pfrio_esize != sizeof(struct pfr_addr)) {
+ error = ENODEV;
+ break;
+ }
+ totlen = (io->pfrio_size + io->pfrio_size2) *
+ sizeof(struct pfr_addr);
+ pfras = malloc(totlen, M_TEMP, M_WAITOK);
+ error = copyin(io->pfrio_buffer, pfras, totlen);
+ if (error) {
+ free(pfras, M_TEMP);
+ break;
+ }
+ PF_RULES_WLOCK();
+ error = pfr_set_addrs(&io->pfrio_table, pfras,
+ io->pfrio_size, &io->pfrio_size2, &io->pfrio_nadd,
+ &io->pfrio_ndel, &io->pfrio_nchange, io->pfrio_flags |
+ PFR_FLAG_USERIOCTL, 0);
+ PF_RULES_WUNLOCK();
+ if (error == 0 && io->pfrio_flags & PFR_FLAG_FEEDBACK)
+ error = copyout(pfras, io->pfrio_buffer, totlen);
+ free(pfras, M_TEMP);
+ break;
+ }
+
+ case DIOCRGETADDRS: {
+ struct pfioc_table *io = (struct pfioc_table *)addr;
+ struct pfr_addr *pfras;
+ size_t totlen;
+
+ if (io->pfrio_esize != sizeof(struct pfr_addr)) {
+ error = ENODEV;
+ break;
+ }
+ totlen = io->pfrio_size * sizeof(struct pfr_addr);
+ pfras = malloc(totlen, M_TEMP, M_WAITOK);
+ PF_RULES_RLOCK();
+ error = pfr_get_addrs(&io->pfrio_table, pfras,
+ &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL);
+ PF_RULES_RUNLOCK();
+ if (error == 0)
+ error = copyout(pfras, io->pfrio_buffer, totlen);
+ free(pfras, M_TEMP);
+ break;
+ }
+
+ case DIOCRGETASTATS: {
+ struct pfioc_table *io = (struct pfioc_table *)addr;
+ struct pfr_astats *pfrastats;
+ size_t totlen;
+
+ if (io->pfrio_esize != sizeof(struct pfr_astats)) {
+ error = ENODEV;
+ break;
+ }
+ totlen = io->pfrio_size * sizeof(struct pfr_astats);
+ pfrastats = malloc(totlen, M_TEMP, M_WAITOK);
+ PF_RULES_RLOCK();
+ error = pfr_get_astats(&io->pfrio_table, pfrastats,
+ &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL);
+ PF_RULES_RUNLOCK();
+ if (error == 0)
+ error = copyout(pfrastats, io->pfrio_buffer, totlen);
+ free(pfrastats, M_TEMP);
+ break;
+ }
+
+ case DIOCRCLRASTATS: {
+ struct pfioc_table *io = (struct pfioc_table *)addr;
+ struct pfr_addr *pfras;
+ size_t totlen;
+
+ if (io->pfrio_esize != sizeof(struct pfr_addr)) {
+ error = ENODEV;
+ break;
+ }
+ totlen = io->pfrio_size * sizeof(struct pfr_addr);
+ pfras = malloc(totlen, M_TEMP, M_WAITOK);
+ error = copyin(io->pfrio_buffer, pfras, totlen);
+ if (error) {
+ free(pfras, M_TEMP);
+ break;
+ }
+ PF_RULES_WLOCK();
+ error = pfr_clr_astats(&io->pfrio_table, pfras,
+ io->pfrio_size, &io->pfrio_nzero, io->pfrio_flags |
+ PFR_FLAG_USERIOCTL);
+ PF_RULES_WUNLOCK();
+ if (error == 0 && io->pfrio_flags & PFR_FLAG_FEEDBACK)
+ error = copyout(pfras, io->pfrio_buffer, totlen);
+ free(pfras, M_TEMP);
+ break;
+ }
+
+ case DIOCRTSTADDRS: {
+ struct pfioc_table *io = (struct pfioc_table *)addr;
+ struct pfr_addr *pfras;
+ size_t totlen;
+
+ if (io->pfrio_esize != sizeof(struct pfr_addr)) {
+ error = ENODEV;
+ break;
+ }
+ totlen = io->pfrio_size * sizeof(struct pfr_addr);
+ pfras = malloc(totlen, M_TEMP, M_WAITOK);
+ error = copyin(io->pfrio_buffer, pfras, totlen);
+ if (error) {
+ free(pfras, M_TEMP);
+ break;
+ }
+ PF_RULES_RLOCK();
+ error = pfr_tst_addrs(&io->pfrio_table, pfras,
+ io->pfrio_size, &io->pfrio_nmatch, io->pfrio_flags |
+ PFR_FLAG_USERIOCTL);
+ PF_RULES_RUNLOCK();
+ if (error == 0)
+ error = copyout(pfras, io->pfrio_buffer, totlen);
+ free(pfras, M_TEMP);
+ break;
+ }
+
+ case DIOCRINADEFINE: {
+ struct pfioc_table *io = (struct pfioc_table *)addr;
+ struct pfr_addr *pfras;
+ size_t totlen;
+
+ if (io->pfrio_esize != sizeof(struct pfr_addr)) {
+ error = ENODEV;
+ break;
+ }
+ totlen = io->pfrio_size * sizeof(struct pfr_addr);
+ pfras = malloc(totlen, M_TEMP, M_WAITOK);
+ error = copyin(io->pfrio_buffer, pfras, totlen);
+ if (error) {
+ free(pfras, M_TEMP);
+ break;
+ }
+ PF_RULES_WLOCK();
+ error = pfr_ina_define(&io->pfrio_table, pfras,
+ io->pfrio_size, &io->pfrio_nadd, &io->pfrio_naddr,
+ io->pfrio_ticket, io->pfrio_flags | PFR_FLAG_USERIOCTL);
+ PF_RULES_WUNLOCK();
+ free(pfras, M_TEMP);
+ break;
+ }
+
+ case DIOCOSFPADD: {
+ struct pf_osfp_ioctl *io = (struct pf_osfp_ioctl *)addr;
+ PF_RULES_WLOCK();
+ error = pf_osfp_add(io);
+ PF_RULES_WUNLOCK();
+ break;
+ }
+
+ case DIOCOSFPGET: {
+ struct pf_osfp_ioctl *io = (struct pf_osfp_ioctl *)addr;
+ PF_RULES_RLOCK();
+ error = pf_osfp_get(io);
+ PF_RULES_RUNLOCK();
+ break;
+ }
+
+ case DIOCXBEGIN: {
+ struct pfioc_trans *io = (struct pfioc_trans *)addr;
+ struct pfioc_trans_e *ioes, *ioe;
+ size_t totlen;
+ int i;
+
+ if (io->esize != sizeof(*ioe)) {
+ error = ENODEV;
+ break;
+ }
+ totlen = sizeof(struct pfioc_trans_e) * io->size;
+ ioes = malloc(totlen, M_TEMP, M_WAITOK);
+ error = copyin(io->array, ioes, totlen);
+ if (error) {
+ free(ioes, M_TEMP);
+ break;
+ }
+ PF_RULES_WLOCK();
+ for (i = 0, ioe = ioes; i < io->size; i++, ioe++) {
+ switch (ioe->rs_num) {
+#ifdef ALTQ
+ case PF_RULESET_ALTQ:
+ if (ioe->anchor[0]) {
+ PF_RULES_WUNLOCK();
+ free(ioes, M_TEMP);
+ error = EINVAL;
+ goto fail;
+ }
+ if ((error = pf_begin_altq(&ioe->ticket))) {
+ PF_RULES_WUNLOCK();
+ free(ioes, M_TEMP);
+ goto fail;
+ }
+ break;
+#endif /* ALTQ */
+ case PF_RULESET_TABLE:
+ {
+ struct pfr_table table;
+
+ bzero(&table, sizeof(table));
+ strlcpy(table.pfrt_anchor, ioe->anchor,
+ sizeof(table.pfrt_anchor));
+ if ((error = pfr_ina_begin(&table,
+ &ioe->ticket, NULL, 0))) {
+ PF_RULES_WUNLOCK();
+ free(ioes, M_TEMP);
+ goto fail;
+ }
+ break;
+ }
+ default:
+ if ((error = pf_begin_rules(&ioe->ticket,
+ ioe->rs_num, ioe->anchor))) {
+ PF_RULES_WUNLOCK();
+ free(ioes, M_TEMP);
+ goto fail;
+ }
+ break;
+ }
+ }
+ PF_RULES_WUNLOCK();
+ error = copyout(ioes, io->array, totlen);
+ free(ioes, M_TEMP);
+ break;
+ }
+
+ case DIOCXROLLBACK: {
+ struct pfioc_trans *io = (struct pfioc_trans *)addr;
+ struct pfioc_trans_e *ioe, *ioes;
+ size_t totlen;
+ int i;
+
+ if (io->esize != sizeof(*ioe)) {
+ error = ENODEV;
+ break;
+ }
+ totlen = sizeof(struct pfioc_trans_e) * io->size;
+ ioes = malloc(totlen, M_TEMP, M_WAITOK);
+ error = copyin(io->array, ioes, totlen);
+ if (error) {
+ free(ioes, M_TEMP);
+ break;
+ }
+ PF_RULES_WLOCK();
+ for (i = 0, ioe = ioes; i < io->size; i++, ioe++) {
+ switch (ioe->rs_num) {
+#ifdef ALTQ
+ case PF_RULESET_ALTQ:
+ if (ioe->anchor[0]) {
+ PF_RULES_WUNLOCK();
+ free(ioes, M_TEMP);
+ error = EINVAL;
+ goto fail;
+ }
+ if ((error = pf_rollback_altq(ioe->ticket))) {
+ PF_RULES_WUNLOCK();
+ free(ioes, M_TEMP);
+ goto fail; /* really bad */
+ }
+ break;
+#endif /* ALTQ */
+ case PF_RULESET_TABLE:
+ {
+ struct pfr_table table;
+
+ bzero(&table, sizeof(table));
+ strlcpy(table.pfrt_anchor, ioe->anchor,
+ sizeof(table.pfrt_anchor));
+ if ((error = pfr_ina_rollback(&table,
+ ioe->ticket, NULL, 0))) {
+ PF_RULES_WUNLOCK();
+ free(ioes, M_TEMP);
+ goto fail; /* really bad */
+ }
+ break;
+ }
+ default:
+ if ((error = pf_rollback_rules(ioe->ticket,
+ ioe->rs_num, ioe->anchor))) {
+ PF_RULES_WUNLOCK();
+ free(ioes, M_TEMP);
+ goto fail; /* really bad */
+ }
+ break;
+ }
+ }
+ PF_RULES_WUNLOCK();
+ free(ioes, M_TEMP);
+ break;
+ }
+
+ case DIOCXCOMMIT: {
+ struct pfioc_trans *io = (struct pfioc_trans *)addr;
+ struct pfioc_trans_e *ioe, *ioes;
+ struct pf_ruleset *rs;
+ size_t totlen;
+ int i;
+
+ if (io->esize != sizeof(*ioe)) {
+ error = ENODEV;
+ break;
+ }
+ totlen = sizeof(struct pfioc_trans_e) * io->size;
+ ioes = malloc(totlen, M_TEMP, M_WAITOK);
+ error = copyin(io->array, ioes, totlen);
+ if (error) {
+ free(ioes, M_TEMP);
+ break;
+ }
+ PF_RULES_WLOCK();
+ /* First makes sure everything will succeed. */
+ for (i = 0, ioe = ioes; i < io->size; i++, ioe++) {
+ switch (ioe->rs_num) {
+#ifdef ALTQ
+ case PF_RULESET_ALTQ:
+ if (ioe->anchor[0]) {
+ PF_RULES_WUNLOCK();
+ free(ioes, M_TEMP);
+ error = EINVAL;
+ goto fail;
+ }
+ if (!V_altqs_inactive_open || ioe->ticket !=
+ V_ticket_altqs_inactive) {
+ PF_RULES_WUNLOCK();
+ free(ioes, M_TEMP);
+ error = EBUSY;
+ goto fail;
+ }
+ break;
+#endif /* ALTQ */
+ case PF_RULESET_TABLE:
+ rs = pf_find_ruleset(ioe->anchor);
+ if (rs == NULL || !rs->topen || ioe->ticket !=
+ rs->tticket) {
+ PF_RULES_WUNLOCK();
+ free(ioes, M_TEMP);
+ error = EBUSY;
+ goto fail;
+ }
+ break;
+ default:
+ if (ioe->rs_num < 0 || ioe->rs_num >=
+ PF_RULESET_MAX) {
+ PF_RULES_WUNLOCK();
+ free(ioes, M_TEMP);
+ error = EINVAL;
+ goto fail;
+ }
+ rs = pf_find_ruleset(ioe->anchor);
+ if (rs == NULL ||
+ !rs->rules[ioe->rs_num].inactive.open ||
+ rs->rules[ioe->rs_num].inactive.ticket !=
+ ioe->ticket) {
+ PF_RULES_WUNLOCK();
+ free(ioes, M_TEMP);
+ error = EBUSY;
+ goto fail;
+ }
+ break;
+ }
+ }
+ /* Now do the commit - no errors should happen here. */
+ for (i = 0, ioe = ioes; i < io->size; i++, ioe++) {
+ switch (ioe->rs_num) {
+#ifdef ALTQ
+ case PF_RULESET_ALTQ:
+ if ((error = pf_commit_altq(ioe->ticket))) {
+ PF_RULES_WUNLOCK();
+ free(ioes, M_TEMP);
+ goto fail; /* really bad */
+ }
+ break;
+#endif /* ALTQ */
+ case PF_RULESET_TABLE:
+ {
+ struct pfr_table table;
+
+ bzero(&table, sizeof(table));
+ strlcpy(table.pfrt_anchor, ioe->anchor,
+ sizeof(table.pfrt_anchor));
+ if ((error = pfr_ina_commit(&table,
+ ioe->ticket, NULL, NULL, 0))) {
+ PF_RULES_WUNLOCK();
+ free(ioes, M_TEMP);
+ goto fail; /* really bad */
+ }
+ break;
+ }
+ default:
+ if ((error = pf_commit_rules(ioe->ticket,
+ ioe->rs_num, ioe->anchor))) {
+ PF_RULES_WUNLOCK();
+ free(ioes, M_TEMP);
+ goto fail; /* really bad */
+ }
+ break;
+ }
+ }
+ PF_RULES_WUNLOCK();
+ free(ioes, M_TEMP);
+ break;
+ }
+
+ case DIOCGETSRCNODES: {
+ struct pfioc_src_nodes *psn = (struct pfioc_src_nodes *)addr;
+ struct pf_srchash *sh;
+ struct pf_src_node *n, *p, *pstore;
+ uint32_t i, nr = 0;
+
+ if (psn->psn_len == 0) {
+ for (i = 0, sh = V_pf_srchash; i < V_pf_srchashmask;
+ i++, sh++) {
+ PF_HASHROW_LOCK(sh);
+ LIST_FOREACH(n, &sh->nodes, entry)
+ nr++;
+ PF_HASHROW_UNLOCK(sh);
+ }
+ psn->psn_len = sizeof(struct pf_src_node) * nr;
+ break;
+ }
+
+ p = pstore = malloc(psn->psn_len, M_TEMP, M_WAITOK);
+ for (i = 0, sh = V_pf_srchash; i < V_pf_srchashmask;
+ i++, sh++) {
+ PF_HASHROW_LOCK(sh);
+ LIST_FOREACH(n, &sh->nodes, entry) {
+ int secs = time_uptime, diff;
+
+ if ((nr + 1) * sizeof(*p) > (unsigned)psn->psn_len)
+ break;
+
+ bcopy(n, p, sizeof(struct pf_src_node));
+ if (n->rule.ptr != NULL)
+ p->rule.nr = n->rule.ptr->nr;
+ p->creation = secs - p->creation;
+ if (p->expire > secs)
+ p->expire -= secs;
+ else
+ p->expire = 0;
+
+ /* Adjust the connection rate estimate. */
+ diff = secs - n->conn_rate.last;
+ if (diff >= n->conn_rate.seconds)
+ p->conn_rate.count = 0;
+ else
+ p->conn_rate.count -=
+ n->conn_rate.count * diff /
+ n->conn_rate.seconds;
+ p++;
+ nr++;
+ }
+ PF_HASHROW_UNLOCK(sh);
+ }
+ error = copyout(pstore, psn->psn_src_nodes,
+ sizeof(struct pf_src_node) * nr);
+ if (error) {
+ free(pstore, M_TEMP);
+ break;
+ }
+ psn->psn_len = sizeof(struct pf_src_node) * nr;
+ free(pstore, M_TEMP);
+ break;
+ }
+
+ case DIOCCLRSRCNODES: {
+
+ pf_clear_srcnodes(NULL);
+ pf_purge_expired_src_nodes();
+ V_pf_status.src_nodes = 0;
+ break;
+ }
+
+ case DIOCKILLSRCNODES: {
+ struct pfioc_src_node_kill *psnk =
+ (struct pfioc_src_node_kill *)addr;
+ struct pf_srchash *sh;
+ struct pf_src_node *sn;
+ u_int i, killed = 0;
+
+ for (i = 0, sh = V_pf_srchash; i < V_pf_srchashmask;
+ i++, sh++) {
+ /*
+ * XXXGL: we don't ever acquire sources hash lock
+ * but if we ever do, the below call to pf_clear_srcnodes()
+ * would lead to a LOR.
+ */
+ PF_HASHROW_LOCK(sh);
+ LIST_FOREACH(sn, &sh->nodes, entry)
+ if (PF_MATCHA(psnk->psnk_src.neg,
+ &psnk->psnk_src.addr.v.a.addr,
+ &psnk->psnk_src.addr.v.a.mask,
+ &sn->addr, sn->af) &&
+ PF_MATCHA(psnk->psnk_dst.neg,
+ &psnk->psnk_dst.addr.v.a.addr,
+ &psnk->psnk_dst.addr.v.a.mask,
+ &sn->raddr, sn->af)) {
+ /* Handle state to src_node linkage */
+ if (sn->states != 0)
+ pf_clear_srcnodes(sn);
+ sn->expire = 1;
+ killed++;
+ }
+ PF_HASHROW_UNLOCK(sh);
+ }
+
+ if (killed > 0)
+ pf_purge_expired_src_nodes();
+
+ psnk->psnk_killed = killed;
+ break;
+ }
+
+ case DIOCSETHOSTID: {
+ u_int32_t *hostid = (u_int32_t *)addr;
+
+ PF_RULES_WLOCK();
+ if (*hostid == 0)
+ V_pf_status.hostid = arc4random();
+ else
+ V_pf_status.hostid = *hostid;
+ PF_RULES_WUNLOCK();
+ break;
+ }
+
+ case DIOCOSFPFLUSH:
+ PF_RULES_WLOCK();
+ pf_osfp_flush();
+ PF_RULES_WUNLOCK();
+ break;
+
+ case DIOCIGETIFACES: {
+ struct pfioc_iface *io = (struct pfioc_iface *)addr;
+ struct pfi_kif *ifstore;
+ size_t bufsiz;
+
+ if (io->pfiio_esize != sizeof(struct pfi_kif)) {
+ error = ENODEV;
+ break;
+ }
+
+ bufsiz = io->pfiio_size * sizeof(struct pfi_kif);
+ ifstore = malloc(bufsiz, M_TEMP, M_WAITOK);
+ PF_RULES_RLOCK();
+ pfi_get_ifaces(io->pfiio_name, ifstore, &io->pfiio_size);
+ PF_RULES_RUNLOCK();
+ error = copyout(ifstore, io->pfiio_buffer, bufsiz);
+ free(ifstore, M_TEMP);
+ break;
+ }
+
+ case DIOCSETIFFLAG: {
+ struct pfioc_iface *io = (struct pfioc_iface *)addr;
+
+ PF_RULES_WLOCK();
+ error = pfi_set_flags(io->pfiio_name, io->pfiio_flags);
+ PF_RULES_WUNLOCK();
+ break;
+ }
+
+ case DIOCCLRIFFLAG: {
+ struct pfioc_iface *io = (struct pfioc_iface *)addr;
+
+ PF_RULES_WLOCK();
+ error = pfi_clear_flags(io->pfiio_name, io->pfiio_flags);
+ PF_RULES_WUNLOCK();
+ break;
+ }
+
+ default:
+ error = ENODEV;
+ break;
+ }
+fail:
+ CURVNET_RESTORE();
+
+ return (error);
+}
+
+void
+pfsync_state_export(struct pfsync_state *sp, struct pf_state *st)
+{
+ bzero(sp, sizeof(struct pfsync_state));
+
+ /* copy from state key */
+ sp->key[PF_SK_WIRE].addr[0] = st->key[PF_SK_WIRE]->addr[0];
+ sp->key[PF_SK_WIRE].addr[1] = st->key[PF_SK_WIRE]->addr[1];
+ sp->key[PF_SK_WIRE].port[0] = st->key[PF_SK_WIRE]->port[0];
+ sp->key[PF_SK_WIRE].port[1] = st->key[PF_SK_WIRE]->port[1];
+ sp->key[PF_SK_STACK].addr[0] = st->key[PF_SK_STACK]->addr[0];
+ sp->key[PF_SK_STACK].addr[1] = st->key[PF_SK_STACK]->addr[1];
+ sp->key[PF_SK_STACK].port[0] = st->key[PF_SK_STACK]->port[0];
+ sp->key[PF_SK_STACK].port[1] = st->key[PF_SK_STACK]->port[1];
+ sp->proto = st->key[PF_SK_WIRE]->proto;
+ sp->af = st->key[PF_SK_WIRE]->af;
+
+ /* copy from state */
+ strlcpy(sp->ifname, st->kif->pfik_name, sizeof(sp->ifname));
+ bcopy(&st->rt_addr, &sp->rt_addr, sizeof(sp->rt_addr));
+ sp->creation = htonl(time_uptime - st->creation);
+ sp->expire = pf_state_expires(st);
+ if (sp->expire <= time_uptime)
+ sp->expire = htonl(0);
+ else
+ sp->expire = htonl(sp->expire - time_uptime);
+
+ sp->direction = st->direction;
+ sp->log = st->log;
+ sp->timeout = st->timeout;
+ sp->state_flags = st->state_flags;
+ if (st->src_node)
+ sp->sync_flags |= PFSYNC_FLAG_SRCNODE;
+ if (st->nat_src_node)
+ sp->sync_flags |= PFSYNC_FLAG_NATSRCNODE;
+
+ sp->id = st->id;
+ sp->creatorid = st->creatorid;
+ pf_state_peer_hton(&st->src, &sp->src);
+ pf_state_peer_hton(&st->dst, &sp->dst);
+
+ if (st->rule.ptr == NULL)
+ sp->rule = htonl(-1);
+ else
+ sp->rule = htonl(st->rule.ptr->nr);
+ if (st->anchor.ptr == NULL)
+ sp->anchor = htonl(-1);
+ else
+ sp->anchor = htonl(st->anchor.ptr->nr);
+ if (st->nat_rule.ptr == NULL)
+ sp->nat_rule = htonl(-1);
+ else
+ sp->nat_rule = htonl(st->nat_rule.ptr->nr);
+
+ pf_state_counter_hton(st->packets[0], sp->packets[0]);
+ pf_state_counter_hton(st->packets[1], sp->packets[1]);
+ pf_state_counter_hton(st->bytes[0], sp->bytes[0]);
+ pf_state_counter_hton(st->bytes[1], sp->bytes[1]);
+
+}
+
+static void
+pf_tbladdr_copyout(struct pf_addr_wrap *aw)
+{
+ struct pfr_ktable *kt;
+
+ KASSERT(aw->type == PF_ADDR_TABLE, ("%s: type %u", __func__, aw->type));
+
+ kt = aw->p.tbl;
+ if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL)
+ kt = kt->pfrkt_root;
+ aw->p.tbl = NULL;
+ aw->p.tblcnt = (kt->pfrkt_flags & PFR_TFLAG_ACTIVE) ?
+ kt->pfrkt_cnt : -1;
+}
+
+/*
+ * XXX - Check for version missmatch!!!
+ */
+static void
+pf_clear_states(void)
+{
+ struct pf_state *s;
+ u_int i;
+
+ for (i = 0; i <= V_pf_hashmask; i++) {
+ struct pf_idhash *ih = &V_pf_idhash[i];
+relock:
+ PF_HASHROW_LOCK(ih);
+ LIST_FOREACH(s, &ih->states, entry) {
+ s->timeout = PFTM_PURGE;
+ /* Don't send out individual delete messages. */
+ s->sync_state = PFSTATE_NOSYNC;
+ pf_unlink_state(s, PF_ENTER_LOCKED);
+ goto relock;
+ }
+ PF_HASHROW_UNLOCK(ih);
+ }
+}
+
+static int
+pf_clear_tables(void)
+{
+ struct pfioc_table io;
+ int error;
+
+ bzero(&io, sizeof(io));
+
+ error = pfr_clr_tables(&io.pfrio_table, &io.pfrio_ndel,
+ io.pfrio_flags);
+
+ return (error);
+}
+
+static void
+pf_clear_srcnodes(struct pf_src_node *n)
+{
+ struct pf_state *s;
+ int i;
+
+ for (i = 0; i <= V_pf_hashmask; i++) {
+ struct pf_idhash *ih = &V_pf_idhash[i];
+
+ PF_HASHROW_LOCK(ih);
+ LIST_FOREACH(s, &ih->states, entry) {
+ if (n == NULL || n == s->src_node)
+ s->src_node = NULL;
+ if (n == NULL || n == s->nat_src_node)
+ s->nat_src_node = NULL;
+ }
+ PF_HASHROW_UNLOCK(ih);
+ }
+
+ if (n == NULL) {
+ struct pf_srchash *sh;
+
+ for (i = 0, sh = V_pf_srchash; i < V_pf_srchashmask;
+ i++, sh++) {
+ PF_HASHROW_LOCK(sh);
+ LIST_FOREACH(n, &sh->nodes, entry) {
+ n->expire = 1;
+ n->states = 0;
+ }
+ PF_HASHROW_UNLOCK(sh);
+ }
+ } else {
+ /* XXX: hash slot should already be locked here. */
+ n->expire = 1;
+ n->states = 0;
+ }
+}
+/*
+ * XXX - Check for version missmatch!!!
+ */
+
+/*
+ * Duplicate pfctl -Fa operation to get rid of as much as we can.
+ */
+static int
+shutdown_pf(void)
+{
+ int error = 0;
+ u_int32_t t[5];
+ char nn = '\0';
+
+ V_pf_status.running = 0;
+ do {
+ if ((error = pf_begin_rules(&t[0], PF_RULESET_SCRUB, &nn))
+ != 0) {
+ DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: SCRUB\n"));
+ break;
+ }
+ if ((error = pf_begin_rules(&t[1], PF_RULESET_FILTER, &nn))
+ != 0) {
+ DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: FILTER\n"));
+ break; /* XXX: rollback? */
+ }
+ if ((error = pf_begin_rules(&t[2], PF_RULESET_NAT, &nn))
+ != 0) {
+ DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: NAT\n"));
+ break; /* XXX: rollback? */
+ }
+ if ((error = pf_begin_rules(&t[3], PF_RULESET_BINAT, &nn))
+ != 0) {
+ DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: BINAT\n"));
+ break; /* XXX: rollback? */
+ }
+ if ((error = pf_begin_rules(&t[4], PF_RULESET_RDR, &nn))
+ != 0) {
+ DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: RDR\n"));
+ break; /* XXX: rollback? */
+ }
+
+ /* XXX: these should always succeed here */
+ pf_commit_rules(t[0], PF_RULESET_SCRUB, &nn);
+ pf_commit_rules(t[1], PF_RULESET_FILTER, &nn);
+ pf_commit_rules(t[2], PF_RULESET_NAT, &nn);
+ pf_commit_rules(t[3], PF_RULESET_BINAT, &nn);
+ pf_commit_rules(t[4], PF_RULESET_RDR, &nn);
+
+ if ((error = pf_clear_tables()) != 0)
+ break;
+
+#ifdef ALTQ
+ if ((error = pf_begin_altq(&t[0])) != 0) {
+ DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: ALTQ\n"));
+ break;
+ }
+ pf_commit_altq(t[0]);
+#endif
+
+ pf_clear_states();
+
+ pf_clear_srcnodes(NULL);
+
+ /* status does not use malloced mem so no need to cleanup */
+ /* fingerprints and interfaces have thier own cleanup code */
+ } while(0);
+
+ return (error);
+}
+
+#ifdef INET
+static int
+pf_check_in(void *arg, struct mbuf **m, struct ifnet *ifp, int dir,
+ struct inpcb *inp)
+{
+ /*
+ * XXX Wed Jul 9 22:03:16 2003 UTC
+ * OpenBSD has changed its byte ordering convention on ip_len/ip_off
+ * in network stack. OpenBSD's network stack have converted
+ * ip_len/ip_off to host byte order frist as FreeBSD.
+ * Now this is not true anymore , so we should convert back to network
+ * byte order.
+ */
+ struct ip *h = NULL;
+ int chk;
+
+ if ((*m)->m_pkthdr.len >= (int)sizeof(struct ip)) {
+ /* if m_pkthdr.len is less than ip header, pf will handle. */
+ h = mtod(*m, struct ip *);
+ HTONS(h->ip_len);
+ HTONS(h->ip_off);
+ }
+ CURVNET_SET(ifp->if_vnet);
+ chk = pf_test(PF_IN, ifp, m, inp);
+ CURVNET_RESTORE();
+ if (chk && *m) {
+ m_freem(*m);
+ *m = NULL;
+ }
+ if (*m != NULL) {
+ /* pf_test can change ip header location */
+ h = mtod(*m, struct ip *);
+ NTOHS(h->ip_len);
+ NTOHS(h->ip_off);
+ }
+ return chk;
+}
+
+static int
+pf_check_out(void *arg, struct mbuf **m, struct ifnet *ifp, int dir,
+ struct inpcb *inp)
+{
+ /*
+ * XXX Wed Jul 9 22:03:16 2003 UTC
+ * OpenBSD has changed its byte ordering convention on ip_len/ip_off
+ * in network stack. OpenBSD's network stack have converted
+ * ip_len/ip_off to host byte order frist as FreeBSD.
+ * Now this is not true anymore , so we should convert back to network
+ * byte order.
+ */
+ struct ip *h = NULL;
+ int chk;
+
+ /* We need a proper CSUM befor we start (s. OpenBSD ip_output) */
+ if ((*m)->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
+ in_delayed_cksum(*m);
+ (*m)->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
+ }
+ if ((*m)->m_pkthdr.len >= (int)sizeof(*h)) {
+ /* if m_pkthdr.len is less than ip header, pf will handle. */
+ h = mtod(*m, struct ip *);
+ HTONS(h->ip_len);
+ HTONS(h->ip_off);
+ }
+ CURVNET_SET(ifp->if_vnet);
+ chk = pf_test(PF_OUT, ifp, m, inp);
+ CURVNET_RESTORE();
+ if (chk && *m) {
+ m_freem(*m);
+ *m = NULL;
+ }
+ if (*m != NULL) {
+ /* pf_test can change ip header location */
+ h = mtod(*m, struct ip *);
+ NTOHS(h->ip_len);
+ NTOHS(h->ip_off);
+ }
+ return chk;
+}
+#endif
+
+#ifdef INET6
+static int
+pf_check6_in(void *arg, struct mbuf **m, struct ifnet *ifp, int dir,
+ struct inpcb *inp)
+{
+
+ /*
+ * IPv6 is not affected by ip_len/ip_off byte order changes.
+ */
+ int chk;
+
+ /*
+ * In case of loopback traffic IPv6 uses the real interface in
+ * order to support scoped addresses. In order to support stateful
+ * filtering we have change this to lo0 as it is the case in IPv4.
+ */
+ CURVNET_SET(ifp->if_vnet);
+ chk = pf_test6(PF_IN, (*m)->m_flags & M_LOOP ? V_loif : ifp, m, inp);
+ CURVNET_RESTORE();
+ if (chk && *m) {
+ m_freem(*m);
+ *m = NULL;
+ }
+ return chk;
+}
+
+static int
+pf_check6_out(void *arg, struct mbuf **m, struct ifnet *ifp, int dir,
+ struct inpcb *inp)
+{
+ /*
+ * IPv6 does not affected ip_len/ip_off byte order changes.
+ */
+ int chk;
+
+ /* We need a proper CSUM before we start (s. OpenBSD ip_output) */
+ if ((*m)->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
+#ifdef INET
+ /* XXX-BZ copy&paste error from r126261? */
+ in_delayed_cksum(*m);
+#endif
+ (*m)->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
+ }
+ CURVNET_SET(ifp->if_vnet);
+ chk = pf_test6(PF_OUT, ifp, m, inp);
+ CURVNET_RESTORE();
+ if (chk && *m) {
+ m_freem(*m);
+ *m = NULL;
+ }
+ return chk;
+}
+#endif /* INET6 */
+
+static int
+hook_pf(void)
+{
+#ifdef INET
+ struct pfil_head *pfh_inet;
+#endif
+#ifdef INET6
+ struct pfil_head *pfh_inet6;
+#endif
+
+ if (V_pf_pfil_hooked)
+ return (0);
+
+#ifdef INET
+ pfh_inet = pfil_head_get(PFIL_TYPE_AF, AF_INET);
+ if (pfh_inet == NULL)
+ return (ESRCH); /* XXX */
+ pfil_add_hook(pf_check_in, NULL, PFIL_IN | PFIL_WAITOK, pfh_inet);
+ pfil_add_hook(pf_check_out, NULL, PFIL_OUT | PFIL_WAITOK, pfh_inet);
+#endif
+#ifdef INET6
+ pfh_inet6 = pfil_head_get(PFIL_TYPE_AF, AF_INET6);
+ if (pfh_inet6 == NULL) {
+#ifdef INET
+ pfil_remove_hook(pf_check_in, NULL, PFIL_IN | PFIL_WAITOK,
+ pfh_inet);
+ pfil_remove_hook(pf_check_out, NULL, PFIL_OUT | PFIL_WAITOK,
+ pfh_inet);
+#endif
+ return (ESRCH); /* XXX */
+ }
+ pfil_add_hook(pf_check6_in, NULL, PFIL_IN | PFIL_WAITOK, pfh_inet6);
+ pfil_add_hook(pf_check6_out, NULL, PFIL_OUT | PFIL_WAITOK, pfh_inet6);
+#endif
+
+ V_pf_pfil_hooked = 1;
+ return (0);
+}
+
+static int
+dehook_pf(void)
+{
+#ifdef INET
+ struct pfil_head *pfh_inet;
+#endif
+#ifdef INET6
+ struct pfil_head *pfh_inet6;
+#endif
+
+ if (V_pf_pfil_hooked == 0)
+ return (0);
+
+#ifdef INET
+ pfh_inet = pfil_head_get(PFIL_TYPE_AF, AF_INET);
+ if (pfh_inet == NULL)
+ return (ESRCH); /* XXX */
+ pfil_remove_hook(pf_check_in, NULL, PFIL_IN | PFIL_WAITOK,
+ pfh_inet);
+ pfil_remove_hook(pf_check_out, NULL, PFIL_OUT | PFIL_WAITOK,
+ pfh_inet);
+#endif
+#ifdef INET6
+ pfh_inet6 = pfil_head_get(PFIL_TYPE_AF, AF_INET6);
+ if (pfh_inet6 == NULL)
+ return (ESRCH); /* XXX */
+ pfil_remove_hook(pf_check6_in, NULL, PFIL_IN | PFIL_WAITOK,
+ pfh_inet6);
+ pfil_remove_hook(pf_check6_out, NULL, PFIL_OUT | PFIL_WAITOK,
+ pfh_inet6);
+#endif
+
+ V_pf_pfil_hooked = 0;
+ return (0);
+}
+
+static int
+pf_load(void)
+{
+ int error;
+
+ VNET_ITERATOR_DECL(vnet_iter);
+
+ VNET_LIST_RLOCK();
+ VNET_FOREACH(vnet_iter) {
+ CURVNET_SET(vnet_iter);
+ V_pf_pfil_hooked = 0;
+ V_pf_end_threads = 0;
+ TAILQ_INIT(&V_pf_tags);
+ TAILQ_INIT(&V_pf_qids);
+ CURVNET_RESTORE();
+ }
+ VNET_LIST_RUNLOCK();
+
+ rw_init(&pf_rules_lock, "pf rulesets");
+
+ pf_dev = make_dev(&pf_cdevsw, 0, 0, 0, 0600, PF_NAME);
+ if ((error = pfattach()) != 0)
+ return (error);
+
+ return (0);
+}
+
+static int
+pf_unload(void)
+{
+ int error = 0;
+
+ PF_RULES_WLOCK();
+ V_pf_status.running = 0;
+ PF_RULES_WUNLOCK();
+ swi_remove(V_pf_swi_cookie);
+ error = dehook_pf();
+ if (error) {
+ /*
+ * Should not happen!
+ * XXX Due to error code ESRCH, kldunload will show
+ * a message like 'No such process'.
+ */
+ printf("%s : pfil unregisteration fail\n", __FUNCTION__);
+ return error;
+ }
+ PF_RULES_WLOCK();
+ shutdown_pf();
+ V_pf_end_threads = 1;
+ while (V_pf_end_threads < 2) {
+ wakeup_one(pf_purge_thread);
+ rw_sleep(pf_purge_thread, &pf_rules_lock, 0, "pftmo", 0);
+ }
+ pf_normalize_cleanup();
+ pfi_cleanup();
+ pfr_cleanup();
+ pf_osfp_flush();
+ pf_cleanup();
+ PF_RULES_WUNLOCK();
+ destroy_dev(pf_dev);
+ rw_destroy(&pf_rules_lock);
+
+ return (error);
+}
+
+static int
+pf_modevent(module_t mod, int type, void *data)
+{
+ int error = 0;
+
+ switch(type) {
+ case MOD_LOAD:
+ error = pf_load();
+ break;
+ case MOD_QUIESCE:
+ /*
+ * Module should not be unloaded due to race conditions.
+ */
+ error = EPERM;
+ break;
+ case MOD_UNLOAD:
+ error = pf_unload();
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+
+ return (error);
+}
+
+static moduledata_t pf_mod = {
+ "pf",
+ pf_modevent,
+ 0
+};
+
+DECLARE_MODULE(pf, pf_mod, SI_SUB_PSEUDO, SI_ORDER_FIRST);
+MODULE_VERSION(pf, PF_MODVER);
diff --git a/sys/netpfil/pf/pf_lb.c b/sys/netpfil/pf/pf_lb.c
new file mode 100644
index 0000000..5b47852
--- /dev/null
+++ b/sys/netpfil/pf/pf_lb.c
@@ -0,0 +1,663 @@
+/* $OpenBSD: pf_lb.c,v 1.2 2009/02/12 02:13:15 sthen Exp $ */
+
+/*
+ * Copyright (c) 2001 Daniel Hartmeier
+ * Copyright (c) 2002 - 2008 Henning Brauer
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Effort sponsored in part by the Defense Advanced Research Projects
+ * Agency (DARPA) and Air Force Research Laboratory, Air Force
+ * Materiel Command, USAF, under agreement number F30602-01-2-0537.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_pf.h"
+#include "opt_inet.h"
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h>
+#include <net/pfvar.h>
+#include <net/if_pflog.h>
+#include <net/pf_mtag.h>
+
+#define DPFPRINTF(n, x) if (V_pf_status.debug >= (n)) printf x
+
+static void pf_hash(struct pf_addr *, struct pf_addr *,
+ struct pf_poolhashkey *, sa_family_t);
+static struct pf_rule *pf_match_translation(struct pf_pdesc *, struct mbuf *,
+ int, int, struct pfi_kif *,
+ struct pf_addr *, u_int16_t, struct pf_addr *,
+ u_int16_t, int);
+static int pf_get_sport(sa_family_t, u_int8_t, struct pf_rule *,
+ struct pf_addr *, struct pf_addr *, u_int16_t,
+ struct pf_addr *, u_int16_t*, u_int16_t, u_int16_t,
+ struct pf_src_node **);
+
+#define mix(a,b,c) \
+ do { \
+ a -= b; a -= c; a ^= (c >> 13); \
+ b -= c; b -= a; b ^= (a << 8); \
+ c -= a; c -= b; c ^= (b >> 13); \
+ a -= b; a -= c; a ^= (c >> 12); \
+ b -= c; b -= a; b ^= (a << 16); \
+ c -= a; c -= b; c ^= (b >> 5); \
+ a -= b; a -= c; a ^= (c >> 3); \
+ b -= c; b -= a; b ^= (a << 10); \
+ c -= a; c -= b; c ^= (b >> 15); \
+ } while (0)
+
+/*
+ * hash function based on bridge_hash in if_bridge.c
+ */
+static void
+pf_hash(struct pf_addr *inaddr, struct pf_addr *hash,
+ struct pf_poolhashkey *key, sa_family_t af)
+{
+ u_int32_t a = 0x9e3779b9, b = 0x9e3779b9, c = key->key32[0];
+
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ a += inaddr->addr32[0];
+ b += key->key32[1];
+ mix(a, b, c);
+ hash->addr32[0] = c + key->key32[2];
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ a += inaddr->addr32[0];
+ b += inaddr->addr32[2];
+ mix(a, b, c);
+ hash->addr32[0] = c;
+ a += inaddr->addr32[1];
+ b += inaddr->addr32[3];
+ c += key->key32[1];
+ mix(a, b, c);
+ hash->addr32[1] = c;
+ a += inaddr->addr32[2];
+ b += inaddr->addr32[1];
+ c += key->key32[2];
+ mix(a, b, c);
+ hash->addr32[2] = c;
+ a += inaddr->addr32[3];
+ b += inaddr->addr32[0];
+ c += key->key32[3];
+ mix(a, b, c);
+ hash->addr32[3] = c;
+ break;
+#endif /* INET6 */
+ }
+}
+
+static struct pf_rule *
+pf_match_translation(struct pf_pdesc *pd, struct mbuf *m, int off,
+ int direction, struct pfi_kif *kif, struct pf_addr *saddr, u_int16_t sport,
+ struct pf_addr *daddr, u_int16_t dport, int rs_num)
+{
+ struct pf_rule *r, *rm = NULL;
+ struct pf_ruleset *ruleset = NULL;
+ int tag = -1;
+ int rtableid = -1;
+ int asd = 0;
+
+ r = TAILQ_FIRST(pf_main_ruleset.rules[rs_num].active.ptr);
+ while (r && rm == NULL) {
+ struct pf_rule_addr *src = NULL, *dst = NULL;
+ struct pf_addr_wrap *xdst = NULL;
+
+ if (r->action == PF_BINAT && direction == PF_IN) {
+ src = &r->dst;
+ if (r->rpool.cur != NULL)
+ xdst = &r->rpool.cur->addr;
+ } else {
+ src = &r->src;
+ dst = &r->dst;
+ }
+
+ r->evaluations++;
+ if (pfi_kif_match(r->kif, kif) == r->ifnot)
+ r = r->skip[PF_SKIP_IFP].ptr;
+ else if (r->direction && r->direction != direction)
+ r = r->skip[PF_SKIP_DIR].ptr;
+ else if (r->af && r->af != pd->af)
+ r = r->skip[PF_SKIP_AF].ptr;
+ else if (r->proto && r->proto != pd->proto)
+ r = r->skip[PF_SKIP_PROTO].ptr;
+ else if (PF_MISMATCHAW(&src->addr, saddr, pd->af,
+ src->neg, kif, M_GETFIB(m)))
+ r = r->skip[src == &r->src ? PF_SKIP_SRC_ADDR :
+ PF_SKIP_DST_ADDR].ptr;
+ else if (src->port_op && !pf_match_port(src->port_op,
+ src->port[0], src->port[1], sport))
+ r = r->skip[src == &r->src ? PF_SKIP_SRC_PORT :
+ PF_SKIP_DST_PORT].ptr;
+ else if (dst != NULL &&
+ PF_MISMATCHAW(&dst->addr, daddr, pd->af, dst->neg, NULL,
+ M_GETFIB(m)))
+ r = r->skip[PF_SKIP_DST_ADDR].ptr;
+ else if (xdst != NULL && PF_MISMATCHAW(xdst, daddr, pd->af,
+ 0, NULL, M_GETFIB(m)))
+ r = TAILQ_NEXT(r, entries);
+ else if (dst != NULL && dst->port_op &&
+ !pf_match_port(dst->port_op, dst->port[0],
+ dst->port[1], dport))
+ r = r->skip[PF_SKIP_DST_PORT].ptr;
+ else if (r->match_tag && !pf_match_tag(m, r, &tag,
+ pd->pf_mtag ? pd->pf_mtag->tag : 0))
+ r = TAILQ_NEXT(r, entries);
+ else if (r->os_fingerprint != PF_OSFP_ANY && (pd->proto !=
+ IPPROTO_TCP || !pf_osfp_match(pf_osfp_fingerprint(pd, m,
+ off, pd->hdr.tcp), r->os_fingerprint)))
+ r = TAILQ_NEXT(r, entries);
+ else {
+ if (r->tag)
+ tag = r->tag;
+ if (r->rtableid >= 0)
+ rtableid = r->rtableid;
+ if (r->anchor == NULL) {
+ rm = r;
+ } else
+ pf_step_into_anchor(&asd, &ruleset, rs_num,
+ &r, NULL, NULL);
+ }
+ if (r == NULL)
+ pf_step_out_of_anchor(&asd, &ruleset, rs_num, &r,
+ NULL, NULL);
+ }
+
+ if (tag > 0 && pf_tag_packet(m, pd, tag))
+ return (NULL);
+ if (rtableid >= 0)
+ M_SETFIB(m, rtableid);
+
+ if (rm != NULL && (rm->action == PF_NONAT ||
+ rm->action == PF_NORDR || rm->action == PF_NOBINAT))
+ return (NULL);
+ return (rm);
+}
+
+static int
+pf_get_sport(sa_family_t af, u_int8_t proto, struct pf_rule *r,
+ struct pf_addr *saddr, struct pf_addr *daddr, u_int16_t dport,
+ struct pf_addr *naddr, u_int16_t *nport, u_int16_t low, u_int16_t high,
+ struct pf_src_node **sn)
+{
+ struct pf_state_key_cmp key;
+ struct pf_addr init_addr;
+ u_int16_t cut;
+
+ bzero(&init_addr, sizeof(init_addr));
+ if (pf_map_addr(af, r, saddr, naddr, &init_addr, sn))
+ return (1);
+
+ if (proto == IPPROTO_ICMP) {
+ low = 1;
+ high = 65535;
+ }
+
+ do {
+ key.af = af;
+ key.proto = proto;
+ PF_ACPY(&key.addr[1], daddr, key.af);
+ PF_ACPY(&key.addr[0], naddr, key.af);
+ key.port[1] = dport;
+
+ /*
+ * port search; start random, step;
+ * similar 2 portloop in in_pcbbind
+ */
+ if (!(proto == IPPROTO_TCP || proto == IPPROTO_UDP ||
+ proto == IPPROTO_ICMP)) {
+ key.port[0] = dport;
+ if (pf_find_state_all(&key, PF_IN, NULL) == NULL)
+ return (0);
+ } else if (low == 0 && high == 0) {
+ key.port[0] = *nport;
+ if (pf_find_state_all(&key, PF_IN, NULL) == NULL)
+ return (0);
+ } else if (low == high) {
+ key.port[0] = htons(low);
+ if (pf_find_state_all(&key, PF_IN, NULL) == NULL) {
+ *nport = htons(low);
+ return (0);
+ }
+ } else {
+ u_int16_t tmp;
+
+ if (low > high) {
+ tmp = low;
+ low = high;
+ high = tmp;
+ }
+ /* low < high */
+ cut = htonl(arc4random()) % (1 + high - low) + low;
+ /* low <= cut <= high */
+ for (tmp = cut; tmp <= high; ++(tmp)) {
+ key.port[0] = htons(tmp);
+ if (pf_find_state_all(&key, PF_IN, NULL) ==
+ NULL) {
+ *nport = htons(tmp);
+ return (0);
+ }
+ }
+ for (tmp = cut - 1; tmp >= low; --(tmp)) {
+ key.port[0] = htons(tmp);
+ if (pf_find_state_all(&key, PF_IN, NULL) ==
+ NULL) {
+ *nport = htons(tmp);
+ return (0);
+ }
+ }
+ }
+
+ switch (r->rpool.opts & PF_POOL_TYPEMASK) {
+ case PF_POOL_RANDOM:
+ case PF_POOL_ROUNDROBIN:
+ if (pf_map_addr(af, r, saddr, naddr, &init_addr, sn))
+ return (1);
+ break;
+ case PF_POOL_NONE:
+ case PF_POOL_SRCHASH:
+ case PF_POOL_BITMASK:
+ default:
+ return (1);
+ }
+ } while (! PF_AEQ(&init_addr, naddr, af) );
+ return (1); /* none available */
+}
+
+int
+pf_map_addr(sa_family_t af, struct pf_rule *r, struct pf_addr *saddr,
+ struct pf_addr *naddr, struct pf_addr *init_addr, struct pf_src_node **sn)
+{
+ struct pf_pool *rpool = &r->rpool;
+ struct pf_addr *raddr = NULL, *rmask = NULL;
+
+ if (*sn == NULL && r->rpool.opts & PF_POOL_STICKYADDR &&
+ (r->rpool.opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) {
+ *sn = pf_find_src_node(saddr, r, af, 0);
+ if (*sn != NULL && !PF_AZERO(&(*sn)->raddr, af)) {
+ PF_ACPY(naddr, &(*sn)->raddr, af);
+ if (V_pf_status.debug >= PF_DEBUG_MISC) {
+ printf("pf_map_addr: src tracking maps ");
+ pf_print_host(saddr, 0, af);
+ printf(" to ");
+ pf_print_host(naddr, 0, af);
+ printf("\n");
+ }
+ return (0);
+ }
+ }
+
+ if (rpool->cur->addr.type == PF_ADDR_NOROUTE)
+ return (1);
+ if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) {
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ if (rpool->cur->addr.p.dyn->pfid_acnt4 < 1 &&
+ (rpool->opts & PF_POOL_TYPEMASK) !=
+ PF_POOL_ROUNDROBIN)
+ return (1);
+ raddr = &rpool->cur->addr.p.dyn->pfid_addr4;
+ rmask = &rpool->cur->addr.p.dyn->pfid_mask4;
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ if (rpool->cur->addr.p.dyn->pfid_acnt6 < 1 &&
+ (rpool->opts & PF_POOL_TYPEMASK) !=
+ PF_POOL_ROUNDROBIN)
+ return (1);
+ raddr = &rpool->cur->addr.p.dyn->pfid_addr6;
+ rmask = &rpool->cur->addr.p.dyn->pfid_mask6;
+ break;
+#endif /* INET6 */
+ }
+ } else if (rpool->cur->addr.type == PF_ADDR_TABLE) {
+ if ((rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_ROUNDROBIN)
+ return (1); /* unsupported */
+ } else {
+ raddr = &rpool->cur->addr.v.a.addr;
+ rmask = &rpool->cur->addr.v.a.mask;
+ }
+
+ switch (rpool->opts & PF_POOL_TYPEMASK) {
+ case PF_POOL_NONE:
+ PF_ACPY(naddr, raddr, af);
+ break;
+ case PF_POOL_BITMASK:
+ PF_POOLMASK(naddr, raddr, rmask, saddr, af);
+ break;
+ case PF_POOL_RANDOM:
+ if (init_addr != NULL && PF_AZERO(init_addr, af)) {
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ rpool->counter.addr32[0] = htonl(arc4random());
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ if (rmask->addr32[3] != 0xffffffff)
+ rpool->counter.addr32[3] =
+ htonl(arc4random());
+ else
+ break;
+ if (rmask->addr32[2] != 0xffffffff)
+ rpool->counter.addr32[2] =
+ htonl(arc4random());
+ else
+ break;
+ if (rmask->addr32[1] != 0xffffffff)
+ rpool->counter.addr32[1] =
+ htonl(arc4random());
+ else
+ break;
+ if (rmask->addr32[0] != 0xffffffff)
+ rpool->counter.addr32[0] =
+ htonl(arc4random());
+ break;
+#endif /* INET6 */
+ }
+ PF_POOLMASK(naddr, raddr, rmask, &rpool->counter, af);
+ PF_ACPY(init_addr, naddr, af);
+
+ } else {
+ PF_AINC(&rpool->counter, af);
+ PF_POOLMASK(naddr, raddr, rmask, &rpool->counter, af);
+ }
+ break;
+ case PF_POOL_SRCHASH:
+ {
+ unsigned char hash[16];
+
+ pf_hash(saddr, (struct pf_addr *)&hash, &rpool->key, af);
+ PF_POOLMASK(naddr, raddr, rmask, (struct pf_addr *)&hash, af);
+ break;
+ }
+ case PF_POOL_ROUNDROBIN:
+ {
+ struct pf_pooladdr *acur = rpool->cur;
+
+ /*
+ * XXXGL: in the round-robin case we need to store
+ * the round-robin machine state in the rule, thus
+ * forwarding thread needs to modify rule.
+ *
+ * This is done w/o locking, because performance is assumed
+ * more important than round-robin precision.
+ *
+ * In the simpliest case we just update the "rpool->cur"
+ * pointer. However, if pool contains tables or dynamic
+ * addresses, then "tblidx" is also used to store machine
+ * state. Since "tblidx" is int, concurrent access to it can't
+ * lead to inconsistence, only to lost of precision.
+ *
+ * Things get worse, if table contains not hosts, but
+ * prefixes. In this case counter also stores machine state,
+ * and for IPv6 address, counter can't be updated atomically.
+ * Probably, using round-robin on a table containing IPv6
+ * prefixes (or even IPv4) would cause a panic.
+ */
+
+ if (rpool->cur->addr.type == PF_ADDR_TABLE) {
+ if (!pfr_pool_get(rpool->cur->addr.p.tbl,
+ &rpool->tblidx, &rpool->counter, af))
+ goto get_addr;
+ } else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) {
+ if (!pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt,
+ &rpool->tblidx, &rpool->counter, af))
+ goto get_addr;
+ } else if (pf_match_addr(0, raddr, rmask, &rpool->counter, af))
+ goto get_addr;
+
+ try_next:
+ if (TAILQ_NEXT(rpool->cur, entries) == NULL)
+ rpool->cur = TAILQ_FIRST(&rpool->list);
+ else
+ rpool->cur = TAILQ_NEXT(rpool->cur, entries);
+ if (rpool->cur->addr.type == PF_ADDR_TABLE) {
+ rpool->tblidx = -1;
+ if (pfr_pool_get(rpool->cur->addr.p.tbl,
+ &rpool->tblidx, &rpool->counter, af)) {
+ /* table contains no address of type 'af' */
+ if (rpool->cur != acur)
+ goto try_next;
+ return (1);
+ }
+ } else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) {
+ rpool->tblidx = -1;
+ if (pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt,
+ &rpool->tblidx, &rpool->counter, af)) {
+ /* table contains no address of type 'af' */
+ if (rpool->cur != acur)
+ goto try_next;
+ return (1);
+ }
+ } else {
+ raddr = &rpool->cur->addr.v.a.addr;
+ rmask = &rpool->cur->addr.v.a.mask;
+ PF_ACPY(&rpool->counter, raddr, af);
+ }
+
+ get_addr:
+ PF_ACPY(naddr, &rpool->counter, af);
+ if (init_addr != NULL && PF_AZERO(init_addr, af))
+ PF_ACPY(init_addr, naddr, af);
+ PF_AINC(&rpool->counter, af);
+ break;
+ }
+ }
+ if (*sn != NULL)
+ PF_ACPY(&(*sn)->raddr, naddr, af);
+
+ if (V_pf_status.debug >= PF_DEBUG_MISC &&
+ (rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) {
+ printf("pf_map_addr: selected address ");
+ pf_print_host(naddr, 0, af);
+ printf("\n");
+ }
+
+ return (0);
+}
+
+struct pf_rule *
+pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, int direction,
+ struct pfi_kif *kif, struct pf_src_node **sn,
+ struct pf_state_key **skp, struct pf_state_key **nkp,
+ struct pf_addr *saddr, struct pf_addr *daddr,
+ u_int16_t sport, u_int16_t dport)
+{
+ struct pf_rule *r = NULL;
+ struct pf_addr *naddr;
+ uint16_t *nport;
+
+ PF_RULES_RASSERT();
+ KASSERT(*skp == NULL, ("*skp not NULL"));
+ KASSERT(*nkp == NULL, ("*nkp not NULL"));
+
+ if (direction == PF_OUT) {
+ r = pf_match_translation(pd, m, off, direction, kif, saddr,
+ sport, daddr, dport, PF_RULESET_BINAT);
+ if (r == NULL)
+ r = pf_match_translation(pd, m, off, direction, kif,
+ saddr, sport, daddr, dport, PF_RULESET_NAT);
+ } else {
+ r = pf_match_translation(pd, m, off, direction, kif, saddr,
+ sport, daddr, dport, PF_RULESET_RDR);
+ if (r == NULL)
+ r = pf_match_translation(pd, m, off, direction, kif,
+ saddr, sport, daddr, dport, PF_RULESET_BINAT);
+ }
+
+ if (r == NULL)
+ return (NULL);
+
+ switch (r->action) {
+ case PF_NONAT:
+ case PF_NOBINAT:
+ case PF_NORDR:
+ return (NULL);
+ }
+
+ *skp = pf_state_key_setup(pd, saddr, daddr, sport, dport);
+ if (*skp == NULL)
+ return (NULL);
+ *nkp = pf_state_key_clone(*skp);
+ if (*nkp == NULL) {
+ uma_zfree(V_pf_state_key_z, skp);
+ *skp = NULL;
+ return (NULL);
+ }
+
+ /* XXX We only modify one side for now. */
+ naddr = &(*nkp)->addr[1];
+ nport = &(*nkp)->port[1];
+
+ switch (r->action) {
+ case PF_NAT:
+ if (pf_get_sport(pd->af, pd->proto, r, saddr, daddr, dport,
+ naddr, nport, r->rpool.proxy_port[0],
+ r->rpool.proxy_port[1], sn)) {
+ DPFPRINTF(PF_DEBUG_MISC,
+ ("pf: NAT proxy port allocation (%u-%u) failed\n",
+ r->rpool.proxy_port[0], r->rpool.proxy_port[1]));
+ goto notrans;
+ }
+ break;
+ case PF_BINAT:
+ switch (direction) {
+ case PF_OUT:
+ if (r->rpool.cur->addr.type == PF_ADDR_DYNIFTL){
+ switch (pd->af) {
+#ifdef INET
+ case AF_INET:
+ if (r->rpool.cur->addr.p.dyn->
+ pfid_acnt4 < 1)
+ goto notrans;
+ PF_POOLMASK(naddr,
+ &r->rpool.cur->addr.p.dyn->
+ pfid_addr4,
+ &r->rpool.cur->addr.p.dyn->
+ pfid_mask4, saddr, AF_INET);
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ if (r->rpool.cur->addr.p.dyn->
+ pfid_acnt6 < 1)
+ goto notrans;
+ PF_POOLMASK(naddr,
+ &r->rpool.cur->addr.p.dyn->
+ pfid_addr6,
+ &r->rpool.cur->addr.p.dyn->
+ pfid_mask6, saddr, AF_INET6);
+ break;
+#endif /* INET6 */
+ }
+ } else
+ PF_POOLMASK(naddr,
+ &r->rpool.cur->addr.v.a.addr,
+ &r->rpool.cur->addr.v.a.mask, saddr,
+ pd->af);
+ break;
+ case PF_IN:
+ if (r->src.addr.type == PF_ADDR_DYNIFTL) {
+ switch (pd->af) {
+#ifdef INET
+ case AF_INET:
+ if (r->src.addr.p.dyn-> pfid_acnt4 < 1)
+ goto notrans;
+ PF_POOLMASK(naddr,
+ &r->src.addr.p.dyn->pfid_addr4,
+ &r->src.addr.p.dyn->pfid_mask4,
+ daddr, AF_INET);
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ if (r->src.addr.p.dyn->pfid_acnt6 < 1)
+ goto notrans;
+ PF_POOLMASK(naddr,
+ &r->src.addr.p.dyn->pfid_addr6,
+ &r->src.addr.p.dyn->pfid_mask6,
+ daddr, AF_INET6);
+ break;
+#endif /* INET6 */
+ }
+ } else
+ PF_POOLMASK(naddr, &r->src.addr.v.a.addr,
+ &r->src.addr.v.a.mask, daddr, pd->af);
+ break;
+ }
+ break;
+ case PF_RDR: {
+ if (pf_map_addr(pd->af, r, saddr, naddr, NULL, sn))
+ goto notrans;
+ if ((r->rpool.opts & PF_POOL_TYPEMASK) == PF_POOL_BITMASK)
+ PF_POOLMASK(naddr, naddr, &r->rpool.cur->addr.v.a.mask,
+ daddr, pd->af);
+
+ if (r->rpool.proxy_port[1]) {
+ uint32_t tmp_nport;
+
+ tmp_nport = ((ntohs(dport) - ntohs(r->dst.port[0])) %
+ (r->rpool.proxy_port[1] - r->rpool.proxy_port[0] +
+ 1)) + r->rpool.proxy_port[0];
+
+ /* Wrap around if necessary. */
+ if (tmp_nport > 65535)
+ tmp_nport -= 65535;
+ *nport = htons((uint16_t)tmp_nport);
+ } else if (r->rpool.proxy_port[0])
+ *nport = htons(r->rpool.proxy_port[0]);
+ break;
+ }
+ default:
+ panic("%s: unknown action %u", __func__, r->action);
+ }
+
+ /* Return success only if translation really happened. */
+ if (bcmp(*skp, *nkp, sizeof(struct pf_state_key_cmp)))
+ return (r);
+
+notrans:
+ uma_zfree(V_pf_state_key_z, *nkp);
+ uma_zfree(V_pf_state_key_z, *skp);
+ *skp = *nkp = NULL;
+
+ return (NULL);
+}
diff --git a/sys/netpfil/pf/pf_norm.c b/sys/netpfil/pf/pf_norm.c
new file mode 100644
index 0000000..9063fe8
--- /dev/null
+++ b/sys/netpfil/pf/pf_norm.c
@@ -0,0 +1,1999 @@
+/* $OpenBSD: pf_norm.c,v 1.114 2009/01/29 14:11:45 henning Exp $ */
+
+/*
+ * Copyright 2001 Niels Provos <provos@citi.umich.edu>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_pf.h"
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/refcount.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+
+#include <net/if.h>
+#include <net/vnet.h>
+#include <net/pfvar.h>
+#include <net/pf_mtag.h>
+#include <net/if_pflog.h>
+
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+
+#ifdef INET6
+#include <netinet/ip6.h>
+#endif /* INET6 */
+
+struct pf_frent {
+ LIST_ENTRY(pf_frent) fr_next;
+ union {
+ struct {
+ struct ip *_fr_ip;
+ struct mbuf *_fr_m;
+ } _frag;
+ struct {
+ uint16_t _fr_off;
+ uint16_t _fr_end;
+ } _cache;
+ } _u;
+};
+#define fr_ip _u._frag._fr_ip
+#define fr_m _u._frag._fr_m
+#define fr_off _u._cache._fr_off
+#define fr_end _u._cache._fr_end
+
+struct pf_fragment {
+ RB_ENTRY(pf_fragment) fr_entry;
+ TAILQ_ENTRY(pf_fragment) frag_next;
+ struct in_addr fr_src;
+ struct in_addr fr_dst;
+ u_int8_t fr_p; /* protocol of this fragment */
+ u_int8_t fr_flags; /* status flags */
+#define PFFRAG_SEENLAST 0x0001 /* Seen the last fragment for this */
+#define PFFRAG_NOBUFFER 0x0002 /* Non-buffering fragment cache */
+#define PFFRAG_DROP 0x0004 /* Drop all fragments */
+#define BUFFER_FRAGMENTS(fr) (!((fr)->fr_flags & PFFRAG_NOBUFFER))
+ u_int16_t fr_id; /* fragment id for reassemble */
+ u_int16_t fr_max; /* fragment data max */
+ u_int32_t fr_timeout;
+ LIST_HEAD(, pf_frent) fr_queue;
+};
+
+static struct mtx pf_frag_mtx;
+#define PF_FRAG_LOCK() mtx_lock(&pf_frag_mtx)
+#define PF_FRAG_UNLOCK() mtx_unlock(&pf_frag_mtx)
+#define PF_FRAG_ASSERT() mtx_assert(&pf_frag_mtx, MA_OWNED)
+
+VNET_DEFINE(uma_zone_t, pf_state_scrub_z); /* XXX: shared with pfsync */
+
+static VNET_DEFINE(uma_zone_t, pf_frent_z);
+#define V_pf_frent_z VNET(pf_frent_z)
+static VNET_DEFINE(uma_zone_t, pf_frag_z);
+#define V_pf_frag_z VNET(pf_frag_z)
+
+TAILQ_HEAD(pf_fragqueue, pf_fragment);
+TAILQ_HEAD(pf_cachequeue, pf_fragment);
+static VNET_DEFINE(struct pf_fragqueue, pf_fragqueue);
+#define V_pf_fragqueue VNET(pf_fragqueue)
+static VNET_DEFINE(struct pf_cachequeue, pf_cachequeue);
+#define V_pf_cachequeue VNET(pf_cachequeue)
+RB_HEAD(pf_frag_tree, pf_fragment);
+static VNET_DEFINE(struct pf_frag_tree, pf_frag_tree);
+#define V_pf_frag_tree VNET(pf_frag_tree)
+static VNET_DEFINE(struct pf_frag_tree, pf_cache_tree);
+#define V_pf_cache_tree VNET(pf_cache_tree)
+static int pf_frag_compare(struct pf_fragment *,
+ struct pf_fragment *);
+static RB_PROTOTYPE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare);
+static RB_GENERATE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare);
+
+/* Private prototypes */
+static void pf_free_fragment(struct pf_fragment *);
+static void pf_remove_fragment(struct pf_fragment *);
+static int pf_normalize_tcpopt(struct pf_rule *, struct mbuf *,
+ struct tcphdr *, int, sa_family_t);
+#ifdef INET
+static void pf_ip2key(struct pf_fragment *, struct ip *);
+static void pf_scrub_ip(struct mbuf **, u_int32_t, u_int8_t,
+ u_int8_t);
+static void pf_flush_fragments(void);
+static struct pf_fragment *pf_find_fragment(struct ip *, struct pf_frag_tree *);
+static struct mbuf *pf_reassemble(struct mbuf **, struct pf_fragment **,
+ struct pf_frent *, int);
+static struct mbuf *pf_fragcache(struct mbuf **, struct ip*,
+ struct pf_fragment **, int, int, int *);
+#endif /* INET */
+#ifdef INET6
+static void pf_scrub_ip6(struct mbuf **, u_int8_t);
+#endif
+#define DPFPRINTF(x) do { \
+ if (V_pf_status.debug >= PF_DEBUG_MISC) { \
+ printf("%s: ", __func__); \
+ printf x ; \
+ } \
+} while(0)
+
+void
+pf_normalize_init(void)
+{
+
+ V_pf_frag_z = uma_zcreate("pf frags", sizeof(struct pf_fragment),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+ V_pf_frent_z = uma_zcreate("pf frag entries", sizeof(struct pf_frent),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+ V_pf_state_scrub_z = uma_zcreate("pf state scrubs",
+ sizeof(struct pf_state_scrub), NULL, NULL, NULL, NULL,
+ UMA_ALIGN_PTR, 0);
+
+ V_pf_limits[PF_LIMIT_FRAGS].zone = V_pf_frent_z;
+ V_pf_limits[PF_LIMIT_FRAGS].limit = PFFRAG_FRENT_HIWAT;
+ uma_zone_set_max(V_pf_frent_z, PFFRAG_FRENT_HIWAT);
+
+ mtx_init(&pf_frag_mtx, "pf fragments", NULL, MTX_DEF);
+
+ TAILQ_INIT(&V_pf_fragqueue);
+ TAILQ_INIT(&V_pf_cachequeue);
+}
+
+void
+pf_normalize_cleanup(void)
+{
+
+ uma_zdestroy(V_pf_state_scrub_z);
+ uma_zdestroy(V_pf_frent_z);
+ uma_zdestroy(V_pf_frag_z);
+
+ mtx_destroy(&pf_frag_mtx);
+}
+
+static int
+pf_frag_compare(struct pf_fragment *a, struct pf_fragment *b)
+{
+ int diff;
+
+ if ((diff = a->fr_id - b->fr_id))
+ return (diff);
+ else if ((diff = a->fr_p - b->fr_p))
+ return (diff);
+ else if (a->fr_src.s_addr < b->fr_src.s_addr)
+ return (-1);
+ else if (a->fr_src.s_addr > b->fr_src.s_addr)
+ return (1);
+ else if (a->fr_dst.s_addr < b->fr_dst.s_addr)
+ return (-1);
+ else if (a->fr_dst.s_addr > b->fr_dst.s_addr)
+ return (1);
+ return (0);
+}
+
+void
+pf_purge_expired_fragments(void)
+{
+ struct pf_fragment *frag;
+ u_int32_t expire = time_uptime -
+ V_pf_default_rule.timeout[PFTM_FRAG];
+
+ PF_FRAG_LOCK();
+ while ((frag = TAILQ_LAST(&V_pf_fragqueue, pf_fragqueue)) != NULL) {
+ KASSERT((BUFFER_FRAGMENTS(frag)),
+ ("BUFFER_FRAGMENTS(frag) == 0: %s", __FUNCTION__));
+ if (frag->fr_timeout > expire)
+ break;
+
+ DPFPRINTF(("expiring %d(%p)\n", frag->fr_id, frag));
+ pf_free_fragment(frag);
+ }
+
+ while ((frag = TAILQ_LAST(&V_pf_cachequeue, pf_cachequeue)) != NULL) {
+ KASSERT((!BUFFER_FRAGMENTS(frag)),
+ ("BUFFER_FRAGMENTS(frag) != 0: %s", __FUNCTION__));
+ if (frag->fr_timeout > expire)
+ break;
+
+ DPFPRINTF(("expiring %d(%p)\n", frag->fr_id, frag));
+ pf_free_fragment(frag);
+ KASSERT((TAILQ_EMPTY(&V_pf_cachequeue) ||
+ TAILQ_LAST(&V_pf_cachequeue, pf_cachequeue) != frag),
+ ("!(TAILQ_EMPTY() || TAILQ_LAST() == farg): %s",
+ __FUNCTION__));
+ }
+ PF_FRAG_UNLOCK();
+}
+
+#ifdef INET
+/*
+ * Try to flush old fragments to make space for new ones
+ */
+static void
+pf_flush_fragments(void)
+{
+ struct pf_fragment *frag, *cache;
+ int goal;
+
+ PF_FRAG_ASSERT();
+
+ goal = uma_zone_get_cur(V_pf_frent_z) * 9 / 10;
+ DPFPRINTF(("trying to free %d frag entriess\n", goal));
+ while (goal < uma_zone_get_cur(V_pf_frent_z)) {
+ frag = TAILQ_LAST(&V_pf_fragqueue, pf_fragqueue);
+ if (frag)
+ pf_free_fragment(frag);
+ cache = TAILQ_LAST(&V_pf_cachequeue, pf_cachequeue);
+ if (cache)
+ pf_free_fragment(cache);
+ if (frag == NULL && cache == NULL)
+ break;
+ }
+}
+#endif /* INET */
+
+/* Frees the fragments and all associated entries */
+static void
+pf_free_fragment(struct pf_fragment *frag)
+{
+ struct pf_frent *frent;
+
+ PF_FRAG_ASSERT();
+
+ /* Free all fragments */
+ if (BUFFER_FRAGMENTS(frag)) {
+ for (frent = LIST_FIRST(&frag->fr_queue); frent;
+ frent = LIST_FIRST(&frag->fr_queue)) {
+ LIST_REMOVE(frent, fr_next);
+
+ m_freem(frent->fr_m);
+ uma_zfree(V_pf_frent_z, frent);
+ }
+ } else {
+ for (frent = LIST_FIRST(&frag->fr_queue); frent;
+ frent = LIST_FIRST(&frag->fr_queue)) {
+ LIST_REMOVE(frent, fr_next);
+
+ KASSERT((LIST_EMPTY(&frag->fr_queue) ||
+ LIST_FIRST(&frag->fr_queue)->fr_off >
+ frent->fr_end),
+ ("! (LIST_EMPTY() || LIST_FIRST()->fr_off >"
+ " frent->fr_end): %s", __func__));
+
+ uma_zfree(V_pf_frent_z, frent);
+ }
+ }
+
+ pf_remove_fragment(frag);
+}
+
+#ifdef INET
+static void
+pf_ip2key(struct pf_fragment *key, struct ip *ip)
+{
+ key->fr_p = ip->ip_p;
+ key->fr_id = ip->ip_id;
+ key->fr_src.s_addr = ip->ip_src.s_addr;
+ key->fr_dst.s_addr = ip->ip_dst.s_addr;
+}
+
+static struct pf_fragment *
+pf_find_fragment(struct ip *ip, struct pf_frag_tree *tree)
+{
+ struct pf_fragment key;
+ struct pf_fragment *frag;
+
+ PF_FRAG_ASSERT();
+
+ pf_ip2key(&key, ip);
+
+ frag = RB_FIND(pf_frag_tree, tree, &key);
+ if (frag != NULL) {
+ /* XXX Are we sure we want to update the timeout? */
+ frag->fr_timeout = time_uptime;
+ if (BUFFER_FRAGMENTS(frag)) {
+ TAILQ_REMOVE(&V_pf_fragqueue, frag, frag_next);
+ TAILQ_INSERT_HEAD(&V_pf_fragqueue, frag, frag_next);
+ } else {
+ TAILQ_REMOVE(&V_pf_cachequeue, frag, frag_next);
+ TAILQ_INSERT_HEAD(&V_pf_cachequeue, frag, frag_next);
+ }
+ }
+
+ return (frag);
+}
+#endif /* INET */
+
+/* Removes a fragment from the fragment queue and frees the fragment */
+
+static void
+pf_remove_fragment(struct pf_fragment *frag)
+{
+
+ PF_FRAG_ASSERT();
+
+ if (BUFFER_FRAGMENTS(frag)) {
+ RB_REMOVE(pf_frag_tree, &V_pf_frag_tree, frag);
+ TAILQ_REMOVE(&V_pf_fragqueue, frag, frag_next);
+ uma_zfree(V_pf_frag_z, frag);
+ } else {
+ RB_REMOVE(pf_frag_tree, &V_pf_cache_tree, frag);
+ TAILQ_REMOVE(&V_pf_cachequeue, frag, frag_next);
+ uma_zfree(V_pf_frag_z, frag);
+ }
+}
+
+#ifdef INET
+#define FR_IP_OFF(fr) ((ntohs((fr)->fr_ip->ip_off) & IP_OFFMASK) << 3)
+static struct mbuf *
+pf_reassemble(struct mbuf **m0, struct pf_fragment **frag,
+ struct pf_frent *frent, int mff)
+{
+ struct mbuf *m = *m0, *m2;
+ struct pf_frent *frea, *next;
+ struct pf_frent *frep = NULL;
+ struct ip *ip = frent->fr_ip;
+ int hlen = ip->ip_hl << 2;
+ u_int16_t off = (ntohs(ip->ip_off) & IP_OFFMASK) << 3;
+ u_int16_t ip_len = ntohs(ip->ip_len) - ip->ip_hl * 4;
+ u_int16_t max = ip_len + off;
+
+ PF_FRAG_ASSERT();
+ KASSERT((*frag == NULL || BUFFER_FRAGMENTS(*frag)),
+ ("! (*frag == NULL || BUFFER_FRAGMENTS(*frag)): %s", __FUNCTION__));
+
+ /* Strip off ip header */
+ m->m_data += hlen;
+ m->m_len -= hlen;
+
+ /* Create a new reassembly queue for this packet */
+ if (*frag == NULL) {
+ *frag = uma_zalloc(V_pf_frag_z, M_NOWAIT);
+ if (*frag == NULL) {
+ pf_flush_fragments();
+ *frag = uma_zalloc(V_pf_frag_z, M_NOWAIT);
+ if (*frag == NULL)
+ goto drop_fragment;
+ }
+
+ (*frag)->fr_flags = 0;
+ (*frag)->fr_max = 0;
+ (*frag)->fr_src = frent->fr_ip->ip_src;
+ (*frag)->fr_dst = frent->fr_ip->ip_dst;
+ (*frag)->fr_p = frent->fr_ip->ip_p;
+ (*frag)->fr_id = frent->fr_ip->ip_id;
+ (*frag)->fr_timeout = time_uptime;
+ LIST_INIT(&(*frag)->fr_queue);
+
+ RB_INSERT(pf_frag_tree, &V_pf_frag_tree, *frag);
+ TAILQ_INSERT_HEAD(&V_pf_fragqueue, *frag, frag_next);
+
+ /* We do not have a previous fragment */
+ frep = NULL;
+ goto insert;
+ }
+
+ /*
+ * Find a fragment after the current one:
+ * - off contains the real shifted offset.
+ */
+ LIST_FOREACH(frea, &(*frag)->fr_queue, fr_next) {
+ if (FR_IP_OFF(frea) > off)
+ break;
+ frep = frea;
+ }
+
+ KASSERT((frep != NULL || frea != NULL),
+ ("!(frep != NULL || frea != NULL): %s", __FUNCTION__));;
+
+ if (frep != NULL &&
+ FR_IP_OFF(frep) + ntohs(frep->fr_ip->ip_len) - frep->fr_ip->ip_hl *
+ 4 > off)
+ {
+ u_int16_t precut;
+
+ precut = FR_IP_OFF(frep) + ntohs(frep->fr_ip->ip_len) -
+ frep->fr_ip->ip_hl * 4 - off;
+ if (precut >= ip_len)
+ goto drop_fragment;
+ m_adj(frent->fr_m, precut);
+ DPFPRINTF(("overlap -%d\n", precut));
+ /* Enforce 8 byte boundaries */
+ ip->ip_off = htons(ntohs(ip->ip_off) + (precut >> 3));
+ off = (ntohs(ip->ip_off) & IP_OFFMASK) << 3;
+ ip_len -= precut;
+ ip->ip_len = htons(ip_len);
+ }
+
+ for (; frea != NULL && ip_len + off > FR_IP_OFF(frea);
+ frea = next)
+ {
+ u_int16_t aftercut;
+
+ aftercut = ip_len + off - FR_IP_OFF(frea);
+ DPFPRINTF(("adjust overlap %d\n", aftercut));
+ if (aftercut < ntohs(frea->fr_ip->ip_len) - frea->fr_ip->ip_hl
+ * 4)
+ {
+ frea->fr_ip->ip_len =
+ htons(ntohs(frea->fr_ip->ip_len) - aftercut);
+ frea->fr_ip->ip_off = htons(ntohs(frea->fr_ip->ip_off) +
+ (aftercut >> 3));
+ m_adj(frea->fr_m, aftercut);
+ break;
+ }
+
+ /* This fragment is completely overlapped, lose it */
+ next = LIST_NEXT(frea, fr_next);
+ m_freem(frea->fr_m);
+ LIST_REMOVE(frea, fr_next);
+ uma_zfree(V_pf_frent_z, frea);
+ }
+
+ insert:
+ /* Update maximum data size */
+ if ((*frag)->fr_max < max)
+ (*frag)->fr_max = max;
+ /* This is the last segment */
+ if (!mff)
+ (*frag)->fr_flags |= PFFRAG_SEENLAST;
+
+ if (frep == NULL)
+ LIST_INSERT_HEAD(&(*frag)->fr_queue, frent, fr_next);
+ else
+ LIST_INSERT_AFTER(frep, frent, fr_next);
+
+ /* Check if we are completely reassembled */
+ if (!((*frag)->fr_flags & PFFRAG_SEENLAST))
+ return (NULL);
+
+ /* Check if we have all the data */
+ off = 0;
+ for (frep = LIST_FIRST(&(*frag)->fr_queue); frep; frep = next) {
+ next = LIST_NEXT(frep, fr_next);
+
+ off += ntohs(frep->fr_ip->ip_len) - frep->fr_ip->ip_hl * 4;
+ if (off < (*frag)->fr_max &&
+ (next == NULL || FR_IP_OFF(next) != off))
+ {
+ DPFPRINTF(("missing fragment at %d, next %d, max %d\n",
+ off, next == NULL ? -1 : FR_IP_OFF(next),
+ (*frag)->fr_max));
+ return (NULL);
+ }
+ }
+ DPFPRINTF(("%d < %d?\n", off, (*frag)->fr_max));
+ if (off < (*frag)->fr_max)
+ return (NULL);
+
+ /* We have all the data */
+ frent = LIST_FIRST(&(*frag)->fr_queue);
+ KASSERT((frent != NULL), ("frent == NULL: %s", __FUNCTION__));
+ if ((frent->fr_ip->ip_hl << 2) + off > IP_MAXPACKET) {
+ DPFPRINTF(("drop: too big: %d\n", off));
+ pf_free_fragment(*frag);
+ *frag = NULL;
+ return (NULL);
+ }
+ next = LIST_NEXT(frent, fr_next);
+
+ /* Magic from ip_input */
+ ip = frent->fr_ip;
+ m = frent->fr_m;
+ m2 = m->m_next;
+ m->m_next = NULL;
+ m_cat(m, m2);
+ uma_zfree(V_pf_frent_z, frent);
+ for (frent = next; frent != NULL; frent = next) {
+ next = LIST_NEXT(frent, fr_next);
+
+ m2 = frent->fr_m;
+ uma_zfree(V_pf_frent_z, frent);
+ m->m_pkthdr.csum_flags &= m2->m_pkthdr.csum_flags;
+ m->m_pkthdr.csum_data += m2->m_pkthdr.csum_data;
+ m_cat(m, m2);
+ }
+
+ while (m->m_pkthdr.csum_data & 0xffff0000)
+ m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0xffff) +
+ (m->m_pkthdr.csum_data >> 16);
+ ip->ip_src = (*frag)->fr_src;
+ ip->ip_dst = (*frag)->fr_dst;
+
+ /* Remove from fragment queue */
+ pf_remove_fragment(*frag);
+ *frag = NULL;
+
+ hlen = ip->ip_hl << 2;
+ ip->ip_len = htons(off + hlen);
+ m->m_len += hlen;
+ m->m_data -= hlen;
+
+ /* some debugging cruft by sklower, below, will go away soon */
+ /* XXX this should be done elsewhere */
+ if (m->m_flags & M_PKTHDR) {
+ int plen = 0;
+ for (m2 = m; m2; m2 = m2->m_next)
+ plen += m2->m_len;
+ m->m_pkthdr.len = plen;
+ }
+
+ DPFPRINTF(("complete: %p(%d)\n", m, ntohs(ip->ip_len)));
+ return (m);
+
+ drop_fragment:
+ /* Oops - fail safe - drop packet */
+ uma_zfree(V_pf_frent_z, frent);
+ m_freem(m);
+ return (NULL);
+}
+
+static struct mbuf *
+pf_fragcache(struct mbuf **m0, struct ip *h, struct pf_fragment **frag, int mff,
+ int drop, int *nomem)
+{
+ struct mbuf *m = *m0;
+ struct pf_frent *frp, *fra, *cur = NULL;
+ int ip_len = ntohs(h->ip_len) - (h->ip_hl << 2);
+ u_int16_t off = ntohs(h->ip_off) << 3;
+ u_int16_t max = ip_len + off;
+ int hosed = 0;
+
+ PF_FRAG_ASSERT();
+ KASSERT((*frag == NULL || !BUFFER_FRAGMENTS(*frag)),
+ ("!(*frag == NULL || !BUFFER_FRAGMENTS(*frag)): %s", __FUNCTION__));
+
+ /* Create a new range queue for this packet */
+ if (*frag == NULL) {
+ *frag = uma_zalloc(V_pf_frag_z, M_NOWAIT);
+ if (*frag == NULL) {
+ pf_flush_fragments();
+ *frag = uma_zalloc(V_pf_frag_z, M_NOWAIT);
+ if (*frag == NULL)
+ goto no_mem;
+ }
+
+ /* Get an entry for the queue */
+ cur = uma_zalloc(V_pf_frent_z, M_NOWAIT);
+ if (cur == NULL) {
+ uma_zfree(V_pf_frag_z, *frag);
+ *frag = NULL;
+ goto no_mem;
+ }
+
+ (*frag)->fr_flags = PFFRAG_NOBUFFER;
+ (*frag)->fr_max = 0;
+ (*frag)->fr_src = h->ip_src;
+ (*frag)->fr_dst = h->ip_dst;
+ (*frag)->fr_p = h->ip_p;
+ (*frag)->fr_id = h->ip_id;
+ (*frag)->fr_timeout = time_uptime;
+
+ cur->fr_off = off;
+ cur->fr_end = max;
+ LIST_INIT(&(*frag)->fr_queue);
+ LIST_INSERT_HEAD(&(*frag)->fr_queue, cur, fr_next);
+
+ RB_INSERT(pf_frag_tree, &V_pf_cache_tree, *frag);
+ TAILQ_INSERT_HEAD(&V_pf_cachequeue, *frag, frag_next);
+
+ DPFPRINTF(("fragcache[%d]: new %d-%d\n", h->ip_id, off, max));
+
+ goto pass;
+ }
+
+ /*
+ * Find a fragment after the current one:
+ * - off contains the real shifted offset.
+ */
+ frp = NULL;
+ LIST_FOREACH(fra, &(*frag)->fr_queue, fr_next) {
+ if (fra->fr_off > off)
+ break;
+ frp = fra;
+ }
+
+ KASSERT((frp != NULL || fra != NULL),
+ ("!(frp != NULL || fra != NULL): %s", __FUNCTION__));
+
+ if (frp != NULL) {
+ int precut;
+
+ precut = frp->fr_end - off;
+ if (precut >= ip_len) {
+ /* Fragment is entirely a duplicate */
+ DPFPRINTF(("fragcache[%d]: dead (%d-%d) %d-%d\n",
+ h->ip_id, frp->fr_off, frp->fr_end, off, max));
+ goto drop_fragment;
+ }
+ if (precut == 0) {
+ /* They are adjacent. Fixup cache entry */
+ DPFPRINTF(("fragcache[%d]: adjacent (%d-%d) %d-%d\n",
+ h->ip_id, frp->fr_off, frp->fr_end, off, max));
+ frp->fr_end = max;
+ } else if (precut > 0) {
+ /* The first part of this payload overlaps with a
+ * fragment that has already been passed.
+ * Need to trim off the first part of the payload.
+ * But to do so easily, we need to create another
+ * mbuf to throw the original header into.
+ */
+
+ DPFPRINTF(("fragcache[%d]: chop %d (%d-%d) %d-%d\n",
+ h->ip_id, precut, frp->fr_off, frp->fr_end, off,
+ max));
+
+ off += precut;
+ max -= precut;
+ /* Update the previous frag to encompass this one */
+ frp->fr_end = max;
+
+ if (!drop) {
+ /* XXX Optimization opportunity
+ * This is a very heavy way to trim the payload.
+ * we could do it much faster by diddling mbuf
+ * internals but that would be even less legible
+ * than this mbuf magic. For my next trick,
+ * I'll pull a rabbit out of my laptop.
+ */
+ *m0 = m_dup(m, M_NOWAIT);
+ if (*m0 == NULL)
+ goto no_mem;
+ /* From KAME Project : We have missed this! */
+ m_adj(*m0, (h->ip_hl << 2) -
+ (*m0)->m_pkthdr.len);
+
+ KASSERT(((*m0)->m_next == NULL),
+ ("(*m0)->m_next != NULL: %s",
+ __FUNCTION__));
+ m_adj(m, precut + (h->ip_hl << 2));
+ m_cat(*m0, m);
+ m = *m0;
+ if (m->m_flags & M_PKTHDR) {
+ int plen = 0;
+ struct mbuf *t;
+ for (t = m; t; t = t->m_next)
+ plen += t->m_len;
+ m->m_pkthdr.len = plen;
+ }
+
+
+ h = mtod(m, struct ip *);
+
+ KASSERT(((int)m->m_len ==
+ ntohs(h->ip_len) - precut),
+ ("m->m_len != ntohs(h->ip_len) - precut: %s",
+ __FUNCTION__));
+ h->ip_off = htons(ntohs(h->ip_off) +
+ (precut >> 3));
+ h->ip_len = htons(ntohs(h->ip_len) - precut);
+ } else {
+ hosed++;
+ }
+ } else {
+ /* There is a gap between fragments */
+
+ DPFPRINTF(("fragcache[%d]: gap %d (%d-%d) %d-%d\n",
+ h->ip_id, -precut, frp->fr_off, frp->fr_end, off,
+ max));
+
+ cur = uma_zalloc(V_pf_frent_z, M_NOWAIT);
+ if (cur == NULL)
+ goto no_mem;
+
+ cur->fr_off = off;
+ cur->fr_end = max;
+ LIST_INSERT_AFTER(frp, cur, fr_next);
+ }
+ }
+
+ if (fra != NULL) {
+ int aftercut;
+ int merge = 0;
+
+ aftercut = max - fra->fr_off;
+ if (aftercut == 0) {
+ /* Adjacent fragments */
+ DPFPRINTF(("fragcache[%d]: adjacent %d-%d (%d-%d)\n",
+ h->ip_id, off, max, fra->fr_off, fra->fr_end));
+ fra->fr_off = off;
+ merge = 1;
+ } else if (aftercut > 0) {
+ /* Need to chop off the tail of this fragment */
+ DPFPRINTF(("fragcache[%d]: chop %d %d-%d (%d-%d)\n",
+ h->ip_id, aftercut, off, max, fra->fr_off,
+ fra->fr_end));
+ fra->fr_off = off;
+ max -= aftercut;
+
+ merge = 1;
+
+ if (!drop) {
+ m_adj(m, -aftercut);
+ if (m->m_flags & M_PKTHDR) {
+ int plen = 0;
+ struct mbuf *t;
+ for (t = m; t; t = t->m_next)
+ plen += t->m_len;
+ m->m_pkthdr.len = plen;
+ }
+ h = mtod(m, struct ip *);
+ KASSERT(((int)m->m_len == ntohs(h->ip_len) - aftercut),
+ ("m->m_len != ntohs(h->ip_len) - aftercut: %s",
+ __FUNCTION__));
+ h->ip_len = htons(ntohs(h->ip_len) - aftercut);
+ } else {
+ hosed++;
+ }
+ } else if (frp == NULL) {
+ /* There is a gap between fragments */
+ DPFPRINTF(("fragcache[%d]: gap %d %d-%d (%d-%d)\n",
+ h->ip_id, -aftercut, off, max, fra->fr_off,
+ fra->fr_end));
+
+ cur = uma_zalloc(V_pf_frent_z, M_NOWAIT);
+ if (cur == NULL)
+ goto no_mem;
+
+ cur->fr_off = off;
+ cur->fr_end = max;
+ LIST_INSERT_BEFORE(fra, cur, fr_next);
+ }
+
+
+ /* Need to glue together two separate fragment descriptors */
+ if (merge) {
+ if (cur && fra->fr_off <= cur->fr_end) {
+ /* Need to merge in a previous 'cur' */
+ DPFPRINTF(("fragcache[%d]: adjacent(merge "
+ "%d-%d) %d-%d (%d-%d)\n",
+ h->ip_id, cur->fr_off, cur->fr_end, off,
+ max, fra->fr_off, fra->fr_end));
+ fra->fr_off = cur->fr_off;
+ LIST_REMOVE(cur, fr_next);
+ uma_zfree(V_pf_frent_z, cur);
+ cur = NULL;
+
+ } else if (frp && fra->fr_off <= frp->fr_end) {
+ /* Need to merge in a modified 'frp' */
+ KASSERT((cur == NULL), ("cur != NULL: %s",
+ __FUNCTION__));
+ DPFPRINTF(("fragcache[%d]: adjacent(merge "
+ "%d-%d) %d-%d (%d-%d)\n",
+ h->ip_id, frp->fr_off, frp->fr_end, off,
+ max, fra->fr_off, fra->fr_end));
+ fra->fr_off = frp->fr_off;
+ LIST_REMOVE(frp, fr_next);
+ uma_zfree(V_pf_frent_z, frp);
+ frp = NULL;
+
+ }
+ }
+ }
+
+ if (hosed) {
+ /*
+ * We must keep tracking the overall fragment even when
+ * we're going to drop it anyway so that we know when to
+ * free the overall descriptor. Thus we drop the frag late.
+ */
+ goto drop_fragment;
+ }
+
+
+ pass:
+ /* Update maximum data size */
+ if ((*frag)->fr_max < max)
+ (*frag)->fr_max = max;
+
+ /* This is the last segment */
+ if (!mff)
+ (*frag)->fr_flags |= PFFRAG_SEENLAST;
+
+ /* Check if we are completely reassembled */
+ if (((*frag)->fr_flags & PFFRAG_SEENLAST) &&
+ LIST_FIRST(&(*frag)->fr_queue)->fr_off == 0 &&
+ LIST_FIRST(&(*frag)->fr_queue)->fr_end == (*frag)->fr_max) {
+ /* Remove from fragment queue */
+ DPFPRINTF(("fragcache[%d]: done 0-%d\n", h->ip_id,
+ (*frag)->fr_max));
+ pf_free_fragment(*frag);
+ *frag = NULL;
+ }
+
+ return (m);
+
+ no_mem:
+ *nomem = 1;
+
+ /* Still need to pay attention to !IP_MF */
+ if (!mff && *frag != NULL)
+ (*frag)->fr_flags |= PFFRAG_SEENLAST;
+
+ m_freem(m);
+ return (NULL);
+
+ drop_fragment:
+
+ /* Still need to pay attention to !IP_MF */
+ if (!mff && *frag != NULL)
+ (*frag)->fr_flags |= PFFRAG_SEENLAST;
+
+ if (drop) {
+ /* This fragment has been deemed bad. Don't reass */
+ if (((*frag)->fr_flags & PFFRAG_DROP) == 0)
+ DPFPRINTF(("fragcache[%d]: dropping overall fragment\n",
+ h->ip_id));
+ (*frag)->fr_flags |= PFFRAG_DROP;
+ }
+
+ m_freem(m);
+ return (NULL);
+}
+
+int
+pf_normalize_ip(struct mbuf **m0, int dir, struct pfi_kif *kif, u_short *reason,
+ struct pf_pdesc *pd)
+{
+ struct mbuf *m = *m0;
+ struct pf_rule *r;
+ struct pf_frent *frent;
+ struct pf_fragment *frag = NULL;
+ struct ip *h = mtod(m, struct ip *);
+ int mff = (ntohs(h->ip_off) & IP_MF);
+ int hlen = h->ip_hl << 2;
+ u_int16_t fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3;
+ u_int16_t max;
+ int ip_len;
+ int ip_off;
+ int tag = -1;
+
+ PF_RULES_RASSERT();
+
+ r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr);
+ while (r != NULL) {
+ r->evaluations++;
+ if (pfi_kif_match(r->kif, kif) == r->ifnot)
+ r = r->skip[PF_SKIP_IFP].ptr;
+ else if (r->direction && r->direction != dir)
+ r = r->skip[PF_SKIP_DIR].ptr;
+ else if (r->af && r->af != AF_INET)
+ r = r->skip[PF_SKIP_AF].ptr;
+ else if (r->proto && r->proto != h->ip_p)
+ r = r->skip[PF_SKIP_PROTO].ptr;
+ else if (PF_MISMATCHAW(&r->src.addr,
+ (struct pf_addr *)&h->ip_src.s_addr, AF_INET,
+ r->src.neg, kif, M_GETFIB(m)))
+ r = r->skip[PF_SKIP_SRC_ADDR].ptr;
+ else if (PF_MISMATCHAW(&r->dst.addr,
+ (struct pf_addr *)&h->ip_dst.s_addr, AF_INET,
+ r->dst.neg, NULL, M_GETFIB(m)))
+ r = r->skip[PF_SKIP_DST_ADDR].ptr;
+ else if (r->match_tag && !pf_match_tag(m, r, &tag,
+ pd->pf_mtag ? pd->pf_mtag->tag : 0))
+ r = TAILQ_NEXT(r, entries);
+ else
+ break;
+ }
+
+ if (r == NULL || r->action == PF_NOSCRUB)
+ return (PF_PASS);
+ else {
+ r->packets[dir == PF_OUT]++;
+ r->bytes[dir == PF_OUT] += pd->tot_len;
+ }
+
+ /* Check for illegal packets */
+ if (hlen < (int)sizeof(struct ip))
+ goto drop;
+
+ if (hlen > ntohs(h->ip_len))
+ goto drop;
+
+ /* Clear IP_DF if the rule uses the no-df option */
+ if (r->rule_flag & PFRULE_NODF && h->ip_off & htons(IP_DF)) {
+ u_int16_t ip_off = h->ip_off;
+
+ h->ip_off &= htons(~IP_DF);
+ h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0);
+ }
+
+ /* We will need other tests here */
+ if (!fragoff && !mff)
+ goto no_fragment;
+
+ /* We're dealing with a fragment now. Don't allow fragments
+ * with IP_DF to enter the cache. If the flag was cleared by
+ * no-df above, fine. Otherwise drop it.
+ */
+ if (h->ip_off & htons(IP_DF)) {
+ DPFPRINTF(("IP_DF\n"));
+ goto bad;
+ }
+
+ ip_len = ntohs(h->ip_len) - hlen;
+ ip_off = (ntohs(h->ip_off) & IP_OFFMASK) << 3;
+
+ /* All fragments are 8 byte aligned */
+ if (mff && (ip_len & 0x7)) {
+ DPFPRINTF(("mff and %d\n", ip_len));
+ goto bad;
+ }
+
+ /* Respect maximum length */
+ if (fragoff + ip_len > IP_MAXPACKET) {
+ DPFPRINTF(("max packet %d\n", fragoff + ip_len));
+ goto bad;
+ }
+ max = fragoff + ip_len;
+
+ if ((r->rule_flag & (PFRULE_FRAGCROP|PFRULE_FRAGDROP)) == 0) {
+
+ /* Fully buffer all of the fragments */
+ PF_FRAG_LOCK();
+ frag = pf_find_fragment(h, &V_pf_frag_tree);
+
+ /* Check if we saw the last fragment already */
+ if (frag != NULL && (frag->fr_flags & PFFRAG_SEENLAST) &&
+ max > frag->fr_max)
+ goto bad;
+
+ /* Get an entry for the fragment queue */
+ frent = uma_zalloc(V_pf_frent_z, M_NOWAIT);
+ if (frent == NULL) {
+ PF_FRAG_UNLOCK();
+ REASON_SET(reason, PFRES_MEMORY);
+ return (PF_DROP);
+ }
+ frent->fr_ip = h;
+ frent->fr_m = m;
+
+ /* Might return a completely reassembled mbuf, or NULL */
+ DPFPRINTF(("reass frag %d @ %d-%d\n", h->ip_id, fragoff, max));
+ *m0 = m = pf_reassemble(m0, &frag, frent, mff);
+ PF_FRAG_UNLOCK();
+
+ if (m == NULL)
+ return (PF_DROP);
+
+ /* use mtag from concatenated mbuf chain */
+ pd->pf_mtag = pf_find_mtag(m);
+#ifdef DIAGNOSTIC
+ if (pd->pf_mtag == NULL) {
+ printf("%s: pf_find_mtag returned NULL(1)\n", __func__);
+ if ((pd->pf_mtag = pf_get_mtag(m)) == NULL) {
+ m_freem(m);
+ *m0 = NULL;
+ goto no_mem;
+ }
+ }
+#endif
+ if (frag != NULL && (frag->fr_flags & PFFRAG_DROP))
+ goto drop;
+
+ h = mtod(m, struct ip *);
+ } else {
+ /* non-buffering fragment cache (drops or masks overlaps) */
+ int nomem = 0;
+
+ if (dir == PF_OUT && pd->pf_mtag->flags & PF_TAG_FRAGCACHE) {
+ /*
+ * Already passed the fragment cache in the
+ * input direction. If we continued, it would
+ * appear to be a dup and would be dropped.
+ */
+ goto fragment_pass;
+ }
+
+ PF_FRAG_LOCK();
+ frag = pf_find_fragment(h, &V_pf_cache_tree);
+
+ /* Check if we saw the last fragment already */
+ if (frag != NULL && (frag->fr_flags & PFFRAG_SEENLAST) &&
+ max > frag->fr_max) {
+ if (r->rule_flag & PFRULE_FRAGDROP)
+ frag->fr_flags |= PFFRAG_DROP;
+ goto bad;
+ }
+
+ *m0 = m = pf_fragcache(m0, h, &frag, mff,
+ (r->rule_flag & PFRULE_FRAGDROP) ? 1 : 0, &nomem);
+ PF_FRAG_UNLOCK();
+ if (m == NULL) {
+ if (nomem)
+ goto no_mem;
+ goto drop;
+ }
+
+ /* use mtag from copied and trimmed mbuf chain */
+ pd->pf_mtag = pf_find_mtag(m);
+#ifdef DIAGNOSTIC
+ if (pd->pf_mtag == NULL) {
+ printf("%s: pf_find_mtag returned NULL(2)\n", __func__);
+ if ((pd->pf_mtag = pf_get_mtag(m)) == NULL) {
+ m_freem(m);
+ *m0 = NULL;
+ goto no_mem;
+ }
+ }
+#endif
+ if (dir == PF_IN)
+ pd->pf_mtag->flags |= PF_TAG_FRAGCACHE;
+
+ if (frag != NULL && (frag->fr_flags & PFFRAG_DROP))
+ goto drop;
+ goto fragment_pass;
+ }
+
+ no_fragment:
+ /* At this point, only IP_DF is allowed in ip_off */
+ if (h->ip_off & ~htons(IP_DF)) {
+ u_int16_t ip_off = h->ip_off;
+
+ h->ip_off &= htons(IP_DF);
+ h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0);
+ }
+
+ /* not missing a return here */
+
+ fragment_pass:
+ pf_scrub_ip(&m, r->rule_flag, r->min_ttl, r->set_tos);
+
+ if ((r->rule_flag & (PFRULE_FRAGCROP|PFRULE_FRAGDROP)) == 0)
+ pd->flags |= PFDESC_IP_REAS;
+ return (PF_PASS);
+
+ no_mem:
+ REASON_SET(reason, PFRES_MEMORY);
+ if (r != NULL && r->log)
+ PFLOG_PACKET(kif, m, AF_INET, dir, *reason, r, NULL, NULL, pd,
+ 1);
+ return (PF_DROP);
+
+ drop:
+ REASON_SET(reason, PFRES_NORM);
+ if (r != NULL && r->log)
+ PFLOG_PACKET(kif, m, AF_INET, dir, *reason, r, NULL, NULL, pd,
+ 1);
+ return (PF_DROP);
+
+ bad:
+ DPFPRINTF(("dropping bad fragment\n"));
+
+ /* Free associated fragments */
+ if (frag != NULL) {
+ pf_free_fragment(frag);
+ PF_FRAG_UNLOCK();
+ }
+
+ REASON_SET(reason, PFRES_FRAG);
+ if (r != NULL && r->log)
+ PFLOG_PACKET(kif, m, AF_INET, dir, *reason, r, NULL, NULL, pd,
+ 1);
+
+ return (PF_DROP);
+}
+#endif
+
+#ifdef INET6
+int
+pf_normalize_ip6(struct mbuf **m0, int dir, struct pfi_kif *kif,
+ u_short *reason, struct pf_pdesc *pd)
+{
+ struct mbuf *m = *m0;
+ struct pf_rule *r;
+ struct ip6_hdr *h = mtod(m, struct ip6_hdr *);
+ int off;
+ struct ip6_ext ext;
+ struct ip6_opt opt;
+ struct ip6_opt_jumbo jumbo;
+ struct ip6_frag frag;
+ u_int32_t jumbolen = 0, plen;
+ u_int16_t fragoff = 0;
+ int optend;
+ int ooff;
+ u_int8_t proto;
+ int terminal;
+
+ PF_RULES_RASSERT();
+
+ r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr);
+ while (r != NULL) {
+ r->evaluations++;
+ if (pfi_kif_match(r->kif, kif) == r->ifnot)
+ r = r->skip[PF_SKIP_IFP].ptr;
+ else if (r->direction && r->direction != dir)
+ r = r->skip[PF_SKIP_DIR].ptr;
+ else if (r->af && r->af != AF_INET6)
+ r = r->skip[PF_SKIP_AF].ptr;
+#if 0 /* header chain! */
+ else if (r->proto && r->proto != h->ip6_nxt)
+ r = r->skip[PF_SKIP_PROTO].ptr;
+#endif
+ else if (PF_MISMATCHAW(&r->src.addr,
+ (struct pf_addr *)&h->ip6_src, AF_INET6,
+ r->src.neg, kif, M_GETFIB(m)))
+ r = r->skip[PF_SKIP_SRC_ADDR].ptr;
+ else if (PF_MISMATCHAW(&r->dst.addr,
+ (struct pf_addr *)&h->ip6_dst, AF_INET6,
+ r->dst.neg, NULL, M_GETFIB(m)))
+ r = r->skip[PF_SKIP_DST_ADDR].ptr;
+ else
+ break;
+ }
+
+ if (r == NULL || r->action == PF_NOSCRUB)
+ return (PF_PASS);
+ else {
+ r->packets[dir == PF_OUT]++;
+ r->bytes[dir == PF_OUT] += pd->tot_len;
+ }
+
+ /* Check for illegal packets */
+ if (sizeof(struct ip6_hdr) + IPV6_MAXPACKET < m->m_pkthdr.len)
+ goto drop;
+
+ off = sizeof(struct ip6_hdr);
+ proto = h->ip6_nxt;
+ terminal = 0;
+ do {
+ switch (proto) {
+ case IPPROTO_FRAGMENT:
+ goto fragment;
+ break;
+ case IPPROTO_AH:
+ case IPPROTO_ROUTING:
+ case IPPROTO_DSTOPTS:
+ if (!pf_pull_hdr(m, off, &ext, sizeof(ext), NULL,
+ NULL, AF_INET6))
+ goto shortpkt;
+ if (proto == IPPROTO_AH)
+ off += (ext.ip6e_len + 2) * 4;
+ else
+ off += (ext.ip6e_len + 1) * 8;
+ proto = ext.ip6e_nxt;
+ break;
+ case IPPROTO_HOPOPTS:
+ if (!pf_pull_hdr(m, off, &ext, sizeof(ext), NULL,
+ NULL, AF_INET6))
+ goto shortpkt;
+ optend = off + (ext.ip6e_len + 1) * 8;
+ ooff = off + sizeof(ext);
+ do {
+ if (!pf_pull_hdr(m, ooff, &opt.ip6o_type,
+ sizeof(opt.ip6o_type), NULL, NULL,
+ AF_INET6))
+ goto shortpkt;
+ if (opt.ip6o_type == IP6OPT_PAD1) {
+ ooff++;
+ continue;
+ }
+ if (!pf_pull_hdr(m, ooff, &opt, sizeof(opt),
+ NULL, NULL, AF_INET6))
+ goto shortpkt;
+ if (ooff + sizeof(opt) + opt.ip6o_len > optend)
+ goto drop;
+ switch (opt.ip6o_type) {
+ case IP6OPT_JUMBO:
+ if (h->ip6_plen != 0)
+ goto drop;
+ if (!pf_pull_hdr(m, ooff, &jumbo,
+ sizeof(jumbo), NULL, NULL,
+ AF_INET6))
+ goto shortpkt;
+ memcpy(&jumbolen, jumbo.ip6oj_jumbo_len,
+ sizeof(jumbolen));
+ jumbolen = ntohl(jumbolen);
+ if (jumbolen <= IPV6_MAXPACKET)
+ goto drop;
+ if (sizeof(struct ip6_hdr) + jumbolen !=
+ m->m_pkthdr.len)
+ goto drop;
+ break;
+ default:
+ break;
+ }
+ ooff += sizeof(opt) + opt.ip6o_len;
+ } while (ooff < optend);
+
+ off = optend;
+ proto = ext.ip6e_nxt;
+ break;
+ default:
+ terminal = 1;
+ break;
+ }
+ } while (!terminal);
+
+ /* jumbo payload option must be present, or plen > 0 */
+ if (ntohs(h->ip6_plen) == 0)
+ plen = jumbolen;
+ else
+ plen = ntohs(h->ip6_plen);
+ if (plen == 0)
+ goto drop;
+ if (sizeof(struct ip6_hdr) + plen > m->m_pkthdr.len)
+ goto shortpkt;
+
+ pf_scrub_ip6(&m, r->min_ttl);
+
+ return (PF_PASS);
+
+ fragment:
+ if (ntohs(h->ip6_plen) == 0 || jumbolen)
+ goto drop;
+ plen = ntohs(h->ip6_plen);
+
+ if (!pf_pull_hdr(m, off, &frag, sizeof(frag), NULL, NULL, AF_INET6))
+ goto shortpkt;
+ fragoff = ntohs(frag.ip6f_offlg & IP6F_OFF_MASK);
+ if (fragoff + (plen - off - sizeof(frag)) > IPV6_MAXPACKET)
+ goto badfrag;
+
+ /* do something about it */
+ /* remember to set pd->flags |= PFDESC_IP_REAS */
+ return (PF_PASS);
+
+ shortpkt:
+ REASON_SET(reason, PFRES_SHORT);
+ if (r != NULL && r->log)
+ PFLOG_PACKET(kif, m, AF_INET6, dir, *reason, r, NULL, NULL, pd,
+ 1);
+ return (PF_DROP);
+
+ drop:
+ REASON_SET(reason, PFRES_NORM);
+ if (r != NULL && r->log)
+ PFLOG_PACKET(kif, m, AF_INET6, dir, *reason, r, NULL, NULL, pd,
+ 1);
+ return (PF_DROP);
+
+ badfrag:
+ REASON_SET(reason, PFRES_FRAG);
+ if (r != NULL && r->log)
+ PFLOG_PACKET(kif, m, AF_INET6, dir, *reason, r, NULL, NULL, pd,
+ 1);
+ return (PF_DROP);
+}
+#endif /* INET6 */
+
+int
+pf_normalize_tcp(int dir, struct pfi_kif *kif, struct mbuf *m, int ipoff,
+ int off, void *h, struct pf_pdesc *pd)
+{
+ struct pf_rule *r, *rm = NULL;
+ struct tcphdr *th = pd->hdr.tcp;
+ int rewrite = 0;
+ u_short reason;
+ u_int8_t flags;
+ sa_family_t af = pd->af;
+
+ PF_RULES_RASSERT();
+
+ r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr);
+ while (r != NULL) {
+ r->evaluations++;
+ if (pfi_kif_match(r->kif, kif) == r->ifnot)
+ r = r->skip[PF_SKIP_IFP].ptr;
+ else if (r->direction && r->direction != dir)
+ r = r->skip[PF_SKIP_DIR].ptr;
+ else if (r->af && r->af != af)
+ r = r->skip[PF_SKIP_AF].ptr;
+ else if (r->proto && r->proto != pd->proto)
+ r = r->skip[PF_SKIP_PROTO].ptr;
+ else if (PF_MISMATCHAW(&r->src.addr, pd->src, af,
+ r->src.neg, kif, M_GETFIB(m)))
+ r = r->skip[PF_SKIP_SRC_ADDR].ptr;
+ else if (r->src.port_op && !pf_match_port(r->src.port_op,
+ r->src.port[0], r->src.port[1], th->th_sport))
+ r = r->skip[PF_SKIP_SRC_PORT].ptr;
+ else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af,
+ r->dst.neg, NULL, M_GETFIB(m)))
+ r = r->skip[PF_SKIP_DST_ADDR].ptr;
+ else if (r->dst.port_op && !pf_match_port(r->dst.port_op,
+ r->dst.port[0], r->dst.port[1], th->th_dport))
+ r = r->skip[PF_SKIP_DST_PORT].ptr;
+ else if (r->os_fingerprint != PF_OSFP_ANY && !pf_osfp_match(
+ pf_osfp_fingerprint(pd, m, off, th),
+ r->os_fingerprint))
+ r = TAILQ_NEXT(r, entries);
+ else {
+ rm = r;
+ break;
+ }
+ }
+
+ if (rm == NULL || rm->action == PF_NOSCRUB)
+ return (PF_PASS);
+ else {
+ r->packets[dir == PF_OUT]++;
+ r->bytes[dir == PF_OUT] += pd->tot_len;
+ }
+
+ if (rm->rule_flag & PFRULE_REASSEMBLE_TCP)
+ pd->flags |= PFDESC_TCP_NORM;
+
+ flags = th->th_flags;
+ if (flags & TH_SYN) {
+ /* Illegal packet */
+ if (flags & TH_RST)
+ goto tcp_drop;
+
+ if (flags & TH_FIN)
+ flags &= ~TH_FIN;
+ } else {
+ /* Illegal packet */
+ if (!(flags & (TH_ACK|TH_RST)))
+ goto tcp_drop;
+ }
+
+ if (!(flags & TH_ACK)) {
+ /* These flags are only valid if ACK is set */
+ if ((flags & TH_FIN) || (flags & TH_PUSH) || (flags & TH_URG))
+ goto tcp_drop;
+ }
+
+ /* Check for illegal header length */
+ if (th->th_off < (sizeof(struct tcphdr) >> 2))
+ goto tcp_drop;
+
+ /* If flags changed, or reserved data set, then adjust */
+ if (flags != th->th_flags || th->th_x2 != 0) {
+ u_int16_t ov, nv;
+
+ ov = *(u_int16_t *)(&th->th_ack + 1);
+ th->th_flags = flags;
+ th->th_x2 = 0;
+ nv = *(u_int16_t *)(&th->th_ack + 1);
+
+ th->th_sum = pf_cksum_fixup(th->th_sum, ov, nv, 0);
+ rewrite = 1;
+ }
+
+ /* Remove urgent pointer, if TH_URG is not set */
+ if (!(flags & TH_URG) && th->th_urp) {
+ th->th_sum = pf_cksum_fixup(th->th_sum, th->th_urp, 0, 0);
+ th->th_urp = 0;
+ rewrite = 1;
+ }
+
+ /* Process options */
+ if (r->max_mss && pf_normalize_tcpopt(r, m, th, off, pd->af))
+ rewrite = 1;
+
+ /* copy back packet headers if we sanitized */
+ if (rewrite)
+ m_copyback(m, off, sizeof(*th), (caddr_t)th);
+
+ return (PF_PASS);
+
+ tcp_drop:
+ REASON_SET(&reason, PFRES_NORM);
+ if (rm != NULL && r->log)
+ PFLOG_PACKET(kif, m, AF_INET, dir, reason, r, NULL, NULL, pd,
+ 1);
+ return (PF_DROP);
+}
+
+int
+pf_normalize_tcp_init(struct mbuf *m, int off, struct pf_pdesc *pd,
+ struct tcphdr *th, struct pf_state_peer *src, struct pf_state_peer *dst)
+{
+ u_int32_t tsval, tsecr;
+ u_int8_t hdr[60];
+ u_int8_t *opt;
+
+ KASSERT((src->scrub == NULL),
+ ("pf_normalize_tcp_init: src->scrub != NULL"));
+
+ src->scrub = uma_zalloc(V_pf_state_scrub_z, M_ZERO | M_NOWAIT);
+ if (src->scrub == NULL)
+ return (1);
+
+ switch (pd->af) {
+#ifdef INET
+ case AF_INET: {
+ struct ip *h = mtod(m, struct ip *);
+ src->scrub->pfss_ttl = h->ip_ttl;
+ break;
+ }
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6: {
+ struct ip6_hdr *h = mtod(m, struct ip6_hdr *);
+ src->scrub->pfss_ttl = h->ip6_hlim;
+ break;
+ }
+#endif /* INET6 */
+ }
+
+
+ /*
+ * All normalizations below are only begun if we see the start of
+ * the connections. They must all set an enabled bit in pfss_flags
+ */
+ if ((th->th_flags & TH_SYN) == 0)
+ return (0);
+
+
+ if (th->th_off > (sizeof(struct tcphdr) >> 2) && src->scrub &&
+ pf_pull_hdr(m, off, hdr, th->th_off << 2, NULL, NULL, pd->af)) {
+ /* Diddle with TCP options */
+ int hlen;
+ opt = hdr + sizeof(struct tcphdr);
+ hlen = (th->th_off << 2) - sizeof(struct tcphdr);
+ while (hlen >= TCPOLEN_TIMESTAMP) {
+ switch (*opt) {
+ case TCPOPT_EOL: /* FALLTHROUGH */
+ case TCPOPT_NOP:
+ opt++;
+ hlen--;
+ break;
+ case TCPOPT_TIMESTAMP:
+ if (opt[1] >= TCPOLEN_TIMESTAMP) {
+ src->scrub->pfss_flags |=
+ PFSS_TIMESTAMP;
+ src->scrub->pfss_ts_mod =
+ htonl(arc4random());
+
+ /* note PFSS_PAWS not set yet */
+ memcpy(&tsval, &opt[2],
+ sizeof(u_int32_t));
+ memcpy(&tsecr, &opt[6],
+ sizeof(u_int32_t));
+ src->scrub->pfss_tsval0 = ntohl(tsval);
+ src->scrub->pfss_tsval = ntohl(tsval);
+ src->scrub->pfss_tsecr = ntohl(tsecr);
+ getmicrouptime(&src->scrub->pfss_last);
+ }
+ /* FALLTHROUGH */
+ default:
+ hlen -= MAX(opt[1], 2);
+ opt += MAX(opt[1], 2);
+ break;
+ }
+ }
+ }
+
+ return (0);
+}
+
+void
+pf_normalize_tcp_cleanup(struct pf_state *state)
+{
+ if (state->src.scrub)
+ uma_zfree(V_pf_state_scrub_z, state->src.scrub);
+ if (state->dst.scrub)
+ uma_zfree(V_pf_state_scrub_z, state->dst.scrub);
+
+ /* Someday... flush the TCP segment reassembly descriptors. */
+}
+
+int
+pf_normalize_tcp_stateful(struct mbuf *m, int off, struct pf_pdesc *pd,
+ u_short *reason, struct tcphdr *th, struct pf_state *state,
+ struct pf_state_peer *src, struct pf_state_peer *dst, int *writeback)
+{
+ struct timeval uptime;
+ u_int32_t tsval, tsecr;
+ u_int tsval_from_last;
+ u_int8_t hdr[60];
+ u_int8_t *opt;
+ int copyback = 0;
+ int got_ts = 0;
+
+ KASSERT((src->scrub || dst->scrub),
+ ("%s: src->scrub && dst->scrub!", __func__));
+
+ /*
+ * Enforce the minimum TTL seen for this connection. Negate a common
+ * technique to evade an intrusion detection system and confuse
+ * firewall state code.
+ */
+ switch (pd->af) {
+#ifdef INET
+ case AF_INET: {
+ if (src->scrub) {
+ struct ip *h = mtod(m, struct ip *);
+ if (h->ip_ttl > src->scrub->pfss_ttl)
+ src->scrub->pfss_ttl = h->ip_ttl;
+ h->ip_ttl = src->scrub->pfss_ttl;
+ }
+ break;
+ }
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6: {
+ if (src->scrub) {
+ struct ip6_hdr *h = mtod(m, struct ip6_hdr *);
+ if (h->ip6_hlim > src->scrub->pfss_ttl)
+ src->scrub->pfss_ttl = h->ip6_hlim;
+ h->ip6_hlim = src->scrub->pfss_ttl;
+ }
+ break;
+ }
+#endif /* INET6 */
+ }
+
+ if (th->th_off > (sizeof(struct tcphdr) >> 2) &&
+ ((src->scrub && (src->scrub->pfss_flags & PFSS_TIMESTAMP)) ||
+ (dst->scrub && (dst->scrub->pfss_flags & PFSS_TIMESTAMP))) &&
+ pf_pull_hdr(m, off, hdr, th->th_off << 2, NULL, NULL, pd->af)) {
+ /* Diddle with TCP options */
+ int hlen;
+ opt = hdr + sizeof(struct tcphdr);
+ hlen = (th->th_off << 2) - sizeof(struct tcphdr);
+ while (hlen >= TCPOLEN_TIMESTAMP) {
+ switch (*opt) {
+ case TCPOPT_EOL: /* FALLTHROUGH */
+ case TCPOPT_NOP:
+ opt++;
+ hlen--;
+ break;
+ case TCPOPT_TIMESTAMP:
+ /* Modulate the timestamps. Can be used for
+ * NAT detection, OS uptime determination or
+ * reboot detection.
+ */
+
+ if (got_ts) {
+ /* Huh? Multiple timestamps!? */
+ if (V_pf_status.debug >= PF_DEBUG_MISC) {
+ DPFPRINTF(("multiple TS??"));
+ pf_print_state(state);
+ printf("\n");
+ }
+ REASON_SET(reason, PFRES_TS);
+ return (PF_DROP);
+ }
+ if (opt[1] >= TCPOLEN_TIMESTAMP) {
+ memcpy(&tsval, &opt[2],
+ sizeof(u_int32_t));
+ if (tsval && src->scrub &&
+ (src->scrub->pfss_flags &
+ PFSS_TIMESTAMP)) {
+ tsval = ntohl(tsval);
+ pf_change_a(&opt[2],
+ &th->th_sum,
+ htonl(tsval +
+ src->scrub->pfss_ts_mod),
+ 0);
+ copyback = 1;
+ }
+
+ /* Modulate TS reply iff valid (!0) */
+ memcpy(&tsecr, &opt[6],
+ sizeof(u_int32_t));
+ if (tsecr && dst->scrub &&
+ (dst->scrub->pfss_flags &
+ PFSS_TIMESTAMP)) {
+ tsecr = ntohl(tsecr)
+ - dst->scrub->pfss_ts_mod;
+ pf_change_a(&opt[6],
+ &th->th_sum, htonl(tsecr),
+ 0);
+ copyback = 1;
+ }
+ got_ts = 1;
+ }
+ /* FALLTHROUGH */
+ default:
+ hlen -= MAX(opt[1], 2);
+ opt += MAX(opt[1], 2);
+ break;
+ }
+ }
+ if (copyback) {
+ /* Copyback the options, caller copys back header */
+ *writeback = 1;
+ m_copyback(m, off + sizeof(struct tcphdr),
+ (th->th_off << 2) - sizeof(struct tcphdr), hdr +
+ sizeof(struct tcphdr));
+ }
+ }
+
+
+ /*
+ * Must invalidate PAWS checks on connections idle for too long.
+ * The fastest allowed timestamp clock is 1ms. That turns out to
+ * be about 24 days before it wraps. XXX Right now our lowerbound
+ * TS echo check only works for the first 12 days of a connection
+ * when the TS has exhausted half its 32bit space
+ */
+#define TS_MAX_IDLE (24*24*60*60)
+#define TS_MAX_CONN (12*24*60*60) /* XXX remove when better tsecr check */
+
+ getmicrouptime(&uptime);
+ if (src->scrub && (src->scrub->pfss_flags & PFSS_PAWS) &&
+ (uptime.tv_sec - src->scrub->pfss_last.tv_sec > TS_MAX_IDLE ||
+ time_uptime - state->creation > TS_MAX_CONN)) {
+ if (V_pf_status.debug >= PF_DEBUG_MISC) {
+ DPFPRINTF(("src idled out of PAWS\n"));
+ pf_print_state(state);
+ printf("\n");
+ }
+ src->scrub->pfss_flags = (src->scrub->pfss_flags & ~PFSS_PAWS)
+ | PFSS_PAWS_IDLED;
+ }
+ if (dst->scrub && (dst->scrub->pfss_flags & PFSS_PAWS) &&
+ uptime.tv_sec - dst->scrub->pfss_last.tv_sec > TS_MAX_IDLE) {
+ if (V_pf_status.debug >= PF_DEBUG_MISC) {
+ DPFPRINTF(("dst idled out of PAWS\n"));
+ pf_print_state(state);
+ printf("\n");
+ }
+ dst->scrub->pfss_flags = (dst->scrub->pfss_flags & ~PFSS_PAWS)
+ | PFSS_PAWS_IDLED;
+ }
+
+ if (got_ts && src->scrub && dst->scrub &&
+ (src->scrub->pfss_flags & PFSS_PAWS) &&
+ (dst->scrub->pfss_flags & PFSS_PAWS)) {
+ /* Validate that the timestamps are "in-window".
+ * RFC1323 describes TCP Timestamp options that allow
+ * measurement of RTT (round trip time) and PAWS
+ * (protection against wrapped sequence numbers). PAWS
+ * gives us a set of rules for rejecting packets on
+ * long fat pipes (packets that were somehow delayed
+ * in transit longer than the time it took to send the
+ * full TCP sequence space of 4Gb). We can use these
+ * rules and infer a few others that will let us treat
+ * the 32bit timestamp and the 32bit echoed timestamp
+ * as sequence numbers to prevent a blind attacker from
+ * inserting packets into a connection.
+ *
+ * RFC1323 tells us:
+ * - The timestamp on this packet must be greater than
+ * or equal to the last value echoed by the other
+ * endpoint. The RFC says those will be discarded
+ * since it is a dup that has already been acked.
+ * This gives us a lowerbound on the timestamp.
+ * timestamp >= other last echoed timestamp
+ * - The timestamp will be less than or equal to
+ * the last timestamp plus the time between the
+ * last packet and now. The RFC defines the max
+ * clock rate as 1ms. We will allow clocks to be
+ * up to 10% fast and will allow a total difference
+ * or 30 seconds due to a route change. And this
+ * gives us an upperbound on the timestamp.
+ * timestamp <= last timestamp + max ticks
+ * We have to be careful here. Windows will send an
+ * initial timestamp of zero and then initialize it
+ * to a random value after the 3whs; presumably to
+ * avoid a DoS by having to call an expensive RNG
+ * during a SYN flood. Proof MS has at least one
+ * good security geek.
+ *
+ * - The TCP timestamp option must also echo the other
+ * endpoints timestamp. The timestamp echoed is the
+ * one carried on the earliest unacknowledged segment
+ * on the left edge of the sequence window. The RFC
+ * states that the host will reject any echoed
+ * timestamps that were larger than any ever sent.
+ * This gives us an upperbound on the TS echo.
+ * tescr <= largest_tsval
+ * - The lowerbound on the TS echo is a little more
+ * tricky to determine. The other endpoint's echoed
+ * values will not decrease. But there may be
+ * network conditions that re-order packets and
+ * cause our view of them to decrease. For now the
+ * only lowerbound we can safely determine is that
+ * the TS echo will never be less than the original
+ * TS. XXX There is probably a better lowerbound.
+ * Remove TS_MAX_CONN with better lowerbound check.
+ * tescr >= other original TS
+ *
+ * It is also important to note that the fastest
+ * timestamp clock of 1ms will wrap its 32bit space in
+ * 24 days. So we just disable TS checking after 24
+ * days of idle time. We actually must use a 12d
+ * connection limit until we can come up with a better
+ * lowerbound to the TS echo check.
+ */
+ struct timeval delta_ts;
+ int ts_fudge;
+
+
+ /*
+ * PFTM_TS_DIFF is how many seconds of leeway to allow
+ * a host's timestamp. This can happen if the previous
+ * packet got delayed in transit for much longer than
+ * this packet.
+ */
+ if ((ts_fudge = state->rule.ptr->timeout[PFTM_TS_DIFF]) == 0)
+ ts_fudge = V_pf_default_rule.timeout[PFTM_TS_DIFF];
+
+ /* Calculate max ticks since the last timestamp */
+#define TS_MAXFREQ 1100 /* RFC max TS freq of 1Khz + 10% skew */
+#define TS_MICROSECS 1000000 /* microseconds per second */
+ delta_ts = uptime;
+ timevalsub(&delta_ts, &src->scrub->pfss_last);
+ tsval_from_last = (delta_ts.tv_sec + ts_fudge) * TS_MAXFREQ;
+ tsval_from_last += delta_ts.tv_usec / (TS_MICROSECS/TS_MAXFREQ);
+
+ if ((src->state >= TCPS_ESTABLISHED &&
+ dst->state >= TCPS_ESTABLISHED) &&
+ (SEQ_LT(tsval, dst->scrub->pfss_tsecr) ||
+ SEQ_GT(tsval, src->scrub->pfss_tsval + tsval_from_last) ||
+ (tsecr && (SEQ_GT(tsecr, dst->scrub->pfss_tsval) ||
+ SEQ_LT(tsecr, dst->scrub->pfss_tsval0))))) {
+ /* Bad RFC1323 implementation or an insertion attack.
+ *
+ * - Solaris 2.6 and 2.7 are known to send another ACK
+ * after the FIN,FIN|ACK,ACK closing that carries
+ * an old timestamp.
+ */
+
+ DPFPRINTF(("Timestamp failed %c%c%c%c\n",
+ SEQ_LT(tsval, dst->scrub->pfss_tsecr) ? '0' : ' ',
+ SEQ_GT(tsval, src->scrub->pfss_tsval +
+ tsval_from_last) ? '1' : ' ',
+ SEQ_GT(tsecr, dst->scrub->pfss_tsval) ? '2' : ' ',
+ SEQ_LT(tsecr, dst->scrub->pfss_tsval0)? '3' : ' '));
+ DPFPRINTF((" tsval: %u tsecr: %u +ticks: %u "
+ "idle: %jus %lums\n",
+ tsval, tsecr, tsval_from_last,
+ (uintmax_t)delta_ts.tv_sec,
+ delta_ts.tv_usec / 1000));
+ DPFPRINTF((" src->tsval: %u tsecr: %u\n",
+ src->scrub->pfss_tsval, src->scrub->pfss_tsecr));
+ DPFPRINTF((" dst->tsval: %u tsecr: %u tsval0: %u"
+ "\n", dst->scrub->pfss_tsval,
+ dst->scrub->pfss_tsecr, dst->scrub->pfss_tsval0));
+ if (V_pf_status.debug >= PF_DEBUG_MISC) {
+ pf_print_state(state);
+ pf_print_flags(th->th_flags);
+ printf("\n");
+ }
+ REASON_SET(reason, PFRES_TS);
+ return (PF_DROP);
+ }
+
+ /* XXX I'd really like to require tsecr but it's optional */
+
+ } else if (!got_ts && (th->th_flags & TH_RST) == 0 &&
+ ((src->state == TCPS_ESTABLISHED && dst->state == TCPS_ESTABLISHED)
+ || pd->p_len > 0 || (th->th_flags & TH_SYN)) &&
+ src->scrub && dst->scrub &&
+ (src->scrub->pfss_flags & PFSS_PAWS) &&
+ (dst->scrub->pfss_flags & PFSS_PAWS)) {
+ /* Didn't send a timestamp. Timestamps aren't really useful
+ * when:
+ * - connection opening or closing (often not even sent).
+ * but we must not let an attacker to put a FIN on a
+ * data packet to sneak it through our ESTABLISHED check.
+ * - on a TCP reset. RFC suggests not even looking at TS.
+ * - on an empty ACK. The TS will not be echoed so it will
+ * probably not help keep the RTT calculation in sync and
+ * there isn't as much danger when the sequence numbers
+ * got wrapped. So some stacks don't include TS on empty
+ * ACKs :-(
+ *
+ * To minimize the disruption to mostly RFC1323 conformant
+ * stacks, we will only require timestamps on data packets.
+ *
+ * And what do ya know, we cannot require timestamps on data
+ * packets. There appear to be devices that do legitimate
+ * TCP connection hijacking. There are HTTP devices that allow
+ * a 3whs (with timestamps) and then buffer the HTTP request.
+ * If the intermediate device has the HTTP response cache, it
+ * will spoof the response but not bother timestamping its
+ * packets. So we can look for the presence of a timestamp in
+ * the first data packet and if there, require it in all future
+ * packets.
+ */
+
+ if (pd->p_len > 0 && (src->scrub->pfss_flags & PFSS_DATA_TS)) {
+ /*
+ * Hey! Someone tried to sneak a packet in. Or the
+ * stack changed its RFC1323 behavior?!?!
+ */
+ if (V_pf_status.debug >= PF_DEBUG_MISC) {
+ DPFPRINTF(("Did not receive expected RFC1323 "
+ "timestamp\n"));
+ pf_print_state(state);
+ pf_print_flags(th->th_flags);
+ printf("\n");
+ }
+ REASON_SET(reason, PFRES_TS);
+ return (PF_DROP);
+ }
+ }
+
+
+ /*
+ * We will note if a host sends his data packets with or without
+ * timestamps. And require all data packets to contain a timestamp
+ * if the first does. PAWS implicitly requires that all data packets be
+ * timestamped. But I think there are middle-man devices that hijack
+ * TCP streams immediately after the 3whs and don't timestamp their
+ * packets (seen in a WWW accelerator or cache).
+ */
+ if (pd->p_len > 0 && src->scrub && (src->scrub->pfss_flags &
+ (PFSS_TIMESTAMP|PFSS_DATA_TS|PFSS_DATA_NOTS)) == PFSS_TIMESTAMP) {
+ if (got_ts)
+ src->scrub->pfss_flags |= PFSS_DATA_TS;
+ else {
+ src->scrub->pfss_flags |= PFSS_DATA_NOTS;
+ if (V_pf_status.debug >= PF_DEBUG_MISC && dst->scrub &&
+ (dst->scrub->pfss_flags & PFSS_TIMESTAMP)) {
+ /* Don't warn if other host rejected RFC1323 */
+ DPFPRINTF(("Broken RFC1323 stack did not "
+ "timestamp data packet. Disabled PAWS "
+ "security.\n"));
+ pf_print_state(state);
+ pf_print_flags(th->th_flags);
+ printf("\n");
+ }
+ }
+ }
+
+
+ /*
+ * Update PAWS values
+ */
+ if (got_ts && src->scrub && PFSS_TIMESTAMP == (src->scrub->pfss_flags &
+ (PFSS_PAWS_IDLED|PFSS_TIMESTAMP))) {
+ getmicrouptime(&src->scrub->pfss_last);
+ if (SEQ_GEQ(tsval, src->scrub->pfss_tsval) ||
+ (src->scrub->pfss_flags & PFSS_PAWS) == 0)
+ src->scrub->pfss_tsval = tsval;
+
+ if (tsecr) {
+ if (SEQ_GEQ(tsecr, src->scrub->pfss_tsecr) ||
+ (src->scrub->pfss_flags & PFSS_PAWS) == 0)
+ src->scrub->pfss_tsecr = tsecr;
+
+ if ((src->scrub->pfss_flags & PFSS_PAWS) == 0 &&
+ (SEQ_LT(tsval, src->scrub->pfss_tsval0) ||
+ src->scrub->pfss_tsval0 == 0)) {
+ /* tsval0 MUST be the lowest timestamp */
+ src->scrub->pfss_tsval0 = tsval;
+ }
+
+ /* Only fully initialized after a TS gets echoed */
+ if ((src->scrub->pfss_flags & PFSS_PAWS) == 0)
+ src->scrub->pfss_flags |= PFSS_PAWS;
+ }
+ }
+
+ /* I have a dream.... TCP segment reassembly.... */
+ return (0);
+}
+
+static int
+pf_normalize_tcpopt(struct pf_rule *r, struct mbuf *m, struct tcphdr *th,
+ int off, sa_family_t af)
+{
+ u_int16_t *mss;
+ int thoff;
+ int opt, cnt, optlen = 0;
+ int rewrite = 0;
+ u_char opts[TCP_MAXOLEN];
+ u_char *optp = opts;
+
+ thoff = th->th_off << 2;
+ cnt = thoff - sizeof(struct tcphdr);
+
+ if (cnt > 0 && !pf_pull_hdr(m, off + sizeof(*th), opts, cnt,
+ NULL, NULL, af))
+ return (rewrite);
+
+ for (; cnt > 0; cnt -= optlen, optp += optlen) {
+ opt = optp[0];
+ if (opt == TCPOPT_EOL)
+ break;
+ if (opt == TCPOPT_NOP)
+ optlen = 1;
+ else {
+ if (cnt < 2)
+ break;
+ optlen = optp[1];
+ if (optlen < 2 || optlen > cnt)
+ break;
+ }
+ switch (opt) {
+ case TCPOPT_MAXSEG:
+ mss = (u_int16_t *)(optp + 2);
+ if ((ntohs(*mss)) > r->max_mss) {
+ th->th_sum = pf_cksum_fixup(th->th_sum,
+ *mss, htons(r->max_mss), 0);
+ *mss = htons(r->max_mss);
+ rewrite = 1;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (rewrite)
+ m_copyback(m, off + sizeof(*th), thoff - sizeof(*th), opts);
+
+ return (rewrite);
+}
+
+#ifdef INET
+static void
+pf_scrub_ip(struct mbuf **m0, u_int32_t flags, u_int8_t min_ttl, u_int8_t tos)
+{
+ struct mbuf *m = *m0;
+ struct ip *h = mtod(m, struct ip *);
+
+ /* Clear IP_DF if no-df was requested */
+ if (flags & PFRULE_NODF && h->ip_off & htons(IP_DF)) {
+ u_int16_t ip_off = h->ip_off;
+
+ h->ip_off &= htons(~IP_DF);
+ h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0);
+ }
+
+ /* Enforce a minimum ttl, may cause endless packet loops */
+ if (min_ttl && h->ip_ttl < min_ttl) {
+ u_int16_t ip_ttl = h->ip_ttl;
+
+ h->ip_ttl = min_ttl;
+ h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_ttl, h->ip_ttl, 0);
+ }
+
+ /* Enforce tos */
+ if (flags & PFRULE_SET_TOS) {
+ u_int16_t ov, nv;
+
+ ov = *(u_int16_t *)h;
+ h->ip_tos = tos;
+ nv = *(u_int16_t *)h;
+
+ h->ip_sum = pf_cksum_fixup(h->ip_sum, ov, nv, 0);
+ }
+
+ /* random-id, but not for fragments */
+ if (flags & PFRULE_RANDOMID && !(h->ip_off & ~htons(IP_DF))) {
+ u_int16_t ip_id = h->ip_id;
+
+ h->ip_id = ip_randomid();
+ h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_id, h->ip_id, 0);
+ }
+}
+#endif /* INET */
+
+#ifdef INET6
+static void
+pf_scrub_ip6(struct mbuf **m0, u_int8_t min_ttl)
+{
+ struct mbuf *m = *m0;
+ struct ip6_hdr *h = mtod(m, struct ip6_hdr *);
+
+ /* Enforce a minimum ttl, may cause endless packet loops */
+ if (min_ttl && h->ip6_hlim < min_ttl)
+ h->ip6_hlim = min_ttl;
+}
+#endif
diff --git a/sys/netpfil/pf/pf_osfp.c b/sys/netpfil/pf/pf_osfp.c
new file mode 100644
index 0000000..29d4a40
--- /dev/null
+++ b/sys/netpfil/pf/pf_osfp.c
@@ -0,0 +1,526 @@
+/* $OpenBSD: pf_osfp.c,v 1.14 2008/06/12 18:17:01 henning Exp $ */
+
+/*
+ * Copyright (c) 2003 Mike Frantzen <frantzen@w4g.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/socket.h>
+
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/tcp.h>
+
+#include <net/if.h>
+#include <net/pfvar.h>
+
+#include <netinet/ip6.h>
+#include <netinet6/in6_var.h>
+
+static MALLOC_DEFINE(M_PFOSFP, "pf_osfp", "pf(4) operating system fingerprints");
+#define DPFPRINTF(format, x...) \
+ if (V_pf_status.debug >= PF_DEBUG_NOISY) \
+ printf(format , ##x)
+
+SLIST_HEAD(pf_osfp_list, pf_os_fingerprint);
+static VNET_DEFINE(struct pf_osfp_list, pf_osfp_list) =
+ SLIST_HEAD_INITIALIZER();
+#define V_pf_osfp_list VNET(pf_osfp_list)
+
+static struct pf_osfp_enlist *pf_osfp_fingerprint_hdr(const struct ip *,
+ const struct ip6_hdr *,
+ const struct tcphdr *);
+static struct pf_os_fingerprint *pf_osfp_find(struct pf_osfp_list *,
+ struct pf_os_fingerprint *, u_int8_t);
+static struct pf_os_fingerprint *pf_osfp_find_exact(struct pf_osfp_list *,
+ struct pf_os_fingerprint *);
+static void pf_osfp_insert(struct pf_osfp_list *,
+ struct pf_os_fingerprint *);
+#ifdef PFDEBUG
+static struct pf_os_fingerprint *pf_osfp_validate(void);
+#endif
+
+/*
+ * Passively fingerprint the OS of the host (IPv4 TCP SYN packets only)
+ * Returns the list of possible OSes.
+ */
+struct pf_osfp_enlist *
+pf_osfp_fingerprint(struct pf_pdesc *pd, struct mbuf *m, int off,
+ const struct tcphdr *tcp)
+{
+ struct ip *ip;
+ struct ip6_hdr *ip6;
+ char hdr[60];
+
+ if ((pd->af != PF_INET && pd->af != PF_INET6) ||
+ pd->proto != IPPROTO_TCP || (tcp->th_off << 2) < sizeof(*tcp))
+ return (NULL);
+
+ if (pd->af == PF_INET) {
+ ip = mtod(m, struct ip *);
+ ip6 = (struct ip6_hdr *)NULL;
+ } else {
+ ip = (struct ip *)NULL;
+ ip6 = mtod(m, struct ip6_hdr *);
+ }
+ if (!pf_pull_hdr(m, off, hdr, tcp->th_off << 2, NULL, NULL,
+ pd->af)) return (NULL);
+
+ return (pf_osfp_fingerprint_hdr(ip, ip6, (struct tcphdr *)hdr));
+}
+
+static struct pf_osfp_enlist *
+pf_osfp_fingerprint_hdr(const struct ip *ip, const struct ip6_hdr *ip6, const struct tcphdr *tcp)
+{
+ struct pf_os_fingerprint fp, *fpresult;
+ int cnt, optlen = 0;
+ const u_int8_t *optp;
+ char srcname[128];
+
+ if ((tcp->th_flags & (TH_SYN|TH_ACK)) != TH_SYN)
+ return (NULL);
+ if (ip) {
+ if ((ip->ip_off & htons(IP_OFFMASK)) != 0)
+ return (NULL);
+ }
+
+ memset(&fp, 0, sizeof(fp));
+
+ if (ip) {
+ fp.fp_psize = ntohs(ip->ip_len);
+ fp.fp_ttl = ip->ip_ttl;
+ if (ip->ip_off & htons(IP_DF))
+ fp.fp_flags |= PF_OSFP_DF;
+ strlcpy(srcname, inet_ntoa(ip->ip_src), sizeof(srcname));
+ }
+#ifdef INET6
+ else if (ip6) {
+ /* jumbo payload? */
+ fp.fp_psize = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen);
+ fp.fp_ttl = ip6->ip6_hlim;
+ fp.fp_flags |= PF_OSFP_DF;
+ fp.fp_flags |= PF_OSFP_INET6;
+ strlcpy(srcname, ip6_sprintf((struct in6_addr *)&ip6->ip6_src),
+ sizeof(srcname));
+ }
+#endif
+ else
+ return (NULL);
+ fp.fp_wsize = ntohs(tcp->th_win);
+
+
+ cnt = (tcp->th_off << 2) - sizeof(*tcp);
+ optp = (const u_int8_t *)((const char *)tcp + sizeof(*tcp));
+ for (; cnt > 0; cnt -= optlen, optp += optlen) {
+ if (*optp == TCPOPT_EOL)
+ break;
+
+ fp.fp_optcnt++;
+ if (*optp == TCPOPT_NOP) {
+ fp.fp_tcpopts = (fp.fp_tcpopts << PF_OSFP_TCPOPT_BITS) |
+ PF_OSFP_TCPOPT_NOP;
+ optlen = 1;
+ } else {
+ if (cnt < 2)
+ return (NULL);
+ optlen = optp[1];
+ if (optlen > cnt || optlen < 2)
+ return (NULL);
+ switch (*optp) {
+ case TCPOPT_MAXSEG:
+ if (optlen >= TCPOLEN_MAXSEG)
+ memcpy(&fp.fp_mss, &optp[2],
+ sizeof(fp.fp_mss));
+ fp.fp_tcpopts = (fp.fp_tcpopts <<
+ PF_OSFP_TCPOPT_BITS) | PF_OSFP_TCPOPT_MSS;
+ NTOHS(fp.fp_mss);
+ break;
+ case TCPOPT_WINDOW:
+ if (optlen >= TCPOLEN_WINDOW)
+ memcpy(&fp.fp_wscale, &optp[2],
+ sizeof(fp.fp_wscale));
+ NTOHS(fp.fp_wscale);
+ fp.fp_tcpopts = (fp.fp_tcpopts <<
+ PF_OSFP_TCPOPT_BITS) |
+ PF_OSFP_TCPOPT_WSCALE;
+ break;
+ case TCPOPT_SACK_PERMITTED:
+ fp.fp_tcpopts = (fp.fp_tcpopts <<
+ PF_OSFP_TCPOPT_BITS) | PF_OSFP_TCPOPT_SACK;
+ break;
+ case TCPOPT_TIMESTAMP:
+ if (optlen >= TCPOLEN_TIMESTAMP) {
+ u_int32_t ts;
+ memcpy(&ts, &optp[2], sizeof(ts));
+ if (ts == 0)
+ fp.fp_flags |= PF_OSFP_TS0;
+
+ }
+ fp.fp_tcpopts = (fp.fp_tcpopts <<
+ PF_OSFP_TCPOPT_BITS) | PF_OSFP_TCPOPT_TS;
+ break;
+ default:
+ return (NULL);
+ }
+ }
+ optlen = MAX(optlen, 1); /* paranoia */
+ }
+
+ DPFPRINTF("fingerprinted %s:%d %d:%d:%d:%d:%llx (%d) "
+ "(TS=%s,M=%s%d,W=%s%d)\n",
+ srcname, ntohs(tcp->th_sport),
+ fp.fp_wsize, fp.fp_ttl, (fp.fp_flags & PF_OSFP_DF) != 0,
+ fp.fp_psize, (long long int)fp.fp_tcpopts, fp.fp_optcnt,
+ (fp.fp_flags & PF_OSFP_TS0) ? "0" : "",
+ (fp.fp_flags & PF_OSFP_MSS_MOD) ? "%" :
+ (fp.fp_flags & PF_OSFP_MSS_DC) ? "*" : "",
+ fp.fp_mss,
+ (fp.fp_flags & PF_OSFP_WSCALE_MOD) ? "%" :
+ (fp.fp_flags & PF_OSFP_WSCALE_DC) ? "*" : "",
+ fp.fp_wscale);
+
+ if ((fpresult = pf_osfp_find(&V_pf_osfp_list, &fp,
+ PF_OSFP_MAXTTL_OFFSET)))
+ return (&fpresult->fp_oses);
+ return (NULL);
+}
+
+/* Match a fingerprint ID against a list of OSes */
+int
+pf_osfp_match(struct pf_osfp_enlist *list, pf_osfp_t os)
+{
+ struct pf_osfp_entry *entry;
+ int os_class, os_version, os_subtype;
+ int en_class, en_version, en_subtype;
+
+ if (os == PF_OSFP_ANY)
+ return (1);
+ if (list == NULL) {
+ DPFPRINTF("osfp no match against %x\n", os);
+ return (os == PF_OSFP_UNKNOWN);
+ }
+ PF_OSFP_UNPACK(os, os_class, os_version, os_subtype);
+ SLIST_FOREACH(entry, list, fp_entry) {
+ PF_OSFP_UNPACK(entry->fp_os, en_class, en_version, en_subtype);
+ if ((os_class == PF_OSFP_ANY || en_class == os_class) &&
+ (os_version == PF_OSFP_ANY || en_version == os_version) &&
+ (os_subtype == PF_OSFP_ANY || en_subtype == os_subtype)) {
+ DPFPRINTF("osfp matched %s %s %s %x==%x\n",
+ entry->fp_class_nm, entry->fp_version_nm,
+ entry->fp_subtype_nm, os, entry->fp_os);
+ return (1);
+ }
+ }
+ DPFPRINTF("fingerprint 0x%x didn't match\n", os);
+ return (0);
+}
+
+/* Flush the fingerprint list */
+void
+pf_osfp_flush(void)
+{
+ struct pf_os_fingerprint *fp;
+ struct pf_osfp_entry *entry;
+
+ while ((fp = SLIST_FIRST(&V_pf_osfp_list))) {
+ SLIST_REMOVE_HEAD(&V_pf_osfp_list, fp_next);
+ while ((entry = SLIST_FIRST(&fp->fp_oses))) {
+ SLIST_REMOVE_HEAD(&fp->fp_oses, fp_entry);
+ free(entry, M_PFOSFP);
+ }
+ free(fp, M_PFOSFP);
+ }
+}
+
+
+/* Add a fingerprint */
+int
+pf_osfp_add(struct pf_osfp_ioctl *fpioc)
+{
+ struct pf_os_fingerprint *fp, fpadd;
+ struct pf_osfp_entry *entry;
+
+ PF_RULES_WASSERT();
+
+ memset(&fpadd, 0, sizeof(fpadd));
+ fpadd.fp_tcpopts = fpioc->fp_tcpopts;
+ fpadd.fp_wsize = fpioc->fp_wsize;
+ fpadd.fp_psize = fpioc->fp_psize;
+ fpadd.fp_mss = fpioc->fp_mss;
+ fpadd.fp_flags = fpioc->fp_flags;
+ fpadd.fp_optcnt = fpioc->fp_optcnt;
+ fpadd.fp_wscale = fpioc->fp_wscale;
+ fpadd.fp_ttl = fpioc->fp_ttl;
+
+#if 0 /* XXX RYAN wants to fix logging */
+ DPFPRINTF("adding osfp %s %s %s = %s%d:%d:%d:%s%d:0x%llx %d "
+ "(TS=%s,M=%s%d,W=%s%d) %x\n",
+ fpioc->fp_os.fp_class_nm, fpioc->fp_os.fp_version_nm,
+ fpioc->fp_os.fp_subtype_nm,
+ (fpadd.fp_flags & PF_OSFP_WSIZE_MOD) ? "%" :
+ (fpadd.fp_flags & PF_OSFP_WSIZE_MSS) ? "S" :
+ (fpadd.fp_flags & PF_OSFP_WSIZE_MTU) ? "T" :
+ (fpadd.fp_flags & PF_OSFP_WSIZE_DC) ? "*" : "",
+ fpadd.fp_wsize,
+ fpadd.fp_ttl,
+ (fpadd.fp_flags & PF_OSFP_DF) ? 1 : 0,
+ (fpadd.fp_flags & PF_OSFP_PSIZE_MOD) ? "%" :
+ (fpadd.fp_flags & PF_OSFP_PSIZE_DC) ? "*" : "",
+ fpadd.fp_psize,
+ (long long int)fpadd.fp_tcpopts, fpadd.fp_optcnt,
+ (fpadd.fp_flags & PF_OSFP_TS0) ? "0" : "",
+ (fpadd.fp_flags & PF_OSFP_MSS_MOD) ? "%" :
+ (fpadd.fp_flags & PF_OSFP_MSS_DC) ? "*" : "",
+ fpadd.fp_mss,
+ (fpadd.fp_flags & PF_OSFP_WSCALE_MOD) ? "%" :
+ (fpadd.fp_flags & PF_OSFP_WSCALE_DC) ? "*" : "",
+ fpadd.fp_wscale,
+ fpioc->fp_os.fp_os);
+#endif
+
+ if ((fp = pf_osfp_find_exact(&V_pf_osfp_list, &fpadd))) {
+ SLIST_FOREACH(entry, &fp->fp_oses, fp_entry) {
+ if (PF_OSFP_ENTRY_EQ(entry, &fpioc->fp_os))
+ return (EEXIST);
+ }
+ if ((entry = malloc(sizeof(*entry), M_PFOSFP, M_NOWAIT))
+ == NULL)
+ return (ENOMEM);
+ } else {
+ if ((fp = malloc(sizeof(*fp), M_PFOSFP, M_ZERO | M_NOWAIT))
+ == NULL)
+ return (ENOMEM);
+ fp->fp_tcpopts = fpioc->fp_tcpopts;
+ fp->fp_wsize = fpioc->fp_wsize;
+ fp->fp_psize = fpioc->fp_psize;
+ fp->fp_mss = fpioc->fp_mss;
+ fp->fp_flags = fpioc->fp_flags;
+ fp->fp_optcnt = fpioc->fp_optcnt;
+ fp->fp_wscale = fpioc->fp_wscale;
+ fp->fp_ttl = fpioc->fp_ttl;
+ SLIST_INIT(&fp->fp_oses);
+ if ((entry = malloc(sizeof(*entry), M_PFOSFP, M_NOWAIT))
+ == NULL) {
+ free(fp, M_PFOSFP);
+ return (ENOMEM);
+ }
+ pf_osfp_insert(&V_pf_osfp_list, fp);
+ }
+ memcpy(entry, &fpioc->fp_os, sizeof(*entry));
+
+ /* Make sure the strings are NUL terminated */
+ entry->fp_class_nm[sizeof(entry->fp_class_nm)-1] = '\0';
+ entry->fp_version_nm[sizeof(entry->fp_version_nm)-1] = '\0';
+ entry->fp_subtype_nm[sizeof(entry->fp_subtype_nm)-1] = '\0';
+
+ SLIST_INSERT_HEAD(&fp->fp_oses, entry, fp_entry);
+
+#ifdef PFDEBUG
+ if ((fp = pf_osfp_validate()))
+ printf("Invalid fingerprint list\n");
+#endif /* PFDEBUG */
+ return (0);
+}
+
+
+/* Find a fingerprint in the list */
+static struct pf_os_fingerprint *
+pf_osfp_find(struct pf_osfp_list *list, struct pf_os_fingerprint *find,
+ u_int8_t ttldiff)
+{
+ struct pf_os_fingerprint *f;
+
+#define MATCH_INT(_MOD, _DC, _field) \
+ if ((f->fp_flags & _DC) == 0) { \
+ if ((f->fp_flags & _MOD) == 0) { \
+ if (f->_field != find->_field) \
+ continue; \
+ } else { \
+ if (f->_field == 0 || find->_field % f->_field) \
+ continue; \
+ } \
+ }
+
+ SLIST_FOREACH(f, list, fp_next) {
+ if (f->fp_tcpopts != find->fp_tcpopts ||
+ f->fp_optcnt != find->fp_optcnt ||
+ f->fp_ttl < find->fp_ttl ||
+ f->fp_ttl - find->fp_ttl > ttldiff ||
+ (f->fp_flags & (PF_OSFP_DF|PF_OSFP_TS0)) !=
+ (find->fp_flags & (PF_OSFP_DF|PF_OSFP_TS0)))
+ continue;
+
+ MATCH_INT(PF_OSFP_PSIZE_MOD, PF_OSFP_PSIZE_DC, fp_psize)
+ MATCH_INT(PF_OSFP_MSS_MOD, PF_OSFP_MSS_DC, fp_mss)
+ MATCH_INT(PF_OSFP_WSCALE_MOD, PF_OSFP_WSCALE_DC, fp_wscale)
+ if ((f->fp_flags & PF_OSFP_WSIZE_DC) == 0) {
+ if (f->fp_flags & PF_OSFP_WSIZE_MSS) {
+ if (find->fp_mss == 0)
+ continue;
+
+/*
+ * Some "smart" NAT devices and DSL routers will tweak the MSS size and
+ * will set it to whatever is suitable for the link type.
+ */
+#define SMART_MSS 1460
+ if ((find->fp_wsize % find->fp_mss ||
+ find->fp_wsize / find->fp_mss !=
+ f->fp_wsize) &&
+ (find->fp_wsize % SMART_MSS ||
+ find->fp_wsize / SMART_MSS !=
+ f->fp_wsize))
+ continue;
+ } else if (f->fp_flags & PF_OSFP_WSIZE_MTU) {
+ if (find->fp_mss == 0)
+ continue;
+
+#define MTUOFF (sizeof(struct ip) + sizeof(struct tcphdr))
+#define SMART_MTU (SMART_MSS + MTUOFF)
+ if ((find->fp_wsize % (find->fp_mss + MTUOFF) ||
+ find->fp_wsize / (find->fp_mss + MTUOFF) !=
+ f->fp_wsize) &&
+ (find->fp_wsize % SMART_MTU ||
+ find->fp_wsize / SMART_MTU !=
+ f->fp_wsize))
+ continue;
+ } else if (f->fp_flags & PF_OSFP_WSIZE_MOD) {
+ if (f->fp_wsize == 0 || find->fp_wsize %
+ f->fp_wsize)
+ continue;
+ } else {
+ if (f->fp_wsize != find->fp_wsize)
+ continue;
+ }
+ }
+ return (f);
+ }
+
+ return (NULL);
+}
+
+/* Find an exact fingerprint in the list */
+static struct pf_os_fingerprint *
+pf_osfp_find_exact(struct pf_osfp_list *list, struct pf_os_fingerprint *find)
+{
+ struct pf_os_fingerprint *f;
+
+ SLIST_FOREACH(f, list, fp_next) {
+ if (f->fp_tcpopts == find->fp_tcpopts &&
+ f->fp_wsize == find->fp_wsize &&
+ f->fp_psize == find->fp_psize &&
+ f->fp_mss == find->fp_mss &&
+ f->fp_flags == find->fp_flags &&
+ f->fp_optcnt == find->fp_optcnt &&
+ f->fp_wscale == find->fp_wscale &&
+ f->fp_ttl == find->fp_ttl)
+ return (f);
+ }
+
+ return (NULL);
+}
+
+/* Insert a fingerprint into the list */
+static void
+pf_osfp_insert(struct pf_osfp_list *list, struct pf_os_fingerprint *ins)
+{
+ struct pf_os_fingerprint *f, *prev = NULL;
+
+ /* XXX need to go semi tree based. can key on tcp options */
+
+ SLIST_FOREACH(f, list, fp_next)
+ prev = f;
+ if (prev)
+ SLIST_INSERT_AFTER(prev, ins, fp_next);
+ else
+ SLIST_INSERT_HEAD(list, ins, fp_next);
+}
+
+/* Fill a fingerprint by its number (from an ioctl) */
+int
+pf_osfp_get(struct pf_osfp_ioctl *fpioc)
+{
+ struct pf_os_fingerprint *fp;
+ struct pf_osfp_entry *entry;
+ int num = fpioc->fp_getnum;
+ int i = 0;
+
+
+ memset(fpioc, 0, sizeof(*fpioc));
+ SLIST_FOREACH(fp, &V_pf_osfp_list, fp_next) {
+ SLIST_FOREACH(entry, &fp->fp_oses, fp_entry) {
+ if (i++ == num) {
+ fpioc->fp_mss = fp->fp_mss;
+ fpioc->fp_wsize = fp->fp_wsize;
+ fpioc->fp_flags = fp->fp_flags;
+ fpioc->fp_psize = fp->fp_psize;
+ fpioc->fp_ttl = fp->fp_ttl;
+ fpioc->fp_wscale = fp->fp_wscale;
+ fpioc->fp_getnum = num;
+ memcpy(&fpioc->fp_os, entry,
+ sizeof(fpioc->fp_os));
+ return (0);
+ }
+ }
+ }
+
+ return (EBUSY);
+}
+
+
+#ifdef PFDEBUG
+/* Validate that each signature is reachable */
+static struct pf_os_fingerprint *
+pf_osfp_validate(void)
+{
+ struct pf_os_fingerprint *f, *f2, find;
+
+ SLIST_FOREACH(f, &V_pf_osfp_list, fp_next) {
+ memcpy(&find, f, sizeof(find));
+
+ /* We do a few MSS/th_win percolations to make things unique */
+ if (find.fp_mss == 0)
+ find.fp_mss = 128;
+ if (f->fp_flags & PF_OSFP_WSIZE_MSS)
+ find.fp_wsize *= find.fp_mss;
+ else if (f->fp_flags & PF_OSFP_WSIZE_MTU)
+ find.fp_wsize *= (find.fp_mss + 40);
+ else if (f->fp_flags & PF_OSFP_WSIZE_MOD)
+ find.fp_wsize *= 2;
+ if (f != (f2 = pf_osfp_find(&V_pf_osfp_list, &find, 0))) {
+ if (f2)
+ printf("Found \"%s %s %s\" instead of "
+ "\"%s %s %s\"\n",
+ SLIST_FIRST(&f2->fp_oses)->fp_class_nm,
+ SLIST_FIRST(&f2->fp_oses)->fp_version_nm,
+ SLIST_FIRST(&f2->fp_oses)->fp_subtype_nm,
+ SLIST_FIRST(&f->fp_oses)->fp_class_nm,
+ SLIST_FIRST(&f->fp_oses)->fp_version_nm,
+ SLIST_FIRST(&f->fp_oses)->fp_subtype_nm);
+ else
+ printf("Couldn't find \"%s %s %s\"\n",
+ SLIST_FIRST(&f->fp_oses)->fp_class_nm,
+ SLIST_FIRST(&f->fp_oses)->fp_version_nm,
+ SLIST_FIRST(&f->fp_oses)->fp_subtype_nm);
+ return (f);
+ }
+ }
+ return (NULL);
+}
+#endif /* PFDEBUG */
diff --git a/sys/netpfil/pf/pf_ruleset.c b/sys/netpfil/pf/pf_ruleset.c
new file mode 100644
index 0000000..77652a6
--- /dev/null
+++ b/sys/netpfil/pf/pf_ruleset.c
@@ -0,0 +1,424 @@
+/* $OpenBSD: pf_ruleset.c,v 1.2 2008/12/18 15:31:37 dhill Exp $ */
+
+/*
+ * Copyright (c) 2001 Daniel Hartmeier
+ * Copyright (c) 2002,2003 Henning Brauer
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Effort sponsored in part by the Defense Advanced Research Projects
+ * Agency (DARPA) and Air Force Research Laboratory, Air Force
+ * Materiel Command, USAF, under agreement number F30602-01-2-0537.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/socket.h>
+#ifdef _KERNEL
+# include <sys/systm.h>
+# include <sys/refcount.h>
+#endif /* _KERNEL */
+#include <sys/mbuf.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#include <netinet/tcp.h>
+
+#include <net/if.h>
+#include <net/pfvar.h>
+
+#ifdef INET6
+#include <netinet/ip6.h>
+#endif /* INET6 */
+
+
+#ifdef _KERNEL
+#define DPFPRINTF(format, x...) \
+ if (V_pf_status.debug >= PF_DEBUG_NOISY) \
+ printf(format , ##x)
+#define rs_malloc(x) malloc(x, M_TEMP, M_NOWAIT|M_ZERO)
+#define rs_free(x) free(x, M_TEMP)
+
+#else
+/* Userland equivalents so we can lend code to pfctl et al. */
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#define rs_malloc(x) calloc(1, x)
+#define rs_free(x) free(x)
+
+#ifdef PFDEBUG
+#include <sys/stdarg.h>
+#define DPFPRINTF(format, x...) fprintf(stderr, format , ##x)
+#else
+#define DPFPRINTF(format, x...) ((void)0)
+#endif /* PFDEBUG */
+#endif /* _KERNEL */
+
+#ifdef _KERNEL
+VNET_DEFINE(struct pf_anchor_global, pf_anchors);
+VNET_DEFINE(struct pf_anchor, pf_main_anchor);
+#else /* ! _KERNEL */
+struct pf_anchor_global pf_anchors;
+struct pf_anchor pf_main_anchor;
+#undef V_pf_anchors
+#define V_pf_anchors pf_anchors
+#undef pf_main_ruleset
+#define pf_main_ruleset pf_main_anchor.ruleset
+#endif /* _KERNEL */
+
+static __inline int pf_anchor_compare(struct pf_anchor *, struct pf_anchor *);
+
+static struct pf_anchor *pf_find_anchor(const char *);
+
+RB_GENERATE(pf_anchor_global, pf_anchor, entry_global, pf_anchor_compare);
+RB_GENERATE(pf_anchor_node, pf_anchor, entry_node, pf_anchor_compare);
+
+static __inline int
+pf_anchor_compare(struct pf_anchor *a, struct pf_anchor *b)
+{
+ int c = strcmp(a->path, b->path);
+
+ return (c ? (c < 0 ? -1 : 1) : 0);
+}
+
+int
+pf_get_ruleset_number(u_int8_t action)
+{
+ switch (action) {
+ case PF_SCRUB:
+ case PF_NOSCRUB:
+ return (PF_RULESET_SCRUB);
+ break;
+ case PF_PASS:
+ case PF_DROP:
+ return (PF_RULESET_FILTER);
+ break;
+ case PF_NAT:
+ case PF_NONAT:
+ return (PF_RULESET_NAT);
+ break;
+ case PF_BINAT:
+ case PF_NOBINAT:
+ return (PF_RULESET_BINAT);
+ break;
+ case PF_RDR:
+ case PF_NORDR:
+ return (PF_RULESET_RDR);
+ break;
+ default:
+ return (PF_RULESET_MAX);
+ break;
+ }
+}
+
+void
+pf_init_ruleset(struct pf_ruleset *ruleset)
+{
+ int i;
+
+ memset(ruleset, 0, sizeof(struct pf_ruleset));
+ for (i = 0; i < PF_RULESET_MAX; i++) {
+ TAILQ_INIT(&ruleset->rules[i].queues[0]);
+ TAILQ_INIT(&ruleset->rules[i].queues[1]);
+ ruleset->rules[i].active.ptr = &ruleset->rules[i].queues[0];
+ ruleset->rules[i].inactive.ptr = &ruleset->rules[i].queues[1];
+ }
+}
+
+static struct pf_anchor *
+pf_find_anchor(const char *path)
+{
+ struct pf_anchor *key, *found;
+
+ key = (struct pf_anchor *)rs_malloc(sizeof(*key));
+ if (key == NULL)
+ return (NULL);
+ strlcpy(key->path, path, sizeof(key->path));
+ found = RB_FIND(pf_anchor_global, &V_pf_anchors, key);
+ rs_free(key);
+ return (found);
+}
+
+struct pf_ruleset *
+pf_find_ruleset(const char *path)
+{
+ struct pf_anchor *anchor;
+
+ while (*path == '/')
+ path++;
+ if (!*path)
+ return (&pf_main_ruleset);
+ anchor = pf_find_anchor(path);
+ if (anchor == NULL)
+ return (NULL);
+ else
+ return (&anchor->ruleset);
+}
+
+struct pf_ruleset *
+pf_find_or_create_ruleset(const char *path)
+{
+ char *p, *q, *r;
+ struct pf_ruleset *ruleset;
+ struct pf_anchor *anchor = NULL, *dup, *parent = NULL;
+
+ if (path[0] == 0)
+ return (&pf_main_ruleset);
+ while (*path == '/')
+ path++;
+ ruleset = pf_find_ruleset(path);
+ if (ruleset != NULL)
+ return (ruleset);
+ p = (char *)rs_malloc(MAXPATHLEN);
+ if (p == NULL)
+ return (NULL);
+ strlcpy(p, path, MAXPATHLEN);
+ while (parent == NULL && (q = strrchr(p, '/')) != NULL) {
+ *q = 0;
+ if ((ruleset = pf_find_ruleset(p)) != NULL) {
+ parent = ruleset->anchor;
+ break;
+ }
+ }
+ if (q == NULL)
+ q = p;
+ else
+ q++;
+ strlcpy(p, path, MAXPATHLEN);
+ if (!*q) {
+ rs_free(p);
+ return (NULL);
+ }
+ while ((r = strchr(q, '/')) != NULL || *q) {
+ if (r != NULL)
+ *r = 0;
+ if (!*q || strlen(q) >= PF_ANCHOR_NAME_SIZE ||
+ (parent != NULL && strlen(parent->path) >=
+ MAXPATHLEN - PF_ANCHOR_NAME_SIZE - 1)) {
+ rs_free(p);
+ return (NULL);
+ }
+ anchor = (struct pf_anchor *)rs_malloc(sizeof(*anchor));
+ if (anchor == NULL) {
+ rs_free(p);
+ return (NULL);
+ }
+ RB_INIT(&anchor->children);
+ strlcpy(anchor->name, q, sizeof(anchor->name));
+ if (parent != NULL) {
+ strlcpy(anchor->path, parent->path,
+ sizeof(anchor->path));
+ strlcat(anchor->path, "/", sizeof(anchor->path));
+ }
+ strlcat(anchor->path, anchor->name, sizeof(anchor->path));
+ if ((dup = RB_INSERT(pf_anchor_global, &V_pf_anchors, anchor)) !=
+ NULL) {
+ printf("pf_find_or_create_ruleset: RB_INSERT1 "
+ "'%s' '%s' collides with '%s' '%s'\n",
+ anchor->path, anchor->name, dup->path, dup->name);
+ rs_free(anchor);
+ rs_free(p);
+ return (NULL);
+ }
+ if (parent != NULL) {
+ anchor->parent = parent;
+ if ((dup = RB_INSERT(pf_anchor_node, &parent->children,
+ anchor)) != NULL) {
+ printf("pf_find_or_create_ruleset: "
+ "RB_INSERT2 '%s' '%s' collides with "
+ "'%s' '%s'\n", anchor->path, anchor->name,
+ dup->path, dup->name);
+ RB_REMOVE(pf_anchor_global, &V_pf_anchors,
+ anchor);
+ rs_free(anchor);
+ rs_free(p);
+ return (NULL);
+ }
+ }
+ pf_init_ruleset(&anchor->ruleset);
+ anchor->ruleset.anchor = anchor;
+ parent = anchor;
+ if (r != NULL)
+ q = r + 1;
+ else
+ *q = 0;
+ }
+ rs_free(p);
+ return (&anchor->ruleset);
+}
+
+void
+pf_remove_if_empty_ruleset(struct pf_ruleset *ruleset)
+{
+ struct pf_anchor *parent;
+ int i;
+
+ while (ruleset != NULL) {
+ if (ruleset == &pf_main_ruleset || ruleset->anchor == NULL ||
+ !RB_EMPTY(&ruleset->anchor->children) ||
+ ruleset->anchor->refcnt > 0 || ruleset->tables > 0 ||
+ ruleset->topen)
+ return;
+ for (i = 0; i < PF_RULESET_MAX; ++i)
+ if (!TAILQ_EMPTY(ruleset->rules[i].active.ptr) ||
+ !TAILQ_EMPTY(ruleset->rules[i].inactive.ptr) ||
+ ruleset->rules[i].inactive.open)
+ return;
+ RB_REMOVE(pf_anchor_global, &V_pf_anchors, ruleset->anchor);
+ if ((parent = ruleset->anchor->parent) != NULL)
+ RB_REMOVE(pf_anchor_node, &parent->children,
+ ruleset->anchor);
+ rs_free(ruleset->anchor);
+ if (parent == NULL)
+ return;
+ ruleset = &parent->ruleset;
+ }
+}
+
+int
+pf_anchor_setup(struct pf_rule *r, const struct pf_ruleset *s,
+ const char *name)
+{
+ char *p, *path;
+ struct pf_ruleset *ruleset;
+
+ r->anchor = NULL;
+ r->anchor_relative = 0;
+ r->anchor_wildcard = 0;
+ if (!name[0])
+ return (0);
+ path = (char *)rs_malloc(MAXPATHLEN);
+ if (path == NULL)
+ return (1);
+ if (name[0] == '/')
+ strlcpy(path, name + 1, MAXPATHLEN);
+ else {
+ /* relative path */
+ r->anchor_relative = 1;
+ if (s->anchor == NULL || !s->anchor->path[0])
+ path[0] = 0;
+ else
+ strlcpy(path, s->anchor->path, MAXPATHLEN);
+ while (name[0] == '.' && name[1] == '.' && name[2] == '/') {
+ if (!path[0]) {
+ printf("pf_anchor_setup: .. beyond root\n");
+ rs_free(path);
+ return (1);
+ }
+ if ((p = strrchr(path, '/')) != NULL)
+ *p = 0;
+ else
+ path[0] = 0;
+ r->anchor_relative++;
+ name += 3;
+ }
+ if (path[0])
+ strlcat(path, "/", MAXPATHLEN);
+ strlcat(path, name, MAXPATHLEN);
+ }
+ if ((p = strrchr(path, '/')) != NULL && !strcmp(p, "/*")) {
+ r->anchor_wildcard = 1;
+ *p = 0;
+ }
+ ruleset = pf_find_or_create_ruleset(path);
+ rs_free(path);
+ if (ruleset == NULL || ruleset->anchor == NULL) {
+ printf("pf_anchor_setup: ruleset\n");
+ return (1);
+ }
+ r->anchor = ruleset->anchor;
+ r->anchor->refcnt++;
+ return (0);
+}
+
+int
+pf_anchor_copyout(const struct pf_ruleset *rs, const struct pf_rule *r,
+ struct pfioc_rule *pr)
+{
+ pr->anchor_call[0] = 0;
+ if (r->anchor == NULL)
+ return (0);
+ if (!r->anchor_relative) {
+ strlcpy(pr->anchor_call, "/", sizeof(pr->anchor_call));
+ strlcat(pr->anchor_call, r->anchor->path,
+ sizeof(pr->anchor_call));
+ } else {
+ char *a, *p;
+ int i;
+
+ a = (char *)rs_malloc(MAXPATHLEN);
+ if (a == NULL)
+ return (1);
+ if (rs->anchor == NULL)
+ a[0] = 0;
+ else
+ strlcpy(a, rs->anchor->path, MAXPATHLEN);
+ for (i = 1; i < r->anchor_relative; ++i) {
+ if ((p = strrchr(a, '/')) == NULL)
+ p = a;
+ *p = 0;
+ strlcat(pr->anchor_call, "../",
+ sizeof(pr->anchor_call));
+ }
+ if (strncmp(a, r->anchor->path, strlen(a))) {
+ printf("pf_anchor_copyout: '%s' '%s'\n", a,
+ r->anchor->path);
+ rs_free(a);
+ return (1);
+ }
+ if (strlen(r->anchor->path) > strlen(a))
+ strlcat(pr->anchor_call, r->anchor->path + (a[0] ?
+ strlen(a) + 1 : 0), sizeof(pr->anchor_call));
+ rs_free(a);
+ }
+ if (r->anchor_wildcard)
+ strlcat(pr->anchor_call, pr->anchor_call[0] ? "/*" : "*",
+ sizeof(pr->anchor_call));
+ return (0);
+}
+
+void
+pf_anchor_remove(struct pf_rule *r)
+{
+ if (r->anchor == NULL)
+ return;
+ if (r->anchor->refcnt <= 0) {
+ printf("pf_anchor_remove: broken refcount\n");
+ r->anchor = NULL;
+ return;
+ }
+ if (!--r->anchor->refcnt)
+ pf_remove_if_empty_ruleset(&r->anchor->ruleset);
+ r->anchor = NULL;
+}
diff --git a/sys/netpfil/pf/pf_table.c b/sys/netpfil/pf/pf_table.c
new file mode 100644
index 0000000..fa88045
--- /dev/null
+++ b/sys/netpfil/pf/pf_table.c
@@ -0,0 +1,2191 @@
+/* $OpenBSD: pf_table.c,v 1.79 2008/10/08 06:24:50 mcbride Exp $ */
+
+/*
+ * Copyright (c) 2002 Cedric Berger
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/refcount.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <vm/uma.h>
+
+#include <net/if.h>
+#include <net/vnet.h>
+#include <net/pfvar.h>
+
+#define ACCEPT_FLAGS(flags, oklist) \
+ do { \
+ if ((flags & ~(oklist)) & \
+ PFR_FLAG_ALLMASK) \
+ return (EINVAL); \
+ } while (0)
+
+#define FILLIN_SIN(sin, addr) \
+ do { \
+ (sin).sin_len = sizeof(sin); \
+ (sin).sin_family = AF_INET; \
+ (sin).sin_addr = (addr); \
+ } while (0)
+
+#define FILLIN_SIN6(sin6, addr) \
+ do { \
+ (sin6).sin6_len = sizeof(sin6); \
+ (sin6).sin6_family = AF_INET6; \
+ (sin6).sin6_addr = (addr); \
+ } while (0)
+
+#define SWAP(type, a1, a2) \
+ do { \
+ type tmp = a1; \
+ a1 = a2; \
+ a2 = tmp; \
+ } while (0)
+
+#define SUNION2PF(su, af) (((af)==AF_INET) ? \
+ (struct pf_addr *)&(su)->sin.sin_addr : \
+ (struct pf_addr *)&(su)->sin6.sin6_addr)
+
+#define AF_BITS(af) (((af)==AF_INET)?32:128)
+#define ADDR_NETWORK(ad) ((ad)->pfra_net < AF_BITS((ad)->pfra_af))
+#define KENTRY_NETWORK(ke) ((ke)->pfrke_net < AF_BITS((ke)->pfrke_af))
+#define KENTRY_RNF_ROOT(ke) \
+ ((((struct radix_node *)(ke))->rn_flags & RNF_ROOT) != 0)
+
+#define NO_ADDRESSES (-1)
+#define ENQUEUE_UNMARKED_ONLY (1)
+#define INVERT_NEG_FLAG (1)
+
+struct pfr_walktree {
+ enum pfrw_op {
+ PFRW_MARK,
+ PFRW_SWEEP,
+ PFRW_ENQUEUE,
+ PFRW_GET_ADDRS,
+ PFRW_GET_ASTATS,
+ PFRW_POOL_GET,
+ PFRW_DYNADDR_UPDATE
+ } pfrw_op;
+ union {
+ struct pfr_addr *pfrw1_addr;
+ struct pfr_astats *pfrw1_astats;
+ struct pfr_kentryworkq *pfrw1_workq;
+ struct pfr_kentry *pfrw1_kentry;
+ struct pfi_dynaddr *pfrw1_dyn;
+ } pfrw_1;
+ int pfrw_free;
+};
+#define pfrw_addr pfrw_1.pfrw1_addr
+#define pfrw_astats pfrw_1.pfrw1_astats
+#define pfrw_workq pfrw_1.pfrw1_workq
+#define pfrw_kentry pfrw_1.pfrw1_kentry
+#define pfrw_dyn pfrw_1.pfrw1_dyn
+#define pfrw_cnt pfrw_free
+
+#define senderr(e) do { rv = (e); goto _bad; } while (0)
+
+static MALLOC_DEFINE(M_PFTABLE, "pf_table", "pf(4) tables structures");
+static VNET_DEFINE(uma_zone_t, pfr_kentry_z);
+#define V_pfr_kentry_z VNET(pfr_kentry_z)
+static VNET_DEFINE(uma_zone_t, pfr_kcounters_z);
+#define V_pfr_kcounters_z VNET(pfr_kcounters_z)
+
+static struct pf_addr pfr_ffaddr = {
+ .addr32 = { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }
+};
+
+static void pfr_copyout_addr(struct pfr_addr *,
+ struct pfr_kentry *ke);
+static int pfr_validate_addr(struct pfr_addr *);
+static void pfr_enqueue_addrs(struct pfr_ktable *,
+ struct pfr_kentryworkq *, int *, int);
+static void pfr_mark_addrs(struct pfr_ktable *);
+static struct pfr_kentry
+ *pfr_lookup_addr(struct pfr_ktable *,
+ struct pfr_addr *, int);
+static struct pfr_kentry *pfr_create_kentry(struct pfr_addr *);
+static void pfr_destroy_kentries(struct pfr_kentryworkq *);
+static void pfr_destroy_kentry(struct pfr_kentry *);
+static void pfr_insert_kentries(struct pfr_ktable *,
+ struct pfr_kentryworkq *, long);
+static void pfr_remove_kentries(struct pfr_ktable *,
+ struct pfr_kentryworkq *);
+static void pfr_clstats_kentries(struct pfr_kentryworkq *, long,
+ int);
+static void pfr_reset_feedback(struct pfr_addr *, int);
+static void pfr_prepare_network(union sockaddr_union *, int, int);
+static int pfr_route_kentry(struct pfr_ktable *,
+ struct pfr_kentry *);
+static int pfr_unroute_kentry(struct pfr_ktable *,
+ struct pfr_kentry *);
+static int pfr_walktree(struct radix_node *, void *);
+static int pfr_validate_table(struct pfr_table *, int, int);
+static int pfr_fix_anchor(char *);
+static void pfr_commit_ktable(struct pfr_ktable *, long);
+static void pfr_insert_ktables(struct pfr_ktableworkq *);
+static void pfr_insert_ktable(struct pfr_ktable *);
+static void pfr_setflags_ktables(struct pfr_ktableworkq *);
+static void pfr_setflags_ktable(struct pfr_ktable *, int);
+static void pfr_clstats_ktables(struct pfr_ktableworkq *, long,
+ int);
+static void pfr_clstats_ktable(struct pfr_ktable *, long, int);
+static struct pfr_ktable
+ *pfr_create_ktable(struct pfr_table *, long, int);
+static void pfr_destroy_ktables(struct pfr_ktableworkq *, int);
+static void pfr_destroy_ktable(struct pfr_ktable *, int);
+static int pfr_ktable_compare(struct pfr_ktable *,
+ struct pfr_ktable *);
+static struct pfr_ktable
+ *pfr_lookup_table(struct pfr_table *);
+static void pfr_clean_node_mask(struct pfr_ktable *,
+ struct pfr_kentryworkq *);
+static int pfr_table_count(struct pfr_table *, int);
+static int pfr_skip_table(struct pfr_table *,
+ struct pfr_ktable *, int);
+static struct pfr_kentry
+ *pfr_kentry_byidx(struct pfr_ktable *, int, int);
+
+static RB_PROTOTYPE(pfr_ktablehead, pfr_ktable, pfrkt_tree, pfr_ktable_compare);
+static RB_GENERATE(pfr_ktablehead, pfr_ktable, pfrkt_tree, pfr_ktable_compare);
+
+struct pfr_ktablehead pfr_ktables;
+struct pfr_table pfr_nulltable;
+int pfr_ktable_cnt;
+
+void
+pfr_initialize(void)
+{
+
+ V_pfr_kentry_z = uma_zcreate("pf table entries",
+ sizeof(struct pfr_kentry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
+ 0);
+ V_pfr_kcounters_z = uma_zcreate("pf table counters",
+ sizeof(struct pfr_kcounters), NULL, NULL, NULL, NULL,
+ UMA_ALIGN_PTR, 0);
+ V_pf_limits[PF_LIMIT_TABLE_ENTRIES].zone = V_pfr_kentry_z;
+ V_pf_limits[PF_LIMIT_TABLE_ENTRIES].limit = PFR_KENTRY_HIWAT;
+}
+
+void
+pfr_cleanup(void)
+{
+
+ uma_zdestroy(V_pfr_kentry_z);
+ uma_zdestroy(V_pfr_kcounters_z);
+}
+
+int
+pfr_clr_addrs(struct pfr_table *tbl, int *ndel, int flags)
+{
+ struct pfr_ktable *kt;
+ struct pfr_kentryworkq workq;
+
+ PF_RULES_WASSERT();
+
+ ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY);
+ if (pfr_validate_table(tbl, 0, flags & PFR_FLAG_USERIOCTL))
+ return (EINVAL);
+ kt = pfr_lookup_table(tbl);
+ if (kt == NULL || !(kt->pfrkt_flags & PFR_TFLAG_ACTIVE))
+ return (ESRCH);
+ if (kt->pfrkt_flags & PFR_TFLAG_CONST)
+ return (EPERM);
+ pfr_enqueue_addrs(kt, &workq, ndel, 0);
+
+ if (!(flags & PFR_FLAG_DUMMY)) {
+ pfr_remove_kentries(kt, &workq);
+ KASSERT(kt->pfrkt_cnt == 0, ("%s: non-null pfrkt_cnt", __func__));
+ }
+ return (0);
+}
+
+int
+pfr_add_addrs(struct pfr_table *tbl, struct pfr_addr *addr, int size,
+ int *nadd, int flags)
+{
+ struct pfr_ktable *kt, *tmpkt;
+ struct pfr_kentryworkq workq;
+ struct pfr_kentry *p, *q;
+ struct pfr_addr *ad;
+ int i, rv, xadd = 0;
+ long tzero = time_second;
+
+ PF_RULES_WASSERT();
+
+ ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY | PFR_FLAG_FEEDBACK);
+ if (pfr_validate_table(tbl, 0, flags & PFR_FLAG_USERIOCTL))
+ return (EINVAL);
+ kt = pfr_lookup_table(tbl);
+ if (kt == NULL || !(kt->pfrkt_flags & PFR_TFLAG_ACTIVE))
+ return (ESRCH);
+ if (kt->pfrkt_flags & PFR_TFLAG_CONST)
+ return (EPERM);
+ tmpkt = pfr_create_ktable(&pfr_nulltable, 0, 0);
+ if (tmpkt == NULL)
+ return (ENOMEM);
+ SLIST_INIT(&workq);
+ for (i = 0, ad = addr; i < size; i++, ad++) {
+ if (pfr_validate_addr(ad))
+ senderr(EINVAL);
+ p = pfr_lookup_addr(kt, ad, 1);
+ q = pfr_lookup_addr(tmpkt, ad, 1);
+ if (flags & PFR_FLAG_FEEDBACK) {
+ if (q != NULL)
+ ad->pfra_fback = PFR_FB_DUPLICATE;
+ else if (p == NULL)
+ ad->pfra_fback = PFR_FB_ADDED;
+ else if (p->pfrke_not != ad->pfra_not)
+ ad->pfra_fback = PFR_FB_CONFLICT;
+ else
+ ad->pfra_fback = PFR_FB_NONE;
+ }
+ if (p == NULL && q == NULL) {
+ p = pfr_create_kentry(ad);
+ if (p == NULL)
+ senderr(ENOMEM);
+ if (pfr_route_kentry(tmpkt, p)) {
+ pfr_destroy_kentry(p);
+ ad->pfra_fback = PFR_FB_NONE;
+ } else {
+ SLIST_INSERT_HEAD(&workq, p, pfrke_workq);
+ xadd++;
+ }
+ }
+ }
+ pfr_clean_node_mask(tmpkt, &workq);
+ if (!(flags & PFR_FLAG_DUMMY))
+ pfr_insert_kentries(kt, &workq, tzero);
+ else
+ pfr_destroy_kentries(&workq);
+ if (nadd != NULL)
+ *nadd = xadd;
+ pfr_destroy_ktable(tmpkt, 0);
+ return (0);
+_bad:
+ pfr_clean_node_mask(tmpkt, &workq);
+ pfr_destroy_kentries(&workq);
+ if (flags & PFR_FLAG_FEEDBACK)
+ pfr_reset_feedback(addr, size);
+ pfr_destroy_ktable(tmpkt, 0);
+ return (rv);
+}
+
+int
+pfr_del_addrs(struct pfr_table *tbl, struct pfr_addr *addr, int size,
+ int *ndel, int flags)
+{
+ struct pfr_ktable *kt;
+ struct pfr_kentryworkq workq;
+ struct pfr_kentry *p;
+ struct pfr_addr *ad;
+ int i, rv, xdel = 0, log = 1;
+
+ PF_RULES_WASSERT();
+
+ ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY | PFR_FLAG_FEEDBACK);
+ if (pfr_validate_table(tbl, 0, flags & PFR_FLAG_USERIOCTL))
+ return (EINVAL);
+ kt = pfr_lookup_table(tbl);
+ if (kt == NULL || !(kt->pfrkt_flags & PFR_TFLAG_ACTIVE))
+ return (ESRCH);
+ if (kt->pfrkt_flags & PFR_TFLAG_CONST)
+ return (EPERM);
+ /*
+ * there are two algorithms to choose from here.
+ * with:
+ * n: number of addresses to delete
+ * N: number of addresses in the table
+ *
+ * one is O(N) and is better for large 'n'
+ * one is O(n*LOG(N)) and is better for small 'n'
+ *
+ * following code try to decide which one is best.
+ */
+ for (i = kt->pfrkt_cnt; i > 0; i >>= 1)
+ log++;
+ if (size > kt->pfrkt_cnt/log) {
+ /* full table scan */
+ pfr_mark_addrs(kt);
+ } else {
+ /* iterate over addresses to delete */
+ for (i = 0, ad = addr; i < size; i++, ad++) {
+ if (pfr_validate_addr(ad))
+ return (EINVAL);
+ p = pfr_lookup_addr(kt, ad, 1);
+ if (p != NULL)
+ p->pfrke_mark = 0;
+ }
+ }
+ SLIST_INIT(&workq);
+ for (i = 0, ad = addr; i < size; i++, ad++) {
+ if (pfr_validate_addr(ad))
+ senderr(EINVAL);
+ p = pfr_lookup_addr(kt, ad, 1);
+ if (flags & PFR_FLAG_FEEDBACK) {
+ if (p == NULL)
+ ad->pfra_fback = PFR_FB_NONE;
+ else if (p->pfrke_not != ad->pfra_not)
+ ad->pfra_fback = PFR_FB_CONFLICT;
+ else if (p->pfrke_mark)
+ ad->pfra_fback = PFR_FB_DUPLICATE;
+ else
+ ad->pfra_fback = PFR_FB_DELETED;
+ }
+ if (p != NULL && p->pfrke_not == ad->pfra_not &&
+ !p->pfrke_mark) {
+ p->pfrke_mark = 1;
+ SLIST_INSERT_HEAD(&workq, p, pfrke_workq);
+ xdel++;
+ }
+ }
+ if (!(flags & PFR_FLAG_DUMMY))
+ pfr_remove_kentries(kt, &workq);
+ if (ndel != NULL)
+ *ndel = xdel;
+ return (0);
+_bad:
+ if (flags & PFR_FLAG_FEEDBACK)
+ pfr_reset_feedback(addr, size);
+ return (rv);
+}
+
+int
+pfr_set_addrs(struct pfr_table *tbl, struct pfr_addr *addr, int size,
+ int *size2, int *nadd, int *ndel, int *nchange, int flags,
+ u_int32_t ignore_pfrt_flags)
+{
+ struct pfr_ktable *kt, *tmpkt;
+ struct pfr_kentryworkq addq, delq, changeq;
+ struct pfr_kentry *p, *q;
+ struct pfr_addr ad;
+ int i, rv, xadd = 0, xdel = 0, xchange = 0;
+ long tzero = time_second;
+
+ PF_RULES_WASSERT();
+
+ ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY | PFR_FLAG_FEEDBACK);
+ if (pfr_validate_table(tbl, ignore_pfrt_flags, flags &
+ PFR_FLAG_USERIOCTL))
+ return (EINVAL);
+ kt = pfr_lookup_table(tbl);
+ if (kt == NULL || !(kt->pfrkt_flags & PFR_TFLAG_ACTIVE))
+ return (ESRCH);
+ if (kt->pfrkt_flags & PFR_TFLAG_CONST)
+ return (EPERM);
+ tmpkt = pfr_create_ktable(&pfr_nulltable, 0, 0);
+ if (tmpkt == NULL)
+ return (ENOMEM);
+ pfr_mark_addrs(kt);
+ SLIST_INIT(&addq);
+ SLIST_INIT(&delq);
+ SLIST_INIT(&changeq);
+ for (i = 0; i < size; i++) {
+ /*
+ * XXXGL: undertand pf_if usage of this function
+ * and make ad a moving pointer
+ */
+ bcopy(addr + i, &ad, sizeof(ad));
+ if (pfr_validate_addr(&ad))
+ senderr(EINVAL);
+ ad.pfra_fback = PFR_FB_NONE;
+ p = pfr_lookup_addr(kt, &ad, 1);
+ if (p != NULL) {
+ if (p->pfrke_mark) {
+ ad.pfra_fback = PFR_FB_DUPLICATE;
+ goto _skip;
+ }
+ p->pfrke_mark = 1;
+ if (p->pfrke_not != ad.pfra_not) {
+ SLIST_INSERT_HEAD(&changeq, p, pfrke_workq);
+ ad.pfra_fback = PFR_FB_CHANGED;
+ xchange++;
+ }
+ } else {
+ q = pfr_lookup_addr(tmpkt, &ad, 1);
+ if (q != NULL) {
+ ad.pfra_fback = PFR_FB_DUPLICATE;
+ goto _skip;
+ }
+ p = pfr_create_kentry(&ad);
+ if (p == NULL)
+ senderr(ENOMEM);
+ if (pfr_route_kentry(tmpkt, p)) {
+ pfr_destroy_kentry(p);
+ ad.pfra_fback = PFR_FB_NONE;
+ } else {
+ SLIST_INSERT_HEAD(&addq, p, pfrke_workq);
+ ad.pfra_fback = PFR_FB_ADDED;
+ xadd++;
+ }
+ }
+_skip:
+ if (flags & PFR_FLAG_FEEDBACK)
+ bcopy(&ad, addr + i, sizeof(ad));
+ }
+ pfr_enqueue_addrs(kt, &delq, &xdel, ENQUEUE_UNMARKED_ONLY);
+ if ((flags & PFR_FLAG_FEEDBACK) && *size2) {
+ if (*size2 < size+xdel) {
+ *size2 = size+xdel;
+ senderr(0);
+ }
+ i = 0;
+ SLIST_FOREACH(p, &delq, pfrke_workq) {
+ pfr_copyout_addr(&ad, p);
+ ad.pfra_fback = PFR_FB_DELETED;
+ bcopy(&ad, addr + size + i, sizeof(ad));
+ i++;
+ }
+ }
+ pfr_clean_node_mask(tmpkt, &addq);
+ if (!(flags & PFR_FLAG_DUMMY)) {
+ pfr_insert_kentries(kt, &addq, tzero);
+ pfr_remove_kentries(kt, &delq);
+ pfr_clstats_kentries(&changeq, tzero, INVERT_NEG_FLAG);
+ } else
+ pfr_destroy_kentries(&addq);
+ if (nadd != NULL)
+ *nadd = xadd;
+ if (ndel != NULL)
+ *ndel = xdel;
+ if (nchange != NULL)
+ *nchange = xchange;
+ if ((flags & PFR_FLAG_FEEDBACK) && size2)
+ *size2 = size+xdel;
+ pfr_destroy_ktable(tmpkt, 0);
+ return (0);
+_bad:
+ pfr_clean_node_mask(tmpkt, &addq);
+ pfr_destroy_kentries(&addq);
+ if (flags & PFR_FLAG_FEEDBACK)
+ pfr_reset_feedback(addr, size);
+ pfr_destroy_ktable(tmpkt, 0);
+ return (rv);
+}
+
+int
+pfr_tst_addrs(struct pfr_table *tbl, struct pfr_addr *addr, int size,
+ int *nmatch, int flags)
+{
+ struct pfr_ktable *kt;
+ struct pfr_kentry *p;
+ struct pfr_addr *ad;
+ int i, xmatch = 0;
+
+ PF_RULES_RASSERT();
+
+ ACCEPT_FLAGS(flags, PFR_FLAG_REPLACE);
+ if (pfr_validate_table(tbl, 0, 0))
+ return (EINVAL);
+ kt = pfr_lookup_table(tbl);
+ if (kt == NULL || !(kt->pfrkt_flags & PFR_TFLAG_ACTIVE))
+ return (ESRCH);
+
+ for (i = 0, ad = addr; i < size; i++, ad++) {
+ if (pfr_validate_addr(ad))
+ return (EINVAL);
+ if (ADDR_NETWORK(ad))
+ return (EINVAL);
+ p = pfr_lookup_addr(kt, ad, 0);
+ if (flags & PFR_FLAG_REPLACE)
+ pfr_copyout_addr(ad, p);
+ ad->pfra_fback = (p == NULL) ? PFR_FB_NONE :
+ (p->pfrke_not ? PFR_FB_NOTMATCH : PFR_FB_MATCH);
+ if (p != NULL && !p->pfrke_not)
+ xmatch++;
+ }
+ if (nmatch != NULL)
+ *nmatch = xmatch;
+ return (0);
+}
+
+int
+pfr_get_addrs(struct pfr_table *tbl, struct pfr_addr *addr, int *size,
+ int flags)
+{
+ struct pfr_ktable *kt;
+ struct pfr_walktree w;
+ int rv;
+
+ PF_RULES_RASSERT();
+
+ ACCEPT_FLAGS(flags, 0);
+ if (pfr_validate_table(tbl, 0, 0))
+ return (EINVAL);
+ kt = pfr_lookup_table(tbl);
+ if (kt == NULL || !(kt->pfrkt_flags & PFR_TFLAG_ACTIVE))
+ return (ESRCH);
+ if (kt->pfrkt_cnt > *size) {
+ *size = kt->pfrkt_cnt;
+ return (0);
+ }
+
+ bzero(&w, sizeof(w));
+ w.pfrw_op = PFRW_GET_ADDRS;
+ w.pfrw_addr = addr;
+ w.pfrw_free = kt->pfrkt_cnt;
+ rv = kt->pfrkt_ip4->rnh_walktree(kt->pfrkt_ip4, pfr_walktree, &w);
+ if (!rv)
+ rv = kt->pfrkt_ip6->rnh_walktree(kt->pfrkt_ip6, pfr_walktree,
+ &w);
+ if (rv)
+ return (rv);
+
+ KASSERT(w.pfrw_free == 0, ("%s: corruption detected (%d)", __func__,
+ w.pfrw_free));
+
+ *size = kt->pfrkt_cnt;
+ return (0);
+}
+
+int
+pfr_get_astats(struct pfr_table *tbl, struct pfr_astats *addr, int *size,
+ int flags)
+{
+ struct pfr_ktable *kt;
+ struct pfr_walktree w;
+ struct pfr_kentryworkq workq;
+ int rv;
+ long tzero = time_second;
+
+ PF_RULES_RASSERT();
+
+ /* XXX PFR_FLAG_CLSTATS disabled */
+ ACCEPT_FLAGS(flags, 0);
+ if (pfr_validate_table(tbl, 0, 0))
+ return (EINVAL);
+ kt = pfr_lookup_table(tbl);
+ if (kt == NULL || !(kt->pfrkt_flags & PFR_TFLAG_ACTIVE))
+ return (ESRCH);
+ if (kt->pfrkt_cnt > *size) {
+ *size = kt->pfrkt_cnt;
+ return (0);
+ }
+
+ bzero(&w, sizeof(w));
+ w.pfrw_op = PFRW_GET_ASTATS;
+ w.pfrw_astats = addr;
+ w.pfrw_free = kt->pfrkt_cnt;
+ rv = kt->pfrkt_ip4->rnh_walktree(kt->pfrkt_ip4, pfr_walktree, &w);
+ if (!rv)
+ rv = kt->pfrkt_ip6->rnh_walktree(kt->pfrkt_ip6, pfr_walktree,
+ &w);
+ if (!rv && (flags & PFR_FLAG_CLSTATS)) {
+ pfr_enqueue_addrs(kt, &workq, NULL, 0);
+ pfr_clstats_kentries(&workq, tzero, 0);
+ }
+ if (rv)
+ return (rv);
+
+ if (w.pfrw_free) {
+ printf("pfr_get_astats: corruption detected (%d).\n",
+ w.pfrw_free);
+ return (ENOTTY);
+ }
+ *size = kt->pfrkt_cnt;
+ return (0);
+}
+
+int
+pfr_clr_astats(struct pfr_table *tbl, struct pfr_addr *addr, int size,
+ int *nzero, int flags)
+{
+ struct pfr_ktable *kt;
+ struct pfr_kentryworkq workq;
+ struct pfr_kentry *p;
+ struct pfr_addr *ad;
+ int i, rv, xzero = 0;
+
+ PF_RULES_WASSERT();
+
+ ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY | PFR_FLAG_FEEDBACK);
+ if (pfr_validate_table(tbl, 0, 0))
+ return (EINVAL);
+ kt = pfr_lookup_table(tbl);
+ if (kt == NULL || !(kt->pfrkt_flags & PFR_TFLAG_ACTIVE))
+ return (ESRCH);
+ SLIST_INIT(&workq);
+ for (i = 0, ad = addr; i < size; i++, ad++) {
+ if (pfr_validate_addr(ad))
+ senderr(EINVAL);
+ p = pfr_lookup_addr(kt, ad, 1);
+ if (flags & PFR_FLAG_FEEDBACK) {
+ ad->pfra_fback = (p != NULL) ?
+ PFR_FB_CLEARED : PFR_FB_NONE;
+ }
+ if (p != NULL) {
+ SLIST_INSERT_HEAD(&workq, p, pfrke_workq);
+ xzero++;
+ }
+ }
+
+ if (!(flags & PFR_FLAG_DUMMY))
+ pfr_clstats_kentries(&workq, 0, 0);
+ if (nzero != NULL)
+ *nzero = xzero;
+ return (0);
+_bad:
+ if (flags & PFR_FLAG_FEEDBACK)
+ pfr_reset_feedback(addr, size);
+ return (rv);
+}
+
+static int
+pfr_validate_addr(struct pfr_addr *ad)
+{
+ int i;
+
+ switch (ad->pfra_af) {
+#ifdef INET
+ case AF_INET:
+ if (ad->pfra_net > 32)
+ return (-1);
+ break;
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ if (ad->pfra_net > 128)
+ return (-1);
+ break;
+#endif /* INET6 */
+ default:
+ return (-1);
+ }
+ if (ad->pfra_net < 128 &&
+ (((caddr_t)ad)[ad->pfra_net/8] & (0xFF >> (ad->pfra_net%8))))
+ return (-1);
+ for (i = (ad->pfra_net+7)/8; i < sizeof(ad->pfra_u); i++)
+ if (((caddr_t)ad)[i])
+ return (-1);
+ if (ad->pfra_not && ad->pfra_not != 1)
+ return (-1);
+ if (ad->pfra_fback)
+ return (-1);
+ return (0);
+}
+
+static void
+pfr_enqueue_addrs(struct pfr_ktable *kt, struct pfr_kentryworkq *workq,
+ int *naddr, int sweep)
+{
+ struct pfr_walktree w;
+
+ SLIST_INIT(workq);
+ bzero(&w, sizeof(w));
+ w.pfrw_op = sweep ? PFRW_SWEEP : PFRW_ENQUEUE;
+ w.pfrw_workq = workq;
+ if (kt->pfrkt_ip4 != NULL)
+ if (kt->pfrkt_ip4->rnh_walktree(kt->pfrkt_ip4, pfr_walktree,
+ &w))
+ printf("pfr_enqueue_addrs: IPv4 walktree failed.\n");
+ if (kt->pfrkt_ip6 != NULL)
+ if (kt->pfrkt_ip6->rnh_walktree(kt->pfrkt_ip6, pfr_walktree,
+ &w))
+ printf("pfr_enqueue_addrs: IPv6 walktree failed.\n");
+ if (naddr != NULL)
+ *naddr = w.pfrw_cnt;
+}
+
+static void
+pfr_mark_addrs(struct pfr_ktable *kt)
+{
+ struct pfr_walktree w;
+
+ bzero(&w, sizeof(w));
+ w.pfrw_op = PFRW_MARK;
+ if (kt->pfrkt_ip4->rnh_walktree(kt->pfrkt_ip4, pfr_walktree, &w))
+ printf("pfr_mark_addrs: IPv4 walktree failed.\n");
+ if (kt->pfrkt_ip6->rnh_walktree(kt->pfrkt_ip6, pfr_walktree, &w))
+ printf("pfr_mark_addrs: IPv6 walktree failed.\n");
+}
+
+
+static struct pfr_kentry *
+pfr_lookup_addr(struct pfr_ktable *kt, struct pfr_addr *ad, int exact)
+{
+ union sockaddr_union sa, mask;
+ struct radix_node_head *head = NULL;
+ struct pfr_kentry *ke;
+
+ bzero(&sa, sizeof(sa));
+ if (ad->pfra_af == AF_INET) {
+ FILLIN_SIN(sa.sin, ad->pfra_ip4addr);
+ head = kt->pfrkt_ip4;
+ } else if ( ad->pfra_af == AF_INET6 ) {
+ FILLIN_SIN6(sa.sin6, ad->pfra_ip6addr);
+ head = kt->pfrkt_ip6;
+ }
+ if (ADDR_NETWORK(ad)) {
+ pfr_prepare_network(&mask, ad->pfra_af, ad->pfra_net);
+ ke = (struct pfr_kentry *)rn_lookup(&sa, &mask, head);
+ if (ke && KENTRY_RNF_ROOT(ke))
+ ke = NULL;
+ } else {
+ ke = (struct pfr_kentry *)rn_match(&sa, head);
+ if (ke && KENTRY_RNF_ROOT(ke))
+ ke = NULL;
+ if (exact && ke && KENTRY_NETWORK(ke))
+ ke = NULL;
+ }
+ return (ke);
+}
+
+static struct pfr_kentry *
+pfr_create_kentry(struct pfr_addr *ad)
+{
+ struct pfr_kentry *ke;
+
+ ke = uma_zalloc(V_pfr_kentry_z, M_NOWAIT | M_ZERO);
+ if (ke == NULL)
+ return (NULL);
+
+ if (ad->pfra_af == AF_INET)
+ FILLIN_SIN(ke->pfrke_sa.sin, ad->pfra_ip4addr);
+ else if (ad->pfra_af == AF_INET6)
+ FILLIN_SIN6(ke->pfrke_sa.sin6, ad->pfra_ip6addr);
+ ke->pfrke_af = ad->pfra_af;
+ ke->pfrke_net = ad->pfra_net;
+ ke->pfrke_not = ad->pfra_not;
+ return (ke);
+}
+
+static void
+pfr_destroy_kentries(struct pfr_kentryworkq *workq)
+{
+ struct pfr_kentry *p, *q;
+
+ for (p = SLIST_FIRST(workq); p != NULL; p = q) {
+ q = SLIST_NEXT(p, pfrke_workq);
+ pfr_destroy_kentry(p);
+ }
+}
+
+static void
+pfr_destroy_kentry(struct pfr_kentry *ke)
+{
+ if (ke->pfrke_counters)
+ uma_zfree(V_pfr_kcounters_z, ke->pfrke_counters);
+ uma_zfree(V_pfr_kentry_z, ke);
+}
+
+static void
+pfr_insert_kentries(struct pfr_ktable *kt,
+ struct pfr_kentryworkq *workq, long tzero)
+{
+ struct pfr_kentry *p;
+ int rv, n = 0;
+
+ SLIST_FOREACH(p, workq, pfrke_workq) {
+ rv = pfr_route_kentry(kt, p);
+ if (rv) {
+ printf("pfr_insert_kentries: cannot route entry "
+ "(code=%d).\n", rv);
+ break;
+ }
+ p->pfrke_tzero = tzero;
+ n++;
+ }
+ kt->pfrkt_cnt += n;
+}
+
+int
+pfr_insert_kentry(struct pfr_ktable *kt, struct pfr_addr *ad, long tzero)
+{
+ struct pfr_kentry *p;
+ int rv;
+
+ p = pfr_lookup_addr(kt, ad, 1);
+ if (p != NULL)
+ return (0);
+ p = pfr_create_kentry(ad);
+ if (p == NULL)
+ return (EINVAL);
+
+ rv = pfr_route_kentry(kt, p);
+ if (rv)
+ return (rv);
+
+ p->pfrke_tzero = tzero;
+ kt->pfrkt_cnt++;
+
+ return (0);
+}
+
+static void
+pfr_remove_kentries(struct pfr_ktable *kt,
+ struct pfr_kentryworkq *workq)
+{
+ struct pfr_kentry *p;
+ int n = 0;
+
+ SLIST_FOREACH(p, workq, pfrke_workq) {
+ pfr_unroute_kentry(kt, p);
+ n++;
+ }
+ kt->pfrkt_cnt -= n;
+ pfr_destroy_kentries(workq);
+}
+
+static void
+pfr_clean_node_mask(struct pfr_ktable *kt,
+ struct pfr_kentryworkq *workq)
+{
+ struct pfr_kentry *p;
+
+ SLIST_FOREACH(p, workq, pfrke_workq)
+ pfr_unroute_kentry(kt, p);
+}
+
+static void
+pfr_clstats_kentries(struct pfr_kentryworkq *workq, long tzero, int negchange)
+{
+ struct pfr_kentry *p;
+
+ SLIST_FOREACH(p, workq, pfrke_workq) {
+ if (negchange)
+ p->pfrke_not = !p->pfrke_not;
+ if (p->pfrke_counters) {
+ uma_zfree(V_pfr_kcounters_z, p->pfrke_counters);
+ p->pfrke_counters = NULL;
+ }
+ p->pfrke_tzero = tzero;
+ }
+}
+
+static void
+pfr_reset_feedback(struct pfr_addr *addr, int size)
+{
+ struct pfr_addr *ad;
+ int i;
+
+ for (i = 0, ad = addr; i < size; i++, ad++)
+ ad->pfra_fback = PFR_FB_NONE;
+}
+
+static void
+pfr_prepare_network(union sockaddr_union *sa, int af, int net)
+{
+ int i;
+
+ bzero(sa, sizeof(*sa));
+ if (af == AF_INET) {
+ sa->sin.sin_len = sizeof(sa->sin);
+ sa->sin.sin_family = AF_INET;
+ sa->sin.sin_addr.s_addr = net ? htonl(-1 << (32-net)) : 0;
+ } else if (af == AF_INET6) {
+ sa->sin6.sin6_len = sizeof(sa->sin6);
+ sa->sin6.sin6_family = AF_INET6;
+ for (i = 0; i < 4; i++) {
+ if (net <= 32) {
+ sa->sin6.sin6_addr.s6_addr32[i] =
+ net ? htonl(-1 << (32-net)) : 0;
+ break;
+ }
+ sa->sin6.sin6_addr.s6_addr32[i] = 0xFFFFFFFF;
+ net -= 32;
+ }
+ }
+}
+
+static int
+pfr_route_kentry(struct pfr_ktable *kt, struct pfr_kentry *ke)
+{
+ union sockaddr_union mask;
+ struct radix_node *rn;
+ struct radix_node_head *head = NULL;
+
+ bzero(ke->pfrke_node, sizeof(ke->pfrke_node));
+ if (ke->pfrke_af == AF_INET)
+ head = kt->pfrkt_ip4;
+ else if (ke->pfrke_af == AF_INET6)
+ head = kt->pfrkt_ip6;
+
+ if (KENTRY_NETWORK(ke)) {
+ pfr_prepare_network(&mask, ke->pfrke_af, ke->pfrke_net);
+ rn = rn_addroute(&ke->pfrke_sa, &mask, head, ke->pfrke_node);
+ } else
+ rn = rn_addroute(&ke->pfrke_sa, NULL, head, ke->pfrke_node);
+
+ return (rn == NULL ? -1 : 0);
+}
+
+static int
+pfr_unroute_kentry(struct pfr_ktable *kt, struct pfr_kentry *ke)
+{
+ union sockaddr_union mask;
+ struct radix_node *rn;
+ struct radix_node_head *head = NULL;
+
+ if (ke->pfrke_af == AF_INET)
+ head = kt->pfrkt_ip4;
+ else if (ke->pfrke_af == AF_INET6)
+ head = kt->pfrkt_ip6;
+
+ if (KENTRY_NETWORK(ke)) {
+ pfr_prepare_network(&mask, ke->pfrke_af, ke->pfrke_net);
+ rn = rn_delete(&ke->pfrke_sa, &mask, head);
+ } else
+ rn = rn_delete(&ke->pfrke_sa, NULL, head);
+
+ if (rn == NULL) {
+ printf("pfr_unroute_kentry: delete failed.\n");
+ return (-1);
+ }
+ return (0);
+}
+
+static void
+pfr_copyout_addr(struct pfr_addr *ad, struct pfr_kentry *ke)
+{
+ bzero(ad, sizeof(*ad));
+ if (ke == NULL)
+ return;
+ ad->pfra_af = ke->pfrke_af;
+ ad->pfra_net = ke->pfrke_net;
+ ad->pfra_not = ke->pfrke_not;
+ if (ad->pfra_af == AF_INET)
+ ad->pfra_ip4addr = ke->pfrke_sa.sin.sin_addr;
+ else if (ad->pfra_af == AF_INET6)
+ ad->pfra_ip6addr = ke->pfrke_sa.sin6.sin6_addr;
+}
+
+static int
+pfr_walktree(struct radix_node *rn, void *arg)
+{
+ struct pfr_kentry *ke = (struct pfr_kentry *)rn;
+ struct pfr_walktree *w = arg;
+
+ switch (w->pfrw_op) {
+ case PFRW_MARK:
+ ke->pfrke_mark = 0;
+ break;
+ case PFRW_SWEEP:
+ if (ke->pfrke_mark)
+ break;
+ /* FALLTHROUGH */
+ case PFRW_ENQUEUE:
+ SLIST_INSERT_HEAD(w->pfrw_workq, ke, pfrke_workq);
+ w->pfrw_cnt++;
+ break;
+ case PFRW_GET_ADDRS:
+ if (w->pfrw_free-- > 0) {
+ pfr_copyout_addr(w->pfrw_addr, ke);
+ w->pfrw_addr++;
+ }
+ break;
+ case PFRW_GET_ASTATS:
+ if (w->pfrw_free-- > 0) {
+ struct pfr_astats as;
+
+ pfr_copyout_addr(&as.pfras_a, ke);
+
+ if (ke->pfrke_counters) {
+ bcopy(ke->pfrke_counters->pfrkc_packets,
+ as.pfras_packets, sizeof(as.pfras_packets));
+ bcopy(ke->pfrke_counters->pfrkc_bytes,
+ as.pfras_bytes, sizeof(as.pfras_bytes));
+ } else {
+ bzero(as.pfras_packets, sizeof(as.pfras_packets));
+ bzero(as.pfras_bytes, sizeof(as.pfras_bytes));
+ as.pfras_a.pfra_fback = PFR_FB_NOCOUNT;
+ }
+ as.pfras_tzero = ke->pfrke_tzero;
+
+ bcopy(&as, w->pfrw_astats, sizeof(as));
+ w->pfrw_astats++;
+ }
+ break;
+ case PFRW_POOL_GET:
+ if (ke->pfrke_not)
+ break; /* negative entries are ignored */
+ if (!w->pfrw_cnt--) {
+ w->pfrw_kentry = ke;
+ return (1); /* finish search */
+ }
+ break;
+ case PFRW_DYNADDR_UPDATE:
+ {
+ union sockaddr_union pfr_mask;
+
+ if (ke->pfrke_af == AF_INET) {
+ if (w->pfrw_dyn->pfid_acnt4++ > 0)
+ break;
+ pfr_prepare_network(&pfr_mask, AF_INET, ke->pfrke_net);
+ w->pfrw_dyn->pfid_addr4 = *SUNION2PF(&ke->pfrke_sa,
+ AF_INET);
+ w->pfrw_dyn->pfid_mask4 = *SUNION2PF(&pfr_mask,
+ AF_INET);
+ } else if (ke->pfrke_af == AF_INET6){
+ if (w->pfrw_dyn->pfid_acnt6++ > 0)
+ break;
+ pfr_prepare_network(&pfr_mask, AF_INET6, ke->pfrke_net);
+ w->pfrw_dyn->pfid_addr6 = *SUNION2PF(&ke->pfrke_sa,
+ AF_INET6);
+ w->pfrw_dyn->pfid_mask6 = *SUNION2PF(&pfr_mask,
+ AF_INET6);
+ }
+ break;
+ }
+ }
+ return (0);
+}
+
+int
+pfr_clr_tables(struct pfr_table *filter, int *ndel, int flags)
+{
+ struct pfr_ktableworkq workq;
+ struct pfr_ktable *p;
+ int xdel = 0;
+
+ ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY | PFR_FLAG_ALLRSETS);
+ if (pfr_fix_anchor(filter->pfrt_anchor))
+ return (EINVAL);
+ if (pfr_table_count(filter, flags) < 0)
+ return (ENOENT);
+
+ SLIST_INIT(&workq);
+ RB_FOREACH(p, pfr_ktablehead, &pfr_ktables) {
+ if (pfr_skip_table(filter, p, flags))
+ continue;
+ if (!strcmp(p->pfrkt_anchor, PF_RESERVED_ANCHOR))
+ continue;
+ if (!(p->pfrkt_flags & PFR_TFLAG_ACTIVE))
+ continue;
+ p->pfrkt_nflags = p->pfrkt_flags & ~PFR_TFLAG_ACTIVE;
+ SLIST_INSERT_HEAD(&workq, p, pfrkt_workq);
+ xdel++;
+ }
+ if (!(flags & PFR_FLAG_DUMMY))
+ pfr_setflags_ktables(&workq);
+ if (ndel != NULL)
+ *ndel = xdel;
+ return (0);
+}
+
+int
+pfr_add_tables(struct pfr_table *tbl, int size, int *nadd, int flags)
+{
+ struct pfr_ktableworkq addq, changeq;
+ struct pfr_ktable *p, *q, *r, key;
+ int i, rv, xadd = 0;
+ long tzero = time_second;
+
+ ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY);
+ SLIST_INIT(&addq);
+ SLIST_INIT(&changeq);
+ for (i = 0; i < size; i++) {
+ bcopy(tbl+i, &key.pfrkt_t, sizeof(key.pfrkt_t));
+ if (pfr_validate_table(&key.pfrkt_t, PFR_TFLAG_USRMASK,
+ flags & PFR_FLAG_USERIOCTL))
+ senderr(EINVAL);
+ key.pfrkt_flags |= PFR_TFLAG_ACTIVE;
+ p = RB_FIND(pfr_ktablehead, &pfr_ktables, &key);
+ if (p == NULL) {
+ p = pfr_create_ktable(&key.pfrkt_t, tzero, 1);
+ if (p == NULL)
+ senderr(ENOMEM);
+ SLIST_FOREACH(q, &addq, pfrkt_workq) {
+ if (!pfr_ktable_compare(p, q))
+ goto _skip;
+ }
+ SLIST_INSERT_HEAD(&addq, p, pfrkt_workq);
+ xadd++;
+ if (!key.pfrkt_anchor[0])
+ goto _skip;
+
+ /* find or create root table */
+ bzero(key.pfrkt_anchor, sizeof(key.pfrkt_anchor));
+ r = RB_FIND(pfr_ktablehead, &pfr_ktables, &key);
+ if (r != NULL) {
+ p->pfrkt_root = r;
+ goto _skip;
+ }
+ SLIST_FOREACH(q, &addq, pfrkt_workq) {
+ if (!pfr_ktable_compare(&key, q)) {
+ p->pfrkt_root = q;
+ goto _skip;
+ }
+ }
+ key.pfrkt_flags = 0;
+ r = pfr_create_ktable(&key.pfrkt_t, 0, 1);
+ if (r == NULL)
+ senderr(ENOMEM);
+ SLIST_INSERT_HEAD(&addq, r, pfrkt_workq);
+ p->pfrkt_root = r;
+ } else if (!(p->pfrkt_flags & PFR_TFLAG_ACTIVE)) {
+ SLIST_FOREACH(q, &changeq, pfrkt_workq)
+ if (!pfr_ktable_compare(&key, q))
+ goto _skip;
+ p->pfrkt_nflags = (p->pfrkt_flags &
+ ~PFR_TFLAG_USRMASK) | key.pfrkt_flags;
+ SLIST_INSERT_HEAD(&changeq, p, pfrkt_workq);
+ xadd++;
+ }
+_skip:
+ ;
+ }
+ if (!(flags & PFR_FLAG_DUMMY)) {
+ pfr_insert_ktables(&addq);
+ pfr_setflags_ktables(&changeq);
+ } else
+ pfr_destroy_ktables(&addq, 0);
+ if (nadd != NULL)
+ *nadd = xadd;
+ return (0);
+_bad:
+ pfr_destroy_ktables(&addq, 0);
+ return (rv);
+}
+
+int
+pfr_del_tables(struct pfr_table *tbl, int size, int *ndel, int flags)
+{
+ struct pfr_ktableworkq workq;
+ struct pfr_ktable *p, *q, key;
+ int i, xdel = 0;
+
+ ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY);
+ SLIST_INIT(&workq);
+ for (i = 0; i < size; i++) {
+ bcopy(tbl+i, &key.pfrkt_t, sizeof(key.pfrkt_t));
+ if (pfr_validate_table(&key.pfrkt_t, 0,
+ flags & PFR_FLAG_USERIOCTL))
+ return (EINVAL);
+ p = RB_FIND(pfr_ktablehead, &pfr_ktables, &key);
+ if (p != NULL && (p->pfrkt_flags & PFR_TFLAG_ACTIVE)) {
+ SLIST_FOREACH(q, &workq, pfrkt_workq)
+ if (!pfr_ktable_compare(p, q))
+ goto _skip;
+ p->pfrkt_nflags = p->pfrkt_flags & ~PFR_TFLAG_ACTIVE;
+ SLIST_INSERT_HEAD(&workq, p, pfrkt_workq);
+ xdel++;
+ }
+_skip:
+ ;
+ }
+
+ if (!(flags & PFR_FLAG_DUMMY))
+ pfr_setflags_ktables(&workq);
+ if (ndel != NULL)
+ *ndel = xdel;
+ return (0);
+}
+
+int
+pfr_get_tables(struct pfr_table *filter, struct pfr_table *tbl, int *size,
+ int flags)
+{
+ struct pfr_ktable *p;
+ int n, nn;
+
+ PF_RULES_RASSERT();
+
+ ACCEPT_FLAGS(flags, PFR_FLAG_ALLRSETS);
+ if (pfr_fix_anchor(filter->pfrt_anchor))
+ return (EINVAL);
+ n = nn = pfr_table_count(filter, flags);
+ if (n < 0)
+ return (ENOENT);
+ if (n > *size) {
+ *size = n;
+ return (0);
+ }
+ RB_FOREACH(p, pfr_ktablehead, &pfr_ktables) {
+ if (pfr_skip_table(filter, p, flags))
+ continue;
+ if (n-- <= 0)
+ continue;
+ bcopy(&p->pfrkt_t, tbl++, sizeof(*tbl));
+ }
+
+ KASSERT(n == 0, ("%s: corruption detected (%d)", __func__, n));
+
+ *size = nn;
+ return (0);
+}
+
+int
+pfr_get_tstats(struct pfr_table *filter, struct pfr_tstats *tbl, int *size,
+ int flags)
+{
+ struct pfr_ktable *p;
+ struct pfr_ktableworkq workq;
+ int n, nn;
+ long tzero = time_second;
+
+ /* XXX PFR_FLAG_CLSTATS disabled */
+ ACCEPT_FLAGS(flags, PFR_FLAG_ALLRSETS);
+ if (pfr_fix_anchor(filter->pfrt_anchor))
+ return (EINVAL);
+ n = nn = pfr_table_count(filter, flags);
+ if (n < 0)
+ return (ENOENT);
+ if (n > *size) {
+ *size = n;
+ return (0);
+ }
+ SLIST_INIT(&workq);
+ RB_FOREACH(p, pfr_ktablehead, &pfr_ktables) {
+ if (pfr_skip_table(filter, p, flags))
+ continue;
+ if (n-- <= 0)
+ continue;
+ bcopy(&p->pfrkt_ts, tbl++, sizeof(*tbl));
+ SLIST_INSERT_HEAD(&workq, p, pfrkt_workq);
+ }
+ if (flags & PFR_FLAG_CLSTATS)
+ pfr_clstats_ktables(&workq, tzero,
+ flags & PFR_FLAG_ADDRSTOO);
+
+ KASSERT(n == 0, ("%s: corruption detected (%d)", __func__, n));
+
+ *size = nn;
+ return (0);
+}
+
+int
+pfr_clr_tstats(struct pfr_table *tbl, int size, int *nzero, int flags)
+{
+ struct pfr_ktableworkq workq;
+ struct pfr_ktable *p, key;
+ int i, xzero = 0;
+ long tzero = time_second;
+
+ ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY | PFR_FLAG_ADDRSTOO);
+ SLIST_INIT(&workq);
+ for (i = 0; i < size; i++) {
+ bcopy(tbl + i, &key.pfrkt_t, sizeof(key.pfrkt_t));
+ if (pfr_validate_table(&key.pfrkt_t, 0, 0))
+ return (EINVAL);
+ p = RB_FIND(pfr_ktablehead, &pfr_ktables, &key);
+ if (p != NULL) {
+ SLIST_INSERT_HEAD(&workq, p, pfrkt_workq);
+ xzero++;
+ }
+ }
+ if (!(flags & PFR_FLAG_DUMMY))
+ pfr_clstats_ktables(&workq, tzero, flags & PFR_FLAG_ADDRSTOO);
+ if (nzero != NULL)
+ *nzero = xzero;
+ return (0);
+}
+
+int
+pfr_set_tflags(struct pfr_table *tbl, int size, int setflag, int clrflag,
+ int *nchange, int *ndel, int flags)
+{
+ struct pfr_ktableworkq workq;
+ struct pfr_ktable *p, *q, key;
+ int i, xchange = 0, xdel = 0;
+
+ ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY);
+ if ((setflag & ~PFR_TFLAG_USRMASK) ||
+ (clrflag & ~PFR_TFLAG_USRMASK) ||
+ (setflag & clrflag))
+ return (EINVAL);
+ SLIST_INIT(&workq);
+ for (i = 0; i < size; i++) {
+ bcopy(tbl + i, &key.pfrkt_t, sizeof(key.pfrkt_t));
+ if (pfr_validate_table(&key.pfrkt_t, 0,
+ flags & PFR_FLAG_USERIOCTL))
+ return (EINVAL);
+ p = RB_FIND(pfr_ktablehead, &pfr_ktables, &key);
+ if (p != NULL && (p->pfrkt_flags & PFR_TFLAG_ACTIVE)) {
+ p->pfrkt_nflags = (p->pfrkt_flags | setflag) &
+ ~clrflag;
+ if (p->pfrkt_nflags == p->pfrkt_flags)
+ goto _skip;
+ SLIST_FOREACH(q, &workq, pfrkt_workq)
+ if (!pfr_ktable_compare(p, q))
+ goto _skip;
+ SLIST_INSERT_HEAD(&workq, p, pfrkt_workq);
+ if ((p->pfrkt_flags & PFR_TFLAG_PERSIST) &&
+ (clrflag & PFR_TFLAG_PERSIST) &&
+ !(p->pfrkt_flags & PFR_TFLAG_REFERENCED))
+ xdel++;
+ else
+ xchange++;
+ }
+_skip:
+ ;
+ }
+ if (!(flags & PFR_FLAG_DUMMY))
+ pfr_setflags_ktables(&workq);
+ if (nchange != NULL)
+ *nchange = xchange;
+ if (ndel != NULL)
+ *ndel = xdel;
+ return (0);
+}
+
+int
+pfr_ina_begin(struct pfr_table *trs, u_int32_t *ticket, int *ndel, int flags)
+{
+ struct pfr_ktableworkq workq;
+ struct pfr_ktable *p;
+ struct pf_ruleset *rs;
+ int xdel = 0;
+
+ ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY);
+ rs = pf_find_or_create_ruleset(trs->pfrt_anchor);
+ if (rs == NULL)
+ return (ENOMEM);
+ SLIST_INIT(&workq);
+ RB_FOREACH(p, pfr_ktablehead, &pfr_ktables) {
+ if (!(p->pfrkt_flags & PFR_TFLAG_INACTIVE) ||
+ pfr_skip_table(trs, p, 0))
+ continue;
+ p->pfrkt_nflags = p->pfrkt_flags & ~PFR_TFLAG_INACTIVE;
+ SLIST_INSERT_HEAD(&workq, p, pfrkt_workq);
+ xdel++;
+ }
+ if (!(flags & PFR_FLAG_DUMMY)) {
+ pfr_setflags_ktables(&workq);
+ if (ticket != NULL)
+ *ticket = ++rs->tticket;
+ rs->topen = 1;
+ } else
+ pf_remove_if_empty_ruleset(rs);
+ if (ndel != NULL)
+ *ndel = xdel;
+ return (0);
+}
+
+int
+pfr_ina_define(struct pfr_table *tbl, struct pfr_addr *addr, int size,
+ int *nadd, int *naddr, u_int32_t ticket, int flags)
+{
+ struct pfr_ktableworkq tableq;
+ struct pfr_kentryworkq addrq;
+ struct pfr_ktable *kt, *rt, *shadow, key;
+ struct pfr_kentry *p;
+ struct pfr_addr *ad;
+ struct pf_ruleset *rs;
+ int i, rv, xadd = 0, xaddr = 0;
+
+ PF_RULES_WASSERT();
+
+ ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY | PFR_FLAG_ADDRSTOO);
+ if (size && !(flags & PFR_FLAG_ADDRSTOO))
+ return (EINVAL);
+ if (pfr_validate_table(tbl, PFR_TFLAG_USRMASK,
+ flags & PFR_FLAG_USERIOCTL))
+ return (EINVAL);
+ rs = pf_find_ruleset(tbl->pfrt_anchor);
+ if (rs == NULL || !rs->topen || ticket != rs->tticket)
+ return (EBUSY);
+ tbl->pfrt_flags |= PFR_TFLAG_INACTIVE;
+ SLIST_INIT(&tableq);
+ kt = RB_FIND(pfr_ktablehead, &pfr_ktables, (struct pfr_ktable *)tbl);
+ if (kt == NULL) {
+ kt = pfr_create_ktable(tbl, 0, 1);
+ if (kt == NULL)
+ return (ENOMEM);
+ SLIST_INSERT_HEAD(&tableq, kt, pfrkt_workq);
+ xadd++;
+ if (!tbl->pfrt_anchor[0])
+ goto _skip;
+
+ /* find or create root table */
+ bzero(&key, sizeof(key));
+ strlcpy(key.pfrkt_name, tbl->pfrt_name, sizeof(key.pfrkt_name));
+ rt = RB_FIND(pfr_ktablehead, &pfr_ktables, &key);
+ if (rt != NULL) {
+ kt->pfrkt_root = rt;
+ goto _skip;
+ }
+ rt = pfr_create_ktable(&key.pfrkt_t, 0, 1);
+ if (rt == NULL) {
+ pfr_destroy_ktables(&tableq, 0);
+ return (ENOMEM);
+ }
+ SLIST_INSERT_HEAD(&tableq, rt, pfrkt_workq);
+ kt->pfrkt_root = rt;
+ } else if (!(kt->pfrkt_flags & PFR_TFLAG_INACTIVE))
+ xadd++;
+_skip:
+ shadow = pfr_create_ktable(tbl, 0, 0);
+ if (shadow == NULL) {
+ pfr_destroy_ktables(&tableq, 0);
+ return (ENOMEM);
+ }
+ SLIST_INIT(&addrq);
+ for (i = 0, ad = addr; i < size; i++, ad++) {
+ if (pfr_validate_addr(ad))
+ senderr(EINVAL);
+ if (pfr_lookup_addr(shadow, ad, 1) != NULL)
+ continue;
+ p = pfr_create_kentry(ad);
+ if (p == NULL)
+ senderr(ENOMEM);
+ if (pfr_route_kentry(shadow, p)) {
+ pfr_destroy_kentry(p);
+ continue;
+ }
+ SLIST_INSERT_HEAD(&addrq, p, pfrke_workq);
+ xaddr++;
+ }
+ if (!(flags & PFR_FLAG_DUMMY)) {
+ if (kt->pfrkt_shadow != NULL)
+ pfr_destroy_ktable(kt->pfrkt_shadow, 1);
+ kt->pfrkt_flags |= PFR_TFLAG_INACTIVE;
+ pfr_insert_ktables(&tableq);
+ shadow->pfrkt_cnt = (flags & PFR_FLAG_ADDRSTOO) ?
+ xaddr : NO_ADDRESSES;
+ kt->pfrkt_shadow = shadow;
+ } else {
+ pfr_clean_node_mask(shadow, &addrq);
+ pfr_destroy_ktable(shadow, 0);
+ pfr_destroy_ktables(&tableq, 0);
+ pfr_destroy_kentries(&addrq);
+ }
+ if (nadd != NULL)
+ *nadd = xadd;
+ if (naddr != NULL)
+ *naddr = xaddr;
+ return (0);
+_bad:
+ pfr_destroy_ktable(shadow, 0);
+ pfr_destroy_ktables(&tableq, 0);
+ pfr_destroy_kentries(&addrq);
+ return (rv);
+}
+
+int
+pfr_ina_rollback(struct pfr_table *trs, u_int32_t ticket, int *ndel, int flags)
+{
+ struct pfr_ktableworkq workq;
+ struct pfr_ktable *p;
+ struct pf_ruleset *rs;
+ int xdel = 0;
+
+ PF_RULES_WASSERT();
+
+ ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY);
+ rs = pf_find_ruleset(trs->pfrt_anchor);
+ if (rs == NULL || !rs->topen || ticket != rs->tticket)
+ return (0);
+ SLIST_INIT(&workq);
+ RB_FOREACH(p, pfr_ktablehead, &pfr_ktables) {
+ if (!(p->pfrkt_flags & PFR_TFLAG_INACTIVE) ||
+ pfr_skip_table(trs, p, 0))
+ continue;
+ p->pfrkt_nflags = p->pfrkt_flags & ~PFR_TFLAG_INACTIVE;
+ SLIST_INSERT_HEAD(&workq, p, pfrkt_workq);
+ xdel++;
+ }
+ if (!(flags & PFR_FLAG_DUMMY)) {
+ pfr_setflags_ktables(&workq);
+ rs->topen = 0;
+ pf_remove_if_empty_ruleset(rs);
+ }
+ if (ndel != NULL)
+ *ndel = xdel;
+ return (0);
+}
+
+int
+pfr_ina_commit(struct pfr_table *trs, u_int32_t ticket, int *nadd,
+ int *nchange, int flags)
+{
+ struct pfr_ktable *p, *q;
+ struct pfr_ktableworkq workq;
+ struct pf_ruleset *rs;
+ int xadd = 0, xchange = 0;
+ long tzero = time_second;
+
+ PF_RULES_WASSERT();
+
+ ACCEPT_FLAGS(flags, PFR_FLAG_DUMMY);
+ rs = pf_find_ruleset(trs->pfrt_anchor);
+ if (rs == NULL || !rs->topen || ticket != rs->tticket)
+ return (EBUSY);
+
+ SLIST_INIT(&workq);
+ RB_FOREACH(p, pfr_ktablehead, &pfr_ktables) {
+ if (!(p->pfrkt_flags & PFR_TFLAG_INACTIVE) ||
+ pfr_skip_table(trs, p, 0))
+ continue;
+ SLIST_INSERT_HEAD(&workq, p, pfrkt_workq);
+ if (p->pfrkt_flags & PFR_TFLAG_ACTIVE)
+ xchange++;
+ else
+ xadd++;
+ }
+
+ if (!(flags & PFR_FLAG_DUMMY)) {
+ for (p = SLIST_FIRST(&workq); p != NULL; p = q) {
+ q = SLIST_NEXT(p, pfrkt_workq);
+ pfr_commit_ktable(p, tzero);
+ }
+ rs->topen = 0;
+ pf_remove_if_empty_ruleset(rs);
+ }
+ if (nadd != NULL)
+ *nadd = xadd;
+ if (nchange != NULL)
+ *nchange = xchange;
+
+ return (0);
+}
+
+static void
+pfr_commit_ktable(struct pfr_ktable *kt, long tzero)
+{
+ struct pfr_ktable *shadow = kt->pfrkt_shadow;
+ int nflags;
+
+ PF_RULES_WASSERT();
+
+ if (shadow->pfrkt_cnt == NO_ADDRESSES) {
+ if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE))
+ pfr_clstats_ktable(kt, tzero, 1);
+ } else if (kt->pfrkt_flags & PFR_TFLAG_ACTIVE) {
+ /* kt might contain addresses */
+ struct pfr_kentryworkq addrq, addq, changeq, delq, garbageq;
+ struct pfr_kentry *p, *q, *next;
+ struct pfr_addr ad;
+
+ pfr_enqueue_addrs(shadow, &addrq, NULL, 0);
+ pfr_mark_addrs(kt);
+ SLIST_INIT(&addq);
+ SLIST_INIT(&changeq);
+ SLIST_INIT(&delq);
+ SLIST_INIT(&garbageq);
+ pfr_clean_node_mask(shadow, &addrq);
+ for (p = SLIST_FIRST(&addrq); p != NULL; p = next) {
+ next = SLIST_NEXT(p, pfrke_workq); /* XXX */
+ pfr_copyout_addr(&ad, p);
+ q = pfr_lookup_addr(kt, &ad, 1);
+ if (q != NULL) {
+ if (q->pfrke_not != p->pfrke_not)
+ SLIST_INSERT_HEAD(&changeq, q,
+ pfrke_workq);
+ q->pfrke_mark = 1;
+ SLIST_INSERT_HEAD(&garbageq, p, pfrke_workq);
+ } else {
+ p->pfrke_tzero = tzero;
+ SLIST_INSERT_HEAD(&addq, p, pfrke_workq);
+ }
+ }
+ pfr_enqueue_addrs(kt, &delq, NULL, ENQUEUE_UNMARKED_ONLY);
+ pfr_insert_kentries(kt, &addq, tzero);
+ pfr_remove_kentries(kt, &delq);
+ pfr_clstats_kentries(&changeq, tzero, INVERT_NEG_FLAG);
+ pfr_destroy_kentries(&garbageq);
+ } else {
+ /* kt cannot contain addresses */
+ SWAP(struct radix_node_head *, kt->pfrkt_ip4,
+ shadow->pfrkt_ip4);
+ SWAP(struct radix_node_head *, kt->pfrkt_ip6,
+ shadow->pfrkt_ip6);
+ SWAP(int, kt->pfrkt_cnt, shadow->pfrkt_cnt);
+ pfr_clstats_ktable(kt, tzero, 1);
+ }
+ nflags = ((shadow->pfrkt_flags & PFR_TFLAG_USRMASK) |
+ (kt->pfrkt_flags & PFR_TFLAG_SETMASK) | PFR_TFLAG_ACTIVE)
+ & ~PFR_TFLAG_INACTIVE;
+ pfr_destroy_ktable(shadow, 0);
+ kt->pfrkt_shadow = NULL;
+ pfr_setflags_ktable(kt, nflags);
+}
+
+static int
+pfr_validate_table(struct pfr_table *tbl, int allowedflags, int no_reserved)
+{
+ int i;
+
+ if (!tbl->pfrt_name[0])
+ return (-1);
+ if (no_reserved && !strcmp(tbl->pfrt_anchor, PF_RESERVED_ANCHOR))
+ return (-1);
+ if (tbl->pfrt_name[PF_TABLE_NAME_SIZE-1])
+ return (-1);
+ for (i = strlen(tbl->pfrt_name); i < PF_TABLE_NAME_SIZE; i++)
+ if (tbl->pfrt_name[i])
+ return (-1);
+ if (pfr_fix_anchor(tbl->pfrt_anchor))
+ return (-1);
+ if (tbl->pfrt_flags & ~allowedflags)
+ return (-1);
+ return (0);
+}
+
+/*
+ * Rewrite anchors referenced by tables to remove slashes
+ * and check for validity.
+ */
+static int
+pfr_fix_anchor(char *anchor)
+{
+ size_t siz = MAXPATHLEN;
+ int i;
+
+ if (anchor[0] == '/') {
+ char *path;
+ int off;
+
+ path = anchor;
+ off = 1;
+ while (*++path == '/')
+ off++;
+ bcopy(path, anchor, siz - off);
+ memset(anchor + siz - off, 0, off);
+ }
+ if (anchor[siz - 1])
+ return (-1);
+ for (i = strlen(anchor); i < siz; i++)
+ if (anchor[i])
+ return (-1);
+ return (0);
+}
+
+static int
+pfr_table_count(struct pfr_table *filter, int flags)
+{
+ struct pf_ruleset *rs;
+
+ PF_RULES_ASSERT();
+
+ if (flags & PFR_FLAG_ALLRSETS)
+ return (pfr_ktable_cnt);
+ if (filter->pfrt_anchor[0]) {
+ rs = pf_find_ruleset(filter->pfrt_anchor);
+ return ((rs != NULL) ? rs->tables : -1);
+ }
+ return (pf_main_ruleset.tables);
+}
+
+static int
+pfr_skip_table(struct pfr_table *filter, struct pfr_ktable *kt, int flags)
+{
+ if (flags & PFR_FLAG_ALLRSETS)
+ return (0);
+ if (strcmp(filter->pfrt_anchor, kt->pfrkt_anchor))
+ return (1);
+ return (0);
+}
+
+static void
+pfr_insert_ktables(struct pfr_ktableworkq *workq)
+{
+ struct pfr_ktable *p;
+
+ SLIST_FOREACH(p, workq, pfrkt_workq)
+ pfr_insert_ktable(p);
+}
+
+static void
+pfr_insert_ktable(struct pfr_ktable *kt)
+{
+
+ PF_RULES_WASSERT();
+
+ RB_INSERT(pfr_ktablehead, &pfr_ktables, kt);
+ pfr_ktable_cnt++;
+ if (kt->pfrkt_root != NULL)
+ if (!kt->pfrkt_root->pfrkt_refcnt[PFR_REFCNT_ANCHOR]++)
+ pfr_setflags_ktable(kt->pfrkt_root,
+ kt->pfrkt_root->pfrkt_flags|PFR_TFLAG_REFDANCHOR);
+}
+
+static void
+pfr_setflags_ktables(struct pfr_ktableworkq *workq)
+{
+ struct pfr_ktable *p, *q;
+
+ for (p = SLIST_FIRST(workq); p; p = q) {
+ q = SLIST_NEXT(p, pfrkt_workq);
+ pfr_setflags_ktable(p, p->pfrkt_nflags);
+ }
+}
+
+static void
+pfr_setflags_ktable(struct pfr_ktable *kt, int newf)
+{
+ struct pfr_kentryworkq addrq;
+
+ PF_RULES_WASSERT();
+
+ if (!(newf & PFR_TFLAG_REFERENCED) &&
+ !(newf & PFR_TFLAG_PERSIST))
+ newf &= ~PFR_TFLAG_ACTIVE;
+ if (!(newf & PFR_TFLAG_ACTIVE))
+ newf &= ~PFR_TFLAG_USRMASK;
+ if (!(newf & PFR_TFLAG_SETMASK)) {
+ RB_REMOVE(pfr_ktablehead, &pfr_ktables, kt);
+ if (kt->pfrkt_root != NULL)
+ if (!--kt->pfrkt_root->pfrkt_refcnt[PFR_REFCNT_ANCHOR])
+ pfr_setflags_ktable(kt->pfrkt_root,
+ kt->pfrkt_root->pfrkt_flags &
+ ~PFR_TFLAG_REFDANCHOR);
+ pfr_destroy_ktable(kt, 1);
+ pfr_ktable_cnt--;
+ return;
+ }
+ if (!(newf & PFR_TFLAG_ACTIVE) && kt->pfrkt_cnt) {
+ pfr_enqueue_addrs(kt, &addrq, NULL, 0);
+ pfr_remove_kentries(kt, &addrq);
+ }
+ if (!(newf & PFR_TFLAG_INACTIVE) && kt->pfrkt_shadow != NULL) {
+ pfr_destroy_ktable(kt->pfrkt_shadow, 1);
+ kt->pfrkt_shadow = NULL;
+ }
+ kt->pfrkt_flags = newf;
+}
+
+static void
+pfr_clstats_ktables(struct pfr_ktableworkq *workq, long tzero, int recurse)
+{
+ struct pfr_ktable *p;
+
+ SLIST_FOREACH(p, workq, pfrkt_workq)
+ pfr_clstats_ktable(p, tzero, recurse);
+}
+
+static void
+pfr_clstats_ktable(struct pfr_ktable *kt, long tzero, int recurse)
+{
+ struct pfr_kentryworkq addrq;
+
+ if (recurse) {
+ pfr_enqueue_addrs(kt, &addrq, NULL, 0);
+ pfr_clstats_kentries(&addrq, tzero, 0);
+ }
+ bzero(kt->pfrkt_packets, sizeof(kt->pfrkt_packets));
+ bzero(kt->pfrkt_bytes, sizeof(kt->pfrkt_bytes));
+ kt->pfrkt_match = kt->pfrkt_nomatch = 0;
+ kt->pfrkt_tzero = tzero;
+}
+
+static struct pfr_ktable *
+pfr_create_ktable(struct pfr_table *tbl, long tzero, int attachruleset)
+{
+ struct pfr_ktable *kt;
+ struct pf_ruleset *rs;
+
+ PF_RULES_WASSERT();
+
+ kt = malloc(sizeof(*kt), M_PFTABLE, M_NOWAIT|M_ZERO);
+ if (kt == NULL)
+ return (NULL);
+ kt->pfrkt_t = *tbl;
+
+ if (attachruleset) {
+ rs = pf_find_or_create_ruleset(tbl->pfrt_anchor);
+ if (!rs) {
+ pfr_destroy_ktable(kt, 0);
+ return (NULL);
+ }
+ kt->pfrkt_rs = rs;
+ rs->tables++;
+ }
+
+ if (!rn_inithead((void **)&kt->pfrkt_ip4,
+ offsetof(struct sockaddr_in, sin_addr) * 8) ||
+ !rn_inithead((void **)&kt->pfrkt_ip6,
+ offsetof(struct sockaddr_in6, sin6_addr) * 8)) {
+ pfr_destroy_ktable(kt, 0);
+ return (NULL);
+ }
+ kt->pfrkt_tzero = tzero;
+
+ return (kt);
+}
+
+static void
+pfr_destroy_ktables(struct pfr_ktableworkq *workq, int flushaddr)
+{
+ struct pfr_ktable *p, *q;
+
+ for (p = SLIST_FIRST(workq); p; p = q) {
+ q = SLIST_NEXT(p, pfrkt_workq);
+ pfr_destroy_ktable(p, flushaddr);
+ }
+}
+
+static void
+pfr_destroy_ktable(struct pfr_ktable *kt, int flushaddr)
+{
+ struct pfr_kentryworkq addrq;
+
+ if (flushaddr) {
+ pfr_enqueue_addrs(kt, &addrq, NULL, 0);
+ pfr_clean_node_mask(kt, &addrq);
+ pfr_destroy_kentries(&addrq);
+ }
+ if (kt->pfrkt_ip4 != NULL) {
+ RADIX_NODE_HEAD_DESTROY(kt->pfrkt_ip4);
+ free((caddr_t)kt->pfrkt_ip4, M_RTABLE);
+ }
+ if (kt->pfrkt_ip6 != NULL) {
+ RADIX_NODE_HEAD_DESTROY(kt->pfrkt_ip6);
+ free((caddr_t)kt->pfrkt_ip6, M_RTABLE);
+ }
+ if (kt->pfrkt_shadow != NULL)
+ pfr_destroy_ktable(kt->pfrkt_shadow, flushaddr);
+ if (kt->pfrkt_rs != NULL) {
+ kt->pfrkt_rs->tables--;
+ pf_remove_if_empty_ruleset(kt->pfrkt_rs);
+ }
+ free(kt, M_PFTABLE);
+}
+
+static int
+pfr_ktable_compare(struct pfr_ktable *p, struct pfr_ktable *q)
+{
+ int d;
+
+ if ((d = strncmp(p->pfrkt_name, q->pfrkt_name, PF_TABLE_NAME_SIZE)))
+ return (d);
+ return (strcmp(p->pfrkt_anchor, q->pfrkt_anchor));
+}
+
+static struct pfr_ktable *
+pfr_lookup_table(struct pfr_table *tbl)
+{
+ /* struct pfr_ktable start like a struct pfr_table */
+ return (RB_FIND(pfr_ktablehead, &pfr_ktables,
+ (struct pfr_ktable *)tbl));
+}
+
+int
+pfr_match_addr(struct pfr_ktable *kt, struct pf_addr *a, sa_family_t af)
+{
+ struct pfr_kentry *ke = NULL;
+ int match;
+
+ PF_RULES_RASSERT();
+
+ if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL)
+ kt = kt->pfrkt_root;
+ if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE))
+ return (0);
+
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ {
+ struct sockaddr_in sin;
+
+ bzero(&sin, sizeof(sin));
+ sin.sin_len = sizeof(sin);
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = a->addr32[0];
+ ke = (struct pfr_kentry *)rn_match(&sin, kt->pfrkt_ip4);
+ if (ke && KENTRY_RNF_ROOT(ke))
+ ke = NULL;
+ break;
+ }
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ {
+ struct sockaddr_in6 sin6;
+
+ bzero(&sin6, sizeof(sin6));
+ sin6.sin6_len = sizeof(sin6);
+ sin6.sin6_family = AF_INET6;
+ bcopy(a, &sin6.sin6_addr, sizeof(sin6.sin6_addr));
+ ke = (struct pfr_kentry *)rn_match(&sin6, kt->pfrkt_ip6);
+ if (ke && KENTRY_RNF_ROOT(ke))
+ ke = NULL;
+ break;
+ }
+#endif /* INET6 */
+ }
+ match = (ke && !ke->pfrke_not);
+ if (match)
+ kt->pfrkt_match++;
+ else
+ kt->pfrkt_nomatch++;
+ return (match);
+}
+
+void
+pfr_update_stats(struct pfr_ktable *kt, struct pf_addr *a, sa_family_t af,
+ u_int64_t len, int dir_out, int op_pass, int notrule)
+{
+ struct pfr_kentry *ke = NULL;
+
+ if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL)
+ kt = kt->pfrkt_root;
+ if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE))
+ return;
+
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ {
+ struct sockaddr_in sin;
+
+ sin.sin_len = sizeof(sin);
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = a->addr32[0];
+ ke = (struct pfr_kentry *)rn_match(&sin, kt->pfrkt_ip4);
+ if (ke && KENTRY_RNF_ROOT(ke))
+ ke = NULL;
+ break;
+ }
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ {
+ struct sockaddr_in6 sin6;
+
+ sin6.sin6_len = sizeof(sin6);
+ sin6.sin6_family = AF_INET6;
+ bcopy(a, &sin6.sin6_addr, sizeof(sin6.sin6_addr));
+ ke = (struct pfr_kentry *)rn_match(&sin6, kt->pfrkt_ip6);
+ if (ke && KENTRY_RNF_ROOT(ke))
+ ke = NULL;
+ break;
+ }
+#endif /* INET6 */
+ default:
+ ;
+ }
+ if ((ke == NULL || ke->pfrke_not) != notrule) {
+ if (op_pass != PFR_OP_PASS)
+ printf("pfr_update_stats: assertion failed.\n");
+ op_pass = PFR_OP_XPASS;
+ }
+ kt->pfrkt_packets[dir_out][op_pass]++;
+ kt->pfrkt_bytes[dir_out][op_pass] += len;
+ if (ke != NULL && op_pass != PFR_OP_XPASS &&
+ (kt->pfrkt_flags & PFR_TFLAG_COUNTERS)) {
+ if (ke->pfrke_counters == NULL)
+ ke->pfrke_counters = uma_zalloc(V_pfr_kcounters_z,
+ M_NOWAIT | M_ZERO);
+ if (ke->pfrke_counters != NULL) {
+ ke->pfrke_counters->pfrkc_packets[dir_out][op_pass]++;
+ ke->pfrke_counters->pfrkc_bytes[dir_out][op_pass] += len;
+ }
+ }
+}
+
+struct pfr_ktable *
+pfr_attach_table(struct pf_ruleset *rs, char *name)
+{
+ struct pfr_ktable *kt, *rt;
+ struct pfr_table tbl;
+ struct pf_anchor *ac = rs->anchor;
+
+ PF_RULES_WASSERT();
+
+ bzero(&tbl, sizeof(tbl));
+ strlcpy(tbl.pfrt_name, name, sizeof(tbl.pfrt_name));
+ if (ac != NULL)
+ strlcpy(tbl.pfrt_anchor, ac->path, sizeof(tbl.pfrt_anchor));
+ kt = pfr_lookup_table(&tbl);
+ if (kt == NULL) {
+ kt = pfr_create_ktable(&tbl, time_second, 1);
+ if (kt == NULL)
+ return (NULL);
+ if (ac != NULL) {
+ bzero(tbl.pfrt_anchor, sizeof(tbl.pfrt_anchor));
+ rt = pfr_lookup_table(&tbl);
+ if (rt == NULL) {
+ rt = pfr_create_ktable(&tbl, 0, 1);
+ if (rt == NULL) {
+ pfr_destroy_ktable(kt, 0);
+ return (NULL);
+ }
+ pfr_insert_ktable(rt);
+ }
+ kt->pfrkt_root = rt;
+ }
+ pfr_insert_ktable(kt);
+ }
+ if (!kt->pfrkt_refcnt[PFR_REFCNT_RULE]++)
+ pfr_setflags_ktable(kt, kt->pfrkt_flags|PFR_TFLAG_REFERENCED);
+ return (kt);
+}
+
+void
+pfr_detach_table(struct pfr_ktable *kt)
+{
+
+ PF_RULES_WASSERT();
+ KASSERT(kt->pfrkt_refcnt[PFR_REFCNT_RULE] > 0, ("%s: refcount %d\n",
+ __func__, kt->pfrkt_refcnt[PFR_REFCNT_RULE]));
+
+ if (!--kt->pfrkt_refcnt[PFR_REFCNT_RULE])
+ pfr_setflags_ktable(kt, kt->pfrkt_flags&~PFR_TFLAG_REFERENCED);
+}
+
+int
+pfr_pool_get(struct pfr_ktable *kt, int *pidx, struct pf_addr *counter,
+ sa_family_t af)
+{
+ struct pf_addr *addr, *cur, *mask;
+ union sockaddr_union uaddr, umask;
+ struct pfr_kentry *ke, *ke2 = NULL;
+ int idx = -1, use_counter = 0;
+
+ switch (af) {
+ case AF_INET:
+ uaddr.sin.sin_len = sizeof(struct sockaddr_in);
+ uaddr.sin.sin_family = AF_INET;
+ break;
+ case AF_INET6:
+ uaddr.sin6.sin6_len = sizeof(struct sockaddr_in6);
+ uaddr.sin6.sin6_family = AF_INET6;
+ break;
+ }
+ addr = SUNION2PF(&uaddr, af);
+
+ if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL)
+ kt = kt->pfrkt_root;
+ if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE))
+ return (-1);
+
+ if (pidx != NULL)
+ idx = *pidx;
+ if (counter != NULL && idx >= 0)
+ use_counter = 1;
+ if (idx < 0)
+ idx = 0;
+
+_next_block:
+ ke = pfr_kentry_byidx(kt, idx, af);
+ if (ke == NULL) {
+ kt->pfrkt_nomatch++;
+ return (1);
+ }
+ pfr_prepare_network(&umask, af, ke->pfrke_net);
+ cur = SUNION2PF(&ke->pfrke_sa, af);
+ mask = SUNION2PF(&umask, af);
+
+ if (use_counter) {
+ /* is supplied address within block? */
+ if (!PF_MATCHA(0, cur, mask, counter, af)) {
+ /* no, go to next block in table */
+ idx++;
+ use_counter = 0;
+ goto _next_block;
+ }
+ PF_ACPY(addr, counter, af);
+ } else {
+ /* use first address of block */
+ PF_ACPY(addr, cur, af);
+ }
+
+ if (!KENTRY_NETWORK(ke)) {
+ /* this is a single IP address - no possible nested block */
+ PF_ACPY(counter, addr, af);
+ *pidx = idx;
+ kt->pfrkt_match++;
+ return (0);
+ }
+ for (;;) {
+ /* we don't want to use a nested block */
+ switch (af) {
+ case AF_INET:
+ ke2 = (struct pfr_kentry *)rn_match(&uaddr,
+ kt->pfrkt_ip4);
+ break;
+ case AF_INET6:
+ ke2 = (struct pfr_kentry *)rn_match(&uaddr,
+ kt->pfrkt_ip6);
+ break;
+ }
+ /* no need to check KENTRY_RNF_ROOT() here */
+ if (ke2 == ke) {
+ /* lookup return the same block - perfect */
+ PF_ACPY(counter, addr, af);
+ *pidx = idx;
+ kt->pfrkt_match++;
+ return (0);
+ }
+
+ /* we need to increase the counter past the nested block */
+ pfr_prepare_network(&umask, AF_INET, ke2->pfrke_net);
+ PF_POOLMASK(addr, addr, SUNION2PF(&umask, af), &pfr_ffaddr, af);
+ PF_AINC(addr, af);
+ if (!PF_MATCHA(0, cur, mask, addr, af)) {
+ /* ok, we reached the end of our main block */
+ /* go to next block in table */
+ idx++;
+ use_counter = 0;
+ goto _next_block;
+ }
+ }
+}
+
+static struct pfr_kentry *
+pfr_kentry_byidx(struct pfr_ktable *kt, int idx, int af)
+{
+ struct pfr_walktree w;
+
+ bzero(&w, sizeof(w));
+ w.pfrw_op = PFRW_POOL_GET;
+ w.pfrw_cnt = idx;
+
+ switch (af) {
+#ifdef INET
+ case AF_INET:
+ kt->pfrkt_ip4->rnh_walktree(kt->pfrkt_ip4, pfr_walktree, &w);
+ return (w.pfrw_kentry);
+#endif /* INET */
+#ifdef INET6
+ case AF_INET6:
+ kt->pfrkt_ip6->rnh_walktree(kt->pfrkt_ip6, pfr_walktree, &w);
+ return (w.pfrw_kentry);
+#endif /* INET6 */
+ default:
+ return (NULL);
+ }
+}
+
+void
+pfr_dynaddr_update(struct pfr_ktable *kt, struct pfi_dynaddr *dyn)
+{
+ struct pfr_walktree w;
+
+ bzero(&w, sizeof(w));
+ w.pfrw_op = PFRW_DYNADDR_UPDATE;
+ w.pfrw_dyn = dyn;
+
+ dyn->pfid_acnt4 = 0;
+ dyn->pfid_acnt6 = 0;
+ if (!dyn->pfid_af || dyn->pfid_af == AF_INET)
+ kt->pfrkt_ip4->rnh_walktree(kt->pfrkt_ip4, pfr_walktree, &w);
+ if (!dyn->pfid_af || dyn->pfid_af == AF_INET6)
+ kt->pfrkt_ip6->rnh_walktree(kt->pfrkt_ip6, pfr_walktree, &w);
+}
OpenPOWER on IntegriCloud