diff options
author | glebius <glebius@FreeBSD.org> | 2012-09-14 11:51:49 +0000 |
---|---|---|
committer | glebius <glebius@FreeBSD.org> | 2012-09-14 11:51:49 +0000 |
commit | 0ccf4838d7a8b4da2c3beaac7ea1fd977aa0ed11 (patch) | |
tree | ec60da6e90cde2e87aa91ac9450c84ce3446233a /sys/netinet | |
parent | f99fc207edf21e7c05c1147864077ce3fe1f3e2c (diff) | |
download | FreeBSD-src-0ccf4838d7a8b4da2c3beaac7ea1fd977aa0ed11.zip FreeBSD-src-0ccf4838d7a8b4da2c3beaac7ea1fd977aa0ed11.tar.gz |
o Create directory sys/netpfil, where all packet filters should
reside, and move there ipfw(4) and pf(4).
o Move most modified parts of pf out of contrib.
Actual movements:
sys/contrib/pf/net/*.c -> sys/netpfil/pf/
sys/contrib/pf/net/*.h -> sys/net/
contrib/pf/pfctl/*.c -> sbin/pfctl
contrib/pf/pfctl/*.h -> sbin/pfctl
contrib/pf/pfctl/pfctl.8 -> sbin/pfctl
contrib/pf/pfctl/*.4 -> share/man/man4
contrib/pf/pfctl/*.5 -> share/man/man5
sys/netinet/ipfw -> sys/netpfil/ipfw
The arguable movement is pf/net/*.h -> sys/net. There are
future plans to refactor pf includes, so I decided not to
break things twice.
Not modified bits of pf left in contrib: authpf, ftp-proxy,
tftp-proxy, pflogd.
The ipfw(4) movement is planned to be merged to stable/9,
to make head and stable match.
Discussed with: bz, luigi
Diffstat (limited to 'sys/netinet')
27 files changed, 0 insertions, 17622 deletions
diff --git a/sys/netinet/ipfw/dn_heap.c b/sys/netinet/ipfw/dn_heap.c deleted file mode 100644 index 3bdfd9d..0000000 --- a/sys/netinet/ipfw/dn_heap.c +++ /dev/null @@ -1,552 +0,0 @@ -/*- - * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa - * All rights reserved - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * Binary heap and hash tables, used in dummynet - * - * $FreeBSD$ - */ - -#include <sys/cdefs.h> -#include <sys/param.h> -#ifdef _KERNEL -__FBSDID("$FreeBSD$"); -#include <sys/systm.h> -#include <sys/malloc.h> -#include <sys/kernel.h> -#include <netinet/ipfw/dn_heap.h> -#ifndef log -#define log(x, arg...) -#endif - -#else /* !_KERNEL */ - -#include <stdio.h> -#include <dn_test.h> -#include <strings.h> -#include <stdlib.h> - -#include "dn_heap.h" -#define log(x, arg...) fprintf(stderr, ## arg) -#define panic(x...) fprintf(stderr, ## x), exit(1) -#define MALLOC_DEFINE(a, b, c) -static void *my_malloc(int s) { return malloc(s); } -static void my_free(void *p) { free(p); } -#define malloc(s, t, w) my_malloc(s) -#define free(p, t) my_free(p) -#endif /* !_KERNEL */ - -static MALLOC_DEFINE(M_DN_HEAP, "dummynet", "dummynet heap"); - -/* - * Heap management functions. - * - * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2. - * Some macros help finding parent/children so we can optimize them. - * - * heap_init() is called to expand the heap when needed. - * Increment size in blocks of 16 entries. - * Returns 1 on error, 0 on success - */ -#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 ) -#define HEAP_LEFT(x) ( (x)+(x) + 1 ) -#define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; } -#define HEAP_INCREMENT 15 - -static int -heap_resize(struct dn_heap *h, unsigned int new_size) -{ - struct dn_heap_entry *p; - - if (h->size >= new_size ) /* have enough room */ - return 0; -#if 1 /* round to the next power of 2 */ - new_size |= new_size >> 1; - new_size |= new_size >> 2; - new_size |= new_size >> 4; - new_size |= new_size >> 8; - new_size |= new_size >> 16; -#else - new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT; -#endif - p = malloc(new_size * sizeof(*p), M_DN_HEAP, M_NOWAIT); - if (p == NULL) { - printf("--- %s, resize %d failed\n", __func__, new_size ); - return 1; /* error */ - } - if (h->size > 0) { - bcopy(h->p, p, h->size * sizeof(*p) ); - free(h->p, M_DN_HEAP); - } - h->p = p; - h->size = new_size; - return 0; -} - -int -heap_init(struct dn_heap *h, int size, int ofs) -{ - if (heap_resize(h, size)) - return 1; - h->elements = 0; - h->ofs = ofs; - return 0; -} - -/* - * Insert element in heap. Normally, p != NULL, we insert p in - * a new position and bubble up. If p == NULL, then the element is - * already in place, and key is the position where to start the - * bubble-up. - * Returns 1 on failure (cannot allocate new heap entry) - * - * If ofs > 0 the position (index, int) of the element in the heap is - * also stored in the element itself at the given offset in bytes. - */ -#define SET_OFFSET(h, i) do { \ - if (h->ofs > 0) \ - *((int32_t *)((char *)(h->p[i].object) + h->ofs)) = i; \ - } while (0) -/* - * RESET_OFFSET is used for sanity checks. It sets ofs - * to an invalid value. - */ -#define RESET_OFFSET(h, i) do { \ - if (h->ofs > 0) \ - *((int32_t *)((char *)(h->p[i].object) + h->ofs)) = -16; \ - } while (0) - -int -heap_insert(struct dn_heap *h, uint64_t key1, void *p) -{ - int son = h->elements; - - //log("%s key %llu p %p\n", __FUNCTION__, key1, p); - if (p == NULL) { /* data already there, set starting point */ - son = key1; - } else { /* insert new element at the end, possibly resize */ - son = h->elements; - if (son == h->size) /* need resize... */ - // XXX expand by 16 or so - if (heap_resize(h, h->elements+16) ) - return 1; /* failure... */ - h->p[son].object = p; - h->p[son].key = key1; - h->elements++; - } - /* make sure that son >= father along the path */ - while (son > 0) { - int father = HEAP_FATHER(son); - struct dn_heap_entry tmp; - - if (DN_KEY_LT( h->p[father].key, h->p[son].key ) ) - break; /* found right position */ - /* son smaller than father, swap and repeat */ - HEAP_SWAP(h->p[son], h->p[father], tmp); - SET_OFFSET(h, son); - son = father; - } - SET_OFFSET(h, son); - return 0; -} - -/* - * remove top element from heap, or obj if obj != NULL - */ -void -heap_extract(struct dn_heap *h, void *obj) -{ - int child, father, max = h->elements - 1; - - if (max < 0) { - printf("--- %s: empty heap 0x%p\n", __FUNCTION__, h); - return; - } - if (obj == NULL) - father = 0; /* default: move up smallest child */ - else { /* extract specific element, index is at offset */ - if (h->ofs <= 0) - panic("%s: extract from middle not set on %p\n", - __FUNCTION__, h); - father = *((int *)((char *)obj + h->ofs)); - if (father < 0 || father >= h->elements) { - panic("%s: father %d out of bound 0..%d\n", - __FUNCTION__, father, h->elements); - } - } - /* - * below, father is the index of the empty element, which - * we replace at each step with the smallest child until we - * reach the bottom level. - */ - // XXX why removing RESET_OFFSET increases runtime by 10% ? - RESET_OFFSET(h, father); - while ( (child = HEAP_LEFT(father)) <= max ) { - if (child != max && - DN_KEY_LT(h->p[child+1].key, h->p[child].key) ) - child++; /* take right child, otherwise left */ - h->p[father] = h->p[child]; - SET_OFFSET(h, father); - father = child; - } - h->elements--; - if (father != max) { - /* - * Fill hole with last entry and bubble up, - * reusing the insert code - */ - h->p[father] = h->p[max]; - heap_insert(h, father, NULL); - } -} - -#if 0 -/* - * change object position and update references - * XXX this one is never used! - */ -static void -heap_move(struct dn_heap *h, uint64_t new_key, void *object) -{ - int temp, i, max = h->elements-1; - struct dn_heap_entry *p, buf; - - if (h->ofs <= 0) - panic("cannot move items on this heap"); - p = h->p; /* shortcut */ - - i = *((int *)((char *)object + h->ofs)); - if (DN_KEY_LT(new_key, p[i].key) ) { /* must move up */ - p[i].key = new_key; - for (; i>0 && - DN_KEY_LT(new_key, p[(temp = HEAP_FATHER(i))].key); - i = temp ) { /* bubble up */ - HEAP_SWAP(p[i], p[temp], buf); - SET_OFFSET(h, i); - } - } else { /* must move down */ - p[i].key = new_key; - while ( (temp = HEAP_LEFT(i)) <= max ) { - /* found left child */ - if (temp != max && - DN_KEY_LT(p[temp+1].key, p[temp].key)) - temp++; /* select child with min key */ - if (DN_KEY_LT(>p[temp].key, new_key)) { - /* go down */ - HEAP_SWAP(p[i], p[temp], buf); - SET_OFFSET(h, i); - } else - break; - i = temp; - } - } - SET_OFFSET(h, i); -} -#endif /* heap_move, unused */ - -/* - * heapify() will reorganize data inside an array to maintain the - * heap property. It is needed when we delete a bunch of entries. - */ -static void -heapify(struct dn_heap *h) -{ - int i; - - for (i = 0; i < h->elements; i++ ) - heap_insert(h, i , NULL); -} - -int -heap_scan(struct dn_heap *h, int (*fn)(void *, uintptr_t), - uintptr_t arg) -{ - int i, ret, found; - - for (i = found = 0 ; i < h->elements ;) { - ret = fn(h->p[i].object, arg); - if (ret & HEAP_SCAN_DEL) { - h->elements-- ; - h->p[i] = h->p[h->elements] ; - found++ ; - } else - i++ ; - if (ret & HEAP_SCAN_END) - break; - } - if (found) - heapify(h); - return found; -} - -/* - * cleanup the heap and free data structure - */ -void -heap_free(struct dn_heap *h) -{ - if (h->size >0 ) - free(h->p, M_DN_HEAP); - bzero(h, sizeof(*h) ); -} - -/* - * hash table support. - */ - -struct dn_ht { - int buckets; /* how many buckets, really buckets - 1*/ - int entries; /* how many entries */ - int ofs; /* offset of link field */ - uint32_t (*hash)(uintptr_t, int, void *arg); - int (*match)(void *_el, uintptr_t key, int, void *); - void *(*newh)(uintptr_t, int, void *); - void **ht; /* bucket heads */ -}; -/* - * Initialize, allocating bucket pointers inline. - * Recycle previous record if possible. - * If the 'newh' function is not supplied, we assume that the - * key passed to ht_find is the same object to be stored in. - */ -struct dn_ht * -dn_ht_init(struct dn_ht *ht, int buckets, int ofs, - uint32_t (*h)(uintptr_t, int, void *), - int (*match)(void *, uintptr_t, int, void *), - void *(*newh)(uintptr_t, int, void *)) -{ - int l; - - /* - * Notes about rounding bucket size to a power of two. - * Given the original bucket size, we compute the nearest lower and - * higher power of two, minus 1 (respectively b_min and b_max) because - * this value will be used to do an AND with the index returned - * by hash function. - * To choice between these two values, the original bucket size is - * compared with b_min. If the original size is greater than 4/3 b_min, - * we round the bucket size to b_max, else to b_min. - * This ratio try to round to the nearest power of two, advantaging - * the greater size if the different between two power is relatively - * big. - * Rounding the bucket size to a power of two avoid the use of - * module when calculating the correct bucket. - * The ht->buckets variable store the bucket size - 1 to simply - * do an AND between the index returned by hash function and ht->bucket - * instead of a module. - */ - int b_min; /* min buckets */ - int b_max; /* max buckets */ - int b_ori; /* original buckets */ - - if (h == NULL || match == NULL) { - printf("--- missing hash or match function"); - return NULL; - } - if (buckets < 1 || buckets > 65536) - return NULL; - - b_ori = buckets; - /* calculate next power of 2, - 1*/ - buckets |= buckets >> 1; - buckets |= buckets >> 2; - buckets |= buckets >> 4; - buckets |= buckets >> 8; - buckets |= buckets >> 16; - - b_max = buckets; /* Next power */ - b_min = buckets >> 1; /* Previous power */ - - /* Calculate the 'nearest' bucket size */ - if (b_min * 4000 / 3000 < b_ori) - buckets = b_max; - else - buckets = b_min; - - if (ht) { /* see if we can reuse */ - if (buckets <= ht->buckets) { - ht->buckets = buckets; - } else { - /* free pointers if not allocated inline */ - if (ht->ht != (void *)(ht + 1)) - free(ht->ht, M_DN_HEAP); - free(ht, M_DN_HEAP); - ht = NULL; - } - } - if (ht == NULL) { - /* Allocate buckets + 1 entries because buckets is use to - * do the AND with the index returned by hash function - */ - l = sizeof(*ht) + (buckets + 1) * sizeof(void **); - ht = malloc(l, M_DN_HEAP, M_NOWAIT | M_ZERO); - } - if (ht) { - ht->ht = (void **)(ht + 1); - ht->buckets = buckets; - ht->ofs = ofs; - ht->hash = h; - ht->match = match; - ht->newh = newh; - } - return ht; -} - -/* dummy callback for dn_ht_free to unlink all */ -static int -do_del(void *obj, void *arg) -{ - return DNHT_SCAN_DEL; -} - -void -dn_ht_free(struct dn_ht *ht, int flags) -{ - if (ht == NULL) - return; - if (flags & DNHT_REMOVE) { - (void)dn_ht_scan(ht, do_del, NULL); - } else { - if (ht->ht && ht->ht != (void *)(ht + 1)) - free(ht->ht, M_DN_HEAP); - free(ht, M_DN_HEAP); - } -} - -int -dn_ht_entries(struct dn_ht *ht) -{ - return ht ? ht->entries : 0; -} - -/* lookup and optionally create or delete element */ -void * -dn_ht_find(struct dn_ht *ht, uintptr_t key, int flags, void *arg) -{ - int i; - void **pp, *p; - - if (ht == NULL) /* easy on an empty hash */ - return NULL; - i = (ht->buckets == 1) ? 0 : - (ht->hash(key, flags, arg) & ht->buckets); - - for (pp = &ht->ht[i]; (p = *pp); pp = (void **)((char *)p + ht->ofs)) { - if (flags & DNHT_MATCH_PTR) { - if (key == (uintptr_t)p) - break; - } else if (ht->match(p, key, flags, arg)) /* found match */ - break; - } - if (p) { - if (flags & DNHT_REMOVE) { - /* link in the next element */ - *pp = *(void **)((char *)p + ht->ofs); - *(void **)((char *)p + ht->ofs) = NULL; - ht->entries--; - } - } else if (flags & DNHT_INSERT) { - // printf("%s before calling new, bucket %d ofs %d\n", - // __FUNCTION__, i, ht->ofs); - p = ht->newh ? ht->newh(key, flags, arg) : (void *)key; - // printf("%s newh returns %p\n", __FUNCTION__, p); - if (p) { - ht->entries++; - *(void **)((char *)p + ht->ofs) = ht->ht[i]; - ht->ht[i] = p; - } - } - return p; -} - -/* - * do a scan with the option to delete the object. Extract next before - * running the callback because the element may be destroyed there. - */ -int -dn_ht_scan(struct dn_ht *ht, int (*fn)(void *, void *), void *arg) -{ - int i, ret, found = 0; - void **curp, *cur, *next; - - if (ht == NULL || fn == NULL) - return 0; - for (i = 0; i <= ht->buckets; i++) { - curp = &ht->ht[i]; - while ( (cur = *curp) != NULL) { - next = *(void **)((char *)cur + ht->ofs); - ret = fn(cur, arg); - if (ret & DNHT_SCAN_DEL) { - found++; - ht->entries--; - *curp = next; - } else { - curp = (void **)((char *)cur + ht->ofs); - } - if (ret & DNHT_SCAN_END) - return found; - } - } - return found; -} - -/* - * Similar to dn_ht_scan(), except that the scan is performed only - * in the bucket 'bucket'. The function returns a correct bucket number if - * the original is invalid. - * If the callback returns DNHT_SCAN_END, the function move the ht->ht[i] - * pointer to the last entry processed. Moreover, the bucket number passed - * by caller is decremented, because usually the caller increment it. - */ -int -dn_ht_scan_bucket(struct dn_ht *ht, int *bucket, int (*fn)(void *, void *), - void *arg) -{ - int i, ret, found = 0; - void **curp, *cur, *next; - - if (ht == NULL || fn == NULL) - return 0; - if (*bucket > ht->buckets) - *bucket = 0; - i = *bucket; - - curp = &ht->ht[i]; - while ( (cur = *curp) != NULL) { - next = *(void **)((char *)cur + ht->ofs); - ret = fn(cur, arg); - if (ret & DNHT_SCAN_DEL) { - found++; - ht->entries--; - *curp = next; - } else { - curp = (void **)((char *)cur + ht->ofs); - } - if (ret & DNHT_SCAN_END) - return found; - } - return found; -} diff --git a/sys/netinet/ipfw/dn_heap.h b/sys/netinet/ipfw/dn_heap.h deleted file mode 100644 index c95473a..0000000 --- a/sys/netinet/ipfw/dn_heap.h +++ /dev/null @@ -1,191 +0,0 @@ -/*- - * Copyright (c) 1998-2010 Luigi Rizzo, Universita` di Pisa - * All rights reserved - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * Binary heap and hash tables, header file - * - * $FreeBSD$ - */ - -#ifndef _IP_DN_HEAP_H -#define _IP_DN_HEAP_H - -#define DN_KEY_LT(a,b) ((int64_t)((a)-(b)) < 0) -#define DN_KEY_LEQ(a,b) ((int64_t)((a)-(b)) <= 0) - -/* - * This module implements a binary heap supporting random extraction. - * - * A heap entry contains an uint64_t key and a pointer to object. - * DN_KEY_LT(a,b) returns true if key 'a' is smaller than 'b' - * - * The heap is a struct dn_heap plus a dynamically allocated - * array of dn_heap_entry entries. 'size' represents the size of - * the array, 'elements' count entries in use. The topmost - * element has the smallest key. - * The heap supports ordered insert, and extract from the top. - * To extract an object from the middle of the heap, we the object - * must reserve an 'int32_t' to store the position of the object - * in the heap itself, and the location of this field must be - * passed as an argument to heap_init() -- use -1 if the feature - * is not used. - */ -struct dn_heap_entry { - uint64_t key; /* sorting key, smallest comes first */ - void *object; /* object pointer */ -}; - -struct dn_heap { - int size; /* the size of the array */ - int elements; /* elements in use */ - int ofs; /* offset in the object of heap index */ - struct dn_heap_entry *p; /* array of "size" entries */ -}; - -enum { - HEAP_SCAN_DEL = 1, - HEAP_SCAN_END = 2, -}; - -/* - * heap_init() reinitializes the heap setting the size and the offset - * of the index for random extraction (use -1 if not used). - * The 'elements' counter is set to 0. - * - * SET_HEAP_OFS() indicates where, in the object, is stored the index - * for random extractions from the heap. - * - * heap_free() frees the memory associated to a heap. - * - * heap_insert() adds a key-pointer pair to the heap - * - * HEAP_TOP() returns a pointer to the top element of the heap, - * but makes no checks on its existance (XXX should we change ?) - * - * heap_extract() removes the entry at the top, returing the pointer. - * (the key should have been read before). - * - * heap_scan() invokes a callback on each entry of the heap. - * The callback can return a combination of HEAP_SCAN_DEL and - * HEAP_SCAN_END. HEAP_SCAN_DEL means the current element must - * be removed, and HEAP_SCAN_END means to terminate the scan. - * heap_scan() returns the number of elements removed. - * Because the order is not guaranteed, we should use heap_scan() - * only as a last resort mechanism. - */ -#define HEAP_TOP(h) ((h)->p) -#define SET_HEAP_OFS(h, n) do { (h)->ofs = n; } while (0) -int heap_init(struct dn_heap *h, int size, int ofs); -int heap_insert(struct dn_heap *h, uint64_t key1, void *p); -void heap_extract(struct dn_heap *h, void *obj); -void heap_free(struct dn_heap *h); -int heap_scan(struct dn_heap *, int (*)(void *, uintptr_t), uintptr_t); - -/*------------------------------------------------------ - * This module implements a generic hash table with support for - * running callbacks on the entire table. To avoid allocating - * memory during hash table operations, objects must reserve - * space for a link field. XXX if the heap is moderately full, - * an SLIST suffices, and we can tolerate the cost of a hash - * computation on each removal. - * - * dn_ht_init() initializes the table, setting the number of - * buckets, the offset of the link field, the main callbacks. - * Callbacks are: - * - * hash(key, flags, arg) called to return a bucket index. - * match(obj, key, flags, arg) called to determine if key - * matches the current 'obj' in the heap - * newh(key, flags, arg) optional, used to allocate a new - * object during insertions. - * - * dn_ht_free() frees the heap or unlink elements. - * DNHT_REMOVE unlink elements, 0 frees the heap. - * You need two calls to do both. - * - * dn_ht_find() is the main lookup function, which can also be - * used to insert or delete elements in the hash table. - * The final 'arg' is passed to all callbacks. - * - * dn_ht_scan() is used to invoke a callback on all entries of - * the heap, or possibly on just one bucket. The callback - * is invoked with a pointer to the object, and must return - * one of DNHT_SCAN_DEL or DNHT_SCAN_END to request the - * removal of the object from the heap and the end of the - * scan, respectively. - * - * dn_ht_scan_bucket() is similar to dn_ht_scan(), except that it scans - * only the specific bucket of the table. The bucket is a in-out - * parameter and return a valid bucket number if the original - * is invalid. - * - * A combination of flags can be used to modify the operation - * of the dn_ht_find(), and of the callbacks: - * - * DNHT_KEY_IS_OBJ means the key is the object pointer. - * It is usally of interest for the hash and match functions. - * - * DNHT_MATCH_PTR during a lookup, match pointers instead - * of calling match(). Normally used when removing specific - * entries. Does not imply KEY_IS_OBJ as the latter _is_ used - * by the match function. - * - * DNHT_INSERT insert the element if not found. - * Calls new() to allocates a new object unless - * DNHT_KEY_IS_OBJ is set. - * - * DNHT_UNIQUE only insert if object not found. - * XXX should it imply DNHT_INSERT ? - * - * DNHT_REMOVE remove objects if we find them. - */ -struct dn_ht; /* should be opaque */ - -struct dn_ht *dn_ht_init(struct dn_ht *, int buckets, int ofs, - uint32_t (*hash)(uintptr_t, int, void *), - int (*match)(void *, uintptr_t, int, void *), - void *(*newh)(uintptr_t, int, void *)); -void dn_ht_free(struct dn_ht *, int flags); - -void *dn_ht_find(struct dn_ht *, uintptr_t, int, void *); -int dn_ht_scan(struct dn_ht *, int (*)(void *, void *), void *); -int dn_ht_scan_bucket(struct dn_ht *, int * , int (*)(void *, void *), void *); -int dn_ht_entries(struct dn_ht *); - -enum { /* flags values. - * first two are returned by the scan callback to indicate - * to delete the matching element or to end the scan - */ - DNHT_SCAN_DEL = 0x0001, - DNHT_SCAN_END = 0x0002, - DNHT_KEY_IS_OBJ = 0x0004, /* key is the obj pointer */ - DNHT_MATCH_PTR = 0x0008, /* match by pointer, not match() */ - DNHT_INSERT = 0x0010, /* insert if not found */ - DNHT_UNIQUE = 0x0020, /* report error if already there */ - DNHT_REMOVE = 0x0040, /* remove on find or dn_ht_free */ -}; - -#endif /* _IP_DN_HEAP_H */ diff --git a/sys/netinet/ipfw/dn_sched.h b/sys/netinet/ipfw/dn_sched.h deleted file mode 100644 index ab823fe..0000000 --- a/sys/netinet/ipfw/dn_sched.h +++ /dev/null @@ -1,191 +0,0 @@ -/* - * Copyright (c) 2010 Riccardo Panicucci, Luigi Rizzo, Universita` di Pisa - * All rights reserved - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * The API to write a packet scheduling algorithm for dummynet. - * - * $FreeBSD$ - */ - -#ifndef _DN_SCHED_H -#define _DN_SCHED_H - -#define DN_MULTIQUEUE 0x01 -/* - * Descriptor for a scheduling algorithm. - * Contains all function pointers for a given scheduler - * This is typically created when a module is loaded, and stored - * in a global list of schedulers. - */ -struct dn_alg { - uint32_t type; /* the scheduler type */ - const char *name; /* scheduler name */ - uint32_t flags; /* DN_MULTIQUEUE if supports multiple queues */ - - /* - * The following define the size of 3 optional data structures - * that may need to be allocated at runtime, and are appended - * to each of the base data structures: scheduler, sched.inst, - * and queue. We don't have a per-flowset structure. - */ - /* + parameters attached to the template, e.g. - * default queue sizes, weights, quantum size, and so on; - */ - size_t schk_datalen; - - /* + per-instance parameters, such as timestamps, - * containers for queues, etc; - */ - size_t si_datalen; - - size_t q_datalen; /* per-queue parameters (e.g. S,F) */ - - /* - * Methods implemented by the scheduler: - * enqueue enqueue packet 'm' on scheduler 's', queue 'q'. - * q is NULL for !MULTIQUEUE. - * Return 0 on success, 1 on drop (packet consumed anyways). - * Note that q should be interpreted only as a hint - * on the flow that the mbuf belongs to: while a - * scheduler will normally enqueue m into q, it is ok - * to leave q alone and put the mbuf elsewhere. - * This function is called in two cases: - * - when a new packet arrives to the scheduler; - * - when a scheduler is reconfigured. In this case the - * call is issued by the new_queue callback, with a - * non empty queue (q) and m pointing to the first - * mbuf in the queue. For this reason, the function - * should internally check for (m != q->mq.head) - * before calling dn_enqueue(). - * - * dequeue Called when scheduler instance 's' can - * dequeue a packet. Return NULL if none are available. - * XXX what about non work-conserving ? - * - * config called on 'sched X config ...', normally writes - * in the area of size sch_arg - * - * destroy called on 'sched delete', frees everything - * in sch_arg (other parts are handled by more specific - * functions) - * - * new_sched called when a new instance is created, e.g. - * to create the local queue for !MULTIQUEUE, set V or - * copy parameters for WFQ, and so on. - * - * free_sched called when deleting an instance, cleans - * extra data in the per-instance area. - * - * new_fsk called when a flowset is linked to a scheduler, - * e.g. to validate parameters such as weights etc. - * free_fsk when a flowset is unlinked from a scheduler. - * (probably unnecessary) - * - * new_queue called to set the per-queue parameters, - * e.g. S and F, adjust sum of weights in the parent, etc. - * - * The new_queue callback is normally called from when - * creating a new queue. In some cases (such as a - * scheduler change or reconfiguration) it can be called - * with a non empty queue. In this case, the queue - * In case of non empty queue, the new_queue callback could - * need to call the enqueue function. In this case, - * the callback should eventually call enqueue() passing - * as m the first element in the queue. - * - * free_queue actions related to a queue removal, e.g. undo - * all the above. If the queue has data in it, also remove - * from the scheduler. This can e.g. happen during a reconfigure. - */ - int (*enqueue)(struct dn_sch_inst *, struct dn_queue *, - struct mbuf *); - struct mbuf * (*dequeue)(struct dn_sch_inst *); - - int (*config)(struct dn_schk *); - int (*destroy)(struct dn_schk*); - int (*new_sched)(struct dn_sch_inst *); - int (*free_sched)(struct dn_sch_inst *); - int (*new_fsk)(struct dn_fsk *f); - int (*free_fsk)(struct dn_fsk *f); - int (*new_queue)(struct dn_queue *q); - int (*free_queue)(struct dn_queue *q); - - /* run-time fields */ - int ref_count; /* XXX number of instances in the system */ - SLIST_ENTRY(dn_alg) next; /* Next scheduler in the list */ -}; - -/* MSVC does not support initializers so we need this ugly macro */ -#ifdef _WIN32 -#define _SI(fld) -#else -#define _SI(fld) fld -#endif - -/* - * Additionally, dummynet exports some functions and macros - * to be used by schedulers: - */ - -void dn_free_pkts(struct mbuf *mnext); -int dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop); -/* bound a variable between min and max */ -int ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg); - -/* - * Extract the head of a queue, update stats. Must be the very last - * thing done on a dequeue as the queue itself may go away. - */ -static __inline struct mbuf* -dn_dequeue(struct dn_queue *q) -{ - struct mbuf *m = q->mq.head; - if (m == NULL) - return NULL; - q->mq.head = m->m_nextpkt; - - /* Update stats for the queue */ - q->ni.length--; - q->ni.len_bytes -= m->m_pkthdr.len; - if (q->_si) { - q->_si->ni.length--; - q->_si->ni.len_bytes -= m->m_pkthdr.len; - } - if (q->ni.length == 0) /* queue is now idle */ - q->q_time = dn_cfg.curr_time; - return m; -} - -int dn_sched_modevent(module_t mod, int cmd, void *arg); - -#define DECLARE_DNSCHED_MODULE(name, dnsched) \ - static moduledata_t name##_mod = { \ - #name, dn_sched_modevent, dnsched \ - }; \ - DECLARE_MODULE(name, name##_mod, \ - SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); \ - MODULE_DEPEND(name, dummynet, 3, 3, 3); -#endif /* _DN_SCHED_H */ diff --git a/sys/netinet/ipfw/dn_sched_fifo.c b/sys/netinet/ipfw/dn_sched_fifo.c deleted file mode 100644 index 0bb3800..0000000 --- a/sys/netinet/ipfw/dn_sched_fifo.c +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa - * All rights reserved - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * $FreeBSD$ - */ - -#ifdef _KERNEL -#include <sys/malloc.h> -#include <sys/socket.h> -#include <sys/socketvar.h> -#include <sys/kernel.h> -#include <sys/mbuf.h> -#include <sys/module.h> -#include <net/if.h> /* IFNAMSIZ */ -#include <netinet/in.h> -#include <netinet/ip_var.h> /* ipfw_rule_ref */ -#include <netinet/ip_fw.h> /* flow_id */ -#include <netinet/ip_dummynet.h> -#include <netinet/ipfw/dn_heap.h> -#include <netinet/ipfw/ip_dn_private.h> -#include <netinet/ipfw/dn_sched.h> -#else -#include <dn_test.h> -#endif - -/* - * This file implements a FIFO scheduler for a single queue. - * The queue is allocated as part of the scheduler instance, - * and there is a single flowset is in the template which stores - * queue size and policy. - * Enqueue and dequeue use the default library functions. - */ -static int -fifo_enqueue(struct dn_sch_inst *si, struct dn_queue *q, struct mbuf *m) -{ - /* XXX if called with q != NULL and m=NULL, this is a - * re-enqueue from an existing scheduler, which we should - * handle. - */ - return dn_enqueue((struct dn_queue *)(si+1), m, 0); -} - -static struct mbuf * -fifo_dequeue(struct dn_sch_inst *si) -{ - return dn_dequeue((struct dn_queue *)(si + 1)); -} - -static int -fifo_new_sched(struct dn_sch_inst *si) -{ - /* This scheduler instance contains the queue */ - struct dn_queue *q = (struct dn_queue *)(si + 1); - - set_oid(&q->ni.oid, DN_QUEUE, sizeof(*q)); - q->_si = si; - q->fs = si->sched->fs; - return 0; -} - -static int -fifo_free_sched(struct dn_sch_inst *si) -{ - struct dn_queue *q = (struct dn_queue *)(si + 1); - dn_free_pkts(q->mq.head); - bzero(q, sizeof(*q)); - return 0; -} - -/* - * FIFO scheduler descriptor - * contains the type of the scheduler, the name, the size of extra - * data structures, and function pointers. - */ -static struct dn_alg fifo_desc = { - _SI( .type = ) DN_SCHED_FIFO, - _SI( .name = ) "FIFO", - _SI( .flags = ) 0, - - _SI( .schk_datalen = ) 0, - _SI( .si_datalen = ) sizeof(struct dn_queue), - _SI( .q_datalen = ) 0, - - _SI( .enqueue = ) fifo_enqueue, - _SI( .dequeue = ) fifo_dequeue, - _SI( .config = ) NULL, - _SI( .destroy = ) NULL, - _SI( .new_sched = ) fifo_new_sched, - _SI( .free_sched = ) fifo_free_sched, - _SI( .new_fsk = ) NULL, - _SI( .free_fsk = ) NULL, - _SI( .new_queue = ) NULL, - _SI( .free_queue = ) NULL, -}; - -DECLARE_DNSCHED_MODULE(dn_fifo, &fifo_desc); diff --git a/sys/netinet/ipfw/dn_sched_prio.c b/sys/netinet/ipfw/dn_sched_prio.c deleted file mode 100644 index 28f6006..0000000 --- a/sys/netinet/ipfw/dn_sched_prio.c +++ /dev/null @@ -1,229 +0,0 @@ -/* - * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa - * All rights reserved - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * $FreeBSD$ - */ -#ifdef _KERNEL -#include <sys/malloc.h> -#include <sys/socket.h> -#include <sys/socketvar.h> -#include <sys/kernel.h> -#include <sys/mbuf.h> -#include <sys/module.h> -#include <net/if.h> /* IFNAMSIZ */ -#include <netinet/in.h> -#include <netinet/ip_var.h> /* ipfw_rule_ref */ -#include <netinet/ip_fw.h> /* flow_id */ -#include <netinet/ip_dummynet.h> -#include <netinet/ipfw/dn_heap.h> -#include <netinet/ipfw/ip_dn_private.h> -#include <netinet/ipfw/dn_sched.h> -#else -#include <dn_test.h> -#endif - -#define DN_SCHED_PRIO 5 //XXX - -#if !defined(_KERNEL) || !defined(__linux__) -#define test_bit(ix, pData) ((*pData) & (1<<(ix))) -#define __set_bit(ix, pData) (*pData) |= (1<<(ix)) -#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix)) -#endif - -#ifdef __MIPSEL__ -#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix)) -#endif - -/* Size of the array of queues pointers. */ -#define BITMAP_T unsigned long -#define MAXPRIO (sizeof(BITMAP_T) * 8) - -/* - * The scheduler instance contains an array of pointers to queues, - * one for each priority, and a bitmap listing backlogged queues. - */ -struct prio_si { - BITMAP_T bitmap; /* array bitmap */ - struct dn_queue *q_array[MAXPRIO]; /* Array of queues pointers */ -}; - -/* - * If a queue with the same priority is already backlogged, use - * that one instead of the queue passed as argument. - */ -static int -prio_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m) -{ - struct prio_si *si = (struct prio_si *)(_si + 1); - int prio = q->fs->fs.par[0]; - - if (test_bit(prio, &si->bitmap) == 0) { - /* No queue with this priority, insert */ - __set_bit(prio, &si->bitmap); - si->q_array[prio] = q; - } else { /* use the existing queue */ - q = si->q_array[prio]; - } - if (dn_enqueue(q, m, 0)) - return 1; - return 0; -} - -/* - * Packets are dequeued only from the highest priority queue. - * The function ffs() return the lowest bit in the bitmap that rapresent - * the array index (-1) which contains the pointer to the highest priority - * queue. - * After the dequeue, if this queue become empty, it is index is removed - * from the bitmap. - * Scheduler is idle if the bitmap is empty - * - * NOTE: highest priority is 0, lowest is sched->max_prio_q - */ -static struct mbuf * -prio_dequeue(struct dn_sch_inst *_si) -{ - struct prio_si *si = (struct prio_si *)(_si + 1); - struct mbuf *m; - struct dn_queue *q; - int prio; - - if (si->bitmap == 0) /* scheduler idle */ - return NULL; - - prio = ffs(si->bitmap) - 1; - - /* Take the highest priority queue in the scheduler */ - q = si->q_array[prio]; - // assert(q) - - m = dn_dequeue(q); - if (q->mq.head == NULL) { - /* Queue is now empty, remove from scheduler - * and mark it - */ - si->q_array[prio] = NULL; - __clear_bit(prio, &si->bitmap); - } - return m; -} - -static int -prio_new_sched(struct dn_sch_inst *_si) -{ - struct prio_si *si = (struct prio_si *)(_si + 1); - - bzero(si->q_array, sizeof(si->q_array)); - si->bitmap = 0; - - return 0; -} - -static int -prio_new_fsk(struct dn_fsk *fs) -{ - /* Check if the prioritiy is between 0 and MAXPRIO-1 */ - ipdn_bound_var(&fs->fs.par[0], 0, 0, MAXPRIO - 1, "PRIO priority"); - return 0; -} - -static int -prio_new_queue(struct dn_queue *q) -{ - struct prio_si *si = (struct prio_si *)(q->_si + 1); - int prio = q->fs->fs.par[0]; - struct dn_queue *oldq; - - q->ni.oid.subtype = DN_SCHED_PRIO; - - if (q->mq.head == NULL) - return 0; - - /* Queue already full, must insert in the scheduler or append - * mbufs to existing queue. This partly duplicates prio_enqueue - */ - if (test_bit(prio, &si->bitmap) == 0) { - /* No queue with this priority, insert */ - __set_bit(prio, &si->bitmap); - si->q_array[prio] = q; - } else if ( (oldq = si->q_array[prio]) != q) { - /* must append to the existing queue. - * can simply append q->mq.head to q2->... - * and add the counters to those of q2 - */ - oldq->mq.tail->m_nextpkt = q->mq.head; - oldq->mq.tail = q->mq.tail; - oldq->ni.length += q->ni.length; - q->ni.length = 0; - oldq->ni.len_bytes += q->ni.len_bytes; - q->ni.len_bytes = 0; - q->mq.tail = q->mq.head = NULL; - } - return 0; -} - -static int -prio_free_queue(struct dn_queue *q) -{ - int prio = q->fs->fs.par[0]; - struct prio_si *si = (struct prio_si *)(q->_si + 1); - - if (si->q_array[prio] == q) { - si->q_array[prio] = NULL; - __clear_bit(prio, &si->bitmap); - } - return 0; -} - - -static struct dn_alg prio_desc = { - _SI( .type = ) DN_SCHED_PRIO, - _SI( .name = ) "PRIO", - _SI( .flags = ) DN_MULTIQUEUE, - - /* we need extra space in the si and the queue */ - _SI( .schk_datalen = ) 0, - _SI( .si_datalen = ) sizeof(struct prio_si), - _SI( .q_datalen = ) 0, - - _SI( .enqueue = ) prio_enqueue, - _SI( .dequeue = ) prio_dequeue, - - _SI( .config = ) NULL, - _SI( .destroy = ) NULL, - _SI( .new_sched = ) prio_new_sched, - _SI( .free_sched = ) NULL, - - _SI( .new_fsk = ) prio_new_fsk, - _SI( .free_fsk = ) NULL, - - _SI( .new_queue = ) prio_new_queue, - _SI( .free_queue = ) prio_free_queue, -}; - - -DECLARE_DNSCHED_MODULE(dn_prio, &prio_desc); diff --git a/sys/netinet/ipfw/dn_sched_qfq.c b/sys/netinet/ipfw/dn_sched_qfq.c deleted file mode 100644 index be7fba3..0000000 --- a/sys/netinet/ipfw/dn_sched_qfq.c +++ /dev/null @@ -1,864 +0,0 @@ -/* - * Copyright (c) 2010 Fabio Checconi, Luigi Rizzo, Paolo Valente - * All rights reserved - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * $FreeBSD$ - */ - -#ifdef _KERNEL -#include <sys/malloc.h> -#include <sys/socket.h> -#include <sys/socketvar.h> -#include <sys/kernel.h> -#include <sys/mbuf.h> -#include <sys/module.h> -#include <net/if.h> /* IFNAMSIZ */ -#include <netinet/in.h> -#include <netinet/ip_var.h> /* ipfw_rule_ref */ -#include <netinet/ip_fw.h> /* flow_id */ -#include <netinet/ip_dummynet.h> -#include <netinet/ipfw/dn_heap.h> -#include <netinet/ipfw/ip_dn_private.h> -#include <netinet/ipfw/dn_sched.h> -#else -#include <dn_test.h> -#endif - -#ifdef QFQ_DEBUG -struct qfq_sched; -static void dump_sched(struct qfq_sched *q, const char *msg); -#define NO(x) x -#else -#define NO(x) -#endif -#define DN_SCHED_QFQ 4 // XXX Where? -typedef unsigned long bitmap; - -/* - * bitmaps ops are critical. Some linux versions have __fls - * and the bitmap ops. Some machines have ffs - */ -#if defined(_WIN32) || (defined(__MIPSEL__) && defined(LINUX_24)) -int fls(unsigned int n) -{ - int i = 0; - for (i = 0; n > 0; n >>= 1, i++) - ; - return i; -} -#endif - -#if !defined(_KERNEL) || defined( __FreeBSD__ ) || defined(_WIN32) || (defined(__MIPSEL__) && defined(LINUX_24)) -static inline unsigned long __fls(unsigned long word) -{ - return fls(word) - 1; -} -#endif - -#if !defined(_KERNEL) || !defined(__linux__) -#ifdef QFQ_DEBUG -int test_bit(int ix, bitmap *p) -{ - if (ix < 0 || ix > 31) - D("bad index %d", ix); - return *p & (1<<ix); -} -void __set_bit(int ix, bitmap *p) -{ - if (ix < 0 || ix > 31) - D("bad index %d", ix); - *p |= (1<<ix); -} -void __clear_bit(int ix, bitmap *p) -{ - if (ix < 0 || ix > 31) - D("bad index %d", ix); - *p &= ~(1<<ix); -} -#else /* !QFQ_DEBUG */ -/* XXX do we have fast version, or leave it to the compiler ? */ -#define test_bit(ix, pData) ((*pData) & (1<<(ix))) -#define __set_bit(ix, pData) (*pData) |= (1<<(ix)) -#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix)) -#endif /* !QFQ_DEBUG */ -#endif /* !__linux__ */ - -#ifdef __MIPSEL__ -#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix)) -#endif - -/*-------------------------------------------*/ -/* - -Virtual time computations. - -S, F and V are all computed in fixed point arithmetic with -FRAC_BITS decimal bits. - - QFQ_MAX_INDEX is the maximum index allowed for a group. We need - one bit per index. - QFQ_MAX_WSHIFT is the maximum power of two supported as a weight. - The layout of the bits is as below: - - [ MTU_SHIFT ][ FRAC_BITS ] - [ MAX_INDEX ][ MIN_SLOT_SHIFT ] - ^.__grp->index = 0 - *.__grp->slot_shift - - where MIN_SLOT_SHIFT is derived by difference from the others. - -The max group index corresponds to Lmax/w_min, where -Lmax=1<<MTU_SHIFT, w_min = 1 . -From this, and knowing how many groups (MAX_INDEX) we want, -we can derive the shift corresponding to each group. - -Because we often need to compute - F = S + len/w_i and V = V + len/wsum -instead of storing w_i store the value - inv_w = (1<<FRAC_BITS)/w_i -so we can do F = S + len * inv_w * wsum. -We use W_TOT in the formulas so we can easily move between -static and adaptive weight sum. - -The per-scheduler-instance data contain all the data structures -for the scheduler: bitmaps and bucket lists. - - */ -/* - * Maximum number of consecutive slots occupied by backlogged classes - * inside a group. This is approx lmax/lmin + 5. - * XXX check because it poses constraints on MAX_INDEX - */ -#define QFQ_MAX_SLOTS 32 -/* - * Shifts used for class<->group mapping. Class weights are - * in the range [1, QFQ_MAX_WEIGHT], we to map each class i to the - * group with the smallest index that can support the L_i / r_i - * configured for the class. - * - * grp->index is the index of the group; and grp->slot_shift - * is the shift for the corresponding (scaled) sigma_i. - * - * When computing the group index, we do (len<<FP_SHIFT)/weight, - * then compute an FLS (which is like a log2()), and if the result - * is below the MAX_INDEX region we use 0 (which is the same as - * using a larger len). - */ -#define QFQ_MAX_INDEX 19 -#define QFQ_MAX_WSHIFT 16 /* log2(max_weight) */ - -#define QFQ_MAX_WEIGHT (1<<QFQ_MAX_WSHIFT) -#define QFQ_MAX_WSUM (2*QFQ_MAX_WEIGHT) -//#define IWSUM (q->i_wsum) -#define IWSUM ((1<<FRAC_BITS)/QFQ_MAX_WSUM) - -#define FRAC_BITS 30 /* fixed point arithmetic */ -#define ONE_FP (1UL << FRAC_BITS) - -#define QFQ_MTU_SHIFT 11 /* log2(max_len) */ -#define QFQ_MIN_SLOT_SHIFT (FRAC_BITS + QFQ_MTU_SHIFT - QFQ_MAX_INDEX) - -/* - * Possible group states, also indexes for the bitmaps array in - * struct qfq_queue. We rely on ER, IR, EB, IB being numbered 0..3 - */ -enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE }; - -struct qfq_group; -/* - * additional queue info. Some of this info should come from - * the flowset, we copy them here for faster processing. - * This is an overlay of the struct dn_queue - */ -struct qfq_class { - struct dn_queue _q; - uint64_t S, F; /* flow timestamps (exact) */ - struct qfq_class *next; /* Link for the slot list. */ - - /* group we belong to. In principle we would need the index, - * which is log_2(lmax/weight), but we never reference it - * directly, only the group. - */ - struct qfq_group *grp; - - /* these are copied from the flowset. */ - uint32_t inv_w; /* ONE_FP/weight */ - uint32_t lmax; /* Max packet size for this flow. */ -}; - -/* Group descriptor, see the paper for details. - * Basically this contains the bucket lists - */ -struct qfq_group { - uint64_t S, F; /* group timestamps (approx). */ - unsigned int slot_shift; /* Slot shift. */ - unsigned int index; /* Group index. */ - unsigned int front; /* Index of the front slot. */ - bitmap full_slots; /* non-empty slots */ - - /* Array of lists of active classes. */ - struct qfq_class *slots[QFQ_MAX_SLOTS]; -}; - -/* scheduler instance descriptor. */ -struct qfq_sched { - uint64_t V; /* Precise virtual time. */ - uint32_t wsum; /* weight sum */ - NO(uint32_t i_wsum; /* ONE_FP/w_sum */ - uint32_t _queued; /* debugging */ - uint32_t loops; /* debugging */) - bitmap bitmaps[QFQ_MAX_STATE]; /* Group bitmaps. */ - struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */ -}; - -/*---- support functions ----------------------------*/ - -/* Generic comparison function, handling wraparound. */ -static inline int qfq_gt(uint64_t a, uint64_t b) -{ - return (int64_t)(a - b) > 0; -} - -/* Round a precise timestamp to its slotted value. */ -static inline uint64_t qfq_round_down(uint64_t ts, unsigned int shift) -{ - return ts & ~((1ULL << shift) - 1); -} - -/* return the pointer to the group with lowest index in the bitmap */ -static inline struct qfq_group *qfq_ffs(struct qfq_sched *q, - unsigned long bitmap) -{ - int index = ffs(bitmap) - 1; // zero-based - return &q->groups[index]; -} - -/* - * Calculate a flow index, given its weight and maximum packet length. - * index = log_2(maxlen/weight) but we need to apply the scaling. - * This is used only once at flow creation. - */ -static int qfq_calc_index(uint32_t inv_w, unsigned int maxlen) -{ - uint64_t slot_size = (uint64_t)maxlen *inv_w; - unsigned long size_map; - int index = 0; - - size_map = (unsigned long)(slot_size >> QFQ_MIN_SLOT_SHIFT); - if (!size_map) - goto out; - - index = __fls(size_map) + 1; // basically a log_2() - index -= !(slot_size - (1ULL << (index + QFQ_MIN_SLOT_SHIFT - 1))); - - if (index < 0) - index = 0; - -out: - ND("W = %d, L = %d, I = %d\n", ONE_FP/inv_w, maxlen, index); - return index; -} -/*---- end support functions ----*/ - -/*-------- API calls --------------------------------*/ -/* - * Validate and copy parameters from flowset. - */ -static int -qfq_new_queue(struct dn_queue *_q) -{ - struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1); - struct qfq_class *cl = (struct qfq_class *)_q; - int i; - uint32_t w; /* approximated weight */ - - /* import parameters from the flowset. They should be correct - * already. - */ - w = _q->fs->fs.par[0]; - cl->lmax = _q->fs->fs.par[1]; - if (!w || w > QFQ_MAX_WEIGHT) { - w = 1; - D("rounding weight to 1"); - } - cl->inv_w = ONE_FP/w; - w = ONE_FP/cl->inv_w; - if (q->wsum + w > QFQ_MAX_WSUM) - return EINVAL; - - i = qfq_calc_index(cl->inv_w, cl->lmax); - cl->grp = &q->groups[i]; - q->wsum += w; - // XXX cl->S = q->V; ? - // XXX compute q->i_wsum - return 0; -} - -/* remove an empty queue */ -static int -qfq_free_queue(struct dn_queue *_q) -{ - struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1); - struct qfq_class *cl = (struct qfq_class *)_q; - if (cl->inv_w) { - q->wsum -= ONE_FP/cl->inv_w; - cl->inv_w = 0; /* reset weight to avoid run twice */ - } - return 0; -} - -/* Calculate a mask to mimic what would be ffs_from(). */ -static inline unsigned long -mask_from(unsigned long bitmap, int from) -{ - return bitmap & ~((1UL << from) - 1); -} - -/* - * The state computation relies on ER=0, IR=1, EB=2, IB=3 - * First compute eligibility comparing grp->S, q->V, - * then check if someone is blocking us and possibly add EB - */ -static inline unsigned int -qfq_calc_state(struct qfq_sched *q, struct qfq_group *grp) -{ - /* if S > V we are not eligible */ - unsigned int state = qfq_gt(grp->S, q->V); - unsigned long mask = mask_from(q->bitmaps[ER], grp->index); - struct qfq_group *next; - - if (mask) { - next = qfq_ffs(q, mask); - if (qfq_gt(grp->F, next->F)) - state |= EB; - } - - return state; -} - -/* - * In principle - * q->bitmaps[dst] |= q->bitmaps[src] & mask; - * q->bitmaps[src] &= ~mask; - * but we should make sure that src != dst - */ -static inline void -qfq_move_groups(struct qfq_sched *q, unsigned long mask, int src, int dst) -{ - q->bitmaps[dst] |= q->bitmaps[src] & mask; - q->bitmaps[src] &= ~mask; -} - -static inline void -qfq_unblock_groups(struct qfq_sched *q, int index, uint64_t old_finish) -{ - unsigned long mask = mask_from(q->bitmaps[ER], index + 1); - struct qfq_group *next; - - if (mask) { - next = qfq_ffs(q, mask); - if (!qfq_gt(next->F, old_finish)) - return; - } - - mask = (1UL << index) - 1; - qfq_move_groups(q, mask, EB, ER); - qfq_move_groups(q, mask, IB, IR); -} - -/* - * perhaps - * - old_V ^= q->V; - old_V >>= QFQ_MIN_SLOT_SHIFT; - if (old_V) { - ... - } - * - */ -static inline void -qfq_make_eligible(struct qfq_sched *q, uint64_t old_V) -{ - unsigned long mask, vslot, old_vslot; - - vslot = q->V >> QFQ_MIN_SLOT_SHIFT; - old_vslot = old_V >> QFQ_MIN_SLOT_SHIFT; - - if (vslot != old_vslot) { - mask = (2UL << (__fls(vslot ^ old_vslot))) - 1; - qfq_move_groups(q, mask, IR, ER); - qfq_move_groups(q, mask, IB, EB); - } -} - -/* - * XXX we should make sure that slot becomes less than 32. - * This is guaranteed by the input values. - * roundedS is always cl->S rounded on grp->slot_shift bits. - */ -static inline void -qfq_slot_insert(struct qfq_group *grp, struct qfq_class *cl, uint64_t roundedS) -{ - uint64_t slot = (roundedS - grp->S) >> grp->slot_shift; - unsigned int i = (grp->front + slot) % QFQ_MAX_SLOTS; - - cl->next = grp->slots[i]; - grp->slots[i] = cl; - __set_bit(slot, &grp->full_slots); -} - -/* - * remove the entry from the slot - */ -static inline void -qfq_front_slot_remove(struct qfq_group *grp) -{ - struct qfq_class **h = &grp->slots[grp->front]; - - *h = (*h)->next; - if (!*h) - __clear_bit(0, &grp->full_slots); -} - -/* - * Returns the first full queue in a group. As a side effect, - * adjust the bucket list so the first non-empty bucket is at - * position 0 in full_slots. - */ -static inline struct qfq_class * -qfq_slot_scan(struct qfq_group *grp) -{ - int i; - - ND("grp %d full %x", grp->index, grp->full_slots); - if (!grp->full_slots) - return NULL; - - i = ffs(grp->full_slots) - 1; // zero-based - if (i > 0) { - grp->front = (grp->front + i) % QFQ_MAX_SLOTS; - grp->full_slots >>= i; - } - - return grp->slots[grp->front]; -} - -/* - * adjust the bucket list. When the start time of a group decreases, - * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to - * move the objects. The mask of occupied slots must be shifted - * because we use ffs() to find the first non-empty slot. - * This covers decreases in the group's start time, but what about - * increases of the start time ? - * Here too we should make sure that i is less than 32 - */ -static inline void -qfq_slot_rotate(struct qfq_sched *q, struct qfq_group *grp, uint64_t roundedS) -{ - unsigned int i = (grp->S - roundedS) >> grp->slot_shift; - - grp->full_slots <<= i; - grp->front = (grp->front - i) % QFQ_MAX_SLOTS; -} - - -static inline void -qfq_update_eligible(struct qfq_sched *q, uint64_t old_V) -{ - bitmap ineligible; - - ineligible = q->bitmaps[IR] | q->bitmaps[IB]; - if (ineligible) { - if (!q->bitmaps[ER]) { - struct qfq_group *grp; - grp = qfq_ffs(q, ineligible); - if (qfq_gt(grp->S, q->V)) - q->V = grp->S; - } - qfq_make_eligible(q, old_V); - } -} - -/* - * Updates the class, returns true if also the group needs to be updated. - */ -static inline int -qfq_update_class(struct qfq_sched *q, struct qfq_group *grp, - struct qfq_class *cl) -{ - - cl->S = cl->F; - if (cl->_q.mq.head == NULL) { - qfq_front_slot_remove(grp); - } else { - unsigned int len; - uint64_t roundedS; - - len = cl->_q.mq.head->m_pkthdr.len; - cl->F = cl->S + (uint64_t)len * cl->inv_w; - roundedS = qfq_round_down(cl->S, grp->slot_shift); - if (roundedS == grp->S) - return 0; - - qfq_front_slot_remove(grp); - qfq_slot_insert(grp, cl, roundedS); - } - return 1; -} - -static struct mbuf * -qfq_dequeue(struct dn_sch_inst *si) -{ - struct qfq_sched *q = (struct qfq_sched *)(si + 1); - struct qfq_group *grp; - struct qfq_class *cl; - struct mbuf *m; - uint64_t old_V; - - NO(q->loops++;) - if (!q->bitmaps[ER]) { - NO(if (q->queued) - dump_sched(q, "start dequeue");) - return NULL; - } - - grp = qfq_ffs(q, q->bitmaps[ER]); - - cl = grp->slots[grp->front]; - /* extract from the first bucket in the bucket list */ - m = dn_dequeue(&cl->_q); - - if (!m) { - D("BUG/* non-workconserving leaf */"); - return NULL; - } - NO(q->queued--;) - old_V = q->V; - q->V += (uint64_t)m->m_pkthdr.len * IWSUM; - ND("m is %p F 0x%llx V now 0x%llx", m, cl->F, q->V); - - if (qfq_update_class(q, grp, cl)) { - uint64_t old_F = grp->F; - cl = qfq_slot_scan(grp); - if (!cl) { /* group gone, remove from ER */ - __clear_bit(grp->index, &q->bitmaps[ER]); - // grp->S = grp->F + 1; // XXX debugging only - } else { - uint64_t roundedS = qfq_round_down(cl->S, grp->slot_shift); - unsigned int s; - - if (grp->S == roundedS) - goto skip_unblock; - grp->S = roundedS; - grp->F = roundedS + (2ULL << grp->slot_shift); - /* remove from ER and put in the new set */ - __clear_bit(grp->index, &q->bitmaps[ER]); - s = qfq_calc_state(q, grp); - __set_bit(grp->index, &q->bitmaps[s]); - } - /* we need to unblock even if the group has gone away */ - qfq_unblock_groups(q, grp->index, old_F); - } - -skip_unblock: - qfq_update_eligible(q, old_V); - NO(if (!q->bitmaps[ER] && q->queued) - dump_sched(q, "end dequeue");) - - return m; -} - -/* - * Assign a reasonable start time for a new flow k in group i. - * Admissible values for \hat(F) are multiples of \sigma_i - * no greater than V+\sigma_i . Larger values mean that - * we had a wraparound so we consider the timestamp to be stale. - * - * If F is not stale and F >= V then we set S = F. - * Otherwise we should assign S = V, but this may violate - * the ordering in ER. So, if we have groups in ER, set S to - * the F_j of the first group j which would be blocking us. - * We are guaranteed not to move S backward because - * otherwise our group i would still be blocked. - */ -static inline void -qfq_update_start(struct qfq_sched *q, struct qfq_class *cl) -{ - unsigned long mask; - uint64_t limit, roundedF; - int slot_shift = cl->grp->slot_shift; - - roundedF = qfq_round_down(cl->F, slot_shift); - limit = qfq_round_down(q->V, slot_shift) + (1UL << slot_shift); - - if (!qfq_gt(cl->F, q->V) || qfq_gt(roundedF, limit)) { - /* timestamp was stale */ - mask = mask_from(q->bitmaps[ER], cl->grp->index); - if (mask) { - struct qfq_group *next = qfq_ffs(q, mask); - if (qfq_gt(roundedF, next->F)) { - cl->S = next->F; - return; - } - } - cl->S = q->V; - } else { /* timestamp is not stale */ - cl->S = cl->F; - } -} - -static int -qfq_enqueue(struct dn_sch_inst *si, struct dn_queue *_q, struct mbuf *m) -{ - struct qfq_sched *q = (struct qfq_sched *)(si + 1); - struct qfq_group *grp; - struct qfq_class *cl = (struct qfq_class *)_q; - uint64_t roundedS; - int s; - - NO(q->loops++;) - DX(4, "len %d flow %p inv_w 0x%x grp %d", m->m_pkthdr.len, - _q, cl->inv_w, cl->grp->index); - /* XXX verify that the packet obeys the parameters */ - if (m != _q->mq.head) { - if (dn_enqueue(_q, m, 0)) /* packet was dropped */ - return 1; - NO(q->queued++;) - if (m != _q->mq.head) - return 0; - } - /* If reach this point, queue q was idle */ - grp = cl->grp; - qfq_update_start(q, cl); /* adjust start time */ - /* compute new finish time and rounded start. */ - cl->F = cl->S + (uint64_t)(m->m_pkthdr.len) * cl->inv_w; - roundedS = qfq_round_down(cl->S, grp->slot_shift); - - /* - * insert cl in the correct bucket. - * If cl->S >= grp->S we don't need to adjust the - * bucket list and simply go to the insertion phase. - * Otherwise grp->S is decreasing, we must make room - * in the bucket list, and also recompute the group state. - * Finally, if there were no flows in this group and nobody - * was in ER make sure to adjust V. - */ - if (grp->full_slots) { - if (!qfq_gt(grp->S, cl->S)) - goto skip_update; - /* create a slot for this cl->S */ - qfq_slot_rotate(q, grp, roundedS); - /* group was surely ineligible, remove */ - __clear_bit(grp->index, &q->bitmaps[IR]); - __clear_bit(grp->index, &q->bitmaps[IB]); - } else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V)) - q->V = roundedS; - - grp->S = roundedS; - grp->F = roundedS + (2ULL << grp->slot_shift); // i.e. 2\sigma_i - s = qfq_calc_state(q, grp); - __set_bit(grp->index, &q->bitmaps[s]); - ND("new state %d 0x%x", s, q->bitmaps[s]); - ND("S %llx F %llx V %llx", cl->S, cl->F, q->V); -skip_update: - qfq_slot_insert(grp, cl, roundedS); - - return 0; -} - - -#if 0 -static inline void -qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp, - struct qfq_class *cl, struct qfq_class **pprev) -{ - unsigned int i, offset; - uint64_t roundedS; - - roundedS = qfq_round_down(cl->S, grp->slot_shift); - offset = (roundedS - grp->S) >> grp->slot_shift; - i = (grp->front + offset) % QFQ_MAX_SLOTS; - -#ifdef notyet - if (!pprev) { - pprev = &grp->slots[i]; - while (*pprev && *pprev != cl) - pprev = &(*pprev)->next; - } -#endif - - *pprev = cl->next; - if (!grp->slots[i]) - __clear_bit(offset, &grp->full_slots); -} - -/* - * called to forcibly destroy a queue. - * If the queue is not in the front bucket, or if it has - * other queues in the front bucket, we can simply remove - * the queue with no other side effects. - * Otherwise we must propagate the event up. - * XXX description to be completed. - */ -static void -qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl, - struct qfq_class **pprev) -{ - struct qfq_group *grp = &q->groups[cl->index]; - unsigned long mask; - uint64_t roundedS; - int s; - - cl->F = cl->S; // not needed if the class goes away. - qfq_slot_remove(q, grp, cl, pprev); - - if (!grp->full_slots) { - /* nothing left in the group, remove from all sets. - * Do ER last because if we were blocking other groups - * we must unblock them. - */ - __clear_bit(grp->index, &q->bitmaps[IR]); - __clear_bit(grp->index, &q->bitmaps[EB]); - __clear_bit(grp->index, &q->bitmaps[IB]); - - if (test_bit(grp->index, &q->bitmaps[ER]) && - !(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) { - mask = q->bitmaps[ER] & ((1UL << grp->index) - 1); - if (mask) - mask = ~((1UL << __fls(mask)) - 1); - else - mask = ~0UL; - qfq_move_groups(q, mask, EB, ER); - qfq_move_groups(q, mask, IB, IR); - } - __clear_bit(grp->index, &q->bitmaps[ER]); - } else if (!grp->slots[grp->front]) { - cl = qfq_slot_scan(grp); - roundedS = qfq_round_down(cl->S, grp->slot_shift); - if (grp->S != roundedS) { - __clear_bit(grp->index, &q->bitmaps[ER]); - __clear_bit(grp->index, &q->bitmaps[IR]); - __clear_bit(grp->index, &q->bitmaps[EB]); - __clear_bit(grp->index, &q->bitmaps[IB]); - grp->S = roundedS; - grp->F = roundedS + (2ULL << grp->slot_shift); - s = qfq_calc_state(q, grp); - __set_bit(grp->index, &q->bitmaps[s]); - } - } - qfq_update_eligible(q, q->V); -} -#endif - -static int -qfq_new_fsk(struct dn_fsk *f) -{ - ipdn_bound_var(&f->fs.par[0], 1, 1, QFQ_MAX_WEIGHT, "qfq weight"); - ipdn_bound_var(&f->fs.par[1], 1500, 1, 2000, "qfq maxlen"); - ND("weight %d len %d\n", f->fs.par[0], f->fs.par[1]); - return 0; -} - -/* - * initialize a new scheduler instance - */ -static int -qfq_new_sched(struct dn_sch_inst *si) -{ - struct qfq_sched *q = (struct qfq_sched *)(si + 1); - struct qfq_group *grp; - int i; - - for (i = 0; i <= QFQ_MAX_INDEX; i++) { - grp = &q->groups[i]; - grp->index = i; - grp->slot_shift = QFQ_MTU_SHIFT + FRAC_BITS - - (QFQ_MAX_INDEX - i); - } - return 0; -} - -/* - * QFQ scheduler descriptor - */ -static struct dn_alg qfq_desc = { - _SI( .type = ) DN_SCHED_QFQ, - _SI( .name = ) "QFQ", - _SI( .flags = ) DN_MULTIQUEUE, - - _SI( .schk_datalen = ) 0, - _SI( .si_datalen = ) sizeof(struct qfq_sched), - _SI( .q_datalen = ) sizeof(struct qfq_class) - sizeof(struct dn_queue), - - _SI( .enqueue = ) qfq_enqueue, - _SI( .dequeue = ) qfq_dequeue, - - _SI( .config = ) NULL, - _SI( .destroy = ) NULL, - _SI( .new_sched = ) qfq_new_sched, - _SI( .free_sched = ) NULL, - _SI( .new_fsk = ) qfq_new_fsk, - _SI( .free_fsk = ) NULL, - _SI( .new_queue = ) qfq_new_queue, - _SI( .free_queue = ) qfq_free_queue, -}; - -DECLARE_DNSCHED_MODULE(dn_qfq, &qfq_desc); - -#ifdef QFQ_DEBUG -static void -dump_groups(struct qfq_sched *q, uint32_t mask) -{ - int i, j; - - for (i = 0; i < QFQ_MAX_INDEX + 1; i++) { - struct qfq_group *g = &q->groups[i]; - - if (0 == (mask & (1<<i))) - continue; - for (j = 0; j < QFQ_MAX_SLOTS; j++) { - if (g->slots[j]) - D(" bucket %d %p", j, g->slots[j]); - } - D("full_slots 0x%x", g->full_slots); - D(" %2d S 0x%20llx F 0x%llx %c", i, - g->S, g->F, - mask & (1<<i) ? '1' : '0'); - } -} - -static void -dump_sched(struct qfq_sched *q, const char *msg) -{ - D("--- in %s: ---", msg); - ND("loops %d queued %d V 0x%llx", q->loops, q->queued, q->V); - D(" ER 0x%08x", q->bitmaps[ER]); - D(" EB 0x%08x", q->bitmaps[EB]); - D(" IR 0x%08x", q->bitmaps[IR]); - D(" IB 0x%08x", q->bitmaps[IB]); - dump_groups(q, 0xffffffff); -}; -#endif /* QFQ_DEBUG */ diff --git a/sys/netinet/ipfw/dn_sched_rr.c b/sys/netinet/ipfw/dn_sched_rr.c deleted file mode 100644 index 1bbd800..0000000 --- a/sys/netinet/ipfw/dn_sched_rr.c +++ /dev/null @@ -1,307 +0,0 @@ -/* - * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa - * All rights reserved - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * $FreeBSD$ - */ - -#ifdef _KERNEL -#include <sys/malloc.h> -#include <sys/socket.h> -#include <sys/socketvar.h> -#include <sys/kernel.h> -#include <sys/mbuf.h> -#include <sys/module.h> -#include <net/if.h> /* IFNAMSIZ */ -#include <netinet/in.h> -#include <netinet/ip_var.h> /* ipfw_rule_ref */ -#include <netinet/ip_fw.h> /* flow_id */ -#include <netinet/ip_dummynet.h> -#include <netinet/ipfw/dn_heap.h> -#include <netinet/ipfw/ip_dn_private.h> -#include <netinet/ipfw/dn_sched.h> -#else -#include <dn_test.h> -#endif - -#define DN_SCHED_RR 3 // XXX Where? - -struct rr_queue { - struct dn_queue q; /* Standard queue */ - int status; /* 1: queue is in the list */ - int credit; /* Number of bytes to transmit */ - int quantum; /* quantum * C */ - struct rr_queue *qnext; /* */ -}; - -/* struct rr_schk contains global config parameters - * and is right after dn_schk - */ -struct rr_schk { - int min_q; /* Min quantum */ - int max_q; /* Max quantum */ - int q_bytes; /* Bytes per quantum */ -}; - -/* per-instance round robin list, right after dn_sch_inst */ -struct rr_si { - struct rr_queue *head, *tail; /* Pointer to current queue */ -}; - -/* Append a queue to the rr list */ -static inline void -rr_append(struct rr_queue *q, struct rr_si *si) -{ - q->status = 1; /* mark as in-rr_list */ - q->credit = q->quantum; /* initialize credit */ - - /* append to the tail */ - if (si->head == NULL) - si->head = q; - else - si->tail->qnext = q; - si->tail = q; /* advance the tail pointer */ - q->qnext = si->head; /* make it circular */ -} - -/* Remove the head queue from circular list. */ -static inline void -rr_remove_head(struct rr_si *si) -{ - if (si->head == NULL) - return; /* empty queue */ - si->head->status = 0; - - if (si->head == si->tail) { - si->head = si->tail = NULL; - return; - } - - si->head = si->head->qnext; - si->tail->qnext = si->head; -} - -/* Remove a queue from circular list. - * XXX see if ti can be merge with remove_queue() - */ -static inline void -remove_queue_q(struct rr_queue *q, struct rr_si *si) -{ - struct rr_queue *prev; - - if (q->status != 1) - return; - if (q == si->head) { - rr_remove_head(si); - return; - } - - for (prev = si->head; prev; prev = prev->qnext) { - if (prev->qnext != q) - continue; - prev->qnext = q->qnext; - if (q == si->tail) - si->tail = prev; - q->status = 0; - break; - } -} - - -static inline void -next_pointer(struct rr_si *si) -{ - if (si->head == NULL) - return; /* empty queue */ - - si->head = si->head->qnext; - si->tail = si->tail->qnext; -} - -static int -rr_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m) -{ - struct rr_si *si; - struct rr_queue *rrq; - - if (m != q->mq.head) { - if (dn_enqueue(q, m, 0)) /* packet was dropped */ - return 1; - if (m != q->mq.head) - return 0; - } - - /* If reach this point, queue q was idle */ - si = (struct rr_si *)(_si + 1); - rrq = (struct rr_queue *)q; - - if (rrq->status == 1) /* Queue is already in the queue list */ - return 0; - - /* Insert the queue in the queue list */ - rr_append(rrq, si); - - return 0; -} - -static struct mbuf * -rr_dequeue(struct dn_sch_inst *_si) -{ - /* Access scheduler instance private data */ - struct rr_si *si = (struct rr_si *)(_si + 1); - struct rr_queue *rrq; - uint64_t len; - - while ( (rrq = si->head) ) { - struct mbuf *m = rrq->q.mq.head; - if ( m == NULL) { - /* empty queue, remove from list */ - rr_remove_head(si); - continue; - } - len = m->m_pkthdr.len; - - if (len > rrq->credit) { - /* Packet too big */ - rrq->credit += rrq->quantum; - /* Try next queue */ - next_pointer(si); - } else { - rrq->credit -= len; - return dn_dequeue(&rrq->q); - } - } - - /* no packet to dequeue*/ - return NULL; -} - -static int -rr_config(struct dn_schk *_schk) -{ - struct rr_schk *schk = (struct rr_schk *)(_schk + 1); - ND("called"); - - /* use reasonable quantums (64..2k bytes, default 1500) */ - schk->min_q = 64; - schk->max_q = 2048; - schk->q_bytes = 1500; /* quantum */ - - return 0; -} - -static int -rr_new_sched(struct dn_sch_inst *_si) -{ - struct rr_si *si = (struct rr_si *)(_si + 1); - - ND("called"); - si->head = si->tail = NULL; - - return 0; -} - -static int -rr_free_sched(struct dn_sch_inst *_si) -{ - ND("called"); - /* Nothing to do? */ - return 0; -} - -static int -rr_new_fsk(struct dn_fsk *fs) -{ - struct rr_schk *schk = (struct rr_schk *)(fs->sched + 1); - /* par[0] is the weight, par[1] is the quantum step */ - ipdn_bound_var(&fs->fs.par[0], 1, - 1, 65536, "RR weight"); - ipdn_bound_var(&fs->fs.par[1], schk->q_bytes, - schk->min_q, schk->max_q, "RR quantum"); - return 0; -} - -static int -rr_new_queue(struct dn_queue *_q) -{ - struct rr_queue *q = (struct rr_queue *)_q; - - _q->ni.oid.subtype = DN_SCHED_RR; - - q->quantum = _q->fs->fs.par[0] * _q->fs->fs.par[1]; - ND("called, q->quantum %d", q->quantum); - q->credit = q->quantum; - q->status = 0; - - if (_q->mq.head != NULL) { - /* Queue NOT empty, insert in the queue list */ - rr_append(q, (struct rr_si *)(_q->_si + 1)); - } - return 0; -} - -static int -rr_free_queue(struct dn_queue *_q) -{ - struct rr_queue *q = (struct rr_queue *)_q; - - ND("called"); - if (q->status == 1) { - struct rr_si *si = (struct rr_si *)(_q->_si + 1); - remove_queue_q(q, si); - } - return 0; -} - -/* - * RR scheduler descriptor - * contains the type of the scheduler, the name, the size of the - * structures and function pointers. - */ -static struct dn_alg rr_desc = { - _SI( .type = ) DN_SCHED_RR, - _SI( .name = ) "RR", - _SI( .flags = ) DN_MULTIQUEUE, - - _SI( .schk_datalen = ) 0, - _SI( .si_datalen = ) sizeof(struct rr_si), - _SI( .q_datalen = ) sizeof(struct rr_queue) - sizeof(struct dn_queue), - - _SI( .enqueue = ) rr_enqueue, - _SI( .dequeue = ) rr_dequeue, - - _SI( .config = ) rr_config, - _SI( .destroy = ) NULL, - _SI( .new_sched = ) rr_new_sched, - _SI( .free_sched = ) rr_free_sched, - _SI( .new_fsk = ) rr_new_fsk, - _SI( .free_fsk = ) NULL, - _SI( .new_queue = ) rr_new_queue, - _SI( .free_queue = ) rr_free_queue, -}; - - -DECLARE_DNSCHED_MODULE(dn_rr, &rr_desc); diff --git a/sys/netinet/ipfw/dn_sched_wf2q.c b/sys/netinet/ipfw/dn_sched_wf2q.c deleted file mode 100644 index 7f16719..0000000 --- a/sys/netinet/ipfw/dn_sched_wf2q.c +++ /dev/null @@ -1,373 +0,0 @@ -/* - * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa - * Copyright (c) 2000-2002 Luigi Rizzo, Universita` di Pisa - * All rights reserved - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * $FreeBSD$ - */ - -#ifdef _KERNEL -#include <sys/malloc.h> -#include <sys/socket.h> -#include <sys/socketvar.h> -#include <sys/kernel.h> -#include <sys/mbuf.h> -#include <sys/module.h> -#include <net/if.h> /* IFNAMSIZ */ -#include <netinet/in.h> -#include <netinet/ip_var.h> /* ipfw_rule_ref */ -#include <netinet/ip_fw.h> /* flow_id */ -#include <netinet/ip_dummynet.h> -#include <netinet/ipfw/dn_heap.h> -#include <netinet/ipfw/ip_dn_private.h> -#include <netinet/ipfw/dn_sched.h> -#else -#include <dn_test.h> -#endif - -#ifndef MAX64 -#define MAX64(x,y) (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x) -#endif - -/* - * timestamps are computed on 64 bit using fixed point arithmetic. - * LMAX_BITS, WMAX_BITS are the max number of bits for the packet len - * and sum of weights, respectively. FRAC_BITS is the number of - * fractional bits. We want FRAC_BITS >> WMAX_BITS to avoid too large - * errors when computing the inverse, FRAC_BITS < 32 so we can do 1/w - * using an unsigned 32-bit division, and to avoid wraparounds we need - * LMAX_BITS + WMAX_BITS + FRAC_BITS << 64 - * As an example - * FRAC_BITS = 26, LMAX_BITS=14, WMAX_BITS = 19 - */ -#ifndef FRAC_BITS -#define FRAC_BITS 28 /* shift for fixed point arithmetic */ -#define ONE_FP (1UL << FRAC_BITS) -#endif - -/* - * Private information for the scheduler instance: - * sch_heap (key is Finish time) returns the next queue to serve - * ne_heap (key is Start time) stores not-eligible queues - * idle_heap (key=start/finish time) stores idle flows. It must - * support extract-from-middle. - * A flow is only in 1 of the three heaps. - * XXX todo: use a more efficient data structure, e.g. a tree sorted - * by F with min_subtree(S) in each node - */ -struct wf2qp_si { - struct dn_heap sch_heap; /* top extract - key Finish time */ - struct dn_heap ne_heap; /* top extract - key Start time */ - struct dn_heap idle_heap; /* random extract - key Start=Finish time */ - uint64_t V; /* virtual time */ - uint32_t inv_wsum; /* inverse of sum of weights */ - uint32_t wsum; /* sum of weights */ -}; - -struct wf2qp_queue { - struct dn_queue _q; - uint64_t S, F; /* start time, finish time */ - uint32_t inv_w; /* ONE_FP / weight */ - int32_t heap_pos; /* position (index) of struct in heap */ -}; - -/* - * This file implements a WF2Q+ scheduler as it has been in dummynet - * since 2000. - * The scheduler supports per-flow queues and has O(log N) complexity. - * - * WF2Q+ needs to drain entries from the idle heap so that we - * can keep the sum of weights up to date. We can do it whenever - * we get a chance, or periodically, or following some other - * strategy. The function idle_check() drains at most N elements - * from the idle heap. - */ -static void -idle_check(struct wf2qp_si *si, int n, int force) -{ - struct dn_heap *h = &si->idle_heap; - while (n-- > 0 && h->elements > 0 && - (force || DN_KEY_LT(HEAP_TOP(h)->key, si->V))) { - struct dn_queue *q = HEAP_TOP(h)->object; - struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q; - - heap_extract(h, NULL); - /* XXX to let the flowset delete the queue we should - * mark it as 'unused' by the scheduler. - */ - alg_fq->S = alg_fq->F + 1; /* Mark timestamp as invalid. */ - si->wsum -= q->fs->fs.par[0]; /* adjust sum of weights */ - if (si->wsum > 0) - si->inv_wsum = ONE_FP/si->wsum; - } -} - -static int -wf2qp_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m) -{ - struct dn_fsk *fs = q->fs; - struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); - struct wf2qp_queue *alg_fq; - uint64_t len = m->m_pkthdr.len; - - if (m != q->mq.head) { - if (dn_enqueue(q, m, 0)) /* packet was dropped */ - return 1; - if (m != q->mq.head) /* queue was already busy */ - return 0; - } - - /* If reach this point, queue q was idle */ - alg_fq = (struct wf2qp_queue *)q; - - if (DN_KEY_LT(alg_fq->F, alg_fq->S)) { - /* F<S means timestamps are invalid ->brand new queue. */ - alg_fq->S = si->V; /* init start time */ - si->wsum += fs->fs.par[0]; /* add weight of new queue. */ - si->inv_wsum = ONE_FP/si->wsum; - } else { /* if it was idle then it was in the idle heap */ - heap_extract(&si->idle_heap, q); - alg_fq->S = MAX64(alg_fq->F, si->V); /* compute new S */ - } - alg_fq->F = alg_fq->S + len * alg_fq->inv_w; - - /* if nothing is backlogged, make sure this flow is eligible */ - if (si->ne_heap.elements == 0 && si->sch_heap.elements == 0) - si->V = MAX64(alg_fq->S, si->V); - - /* - * Look at eligibility. A flow is not eligibile if S>V (when - * this happens, it means that there is some other flow already - * scheduled for the same pipe, so the sch_heap cannot be - * empty). If the flow is not eligible we just store it in the - * ne_heap. Otherwise, we store in the sch_heap. - * Note that for all flows in sch_heap (SCH), S_i <= V, - * and for all flows in ne_heap (NEH), S_i > V. - * So when we need to compute max(V, min(S_i)) forall i in - * SCH+NEH, we only need to look into NEH. - */ - if (DN_KEY_LT(si->V, alg_fq->S)) { - /* S>V means flow Not eligible. */ - if (si->sch_heap.elements == 0) - D("++ ouch! not eligible but empty scheduler!"); - heap_insert(&si->ne_heap, alg_fq->S, q); - } else { - heap_insert(&si->sch_heap, alg_fq->F, q); - } - return 0; -} - -/* XXX invariant: sch > 0 || V >= min(S in neh) */ -static struct mbuf * -wf2qp_dequeue(struct dn_sch_inst *_si) -{ - /* Access scheduler instance private data */ - struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); - struct mbuf *m; - struct dn_queue *q; - struct dn_heap *sch = &si->sch_heap; - struct dn_heap *neh = &si->ne_heap; - struct wf2qp_queue *alg_fq; - - if (sch->elements == 0 && neh->elements == 0) { - /* we have nothing to do. We could kill the idle heap - * altogether and reset V - */ - idle_check(si, 0x7fffffff, 1); - si->V = 0; - si->wsum = 0; /* should be set already */ - return NULL; /* quick return if nothing to do */ - } - idle_check(si, 1, 0); /* drain something from the idle heap */ - - /* make sure at least one element is eligible, bumping V - * and moving entries that have become eligible. - * We need to repeat the first part twice, before and - * after extracting the candidate, or enqueue() will - * find the data structure in a wrong state. - */ - m = NULL; - for(;;) { - /* - * Compute V = max(V, min(S_i)). Remember that all elements - * in sch have by definition S_i <= V so if sch is not empty, - * V is surely the max and we must not update it. Conversely, - * if sch is empty we only need to look at neh. - * We don't need to move the queues, as it will be done at the - * next enqueue - */ - if (sch->elements == 0 && neh->elements > 0) { - si->V = MAX64(si->V, HEAP_TOP(neh)->key); - } - while (neh->elements > 0 && - DN_KEY_LEQ(HEAP_TOP(neh)->key, si->V)) { - q = HEAP_TOP(neh)->object; - alg_fq = (struct wf2qp_queue *)q; - heap_extract(neh, NULL); - heap_insert(sch, alg_fq->F, q); - } - if (m) /* pkt found in previous iteration */ - break; - /* ok we have at least one eligible pkt */ - q = HEAP_TOP(sch)->object; - alg_fq = (struct wf2qp_queue *)q; - m = dn_dequeue(q); - heap_extract(sch, NULL); /* Remove queue from heap. */ - si->V += (uint64_t)(m->m_pkthdr.len) * si->inv_wsum; - alg_fq->S = alg_fq->F; /* Update start time. */ - if (q->mq.head == 0) { /* not backlogged any more. */ - heap_insert(&si->idle_heap, alg_fq->F, q); - } else { /* Still backlogged. */ - /* Update F, store in neh or sch */ - uint64_t len = q->mq.head->m_pkthdr.len; - alg_fq->F += len * alg_fq->inv_w; - if (DN_KEY_LEQ(alg_fq->S, si->V)) { - heap_insert(sch, alg_fq->F, q); - } else { - heap_insert(neh, alg_fq->S, q); - } - } - } - return m; -} - -static int -wf2qp_new_sched(struct dn_sch_inst *_si) -{ - struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); - int ofs = offsetof(struct wf2qp_queue, heap_pos); - - /* all heaps support extract from middle */ - if (heap_init(&si->idle_heap, 16, ofs) || - heap_init(&si->sch_heap, 16, ofs) || - heap_init(&si->ne_heap, 16, ofs)) { - heap_free(&si->ne_heap); - heap_free(&si->sch_heap); - heap_free(&si->idle_heap); - return ENOMEM; - } - return 0; -} - -static int -wf2qp_free_sched(struct dn_sch_inst *_si) -{ - struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); - - heap_free(&si->sch_heap); - heap_free(&si->ne_heap); - heap_free(&si->idle_heap); - - return 0; -} - -static int -wf2qp_new_fsk(struct dn_fsk *fs) -{ - ipdn_bound_var(&fs->fs.par[0], 1, - 1, 100, "WF2Q+ weight"); - return 0; -} - -static int -wf2qp_new_queue(struct dn_queue *_q) -{ - struct wf2qp_queue *q = (struct wf2qp_queue *)_q; - - _q->ni.oid.subtype = DN_SCHED_WF2QP; - q->F = 0; /* not strictly necessary */ - q->S = q->F + 1; /* mark timestamp as invalid. */ - q->inv_w = ONE_FP / _q->fs->fs.par[0]; - if (_q->mq.head != NULL) { - wf2qp_enqueue(_q->_si, _q, _q->mq.head); - } - return 0; -} - -/* - * Called when the infrastructure removes a queue (e.g. flowset - * is reconfigured). Nothing to do if we did not 'own' the queue, - * otherwise remove it from the right heap and adjust the sum - * of weights. - */ -static int -wf2qp_free_queue(struct dn_queue *q) -{ - struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q; - struct wf2qp_si *si = (struct wf2qp_si *)(q->_si + 1); - - if (alg_fq->S >= alg_fq->F + 1) - return 0; /* nothing to do, not in any heap */ - si->wsum -= q->fs->fs.par[0]; - if (si->wsum > 0) - si->inv_wsum = ONE_FP/si->wsum; - - /* extract from the heap. XXX TODO we may need to adjust V - * to make sure the invariants hold. - */ - if (q->mq.head == NULL) { - heap_extract(&si->idle_heap, q); - } else if (DN_KEY_LT(si->V, alg_fq->S)) { - heap_extract(&si->ne_heap, q); - } else { - heap_extract(&si->sch_heap, q); - } - return 0; -} - -/* - * WF2Q+ scheduler descriptor - * contains the type of the scheduler, the name, the size of the - * structures and function pointers. - */ -static struct dn_alg wf2qp_desc = { - _SI( .type = ) DN_SCHED_WF2QP, - _SI( .name = ) "WF2Q+", - _SI( .flags = ) DN_MULTIQUEUE, - - /* we need extra space in the si and the queue */ - _SI( .schk_datalen = ) 0, - _SI( .si_datalen = ) sizeof(struct wf2qp_si), - _SI( .q_datalen = ) sizeof(struct wf2qp_queue) - - sizeof(struct dn_queue), - - _SI( .enqueue = ) wf2qp_enqueue, - _SI( .dequeue = ) wf2qp_dequeue, - - _SI( .config = ) NULL, - _SI( .destroy = ) NULL, - _SI( .new_sched = ) wf2qp_new_sched, - _SI( .free_sched = ) wf2qp_free_sched, - - _SI( .new_fsk = ) wf2qp_new_fsk, - _SI( .free_fsk = ) NULL, - - _SI( .new_queue = ) wf2qp_new_queue, - _SI( .free_queue = ) wf2qp_free_queue, -}; - - -DECLARE_DNSCHED_MODULE(dn_wf2qp, &wf2qp_desc); diff --git a/sys/netinet/ipfw/dummynet.txt b/sys/netinet/ipfw/dummynet.txt deleted file mode 100644 index e8c9725..0000000 --- a/sys/netinet/ipfw/dummynet.txt +++ /dev/null @@ -1,860 +0,0 @@ -# -# $FreeBSD$ -# - -Notes on the internal structure of dummynet (2010 version) -by Riccardo Panicucci and Luigi Rizzo -Work supported by the EC project ONELAB2 - - -********* -* INDEX * -********* -Implementation of new dummynet - Internal structure - Files -Packet arrival - The reconfiguration routine -dummynet_task() -Configuration - Add a pipe - Add a scheduler - Add a flowset -Listing object -Delete of object - Delete a pipe - Delete a flowset - Delete a scheduler -Compatibility with FreeBSD7.2 and FreeBSD 8 ipfw binary - ip_dummynet_glue.c - ip_fw_glue.c -How to configure dummynet -How to implement a new scheduler - - - -OPEN ISSUES ------------------------------- -20100131 deleting RR causes infinite loop - presumably in the rr_free_queue() call -- seems to hang - forever when deleting a live flow ------------------------------- - -Dummynet is a traffic shaper and network emulator. Packets are -selected by an external filter such as ipfw, and passed to the emulator -with a tag such as "pipe 10" or "queue 5" which tells what to -do with the packet. As an example - - ipfw add queue 5 icmp from 10.0.0.2 to all - -All packets with the same tag belong to a "flowset", or a set -of flows which can be further partitioned according to a mask. -Flowsets are then passed to a scheduler for processing. The -association of flowsets and schedulers is configurable e.g. - - ipfw queue 5 config sched 10 weight 3 flow_mask xxxx - ipfw queue 8 config sched 10 weight 1 ... - ipfw queue 3 config sched 20 weight 1 ... - -"sched 10" represents one or more scheduler instances, -selected through a mask on the 5-tuple itself. - - ipfw sched 20 config type FIFO sched_mask yyy ... - -There are in fact two masks applied to each packet: -+ the "sched_mask" sends packets arriving to a scheduler_id to - one of many instances. -+ the "flow_mask" together with the flowset_id is used to - collect packets into independent flows on each scheduler. - -As an example, we can have - ipfw queue 5 config sched 10 flow_mask src-ip 0x000000ff - ipfw sched 10 config type WF2Q+ sched_mask src-ip 0xffffff00 - -means that sched 10 will have one instance per /24 source subnet, -and within that, each individual source will be a flow. - -Internal structure ------------------ -Dummynet-related data is split into several data structures, -part of them constituting the userland-kernel API, and others -specific to the kernel. -NOTE: for up-to-date details please look at the relevant source - headers (ip_dummynet.h, ip_dn_private.h, dn_sched.h) - -USERLAND-KERNEL API (ip_dummynet.h) - - struct dn_link: - contains data about the physical link such as - bandwith, delay, burst size; - - struct dn_fs: - describes a flowset, i.e. a template for queues. - Main parameters are the scheduler we attach to, a flow_mask, - buckets, queue size, plr, weight, and other scheduler-specific - parameters. - - struct dn_flow - contains information on a flow, including masks and - statistics - - struct dn_sch: - defines a scheduler (and a link attached to it). - Parameters include scheduler type, sched_mask, number of - buckets, and possibly other scheduler-specific parameters, - - struct dn_profile: - fields to simulate a delay profile - - -KERNEL REPRESENTATION (ip_dn_private.h) - - struct mq - a queue of mbufs with head and tail. - - struct dn_queue - individual queue of packets, created by a flowset using - flow_mask and attached to a scheduler instance selected - through sched_mask. - A dn_queue has a pointer to the dn_fsk (which in turn counts - how many queues point to it), a pointer to the - dn_sch_inst it attaches to, and is in a hash table in the - flowset. scheduler instances also should store queues in - their own containers used for scheduling (lists, trees, etc.) - CREATE: done on packet arrivals when a flow matches a flowset. - DELETE: done only when deleting the parent dn_sch_inst - or draining memory. - - struct dn_fsk - includes a dn_fs; a pointer to the dn_schk; a link field - for the list of dn_fsk attached to the same scheduler, - or for the unlinked list; - a refcount for the number of queues pointing to it; - The dn_fsk is in a hash table, fshash. - CREATE: done on configuration commands. - DELETE: on configuration commands. - - struct dn_sch_inst - a scheduler instance, created from a dn_schk applying sched_mask. - Contains a delay line, a reference to the parent, and scheduler- - specific info. Both dn_sch_inst and its delay line can be in the - evheap if they have events to be processed. - CREATE: created from a dn_schk applying sched_mask - DELETE: configuration command delete a scheduler which in turn - sweeps the hash table of instances deleting them - - struct dn_schk - includes dn_sch, dn_link, a pointer to dn_profile, - a hash table of dn_sch_inst, a list of dn_fsk - attached to it. - CREATE: configuration command. If there are flowsets that - refer to this number, they are attached and moved - to the hash table - DELETE: manual, see dn_sch_inst - - - fshash schedhash - +---------------+ sched +--------------+ - | sched-------------------->| NEW_SCHK| - -<----*sch_chain |<-----------------*fsk_list | - |NEW_FSK |<----. | [dn_link] | - +---------------+ | +--------------+ - |qht (hash) | | | siht(hash) | - | [dn_queue] | | | [dn_si] | - | [dn_queue] | | | [dn_si] | - | ... | | | ... | - | +--------+ | | | +---------+ | - | |dn_queue| | | | |dn_si | | - | | fs *----------' | | | | - | | si *---------------------->| | | - | +---------+ | | +---------+ | - +---------------+ +--------------+ - -The following global data structures contain all -schedulers and flowsets. - -- schedhash[x]: contains all scheduler templates in the system. - Looked up only on manual configurations, where flowsets - are attached to matching schedulers. - We have one entry per 'sched X config' command - (plus one for each 'pipe X config'). - -- fshash[x]: contains all flowsets. - We do a lookup on this for each packet. - We have one entry for each 'queue X config' - (plus one for each 'pipe X config'). - -Additionally, a list that contains all unlinked flowset: -- fsu: contains flowset that are not linked with any scheduler. - Flowset are put in this list when they refer to a non - existing scheduler. - We don't need an efficient data structure as we never search - here on a packet arrivals. - -Scheduler instances and the delay lines associated with each scheduler -instance need to be woken up at certain times. Because we have many -such objects, we keep them in a priority heap (system_heap). - -Almost all objects in this implementation are preceded by a structure -(struct dn_id) which makes it easier to identify them. - - -Files ------ -The dummynet code is split in several files. -All kernel code is in sys/netinet/ipfw except ip_dummynet.h -All userland code is in sbin/ipfw. -Files are -- sys/netinet/ip_dummynet.h defines the kernel-userland API -- ip_dn_private.h contains the kernel-specific APIs - and data structures -- dn_sched.h defines the scheduler API -- ip_dummynet.c cointains module glue and sockopt handlers, with all - functions to configure and list objects. -- ip_dn_io.c contains the functions directly related to packet processing, - and run in the critical path. It also contains some functions - exported to the schedulers. -- dn_heap.[ch] implement a binary heap and a generic hash table -- dn_sched_* implement the various scheduler modules - -- dummynet.c is the file used to implement the user side of dummynet. - It contains the function to parsing command line, and functions to - show the output of dummynet objects. -Moreover, there are two new file (ip_dummynet_glue.c and ip_fw_glue.c) that -are used to allow compatibility with the "ipfw" binary from FreeBSD 7.2 and -FreeBSD 8. - -LOCKING -======= -At the moment the entire processing occurs under a single lock -which is expected to be acquired in exclusive mode -DN_BH_WLOCK() / DN_BH_WUNLOCK(). - -In perspective we aim at the following: -- the 'busy' flag, 'pending' list and all structures modified by packet - arrivals and departures are protected by the BH_WLOCK. - This is normally acquired in exclusive mode by the packet processing - functions for short sections of code (exception -- the timer). - If 'busy' is not set, we can do regular packet processing. - If 'busy' is set, no pieces can be accessed. - We must enqueue the packet on 'pending' and return immediately. - -- the 'busy' flag is set/cleared by long sections of code as follows: - UH_WLOCK(); KASSERT(busy == 0); - BH_WLOCK(); busy=1; BH_WUNLOCK(); - ... do processing ... - BH_WLOCK(); busy=0; drain_queue(pending); BH_WUNLOCK(); - UH_WUNLOCK(); - this normally happens when the upper half has something heavy - to do. The prologue and epilogue are not in the critical path. - -- the main containers (fshash, schedhash, ...) are protected by - UH_WLOCK. - -Packet processing -================= -A packet enters dummynet through dummynet_io(). We first lookup -the flowset number in fshash using dn_ht_find(), then find the scheduler -instance using ipdn_si_find(), then possibly identify the correct -queue with ipdn_q_find(). -If successful, we call the scheduler's enqueue function(), and -if needed start I/O on the link calling serve_sched(). -If the packet can be returned immediately, this is done by -leaving *m0 set. Otherwise, the packet is absorbed by dummynet -and we simply return, possibly with some appropriate error code. - -Reconfiguration ---------------- -Reconfiguration is the complex part of the system because we need to -keep track of the various objects and containers. -At the moment we do not use reference counts for objects so all -processing must be done under a lock. - -The main entry points for configuration is the ip_dn_ctl() handler -for the IP_DUMMYNET3 sockopt (others are provided only for backward -compatibility). Modifications to the configuration call do_config(). -The argument is a sequence of blocks each starting with a struct dn_id -which specifies its content. -The first dn_id must contain as obj.id the DN_API_VERSION -The obj.type is DN_CMD_CONFIG (followed by actual objects), -DN_CMD_DELETE (with the correct subtype and list of objects), or -DN_CMD_FLUSH. - -DN_CMD_CONFIG is followed by objects to add/reconfigure. In general, -if an object already exists it is reconfigured, otherwise it is -created in a way that keeps the structure consistent. -We have the following objects in the system, normally numbered with -an identifier N between 1 and 65535. For certain objects we have -"shadow" copies numbered I+NMAX and I+ 2*NMAX which are used to -implement certain backward compatibility features. - -In general we have the following linking - - TRADITIONAL DUMMYNET QUEUES "queue N config ... pipe M ..." - corresponds to a dn_fs object numbered N - - TRADITIONAL DUMMYNET PIPES "pipe N config ..." - dn_fs N+2*NMAX --> dn_sch N+NMAX type FIFO --> dn_link N+NMAX - - GENERIC SCHEDULER "sched N config ... " - [dn_fs N+NMAX] --> dn_sch N --> dn_link N - The flowset N+NMAX is created only if the scheduler is not - of type MULTIQUEUE. - - DELAY PROFILE "pipe N config profile ..." - it is always attached to an existing dn_link N - -Because traditional dummynet pipes actually configure both a -'standalone' instance and one that can be used by queues, -we do the following: - - "pipe N config ..." configures: - dn_sched N type WF2Q+ - dn_sched N+NMAX type FIFO - dn_fs N+2NMAX attached to dn_sched N+NMAX - dn_pipe N - dn_pipe N+NMAX - - "queue N config" configures - dn_fs N - - "sched N config" configures - dn_sched N type as desired - dn_fs N+NMAX attached to dn_sched N - - -dummynet_task() -=============== -The dummynet_task() function is the main dummynet processing function and is -called every tick. This function first calculate the new current time, then -it checks if it is the time to wake up object from the system_heap comparing -the current time and the key of the heap. Two types of object (really the -heap contains pointer to objects) are in the -system_heap: - -- scheduler instance: if a scheduler instance is waked up, the dequeue() - function is called until it has credit. If the dequeue() returns packets, - the scheduler instance is inserted in the heap with a new key depending of - the data that will be send out. If the scheduler instance remains with - some credit, it means that is hasn't other packet to send and so the - instance is no longer inserted in the heap. - - If the scheduler instance extracted from the heap has the DELETE flag set, - the dequeue() is not called and the instance is destroyed now. - -- delay line: when extracting a delay line, the function transmit_event() is - called to send out packet from delay line. - - If the scheduler instance associated with this delay line doesn't exists, - the delay line will be delete now. - -Configuration -============= -To create a pipe, queue or scheduler, the user should type commands like: -"ipfw pipe x config" -"ipfw queue y config pipe x" -"ipfw pipe x config sched <type>" - -The userland side of dummynet will prepare a buffer contains data to pass to -kernel side. -The buffer contains all struct needed to configure an object. In more detail, -to configure a pipe all three structs (dn_link, dn_sch, dn_fs) are needed, -plus the delay profile struct if the pipe has a delay profile. - -If configuring a scheduler only the struct dn_sch is wrote in the buffer, -while if configuring a flowset only the dn_fs struct is wrote. - -The first struct in the buffer contains the type of command request, that is -if it is configuring a pipe, a queue, or a scheduler. Then there are structs -need to configure the object, and finally there is the struct that mark -the end of the buffer. - -To support the insertion of pipe and queue using the old syntax, when adding -a pipe it's necessary to create a FIFO flowset and a FIFO scheduler, which -have a number x + DN_PIPEOFFSET. - -Add a pipe ----------- -A pipe is only a template for a link. -If the pipe already exists, parameters are updated. If a delay profile exists -it is deleted and a new one is created. -If the pipe doesn't exist a new one is created. After the creation, the -flowset unlinked list is scanned to see if there are some flowset that would -be linked with this pipe. If so, these flowset will be of wf2q+ type (for -compatibility) and a new wf2q+ scheduler is created now. - -Add a scheduler ---------------- -If the scheduler already exists, and the type and the mask are the same, the -scheduler is simply reconfigured calling the config_scheduler() scheduler -function with the RECONFIGURE flag active. -If the type or the mask differ, it is necessary to delete the old scheduler -and create a new one. -If the scheduler doesn't exists, a new one is created. If the scheduler has -a mask, the hash table is created to store pointers to scheduler instances. -When a new scheduler is created, it is necessary to scan the unlinked -flowset list to search eventually flowset that would be linked with this -scheduler number. If some are found, flowsets became of the type of this -scheduler and they are configured properly. - -Add a flowset -------------- -Flowset pointers are store in the system in two list. The unlinked flowset list -contains all flowset that aren't linked with a scheduler, the flowset list -contains flowset linked to a scheduler, and so they have a type. -When adding a new flowset, first it is checked if the flowset exists (that is, -it is in the flowset list) and if it doesn't exists a new flowset is created -and added to unlinked flowset list if the scheduler which the flowset would be -linked doesn't exists, or added in the flowset list and configured properly if -the scheduler exists. If the flowset (before to be created) was in the -unlinked flowset list, it is removed and deleted, and then recreated. -If the flowset exists, to allow reconfiguration of this flowset, the -scheduler number and types must match with the one in memory. If this isn't -so, the flowset is deleted and a new one will be created. Really, the flowset -it isn't deleted now, but it is removed from flowset list and it will be -deleted later because there could be some queues that are using it. - -Listing of object -================= -The user can request a list of object present in dummynet through the command -"ipfw [-v] pipe|queue [x] list|show" -The kernel side of dummynet send a buffer to user side that contains all -pipe, all scheduler, all flowset, plus all scheduler instances and all queues. -The dummynet user land will format the output and show only the relevant -information. -The buffer sent start with all pipe from the system. The entire struct dn_link -is passed, except the delay_profile struct that is useless in user space. -After pipes, all flowset are wrote in the buffer. The struct contains -scheduler flowset specific data is linked with the flowset writing the -'obj' id of the extension into the 'alg_fs' pointer. -Then schedulers are wrote. If a scheduler has one or more scheduler instance, -these are linked to the parent scheduler writing the id of the parent in the -'ptr_sched' pointer. If a scheduler instance has queues, there are wrote in -the buffer and linked thorugh the 'obj' and 'sched_inst' pointer. -Finally, flowsets in the unlinked flowset list are write in the buffer, and -then a struct gen in saved in the buffer to mark the last struct in the buffer. - - -Delete of object -================ -An object is usually removed by user through a command like -"ipfw pipe|queue x delete". XXX sched? -ipfw pass to the kernel a struct gen that contains the type and the number -of the object to remove - -Delete of pipe x ----------------- -A pipe can be deleted by the user throught the command 'ipfw pipe x delete'. -To delete a pipe, the pipe is removed from the pipe list, and then deleted. -Also the scheduler associated with this pipe should be deleted. -For compatibility with old dummynet syntax, the associated FIFO scheduler and -FIFO flowset must be deleted. - -Delete of flowset x -------------------- -To remove a flowset, we must be sure that is no loger referenced by any object. -If the flowset to remove is in the unlinked flowset list, there is not any -issue, the flowset can be safely removed calling a free() (the flowset -extension is not yet created if the flowset is in this list). -If the flowset is in the flowset list, first we remove from it so new packet -are discarded when arrive. Next, the flowset is marked as delete. -Now we must check if some queue is using this flowset. -To do this, a counter (active_f) is provided. This counter indicate how many -queues exist using this flowset. -The active_f counter is automatically incremented when a queue is created -and decremented when a queue is deleted. -If the counter is 0, the flowset can be safely deleted, and the delete_alg_fs() -scheduler function is called before deallocate memory. -If the counter is not 0, the flowset remain in memory until the counter become -zero. When a queue is delete (by dn_delete_queue() function) it is checked if -the linked flowset is deleting and if so the counter is decrementing. If the -counter reaches 0, the flowset is deleted. -The deletion of a queue can be done only by the scheduler, or when the scheduler -is destroyed. - -Delete of scheduler x ---------------------- -To delete a scheduler we must be sure that any scheduler instance of this type -are in the system_heap. To do so, a counter (inst_counter) is provided. -This counter is managed by the system: it is incremented every time it is -inserted in the system_heap, and decremented every time it is extracted from it. -To delete the scheduler, first we remove it from the scheduler list, so new -packet are discarded when they arrive, and mark the scheduler as deleting. - -If the counter is 0, we can remove the scheduler safely calling the -really_deletescheduler() function. This function will scan all scheduler -instances and call the delete_scheduler_instance() function that will delete -the instance. When all instance are deleted, the scheduler template is -deleted calling the delete_scheduler_template(). If the delay line associate -with the scheduler is empty, it is deleted now, else it will be deleted when -it will became empy. -If the counter was not 0, we wait for it. Every time the dummynet_task() -function extract a scheduler from the system_heap, the counter is decremented. -If the scheduler has the delete flag enabled the dequeue() is not called and -delete_scheduler_instance() is called to delete the instance. -Obviously this scheduler instance is no loger inserted in the system_heap. -If the counter reaches 0, the delete_scheduler_template() function is called -all memory is released. -NOTE: Flowsets that belong to this scheduler are not deleted, so if a new - scheduler with the same number is inserted will use these flowsets. - To do so, the best approach would be insert these flowset in the - unlinked flowset list, but doing this now will be very expensive. - So flowsets will remain in memory and linked with a scheduler that no - longer exists until a packet belonging to this flowset arrives. When - this packet arrives, the reconfigure() function is called because the - generation number mismatch with one contains in the flowset and so - the flowset will be moved into the flowset unlinked list, or will be - linked with the new scheduler if a new one was created. - - -COMPATIBILITY WITH FREEBSD 7.2 AND FREEBSD 8 'IPFW' BINARY -========================================================== -Dummynet is not compatible with old ipfw binary because internal structs are -changed. Moreover, the old ipfw binary is not compatible with new kernels -because the struct that represents a firewall rule has changed. So, if a user -install a new kernel on a FreeBSD 7.2, the ipfw (and possibly many other -commands) will not work. -New dummynet uses a new socket option: IP_DUMMYNET3, used for both set and get. -The old option can be used to allow compatibility with the 'ipfw' binary of -older version (tested with 7.2 and 8.0) of FreeBSD. -Two file are provided for this purpose: -- ip_dummynet_glue.c translates old dummynet requests to the new ones, -- ip_fw_glue.c converts the rule format between 7.2 and 8 versions. -Let see in detail these two files. - -IP_DUMMYNET_GLUE.C ------------------- -The internal structs of new dummynet are very different from the original. -Because of there are some difference from between dummynet in FreeBSD 7.2 and -dummynet in FreeBSD 8 (the FreeBSD 8 version includes support to pipe delay -profile and burst option), I have to include both header files. I copied -the revision 191715 (for version 7.2) and the revision 196045 (for version 8) -and I appended a number to each struct to mark them. - -The main function of this file is ip_dummynet_compat() that is called by -ip_dn_ctl() when it receive a request of old socket option. - -A global variabile ('is7') store the version of 'ipfw' that FreeBSD is using. -This variable is set every time a request of configuration is done, because -with this request we receive a buffer of which size depending of ipfw version. -Because of in general the first action is a configuration, this variable is -usually set accordly. If the first action is a request of listing of pipes -or queues, the system cannot know the version of ipfw, and we suppose that -version 7.2 is used. If version is wrong, the output can be senseless, but -the application should not crash. - -There are four request for old dummynet: -- IP_DUMMYNET_FLUSH: the flush options have no parameter, so simply the - dummynet_flush() function is called; -- IP_DUMMYNET_DEL: the delete option need to be translate. - It is only necessary to extract the number and the type of the object - (pipe or queue) to delete from the buffer received and build a new struct - gen contains the right parameters, then call the delete_object() function; -- IP_DUMMYNET_CONFIGURE: the configure command receive a buffer depending of - the ipfw version. After the properly extraction of all data, that depends - by the ipfw version used, new structures are filled and then the dummynet - config_link() function is properly called. Note that the 7.2 version does - not support some parameter as burst or delay profile. -- IP_DUMMYNET_GET: The get command should send to the ipfw the correct buffer - depending of its version. There are two function that build the - corrected buffer, ip_dummynet_get7() and ip_dummynet_get8(). These - functions reproduce the buffer exactly as 'ipfw' expect. The only difference - is that the weight parameter for a queue is no loger sent by dummynet and so - it is set to 0. - Moreover, because of the internal structure has changed, the bucket size - of a queue could not be correct, because now all flowset share the hash - table. - If the version of ipfw is wrong, the output could be senseless or truncated, - but the application should not crash. - -IP_FW_GLUE.C ------------- -The ipfw binary also is used to add rules to FreeBSD firewall. Because of the -struct ip_fw is changed from FreeBsd 7.2 to FreeBSD 8, it is necessary -to write some glue code to allow use ipfw from FreeBSD 7.2 with the kernel -provided with FreeBSD 8. -This file contains two functions to convert a rule from FreeBSD 7.2 format to -FreeBSD 8 format, and viceversa. -The conversion should be done when a rule passes from userspace to kernel space -and viceversa. -I have to modify the ip_fw2.c file to manage these two case, and added a -variable (is7) to store the ipfw version used, using an approach like the -previous file: -- when a new rule is added (option IP_FW_ADD) the is7 variable is set if the - size of the rule received corrispond to FreeBSD 7.2 ipfw version. If so, the - rule is converted to version 8 calling the function convert_rule_to_8(). - Moreover, after the insertion of the rule, the rule is now reconverted to - version 7 because the ipfw binary will print it. -- when the user request a list of rules (option IP_FW_GET) the is7 variable - should be set correctly because we suppose that a configure command was done, - else we suppose that the FreeBSD version is 8. The function ipfw_getrules() - in ip_fw2.c file return all rules, eventually converted to version 7 (if - the is7 is set) to the ipfw binary. -The conversion of a rule is quite simple. The only difference between the -two structures (struct ip_fw) is that in the new there is a new field -(uint32_t id). So, I copy the entire rule in a buffer and the copy the rule in -the right position in the new (or old) struct. The size of commands are not -changed, and the copy is done into a cicle. - -How to configure dummynet -========================= -It is possible to configure dummynet through two main commands: -'ipfw pipe' and 'ipfw queue'. -To allow compatibility with old version, it is possible configure dummynet -using the old command syntax. Doing so, obviously, it is only possible to -configure a FIFO scheduler or a wf2q+ scheduler. -A new command, 'ipfw pipe x config sched <type>' is supported to add a new -scheduler to the system. - -- ipfw pipe x config ... - create a new pipe with the link parameters - create a new scheduler fifo (x + offset) - create a new flowset fifo (x + offset) - the mask is eventually stored in the FIFO scheduler - -- ipfw queue y config pipe x ... - create a new flowset y linked to sched x. - The type of flowset depends by the specified scheduler. - If the scheduler does not exist, this flowset is inserted in a special - list and will be not active. - If pipe x exists and sched does not exist, a new wf2q+ scheduler is - created and the flowset will be linked to this new scheduler (this is - done for compatibility with old syntax). - -- ipfw pipe x config sched <type> ... - create a new scheduler x of type <type>. - Search into the flowset unlinked list if there are some flowset that - should be linked with this new scheduler. - -- ipfw pipe x delete - delete the pipe x - delete the scheduler fifo (x + offset) - delete the scheduler x - delete the flowset fifo (x + offset) - -- ipfw queue x delete - delete the flowset x - -- ipfw sched x delete ///XXX - delete the scheduler x - -Follow now some examples to how configure dummynet: -- Ex1: - ipfw pipe 10 config bw 1M delay 15 // create a pipe with band and delay - A FIFO flowset and scheduler is - also created - ipfw queue 5 config pipe 10 weight 56 // create a flowset. This flowset - will be of wf2q+ because a pipe 10 - exists. Moreover, the wf2q+ - scheduler is created now. -- Ex2: - ipfw queue 5 config pipe 10 weight 56 // Create a flowset. Scheduler 10 - does not exist, so this flowset - is inserted in the unlinked - flowset list. - ipfw pipe 10 config bw... // Create a pipe, a FIFO flowset and scheduler. - Because of a flowset with 'pipe 10' exists, - a wf2q+ scheduler is created now and that - flowset is linked with this sceduler. - -- Ex3: - ipfw pipe 10 config bw... // Create a pipe, a FIFO flowset and scheduler. - ipfw pipe 10 config sched rr // Create a scheduler of type RR, linked to - pipe 10 - ipfw queue 5 config pipe 10 weight 56 // Create a flowset 5. This flowset - will belong to scheduler 10 and - it is of type RR - -- Ex4: - ipfw pipe 10 config sched rr // Create a scheduler of type RR, linked to - pipe 10 (not exist yet) - ipfw pipe 10 config bw... // Create a pipe, a FIFO flowset and scheduler. - ipfw queue 5 config pipe 10 weight 56 // Create a flowset 5.This flowset - will belong to scheduler 10 and - it is of type RR - ipfw pipe 10 config sched wf2q+ // Modify the type of scheduler 10. It - becomes a wf2q+ scheduler. - When a new packet of flowset 5 arrives, - the flowset 5 becomes to wf2q+ type. - -How to implement a new scheduler -================================ -In dummynet, a scheduler algorithm is represented by two main structs, some -functions and other minor structs. -- A struct dn_sch_xyz (where xyz is the 'type' of scheduler algorithm - implemented) contains data relative to scheduler, as global parameter that - are common to all instances of the scheduler -- A struct dn_sch_inst_xyz contains data relative to a single scheduler - instance, as local status variable depending for example by flows that - are linked with the scheduler, and so on. -To add a scheduler to dummynet, the user should type a command like: -'ipfw pipe x config sched <type> [mask ... ...]' -This command creates a new struct dn_sch_xyz of type <type>, and -store the optional parameter in that struct. - -The parameter mask determines how many scheduler instance of this -scheduler may exist. For example, it is possible to divide traffic -depending on the source port (or destination, or ip address...), -so that every scheduler instance act as an independent scheduler. -If the mask is not set, all traffic goes to the same instance. - -When a packet arrives to a scheduler, the system search the corrected -scheduler instance, and if it does not exist it is created now (the -struct dn_sch_inst_xyz is allocated by the system, and the scheduler -fills the field correctly). It is a task of the scheduler to create -the struct that contains all queues for a scheduler instance. -Dummynet provides some function to create an hash table to store -queues, but the schedule algorithm can choice the own struct. - -To link a flow to a scheduler, the user should type a command like: -'ipfw queue z config pipe x [mask... ...]' - -This command creates a new 'dn_fs' struct that will be inserted -in the system. If the scheduler x exists, this flowset will be -linked to that scheduler and the flowset type become the same as -the scheduler type. At this point, the function create_alg_fs_xyz() -is called to allow store eventually parameter for the flowset that -depend by scheduler (for example the 'weight' parameter for a wf2q+ -scheduler, or some priority...). A parameter mask can be used for -a flowset. If the mask parameter is set, the scheduler instance can -separate packet according to its flow id (src and dst ip, ports...) -and assign it to a separate queue. This is done by the scheduler, -so it can ignore the mask if it wants. - -See now the two main structs: -struct dn_sch_xyz { - struct gen g; /* important the name g */ - /* global params */ -}; -struct dn_sch_inst_xyz { - struct gen g; /* important the name g */ - /* params of the instance */ -}; -It is important to embed the struct gen as first parameter. The struct gen -contains some values that the scheduler instance must fill (the 'type' of -scheduler, the 'len' of the struct...) -The function create_scheduler_xyz() should be implemented to initialize global -parameters in the first struct, and if memory allocation is done it is -mandatory to implement the delete_scheduler_template() function to free that -memory. -The function create_scheduler_instance_xyz() must be implemented even if the -scheduler instance does not use extra parameters. In this function the struct -gen fields must be filled with corrected infos. The -delete_scheduler_instance_xyz() function must bu implemented if the instance -has allocated some memory in the previous function. - -To store data belonging to a flowset the follow struct is used: -struct alg_fs_xyz { - struct gen g; - /* fill correctly the gen struct - g.subtype = DN_XYZ; - g.len = sizeof(struct alg_fs_xyz) - ... - */ - /* params for the flow */ -}; -The create_alg_fs_xyz() function is mandatory, because it must fill the struct -gen, but the delete_alg_fs_xyz() is mandatory only if the previous function -has allocated some memory. - -A struct dn_queue contains packets belonging to a queue and some statistical -data. The scheduler could have to store data in this struct, so it must define -a dn_queue_xyz struct: -struct dn_queue_xyz { - struct dn_queue q; - /* parameter for a queue */ -} - -All structures are allocated by the system. To do so, the scheduler must -set the size of its structs in the scheduler descriptor: -scheduler_size: sizeof(dn_sch_xyz) -scheduler_i_size: sizeof(dn_sch_inst_xyz) -flowset_size: sizeof(alg_fs_xyz) -queue_size: sizeof(dn_queue_xyz); -The scheduler_size could be 0, but other struct must have at least a struct gen. - - -After the definition of structs, it is necessary to implement the -scheduler functions. - -- int (*config_scheduler)(char *command, void *sch, int reconfigure); - Configure a scheduler, or reconfigure if 'reconfigure' == 1. - This function performs additional allocation and initialization of global - parameter for this scheduler. - If memory is allocated here, the delete_scheduler_template() function - should be implemented to remove this memory. -- int (*delete_scheduler_template)(void* sch); - Delete a scheduler template. This function is mandatory if the scheduler - uses extra data respect the struct dn_sch. -- int (*create_scheduler_instance)(void *s); - Create a new scheduler instance. The system allocate the necessary memory - and the schedulet can access it using the 's' pointer. - The scheduler instance stores all queues, and to do this can use the - hash table provided by the system. -- int (*delete_scheduler_instance)(void *s); - Delete a scheduler instance. It is important to free memory allocated - by create_scheduler_instance() function. The memory allocated by system - is freed by the system itself. The struct contains all queue also has - to be deleted. -- int (*enqueue)(void *s, struct gen *f, struct mbuf *m, - struct ipfw_flow_id *id); - Called when a packet arrives. The packet 'm' belongs to the scheduler - instance 's', has a flowset 'f' and the flowid 'id' has already been - masked. The enqueue() must call dn_queue_packet(q, m) function to really - enqueue packet in the queue q. The queue 'q' is chosen by the scheduler - and if it does not exist should be created calling the dn_create_queue() - function. If the schedule want to drop the packet, it must call the - dn_drop_packet() function and then return 1. -- struct mbuf * (*dequeue)(void *s); - Called when the timer expires (or when a packet arrives and the scheduler - instance is idle). - This function is called when at least a packet can be send out. The - scheduler choices the packet and returns it; if no packet are in the - schedulerinstance, the function must return NULL. - Before return a packet, it is important to call the function - dn_return_packet() to update some statistic of the queue and update the - queue counters. -- int (*drain_queue)(void *s, int flag); - The system request to scheduler to delete all queues that is not using - to free memory. The flag parameter indicate if a queue must be deleted - even if it is active. - -- int (*create_alg_fs)(char *command, struct gen *g, int reconfigure); - It is called when a flowset is linked with a scheduler. This is done - when the scheduler is defined, so we can know the type of flowset. - The function initialize the flowset paramenter parsing the command - line. The parameter will be stored in the g struct that have the right - size allocated by the system. If the reconfigure flag is set, it means - that the flowset is reconfiguring -- int (*delete_alg_fs)(struct gen *f); - It is called when a flowset is deleting. Must remove the memory allocate - by the create_alg_fs() function. - -- int (*create_queue_alg)(struct dn_queue *q, struct gen *f); - Called when a queue is created. The function should link the queue - to the struct used by the scheduler instance to store all queues. -- int (*delete_queue_alg)(struct dn_queue *q); - Called when a queue is deleting. The function should remove extra data - and update the struct contains all queues in the scheduler instance. - -The struct scheduler represent the scheduler descriptor that is passed to -dummynet when a scheduler module is loaded. -This struct contains the type of scheduler, the length of all structs and -all function pointers. -If a function is not implemented should be initialize to NULL. Some functions -are mandatory, other are mandatory if some memory should be freed. -Mandatory functions: -- create_scheduler_instance() -- enqueue() -- dequeue() -- create_alg_fs() -- drain_queue() -Optional functions: -- config_scheduler() -- create_queue_alg() -Mandatory functions if the corresponding create...() has allocated memory: -- delete_scheduler_template() -- delete_scheduler_instance() -- delete_alg_fs() -- delete_queue_alg() - diff --git a/sys/netinet/ipfw/ip_dn_glue.c b/sys/netinet/ipfw/ip_dn_glue.c deleted file mode 100644 index 9fc6b23..0000000 --- a/sys/netinet/ipfw/ip_dn_glue.c +++ /dev/null @@ -1,845 +0,0 @@ -/*- - * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa - * All rights reserved - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * $FreeBSD$ - * - * Binary compatibility support for /sbin/ipfw RELENG_7 and RELENG_8 - */ - -#include "opt_inet6.h" - -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/malloc.h> -#include <sys/mbuf.h> -#include <sys/kernel.h> -#include <sys/lock.h> -#include <sys/module.h> -#include <sys/priv.h> -#include <sys/proc.h> -#include <sys/rwlock.h> -#include <sys/socket.h> -#include <sys/socketvar.h> -#include <sys/time.h> -#include <sys/taskqueue.h> -#include <net/if.h> /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ -#include <netinet/in.h> -#include <netinet/ip_var.h> /* ip_output(), IP_FORWARDING */ -#include <netinet/ip_fw.h> -#include <netinet/ipfw/ip_fw_private.h> -#include <netinet/ipfw/dn_heap.h> -#include <netinet/ip_dummynet.h> -#include <netinet/ipfw/ip_dn_private.h> -#include <netinet/ipfw/dn_sched.h> - -/* FREEBSD7.2 ip_dummynet.h r191715*/ - -struct dn_heap_entry7 { - int64_t key; /* sorting key. Topmost element is smallest one */ - void *object; /* object pointer */ -}; - -struct dn_heap7 { - int size; - int elements; - int offset; /* XXX if > 0 this is the offset of direct ptr to obj */ - struct dn_heap_entry7 *p; /* really an array of "size" entries */ -}; - -/* Common to 7.2 and 8 */ -struct dn_flow_set { - SLIST_ENTRY(dn_flow_set) next; /* linked list in a hash slot */ - - u_short fs_nr ; /* flow_set number */ - u_short flags_fs; -#define DNOLD_HAVE_FLOW_MASK 0x0001 -#define DNOLD_IS_RED 0x0002 -#define DNOLD_IS_GENTLE_RED 0x0004 -#define DNOLD_QSIZE_IS_BYTES 0x0008 /* queue size is measured in bytes */ -#define DNOLD_NOERROR 0x0010 /* do not report ENOBUFS on drops */ -#define DNOLD_HAS_PROFILE 0x0020 /* the pipe has a delay profile. */ -#define DNOLD_IS_PIPE 0x4000 -#define DNOLD_IS_QUEUE 0x8000 - - struct dn_pipe7 *pipe ; /* pointer to parent pipe */ - u_short parent_nr ; /* parent pipe#, 0 if local to a pipe */ - - int weight ; /* WFQ queue weight */ - int qsize ; /* queue size in slots or bytes */ - int plr ; /* pkt loss rate (2^31-1 means 100%) */ - - struct ipfw_flow_id flow_mask ; - - /* hash table of queues onto this flow_set */ - int rq_size ; /* number of slots */ - int rq_elements ; /* active elements */ - struct dn_flow_queue7 **rq; /* array of rq_size entries */ - - u_int32_t last_expired ; /* do not expire too frequently */ - int backlogged ; /* #active queues for this flowset */ - - /* RED parameters */ -#define SCALE_RED 16 -#define SCALE(x) ( (x) << SCALE_RED ) -#define SCALE_VAL(x) ( (x) >> SCALE_RED ) -#define SCALE_MUL(x,y) ( ( (x) * (y) ) >> SCALE_RED ) - int w_q ; /* queue weight (scaled) */ - int max_th ; /* maximum threshold for queue (scaled) */ - int min_th ; /* minimum threshold for queue (scaled) */ - int max_p ; /* maximum value for p_b (scaled) */ - u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */ - u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */ - u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */ - u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */ - u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */ - u_int lookup_depth ; /* depth of lookup table */ - int lookup_step ; /* granularity inside the lookup table */ - int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */ - int avg_pkt_size ; /* medium packet size */ - int max_pkt_size ; /* max packet size */ -}; -SLIST_HEAD(dn_flow_set_head, dn_flow_set); - -#define DN_IS_PIPE 0x4000 -#define DN_IS_QUEUE 0x8000 -struct dn_flow_queue7 { - struct dn_flow_queue7 *next ; - struct ipfw_flow_id id ; - - struct mbuf *head, *tail ; /* queue of packets */ - u_int len ; - u_int len_bytes ; - - u_long numbytes; - - u_int64_t tot_pkts ; /* statistics counters */ - u_int64_t tot_bytes ; - u_int32_t drops ; - - int hash_slot ; /* debugging/diagnostic */ - - /* RED parameters */ - int avg ; /* average queue length est. (scaled) */ - int count ; /* arrivals since last RED drop */ - int random ; /* random value (scaled) */ - u_int32_t q_time; /* start of queue idle time */ - - /* WF2Q+ support */ - struct dn_flow_set *fs ; /* parent flow set */ - int heap_pos ; /* position (index) of struct in heap */ - int64_t sched_time ; /* current time when queue enters ready_heap */ - - int64_t S,F ; /* start time, finish time */ -}; - -struct dn_pipe7 { /* a pipe */ - SLIST_ENTRY(dn_pipe7) next; /* linked list in a hash slot */ - - int pipe_nr ; /* number */ - int bandwidth; /* really, bytes/tick. */ - int delay ; /* really, ticks */ - - struct mbuf *head, *tail ; /* packets in delay line */ - - /* WF2Q+ */ - struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/ - struct dn_heap7 not_eligible_heap; /* top extract- key Start time */ - struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */ - - int64_t V ; /* virtual time */ - int sum; /* sum of weights of all active sessions */ - - int numbytes; - - int64_t sched_time ; /* time pipe was scheduled in ready_heap */ - - /* - * When the tx clock come from an interface (if_name[0] != '\0'), its name - * is stored below, whereas the ifp is filled when the rule is configured. - */ - char if_name[IFNAMSIZ]; - struct ifnet *ifp ; - int ready ; /* set if ifp != NULL and we got a signal from it */ - - struct dn_flow_set fs ; /* used with fixed-rate flows */ -}; -SLIST_HEAD(dn_pipe_head7, dn_pipe7); - - -/* FREEBSD8 ip_dummynet.h r196045 */ -struct dn_flow_queue8 { - struct dn_flow_queue8 *next ; - struct ipfw_flow_id id ; - - struct mbuf *head, *tail ; /* queue of packets */ - u_int len ; - u_int len_bytes ; - - uint64_t numbytes ; /* credit for transmission (dynamic queues) */ - int64_t extra_bits; /* extra bits simulating unavailable channel */ - - u_int64_t tot_pkts ; /* statistics counters */ - u_int64_t tot_bytes ; - u_int32_t drops ; - - int hash_slot ; /* debugging/diagnostic */ - - /* RED parameters */ - int avg ; /* average queue length est. (scaled) */ - int count ; /* arrivals since last RED drop */ - int random ; /* random value (scaled) */ - int64_t idle_time; /* start of queue idle time */ - - /* WF2Q+ support */ - struct dn_flow_set *fs ; /* parent flow set */ - int heap_pos ; /* position (index) of struct in heap */ - int64_t sched_time ; /* current time when queue enters ready_heap */ - - int64_t S,F ; /* start time, finish time */ -}; - -struct dn_pipe8 { /* a pipe */ - SLIST_ENTRY(dn_pipe8) next; /* linked list in a hash slot */ - - int pipe_nr ; /* number */ - int bandwidth; /* really, bytes/tick. */ - int delay ; /* really, ticks */ - - struct mbuf *head, *tail ; /* packets in delay line */ - - /* WF2Q+ */ - struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/ - struct dn_heap7 not_eligible_heap; /* top extract- key Start time */ - struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */ - - int64_t V ; /* virtual time */ - int sum; /* sum of weights of all active sessions */ - - /* Same as in dn_flow_queue, numbytes can become large */ - int64_t numbytes; /* bits I can transmit (more or less). */ - uint64_t burst; /* burst size, scaled: bits * hz */ - - int64_t sched_time ; /* time pipe was scheduled in ready_heap */ - int64_t idle_time; /* start of pipe idle time */ - - char if_name[IFNAMSIZ]; - struct ifnet *ifp ; - int ready ; /* set if ifp != NULL and we got a signal from it */ - - struct dn_flow_set fs ; /* used with fixed-rate flows */ - - /* fields to simulate a delay profile */ -#define ED_MAX_NAME_LEN 32 - char name[ED_MAX_NAME_LEN]; - int loss_level; - int samples_no; - int *samples; -}; - -#define ED_MAX_SAMPLES_NO 1024 -struct dn_pipe_max8 { - struct dn_pipe8 pipe; - int samples[ED_MAX_SAMPLES_NO]; -}; -SLIST_HEAD(dn_pipe_head8, dn_pipe8); - -/* - * Changes from 7.2 to 8: - * dn_pipe: - * numbytes from int to int64_t - * add burst (int64_t) - * add idle_time (int64_t) - * add profile - * add struct dn_pipe_max - * add flag DN_HAS_PROFILE - * - * dn_flow_queue - * numbytes from u_long to int64_t - * add extra_bits (int64_t) - * q_time from u_int32_t to int64_t and name idle_time - * - * dn_flow_set unchanged - * - */ - -/* NOTE:XXX copied from dummynet.c */ -#define O_NEXT(p, len) ((void *)((char *)p + len)) -static void -oid_fill(struct dn_id *oid, int len, int type, uintptr_t id) -{ - oid->len = len; - oid->type = type; - oid->subtype = 0; - oid->id = id; -} -/* make room in the buffer and move the pointer forward */ -static void * -o_next(struct dn_id **o, int len, int type) -{ - struct dn_id *ret = *o; - oid_fill(ret, len, type, 0); - *o = O_NEXT(*o, len); - return ret; -} - - -static size_t pipesize7 = sizeof(struct dn_pipe7); -static size_t pipesize8 = sizeof(struct dn_pipe8); -static size_t pipesizemax8 = sizeof(struct dn_pipe_max8); - -/* Indicate 'ipfw' version - * 1: from FreeBSD 7.2 - * 0: from FreeBSD 8 - * -1: unknow (for now is unused) - * - * It is update when a IP_DUMMYNET_DEL or IP_DUMMYNET_CONFIGURE request arrives - * NOTE: if a IP_DUMMYNET_GET arrives and the 'ipfw' version is unknow, - * it is suppose to be the FreeBSD 8 version. - */ -static int is7 = 0; - -static int -convertflags2new(int src) -{ - int dst = 0; - - if (src & DNOLD_HAVE_FLOW_MASK) - dst |= DN_HAVE_MASK; - if (src & DNOLD_QSIZE_IS_BYTES) - dst |= DN_QSIZE_BYTES; - if (src & DNOLD_NOERROR) - dst |= DN_NOERROR; - if (src & DNOLD_IS_RED) - dst |= DN_IS_RED; - if (src & DNOLD_IS_GENTLE_RED) - dst |= DN_IS_GENTLE_RED; - if (src & DNOLD_HAS_PROFILE) - dst |= DN_HAS_PROFILE; - - return dst; -} - -static int -convertflags2old(int src) -{ - int dst = 0; - - if (src & DN_HAVE_MASK) - dst |= DNOLD_HAVE_FLOW_MASK; - if (src & DN_IS_RED) - dst |= DNOLD_IS_RED; - if (src & DN_IS_GENTLE_RED) - dst |= DNOLD_IS_GENTLE_RED; - if (src & DN_NOERROR) - dst |= DNOLD_NOERROR; - if (src & DN_HAS_PROFILE) - dst |= DNOLD_HAS_PROFILE; - if (src & DN_QSIZE_BYTES) - dst |= DNOLD_QSIZE_IS_BYTES; - - return dst; -} - -static int -dn_compat_del(void *v) -{ - struct dn_pipe7 *p = (struct dn_pipe7 *) v; - struct dn_pipe8 *p8 = (struct dn_pipe8 *) v; - struct { - struct dn_id oid; - uintptr_t a[1]; /* add more if we want a list */ - } cmd; - - /* XXX DN_API_VERSION ??? */ - oid_fill((void *)&cmd, sizeof(cmd), DN_CMD_DELETE, DN_API_VERSION); - - if (is7) { - if (p->pipe_nr == 0 && p->fs.fs_nr == 0) - return EINVAL; - if (p->pipe_nr != 0 && p->fs.fs_nr != 0) - return EINVAL; - } else { - if (p8->pipe_nr == 0 && p8->fs.fs_nr == 0) - return EINVAL; - if (p8->pipe_nr != 0 && p8->fs.fs_nr != 0) - return EINVAL; - } - - if (p->pipe_nr != 0) { /* pipe x delete */ - cmd.a[0] = p->pipe_nr; - cmd.oid.subtype = DN_LINK; - } else { /* queue x delete */ - cmd.oid.subtype = DN_FS; - cmd.a[0] = (is7) ? p->fs.fs_nr : p8->fs.fs_nr; - } - - return do_config(&cmd, cmd.oid.len); -} - -static int -dn_compat_config_queue(struct dn_fs *fs, void* v) -{ - struct dn_pipe7 *p7 = (struct dn_pipe7 *)v; - struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; - struct dn_flow_set *f; - - if (is7) - f = &p7->fs; - else - f = &p8->fs; - - fs->fs_nr = f->fs_nr; - fs->sched_nr = f->parent_nr; - fs->flow_mask = f->flow_mask; - fs->buckets = f->rq_size; - fs->qsize = f->qsize; - fs->plr = f->plr; - fs->par[0] = f->weight; - fs->flags = convertflags2new(f->flags_fs); - if (fs->flags & DN_IS_GENTLE_RED || fs->flags & DN_IS_RED) { - fs->w_q = f->w_q; - fs->max_th = f->max_th; - fs->min_th = f->min_th; - fs->max_p = f->max_p; - } - - return 0; -} - -static int -dn_compat_config_pipe(struct dn_sch *sch, struct dn_link *p, - struct dn_fs *fs, void* v) -{ - struct dn_pipe7 *p7 = (struct dn_pipe7 *)v; - struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; - int i = p7->pipe_nr; - - sch->sched_nr = i; - sch->oid.subtype = 0; - p->link_nr = i; - fs->fs_nr = i + 2*DN_MAX_ID; - fs->sched_nr = i + DN_MAX_ID; - - /* Common to 7 and 8 */ - p->bandwidth = p7->bandwidth; - p->delay = p7->delay; - if (!is7) { - /* FreeBSD 8 has burst */ - p->burst = p8->burst; - } - - /* fill the fifo flowset */ - dn_compat_config_queue(fs, v); - fs->fs_nr = i + 2*DN_MAX_ID; - fs->sched_nr = i + DN_MAX_ID; - - /* Move scheduler related parameter from fs to sch */ - sch->buckets = fs->buckets; /*XXX*/ - fs->buckets = 0; - if (fs->flags & DN_HAVE_MASK) { - sch->flags |= DN_HAVE_MASK; - fs->flags &= ~DN_HAVE_MASK; - sch->sched_mask = fs->flow_mask; - bzero(&fs->flow_mask, sizeof(struct ipfw_flow_id)); - } - - return 0; -} - -static int -dn_compat_config_profile(struct dn_profile *pf, struct dn_link *p, - void *v) -{ - struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; - - p8->samples = &(((struct dn_pipe_max8 *)p8)->samples[0]); - - pf->link_nr = p->link_nr; - pf->loss_level = p8->loss_level; -// pf->bandwidth = p->bandwidth; //XXX bandwidth redundant? - pf->samples_no = p8->samples_no; - strncpy(pf->name, p8->name,sizeof(pf->name)); - bcopy(p8->samples, pf->samples, sizeof(pf->samples)); - - return 0; -} - -/* - * If p->pipe_nr != 0 the command is 'pipe x config', so need to create - * the three main struct, else only a flowset is created - */ -static int -dn_compat_configure(void *v) -{ - struct dn_id *buf = NULL, *base; - struct dn_sch *sch = NULL; - struct dn_link *p = NULL; - struct dn_fs *fs = NULL; - struct dn_profile *pf = NULL; - int lmax; - int error; - - struct dn_pipe7 *p7 = (struct dn_pipe7 *)v; - struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; - - int i; /* number of object to configure */ - - lmax = sizeof(struct dn_id); /* command header */ - lmax += sizeof(struct dn_sch) + sizeof(struct dn_link) + - sizeof(struct dn_fs) + sizeof(struct dn_profile); - - base = buf = malloc(lmax, M_DUMMYNET, M_WAIT|M_ZERO); - o_next(&buf, sizeof(struct dn_id), DN_CMD_CONFIG); - base->id = DN_API_VERSION; - - /* pipe_nr is the same in p7 and p8 */ - i = p7->pipe_nr; - if (i != 0) { /* pipe config */ - sch = o_next(&buf, sizeof(*sch), DN_SCH); - p = o_next(&buf, sizeof(*p), DN_LINK); - fs = o_next(&buf, sizeof(*fs), DN_FS); - - error = dn_compat_config_pipe(sch, p, fs, v); - if (error) { - free(buf, M_DUMMYNET); - return error; - } - if (!is7 && p8->samples_no > 0) { - /* Add profiles*/ - pf = o_next(&buf, sizeof(*pf), DN_PROFILE); - error = dn_compat_config_profile(pf, p, v); - if (error) { - free(buf, M_DUMMYNET); - return error; - } - } - } else { /* queue config */ - fs = o_next(&buf, sizeof(*fs), DN_FS); - error = dn_compat_config_queue(fs, v); - if (error) { - free(buf, M_DUMMYNET); - return error; - } - } - error = do_config(base, (char *)buf - (char *)base); - - if (buf) - free(buf, M_DUMMYNET); - return error; -} - -int -dn_compat_calc_size(void) -{ - int need = 0; - /* XXX use FreeBSD 8 struct size */ - /* NOTE: - * - half scheduler: schk_count/2 - * - all flowset: fsk_count - * - all flowset queues: queue_count - * - all pipe queue: si_count - */ - need += dn_cfg.schk_count * sizeof(struct dn_pipe8) / 2; - need += dn_cfg.fsk_count * sizeof(struct dn_flow_set); - need += dn_cfg.si_count * sizeof(struct dn_flow_queue8); - need += dn_cfg.queue_count * sizeof(struct dn_flow_queue8); - - return need; -} - -int -dn_c_copy_q (void *_ni, void *arg) -{ - struct copy_args *a = arg; - struct dn_flow_queue7 *fq7 = (struct dn_flow_queue7 *)*a->start; - struct dn_flow_queue8 *fq8 = (struct dn_flow_queue8 *)*a->start; - struct dn_flow *ni = (struct dn_flow *)_ni; - int size = 0; - - /* XXX hash slot not set */ - /* No difference between 7.2/8 */ - fq7->len = ni->length; - fq7->len_bytes = ni->len_bytes; - fq7->id = ni->fid; - - if (is7) { - size = sizeof(struct dn_flow_queue7); - fq7->tot_pkts = ni->tot_pkts; - fq7->tot_bytes = ni->tot_bytes; - fq7->drops = ni->drops; - } else { - size = sizeof(struct dn_flow_queue8); - fq8->tot_pkts = ni->tot_pkts; - fq8->tot_bytes = ni->tot_bytes; - fq8->drops = ni->drops; - } - - *a->start += size; - return 0; -} - -int -dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq) -{ - struct dn_link *l = &s->link; - struct dn_fsk *f = s->fs; - - struct dn_pipe7 *pipe7 = (struct dn_pipe7 *)*a->start; - struct dn_pipe8 *pipe8 = (struct dn_pipe8 *)*a->start; - struct dn_flow_set *fs; - int size = 0; - - if (is7) { - fs = &pipe7->fs; - size = sizeof(struct dn_pipe7); - } else { - fs = &pipe8->fs; - size = sizeof(struct dn_pipe8); - } - - /* These 4 field are the same in pipe7 and pipe8 */ - pipe7->next.sle_next = (struct dn_pipe7 *)DN_IS_PIPE; - pipe7->bandwidth = l->bandwidth; - pipe7->delay = l->delay * 1000 / hz; - pipe7->pipe_nr = l->link_nr - DN_MAX_ID; - - if (!is7) { - if (s->profile) { - struct dn_profile *pf = s->profile; - strncpy(pipe8->name, pf->name, sizeof(pf->name)); - pipe8->loss_level = pf->loss_level; - pipe8->samples_no = pf->samples_no; - } - pipe8->burst = div64(l->burst , 8 * hz); - } - - fs->flow_mask = s->sch.sched_mask; - fs->rq_size = s->sch.buckets ? s->sch.buckets : 1; - - fs->parent_nr = l->link_nr - DN_MAX_ID; - fs->qsize = f->fs.qsize; - fs->plr = f->fs.plr; - fs->w_q = f->fs.w_q; - fs->max_th = f->max_th; - fs->min_th = f->min_th; - fs->max_p = f->fs.max_p; - fs->rq_elements = nq; - - fs->flags_fs = convertflags2old(f->fs.flags); - - *a->start += size; - return 0; -} - - -int -dn_compat_copy_pipe(struct copy_args *a, void *_o) -{ - int have = a->end - *a->start; - int need = 0; - int pipe_size = sizeof(struct dn_pipe8); - int queue_size = sizeof(struct dn_flow_queue8); - int n_queue = 0; /* number of queues */ - - struct dn_schk *s = (struct dn_schk *)_o; - /* calculate needed space: - * - struct dn_pipe - * - if there are instances, dn_queue * n_instances - */ - n_queue = (s->sch.flags & DN_HAVE_MASK ? dn_ht_entries(s->siht) : - (s->siht ? 1 : 0)); - need = pipe_size + queue_size * n_queue; - if (have < need) { - D("have %d < need %d", have, need); - return 1; - } - /* copy pipe */ - dn_c_copy_pipe(s, a, n_queue); - - /* copy queues */ - if (s->sch.flags & DN_HAVE_MASK) - dn_ht_scan(s->siht, dn_c_copy_q, a); - else if (s->siht) - dn_c_copy_q(s->siht, a); - return 0; -} - -int -dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq) -{ - struct dn_flow_set *fs = (struct dn_flow_set *)*a->start; - - fs->next.sle_next = (struct dn_flow_set *)DN_IS_QUEUE; - fs->fs_nr = f->fs.fs_nr; - fs->qsize = f->fs.qsize; - fs->plr = f->fs.plr; - fs->w_q = f->fs.w_q; - fs->max_th = f->max_th; - fs->min_th = f->min_th; - fs->max_p = f->fs.max_p; - fs->flow_mask = f->fs.flow_mask; - fs->rq_elements = nq; - fs->rq_size = (f->fs.buckets ? f->fs.buckets : 1); - fs->parent_nr = f->fs.sched_nr; - fs->weight = f->fs.par[0]; - - fs->flags_fs = convertflags2old(f->fs.flags); - *a->start += sizeof(struct dn_flow_set); - return 0; -} - -int -dn_compat_copy_queue(struct copy_args *a, void *_o) -{ - int have = a->end - *a->start; - int need = 0; - int fs_size = sizeof(struct dn_flow_set); - int queue_size = sizeof(struct dn_flow_queue8); - - struct dn_fsk *fs = (struct dn_fsk *)_o; - int n_queue = 0; /* number of queues */ - - n_queue = (fs->fs.flags & DN_HAVE_MASK ? dn_ht_entries(fs->qht) : - (fs->qht ? 1 : 0)); - - need = fs_size + queue_size * n_queue; - if (have < need) { - D("have < need"); - return 1; - } - - /* copy flowset */ - dn_c_copy_fs(fs, a, n_queue); - - /* copy queues */ - if (fs->fs.flags & DN_HAVE_MASK) - dn_ht_scan(fs->qht, dn_c_copy_q, a); - else if (fs->qht) - dn_c_copy_q(fs->qht, a); - - return 0; -} - -int -copy_data_helper_compat(void *_o, void *_arg) -{ - struct copy_args *a = _arg; - - if (a->type == DN_COMPAT_PIPE) { - struct dn_schk *s = _o; - if (s->sch.oid.subtype != 1 || s->sch.sched_nr <= DN_MAX_ID) { - return 0; /* not old type */ - } - /* copy pipe parameters, and if instance exists, copy - * other parameters and eventually queues. - */ - if(dn_compat_copy_pipe(a, _o)) - return DNHT_SCAN_END; - } else if (a->type == DN_COMPAT_QUEUE) { - struct dn_fsk *fs = _o; - if (fs->fs.fs_nr >= DN_MAX_ID) - return 0; - if (dn_compat_copy_queue(a, _o)) - return DNHT_SCAN_END; - } - return 0; -} - -/* Main function to manage old requests */ -int -ip_dummynet_compat(struct sockopt *sopt) -{ - int error=0; - void *v = NULL; - struct dn_id oid; - - /* Lenght of data, used to found ipfw version... */ - int len = sopt->sopt_valsize; - - /* len can be 0 if command was dummynet_flush */ - if (len == pipesize7) { - D("setting compatibility with FreeBSD 7.2"); - is7 = 1; - } - else if (len == pipesize8 || len == pipesizemax8) { - D("setting compatibility with FreeBSD 8"); - is7 = 0; - } - - switch (sopt->sopt_name) { - default: - printf("dummynet: -- unknown option %d", sopt->sopt_name); - error = EINVAL; - break; - - case IP_DUMMYNET_FLUSH: - oid_fill(&oid, sizeof(oid), DN_CMD_FLUSH, DN_API_VERSION); - do_config(&oid, oid.len); - break; - - case IP_DUMMYNET_DEL: - v = malloc(len, M_TEMP, M_WAITOK); - error = sooptcopyin(sopt, v, len, len); - if (error) - break; - error = dn_compat_del(v); - free(v, M_TEMP); - break; - - case IP_DUMMYNET_CONFIGURE: - v = malloc(len, M_TEMP, M_WAITOK); - error = sooptcopyin(sopt, v, len, len); - if (error) - break; - error = dn_compat_configure(v); - free(v, M_TEMP); - break; - - case IP_DUMMYNET_GET: { - void *buf; - int ret; - int original_size = sopt->sopt_valsize; - int size; - - ret = dummynet_get(sopt, &buf); - if (ret) - return 0;//XXX ? - size = sopt->sopt_valsize; - sopt->sopt_valsize = original_size; - D("size=%d, buf=%p", size, buf); - ret = sooptcopyout(sopt, buf, size); - if (ret) - printf(" %s ERROR sooptcopyout\n", __FUNCTION__); - if (buf) - free(buf, M_DUMMYNET); - } - } - - return error; -} - - diff --git a/sys/netinet/ipfw/ip_dn_io.c b/sys/netinet/ipfw/ip_dn_io.c deleted file mode 100644 index becd85e..0000000 --- a/sys/netinet/ipfw/ip_dn_io.c +++ /dev/null @@ -1,858 +0,0 @@ -/*- - * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa - * All rights reserved - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * Dummynet portions related to packet handling. - */ -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - -#include "opt_inet6.h" - -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/malloc.h> -#include <sys/mbuf.h> -#include <sys/kernel.h> -#include <sys/lock.h> -#include <sys/module.h> -#include <sys/priv.h> -#include <sys/proc.h> -#include <sys/rwlock.h> -#include <sys/socket.h> -#include <sys/time.h> -#include <sys/sysctl.h> - -#include <net/if.h> /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ -#include <net/netisr.h> -#include <net/vnet.h> - -#include <netinet/in.h> -#include <netinet/ip.h> /* ip_len, ip_off */ -#include <netinet/ip_var.h> /* ip_output(), IP_FORWARDING */ -#include <netinet/ip_fw.h> -#include <netinet/ipfw/ip_fw_private.h> -#include <netinet/ipfw/dn_heap.h> -#include <netinet/ip_dummynet.h> -#include <netinet/ipfw/ip_dn_private.h> -#include <netinet/ipfw/dn_sched.h> - -#include <netinet/if_ether.h> /* various ether_* routines */ - -#include <netinet/ip6.h> /* for ip6_input, ip6_output prototypes */ -#include <netinet6/ip6_var.h> - -/* - * We keep a private variable for the simulation time, but we could - * probably use an existing one ("softticks" in sys/kern/kern_timeout.c) - * instead of dn_cfg.curr_time - */ - -struct dn_parms dn_cfg; -//VNET_DEFINE(struct dn_parms, _base_dn_cfg); - -static long tick_last; /* Last tick duration (usec). */ -static long tick_delta; /* Last vs standard tick diff (usec). */ -static long tick_delta_sum; /* Accumulated tick difference (usec).*/ -static long tick_adjustment; /* Tick adjustments done. */ -static long tick_lost; /* Lost(coalesced) ticks number. */ -/* Adjusted vs non-adjusted curr_time difference (ticks). */ -static long tick_diff; - -static unsigned long io_pkt; -static unsigned long io_pkt_fast; -static unsigned long io_pkt_drop; - -/* - * We use a heap to store entities for which we have pending timer events. - * The heap is checked at every tick and all entities with expired events - * are extracted. - */ - -MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap"); - -extern void (*bridge_dn_p)(struct mbuf *, struct ifnet *); - -#ifdef SYSCTL_NODE - -/* - * Because of the way the SYSBEGIN/SYSEND macros work on other - * platforms, there should not be functions between them. - * So keep the handlers outside the block. - */ -static int -sysctl_hash_size(SYSCTL_HANDLER_ARGS) -{ - int error, value; - - value = dn_cfg.hash_size; - error = sysctl_handle_int(oidp, &value, 0, req); - if (error != 0 || req->newptr == NULL) - return (error); - if (value < 16 || value > 65536) - return (EINVAL); - dn_cfg.hash_size = value; - return (0); -} - -static int -sysctl_limits(SYSCTL_HANDLER_ARGS) -{ - int error; - long value; - - if (arg2 != 0) - value = dn_cfg.slot_limit; - else - value = dn_cfg.byte_limit; - error = sysctl_handle_long(oidp, &value, 0, req); - - if (error != 0 || req->newptr == NULL) - return (error); - if (arg2 != 0) { - if (value < 1) - return (EINVAL); - dn_cfg.slot_limit = value; - } else { - if (value < 1500) - return (EINVAL); - dn_cfg.byte_limit = value; - } - return (0); -} - -SYSBEGIN(f4) - -SYSCTL_DECL(_net_inet); -SYSCTL_DECL(_net_inet_ip); -static SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet"); - -/* wrapper to pass dn_cfg fields to SYSCTL_* */ -//#define DC(x) (&(VNET_NAME(_base_dn_cfg).x)) -#define DC(x) (&(dn_cfg.x)) -/* parameters */ - - -SYSCTL_PROC(_net_inet_ip_dummynet, OID_AUTO, hash_size, - CTLTYPE_INT | CTLFLAG_RW, 0, 0, sysctl_hash_size, - "I", "Default hash table size"); - - -SYSCTL_PROC(_net_inet_ip_dummynet, OID_AUTO, pipe_slot_limit, - CTLTYPE_LONG | CTLFLAG_RW, 0, 1, sysctl_limits, - "L", "Upper limit in slots for pipe queue."); -SYSCTL_PROC(_net_inet_ip_dummynet, OID_AUTO, pipe_byte_limit, - CTLTYPE_LONG | CTLFLAG_RW, 0, 0, sysctl_limits, - "L", "Upper limit in bytes for pipe queue."); -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, io_fast, - CTLFLAG_RW, DC(io_fast), 0, "Enable fast dummynet io."); -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, - CTLFLAG_RW, DC(debug), 0, "Dummynet debug level"); - -/* RED parameters */ -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth, - CTLFLAG_RD, DC(red_lookup_depth), 0, "Depth of RED lookup table"); -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size, - CTLFLAG_RD, DC(red_avg_pkt_size), 0, "RED Medium packet size"); -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size, - CTLFLAG_RD, DC(red_max_pkt_size), 0, "RED Max packet size"); - -/* time adjustment */ -SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta, - CTLFLAG_RD, &tick_delta, 0, "Last vs standard tick difference (usec)."); -SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta_sum, - CTLFLAG_RD, &tick_delta_sum, 0, "Accumulated tick difference (usec)."); -SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_adjustment, - CTLFLAG_RD, &tick_adjustment, 0, "Tick adjustments done."); -SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_diff, - CTLFLAG_RD, &tick_diff, 0, - "Adjusted vs non-adjusted curr_time difference (ticks)."); -SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost, - CTLFLAG_RD, &tick_lost, 0, - "Number of ticks coalesced by dummynet taskqueue."); - -/* Drain parameters */ -SYSCTL_UINT(_net_inet_ip_dummynet, OID_AUTO, expire, - CTLFLAG_RW, DC(expire), 0, "Expire empty queues/pipes"); -SYSCTL_UINT(_net_inet_ip_dummynet, OID_AUTO, expire_cycle, - CTLFLAG_RD, DC(expire_cycle), 0, "Expire cycle for queues/pipes"); - -/* statistics */ -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, schk_count, - CTLFLAG_RD, DC(schk_count), 0, "Number of schedulers"); -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, si_count, - CTLFLAG_RD, DC(si_count), 0, "Number of scheduler instances"); -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, fsk_count, - CTLFLAG_RD, DC(fsk_count), 0, "Number of flowsets"); -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, queue_count, - CTLFLAG_RD, DC(queue_count), 0, "Number of queues"); -SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt, - CTLFLAG_RD, &io_pkt, 0, - "Number of packets passed to dummynet."); -SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_fast, - CTLFLAG_RD, &io_pkt_fast, 0, - "Number of packets bypassed dummynet scheduler."); -SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_drop, - CTLFLAG_RD, &io_pkt_drop, 0, - "Number of packets dropped by dummynet."); -#undef DC -SYSEND - -#endif - -static void dummynet_send(struct mbuf *); - -/* - * Packets processed by dummynet have an mbuf tag associated with - * them that carries their dummynet state. - * Outside dummynet, only the 'rule' field is relevant, and it must - * be at the beginning of the structure. - */ -struct dn_pkt_tag { - struct ipfw_rule_ref rule; /* matching rule */ - - /* second part, dummynet specific */ - int dn_dir; /* action when packet comes out.*/ - /* see ip_fw_private.h */ - uint64_t output_time; /* when the pkt is due for delivery*/ - struct ifnet *ifp; /* interface, for ip_output */ - struct _ip6dn_args ip6opt; /* XXX ipv6 options */ -}; - -/* - * Return the mbuf tag holding the dummynet state (it should - * be the first one on the list). - */ -static struct dn_pkt_tag * -dn_tag_get(struct mbuf *m) -{ - struct m_tag *mtag = m_tag_first(m); - KASSERT(mtag != NULL && - mtag->m_tag_cookie == MTAG_ABI_COMPAT && - mtag->m_tag_id == PACKET_TAG_DUMMYNET, - ("packet on dummynet queue w/o dummynet tag!")); - return (struct dn_pkt_tag *)(mtag+1); -} - -static inline void -mq_append(struct mq *q, struct mbuf *m) -{ - if (q->head == NULL) - q->head = m; - else - q->tail->m_nextpkt = m; - q->tail = m; - m->m_nextpkt = NULL; -} - -/* - * Dispose a list of packet. Use a functions so if we need to do - * more work, this is a central point to do it. - */ -void dn_free_pkts(struct mbuf *mnext) -{ - struct mbuf *m; - - while ((m = mnext) != NULL) { - mnext = m->m_nextpkt; - FREE_PKT(m); - } -} - -static int -red_drops (struct dn_queue *q, int len) -{ - /* - * RED algorithm - * - * RED calculates the average queue size (avg) using a low-pass filter - * with an exponential weighted (w_q) moving average: - * avg <- (1-w_q) * avg + w_q * q_size - * where q_size is the queue length (measured in bytes or * packets). - * - * If q_size == 0, we compute the idle time for the link, and set - * avg = (1 - w_q)^(idle/s) - * where s is the time needed for transmitting a medium-sized packet. - * - * Now, if avg < min_th the packet is enqueued. - * If avg > max_th the packet is dropped. Otherwise, the packet is - * dropped with probability P function of avg. - */ - - struct dn_fsk *fs = q->fs; - int64_t p_b = 0; - - /* Queue in bytes or packets? */ - uint32_t q_size = (fs->fs.flags & DN_QSIZE_BYTES) ? - q->ni.len_bytes : q->ni.length; - - /* Average queue size estimation. */ - if (q_size != 0) { - /* Queue is not empty, avg <- avg + (q_size - avg) * w_q */ - int diff = SCALE(q_size) - q->avg; - int64_t v = SCALE_MUL((int64_t)diff, (int64_t)fs->w_q); - - q->avg += (int)v; - } else { - /* - * Queue is empty, find for how long the queue has been - * empty and use a lookup table for computing - * (1 - * w_q)^(idle_time/s) where s is the time to send a - * (small) packet. - * XXX check wraps... - */ - if (q->avg) { - u_int t = div64((dn_cfg.curr_time - q->q_time), fs->lookup_step); - - q->avg = (t < fs->lookup_depth) ? - SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0; - } - } - - /* Should i drop? */ - if (q->avg < fs->min_th) { - q->count = -1; - return (0); /* accept packet */ - } - if (q->avg >= fs->max_th) { /* average queue >= max threshold */ - if (fs->fs.flags & DN_IS_GENTLE_RED) { - /* - * According to Gentle-RED, if avg is greater than - * max_th the packet is dropped with a probability - * p_b = c_3 * avg - c_4 - * where c_3 = (1 - max_p) / max_th - * c_4 = 1 - 2 * max_p - */ - p_b = SCALE_MUL((int64_t)fs->c_3, (int64_t)q->avg) - - fs->c_4; - } else { - q->count = -1; - return (1); - } - } else if (q->avg > fs->min_th) { - /* - * We compute p_b using the linear dropping function - * p_b = c_1 * avg - c_2 - * where c_1 = max_p / (max_th - min_th) - * c_2 = max_p * min_th / (max_th - min_th) - */ - p_b = SCALE_MUL((int64_t)fs->c_1, (int64_t)q->avg) - fs->c_2; - } - - if (fs->fs.flags & DN_QSIZE_BYTES) - p_b = div64((p_b * len) , fs->max_pkt_size); - if (++q->count == 0) - q->random = random() & 0xffff; - else { - /* - * q->count counts packets arrived since last drop, so a greater - * value of q->count means a greater packet drop probability. - */ - if (SCALE_MUL(p_b, SCALE((int64_t)q->count)) > q->random) { - q->count = 0; - /* After a drop we calculate a new random value. */ - q->random = random() & 0xffff; - return (1); /* drop */ - } - } - /* End of RED algorithm. */ - - return (0); /* accept */ - -} - -/* - * Enqueue a packet in q, subject to space and queue management policy - * (whose parameters are in q->fs). - * Update stats for the queue and the scheduler. - * Return 0 on success, 1 on drop. The packet is consumed anyways. - */ -int -dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop) -{ - struct dn_fs *f; - struct dn_flow *ni; /* stats for scheduler instance */ - uint64_t len; - - if (q->fs == NULL || q->_si == NULL) { - printf("%s fs %p si %p, dropping\n", - __FUNCTION__, q->fs, q->_si); - FREE_PKT(m); - return 1; - } - f = &(q->fs->fs); - ni = &q->_si->ni; - len = m->m_pkthdr.len; - /* Update statistics, then check reasons to drop pkt. */ - q->ni.tot_bytes += len; - q->ni.tot_pkts++; - ni->tot_bytes += len; - ni->tot_pkts++; - if (drop) - goto drop; - if (f->plr && random() < f->plr) - goto drop; - if (f->flags & DN_IS_RED && red_drops(q, m->m_pkthdr.len)) - goto drop; - if (f->flags & DN_QSIZE_BYTES) { - if (q->ni.len_bytes > f->qsize) - goto drop; - } else if (q->ni.length >= f->qsize) { - goto drop; - } - mq_append(&q->mq, m); - q->ni.length++; - q->ni.len_bytes += len; - ni->length++; - ni->len_bytes += len; - return 0; - -drop: - io_pkt_drop++; - q->ni.drops++; - ni->drops++; - FREE_PKT(m); - return 1; -} - -/* - * Fetch packets from the delay line which are due now. If there are - * leftover packets, reinsert the delay line in the heap. - * Runs under scheduler lock. - */ -static void -transmit_event(struct mq *q, struct delay_line *dline, uint64_t now) -{ - struct mbuf *m; - struct dn_pkt_tag *pkt = NULL; - - dline->oid.subtype = 0; /* not in heap */ - while ((m = dline->mq.head) != NULL) { - pkt = dn_tag_get(m); - if (!DN_KEY_LEQ(pkt->output_time, now)) - break; - dline->mq.head = m->m_nextpkt; - mq_append(q, m); - } - if (m != NULL) { - dline->oid.subtype = 1; /* in heap */ - heap_insert(&dn_cfg.evheap, pkt->output_time, dline); - } -} - -/* - * Convert the additional MAC overheads/delays into an equivalent - * number of bits for the given data rate. The samples are - * in milliseconds so we need to divide by 1000. - */ -static uint64_t -extra_bits(struct mbuf *m, struct dn_schk *s) -{ - int index; - uint64_t bits; - struct dn_profile *pf = s->profile; - - if (!pf || pf->samples_no == 0) - return 0; - index = random() % pf->samples_no; - bits = div64((uint64_t)pf->samples[index] * s->link.bandwidth, 1000); - if (index >= pf->loss_level) { - struct dn_pkt_tag *dt = dn_tag_get(m); - if (dt) - dt->dn_dir = DIR_DROP; - } - return bits; -} - -/* - * Send traffic from a scheduler instance due by 'now'. - * Return a pointer to the head of the queue. - */ -static struct mbuf * -serve_sched(struct mq *q, struct dn_sch_inst *si, uint64_t now) -{ - struct mq def_q; - struct dn_schk *s = si->sched; - struct mbuf *m = NULL; - int delay_line_idle = (si->dline.mq.head == NULL); - int done, bw; - - if (q == NULL) { - q = &def_q; - q->head = NULL; - } - - bw = s->link.bandwidth; - si->kflags &= ~DN_ACTIVE; - - if (bw > 0) - si->credit += (now - si->sched_time) * bw; - else - si->credit = 0; - si->sched_time = now; - done = 0; - while (si->credit >= 0 && (m = s->fp->dequeue(si)) != NULL) { - uint64_t len_scaled; - - done++; - len_scaled = (bw == 0) ? 0 : hz * - (m->m_pkthdr.len * 8 + extra_bits(m, s)); - si->credit -= len_scaled; - /* Move packet in the delay line */ - dn_tag_get(m)->output_time = dn_cfg.curr_time + s->link.delay ; - mq_append(&si->dline.mq, m); - } - - /* - * If credit >= 0 the instance is idle, mark time. - * Otherwise put back in the heap, and adjust the output - * time of the last inserted packet, m, which was too early. - */ - if (si->credit >= 0) { - si->idle_time = now; - } else { - uint64_t t; - KASSERT (bw > 0, ("bw=0 and credit<0 ?")); - t = div64(bw - 1 - si->credit, bw); - if (m) - dn_tag_get(m)->output_time += t; - si->kflags |= DN_ACTIVE; - heap_insert(&dn_cfg.evheap, now + t, si); - } - if (delay_line_idle && done) - transmit_event(q, &si->dline, now); - return q->head; -} - -/* - * The timer handler for dummynet. Time is computed in ticks, but - * but the code is tolerant to the actual rate at which this is called. - * Once complete, the function reschedules itself for the next tick. - */ -void -dummynet_task(void *context, int pending) -{ - struct timeval t; - struct mq q = { NULL, NULL }; /* queue to accumulate results */ - - CURVNET_SET((struct vnet *)context); - - DN_BH_WLOCK(); - - /* Update number of lost(coalesced) ticks. */ - tick_lost += pending - 1; - - getmicrouptime(&t); - /* Last tick duration (usec). */ - tick_last = (t.tv_sec - dn_cfg.prev_t.tv_sec) * 1000000 + - (t.tv_usec - dn_cfg.prev_t.tv_usec); - /* Last tick vs standard tick difference (usec). */ - tick_delta = (tick_last * hz - 1000000) / hz; - /* Accumulated tick difference (usec). */ - tick_delta_sum += tick_delta; - - dn_cfg.prev_t = t; - - /* - * Adjust curr_time if the accumulated tick difference is - * greater than the 'standard' tick. Since curr_time should - * be monotonically increasing, we do positive adjustments - * as required, and throttle curr_time in case of negative - * adjustment. - */ - dn_cfg.curr_time++; - if (tick_delta_sum - tick >= 0) { - int diff = tick_delta_sum / tick; - - dn_cfg.curr_time += diff; - tick_diff += diff; - tick_delta_sum %= tick; - tick_adjustment++; - } else if (tick_delta_sum + tick <= 0) { - dn_cfg.curr_time--; - tick_diff--; - tick_delta_sum += tick; - tick_adjustment++; - } - - /* serve pending events, accumulate in q */ - for (;;) { - struct dn_id *p; /* generic parameter to handler */ - - if (dn_cfg.evheap.elements == 0 || - DN_KEY_LT(dn_cfg.curr_time, HEAP_TOP(&dn_cfg.evheap)->key)) - break; - p = HEAP_TOP(&dn_cfg.evheap)->object; - heap_extract(&dn_cfg.evheap, NULL); - - if (p->type == DN_SCH_I) { - serve_sched(&q, (struct dn_sch_inst *)p, dn_cfg.curr_time); - } else { /* extracted a delay line */ - transmit_event(&q, (struct delay_line *)p, dn_cfg.curr_time); - } - } - if (dn_cfg.expire && ++dn_cfg.expire_cycle >= dn_cfg.expire) { - dn_cfg.expire_cycle = 0; - dn_drain_scheduler(); - dn_drain_queue(); - } - - DN_BH_WUNLOCK(); - dn_reschedule(); - if (q.head != NULL) - dummynet_send(q.head); - CURVNET_RESTORE(); -} - -/* - * forward a chain of packets to the proper destination. - * This runs outside the dummynet lock. - */ -static void -dummynet_send(struct mbuf *m) -{ - struct mbuf *n; - - for (; m != NULL; m = n) { - struct ifnet *ifp = NULL; /* gcc 3.4.6 complains */ - struct m_tag *tag; - int dst; - - n = m->m_nextpkt; - m->m_nextpkt = NULL; - tag = m_tag_first(m); - if (tag == NULL) { /* should not happen */ - dst = DIR_DROP; - } else { - struct dn_pkt_tag *pkt = dn_tag_get(m); - /* extract the dummynet info, rename the tag - * to carry reinject info. - */ - dst = pkt->dn_dir; - ifp = pkt->ifp; - tag->m_tag_cookie = MTAG_IPFW_RULE; - tag->m_tag_id = 0; - } - - switch (dst) { - case DIR_OUT: - SET_HOST_IPLEN(mtod(m, struct ip *)); - ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL); - break ; - - case DIR_IN : - /* put header in network format for ip_input() */ - //SET_NET_IPLEN(mtod(m, struct ip *)); - netisr_dispatch(NETISR_IP, m); - break; - -#ifdef INET6 - case DIR_IN | PROTO_IPV6: - netisr_dispatch(NETISR_IPV6, m); - break; - - case DIR_OUT | PROTO_IPV6: - ip6_output(m, NULL, NULL, IPV6_FORWARDING, NULL, NULL, NULL); - break; -#endif - - case DIR_FWD | PROTO_IFB: /* DN_TO_IFB_FWD: */ - if (bridge_dn_p != NULL) - ((*bridge_dn_p)(m, ifp)); - else - printf("dummynet: if_bridge not loaded\n"); - - break; - - case DIR_IN | PROTO_LAYER2: /* DN_TO_ETH_DEMUX: */ - /* - * The Ethernet code assumes the Ethernet header is - * contiguous in the first mbuf header. - * Insure this is true. - */ - if (m->m_len < ETHER_HDR_LEN && - (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) { - printf("dummynet/ether: pullup failed, " - "dropping packet\n"); - break; - } - ether_demux(m->m_pkthdr.rcvif, m); - break; - - case DIR_OUT | PROTO_LAYER2: /* N_TO_ETH_OUT: */ - ether_output_frame(ifp, m); - break; - - case DIR_DROP: - /* drop the packet after some time */ - FREE_PKT(m); - break; - - default: - printf("dummynet: bad switch %d!\n", dst); - FREE_PKT(m); - break; - } - } -} - -static inline int -tag_mbuf(struct mbuf *m, int dir, struct ip_fw_args *fwa) -{ - struct dn_pkt_tag *dt; - struct m_tag *mtag; - - mtag = m_tag_get(PACKET_TAG_DUMMYNET, - sizeof(*dt), M_NOWAIT | M_ZERO); - if (mtag == NULL) - return 1; /* Cannot allocate packet header. */ - m_tag_prepend(m, mtag); /* Attach to mbuf chain. */ - dt = (struct dn_pkt_tag *)(mtag + 1); - dt->rule = fwa->rule; - dt->rule.info &= IPFW_ONEPASS; /* only keep this info */ - dt->dn_dir = dir; - dt->ifp = fwa->oif; - /* dt->output tame is updated as we move through */ - dt->output_time = dn_cfg.curr_time; - return 0; -} - - -/* - * dummynet hook for packets. - * We use the argument to locate the flowset fs and the sched_set sch - * associated to it. The we apply flow_mask and sched_mask to - * determine the queue and scheduler instances. - * - * dir where shall we send the packet after dummynet. - * *m0 the mbuf with the packet - * ifp the 'ifp' parameter from the caller. - * NULL in ip_input, destination interface in ip_output, - */ -int -dummynet_io(struct mbuf **m0, int dir, struct ip_fw_args *fwa) -{ - struct mbuf *m = *m0; - struct dn_fsk *fs = NULL; - struct dn_sch_inst *si; - struct dn_queue *q = NULL; /* default */ - - int fs_id = (fwa->rule.info & IPFW_INFO_MASK) + - ((fwa->rule.info & IPFW_IS_PIPE) ? 2*DN_MAX_ID : 0); - DN_BH_WLOCK(); - io_pkt++; - /* we could actually tag outside the lock, but who cares... */ - if (tag_mbuf(m, dir, fwa)) - goto dropit; - if (dn_cfg.busy) { - /* if the upper half is busy doing something expensive, - * lets queue the packet and move forward - */ - mq_append(&dn_cfg.pending, m); - m = *m0 = NULL; /* consumed */ - goto done; /* already active, nothing to do */ - } - /* XXX locate_flowset could be optimised with a direct ref. */ - fs = dn_ht_find(dn_cfg.fshash, fs_id, 0, NULL); - if (fs == NULL) - goto dropit; /* This queue/pipe does not exist! */ - if (fs->sched == NULL) /* should not happen */ - goto dropit; - /* find scheduler instance, possibly applying sched_mask */ - si = ipdn_si_find(fs->sched, &(fwa->f_id)); - if (si == NULL) - goto dropit; - /* - * If the scheduler supports multiple queues, find the right one - * (otherwise it will be ignored by enqueue). - */ - if (fs->sched->fp->flags & DN_MULTIQUEUE) { - q = ipdn_q_find(fs, si, &(fwa->f_id)); - if (q == NULL) - goto dropit; - } - if (fs->sched->fp->enqueue(si, q, m)) { - /* packet was dropped by enqueue() */ - m = *m0 = NULL; - goto dropit; - } - - if (si->kflags & DN_ACTIVE) { - m = *m0 = NULL; /* consumed */ - goto done; /* already active, nothing to do */ - } - - /* compute the initial allowance */ - if (si->idle_time < dn_cfg.curr_time) { - /* Do this only on the first packet on an idle pipe */ - struct dn_link *p = &fs->sched->link; - - si->sched_time = dn_cfg.curr_time; - si->credit = dn_cfg.io_fast ? p->bandwidth : 0; - if (p->burst) { - uint64_t burst = (dn_cfg.curr_time - si->idle_time) * p->bandwidth; - if (burst > p->burst) - burst = p->burst; - si->credit += burst; - } - } - /* pass through scheduler and delay line */ - m = serve_sched(NULL, si, dn_cfg.curr_time); - - /* optimization -- pass it back to ipfw for immediate send */ - /* XXX Don't call dummynet_send() if scheduler return the packet - * just enqueued. This avoid a lock order reversal. - * - */ - if (/*dn_cfg.io_fast &&*/ m == *m0 && (dir & PROTO_LAYER2) == 0 ) { - /* fast io, rename the tag * to carry reinject info. */ - struct m_tag *tag = m_tag_first(m); - - tag->m_tag_cookie = MTAG_IPFW_RULE; - tag->m_tag_id = 0; - io_pkt_fast++; - if (m->m_nextpkt != NULL) { - printf("dummynet: fast io: pkt chain detected!\n"); - m->m_nextpkt = NULL; - } - m = NULL; - } else { - *m0 = NULL; - } -done: - DN_BH_WUNLOCK(); - if (m) - dummynet_send(m); - return 0; - -dropit: - io_pkt_drop++; - DN_BH_WUNLOCK(); - if (m) - FREE_PKT(m); - *m0 = NULL; - return (fs && (fs->fs.flags & DN_NOERROR)) ? 0 : ENOBUFS; -} diff --git a/sys/netinet/ipfw/ip_dn_private.h b/sys/netinet/ipfw/ip_dn_private.h deleted file mode 100644 index 159ddc9..0000000 --- a/sys/netinet/ipfw/ip_dn_private.h +++ /dev/null @@ -1,403 +0,0 @@ -/*- - * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa - * All rights reserved - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * internal dummynet APIs. - * - * $FreeBSD$ - */ - -#ifndef _IP_DN_PRIVATE_H -#define _IP_DN_PRIVATE_H - -/* debugging support - * use ND() to remove debugging, D() to print a line, - * DX(level, ...) to print above a certain level - * If you redefine D() you are expected to redefine all. - */ -#ifndef D -#define ND(fmt, ...) do {} while (0) -#define D1(fmt, ...) do {} while (0) -#define D(fmt, ...) printf("%-10s " fmt "\n", \ - __FUNCTION__, ## __VA_ARGS__) -#define DX(lev, fmt, ...) do { \ - if (dn_cfg.debug > lev) D(fmt, ## __VA_ARGS__); } while (0) -#endif - -MALLOC_DECLARE(M_DUMMYNET); - -#ifndef __linux__ -#define div64(a, b) ((int64_t)(a) / (int64_t)(b)) -#endif - -#define DN_LOCK_INIT() do { \ - mtx_init(&dn_cfg.uh_mtx, "dn_uh", NULL, MTX_DEF); \ - mtx_init(&dn_cfg.bh_mtx, "dn_bh", NULL, MTX_DEF); \ - } while (0) -#define DN_LOCK_DESTROY() do { \ - mtx_destroy(&dn_cfg.uh_mtx); \ - mtx_destroy(&dn_cfg.bh_mtx); \ - } while (0) -#if 0 /* not used yet */ -#define DN_UH_RLOCK() mtx_lock(&dn_cfg.uh_mtx) -#define DN_UH_RUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) -#define DN_UH_WLOCK() mtx_lock(&dn_cfg.uh_mtx) -#define DN_UH_WUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) -#define DN_UH_LOCK_ASSERT() mtx_assert(&dn_cfg.uh_mtx, MA_OWNED) -#endif - -#define DN_BH_RLOCK() mtx_lock(&dn_cfg.uh_mtx) -#define DN_BH_RUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) -#define DN_BH_WLOCK() mtx_lock(&dn_cfg.uh_mtx) -#define DN_BH_WUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) -#define DN_BH_LOCK_ASSERT() mtx_assert(&dn_cfg.uh_mtx, MA_OWNED) - -SLIST_HEAD(dn_schk_head, dn_schk); -SLIST_HEAD(dn_sch_inst_head, dn_sch_inst); -SLIST_HEAD(dn_fsk_head, dn_fsk); -SLIST_HEAD(dn_queue_head, dn_queue); -SLIST_HEAD(dn_alg_head, dn_alg); - -struct mq { /* a basic queue of packets*/ - struct mbuf *head, *tail; -}; - -static inline void -set_oid(struct dn_id *o, int type, int len) -{ - o->type = type; - o->len = len; - o->subtype = 0; -}; - -/* - * configuration and global data for a dummynet instance - * - * When a configuration is modified from userland, 'id' is incremented - * so we can use the value to check for stale pointers. - */ -struct dn_parms { - uint32_t id; /* configuration version */ - - /* defaults (sysctl-accessible) */ - int red_lookup_depth; - int red_avg_pkt_size; - int red_max_pkt_size; - int hash_size; - int max_hash_size; - long byte_limit; /* max queue sizes */ - long slot_limit; - - int io_fast; - int debug; - - /* timekeeping */ - struct timeval prev_t; /* last time dummynet_tick ran */ - struct dn_heap evheap; /* scheduled events */ - - /* counters of objects -- used for reporting space */ - int schk_count; - int si_count; - int fsk_count; - int queue_count; - - /* ticks and other stuff */ - uint64_t curr_time; - /* flowsets and schedulers are in hash tables, with 'hash_size' - * buckets. fshash is looked up at every packet arrival - * so better be generous if we expect many entries. - */ - struct dn_ht *fshash; - struct dn_ht *schedhash; - /* list of flowsets without a scheduler -- use sch_chain */ - struct dn_fsk_head fsu; /* list of unlinked flowsets */ - struct dn_alg_head schedlist; /* list of algorithms */ - - /* Store the fs/sch to scan when draining. The value is the - * bucket number of the hash table. Expire can be disabled - * with net.inet.ip.dummynet.expire=0, or it happens every - * expire ticks. - **/ - int drain_fs; - int drain_sch; - uint32_t expire; - uint32_t expire_cycle; /* tick count */ - - int init_done; - - /* if the upper half is busy doing something long, - * can set the busy flag and we will enqueue packets in - * a queue for later processing. - */ - int busy; - struct mq pending; - -#ifdef _KERNEL - /* - * This file is normally used in the kernel, unless we do - * some userland tests, in which case we do not need a mtx. - * uh_mtx arbitrates between system calls and also - * protects fshash, schedhash and fsunlinked. - * These structures are readonly for the lower half. - * bh_mtx protects all other structures which may be - * modified upon packet arrivals - */ -#if defined( __linux__ ) || defined( _WIN32 ) - spinlock_t uh_mtx; - spinlock_t bh_mtx; -#else - struct mtx uh_mtx; - struct mtx bh_mtx; -#endif - -#endif /* _KERNEL */ -}; - -/* - * Delay line, contains all packets on output from a link. - * Every scheduler instance has one. - */ -struct delay_line { - struct dn_id oid; - struct dn_sch_inst *si; - struct mq mq; -}; - -/* - * The kernel side of a flowset. It is linked in a hash table - * of flowsets, and in a list of children of their parent scheduler. - * qht is either the queue or (if HAVE_MASK) a hash table queues. - * Note that the mask to use is the (flow_mask|sched_mask), which - * changes as we attach/detach schedulers. So we store it here. - * - * XXX If we want to add scheduler-specific parameters, we need to - * put them in external storage because the scheduler may not be - * available when the fsk is created. - */ -struct dn_fsk { /* kernel side of a flowset */ - struct dn_fs fs; - SLIST_ENTRY(dn_fsk) fsk_next; /* hash chain for fshash */ - - struct ipfw_flow_id fsk_mask; - - /* qht is a hash table of queues, or just a single queue - * a bit in fs.flags tells us which one - */ - struct dn_ht *qht; - struct dn_schk *sched; /* Sched we are linked to */ - SLIST_ENTRY(dn_fsk) sch_chain; /* list of fsk attached to sched */ - - /* bucket index used by drain routine to drain queues for this - * flowset - */ - int drain_bucket; - /* Parameter realted to RED / GRED */ - /* original values are in dn_fs*/ - int w_q ; /* queue weight (scaled) */ - int max_th ; /* maximum threshold for queue (scaled) */ - int min_th ; /* minimum threshold for queue (scaled) */ - int max_p ; /* maximum value for p_b (scaled) */ - - u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */ - u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */ - u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */ - u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */ - u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */ - u_int lookup_depth ; /* depth of lookup table */ - int lookup_step ; /* granularity inside the lookup table */ - int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */ - int avg_pkt_size ; /* medium packet size */ - int max_pkt_size ; /* max packet size */ -}; - -/* - * A queue is created as a child of a flowset unless it belongs to - * a !MULTIQUEUE scheduler. It is normally in a hash table in the - * flowset. fs always points to the parent flowset. - * si normally points to the sch_inst, unless the flowset has been - * detached from the scheduler -- in this case si == NULL and we - * should not enqueue. - */ -struct dn_queue { - struct dn_flow ni; /* oid, flow_id, stats */ - struct mq mq; /* packets queue */ - struct dn_sch_inst *_si; /* owner scheduler instance */ - SLIST_ENTRY(dn_queue) q_next; /* hash chain list for qht */ - struct dn_fsk *fs; /* parent flowset. */ - - /* RED parameters */ - int avg; /* average queue length est. (scaled) */ - int count; /* arrivals since last RED drop */ - int random; /* random value (scaled) */ - uint64_t q_time; /* start of queue idle time */ - -}; - -/* - * The kernel side of a scheduler. Contains the userland config, - * a link, pointer to extra config arguments from command line, - * kernel flags, and a pointer to the scheduler methods. - * It is stored in a hash table, and holds a list of all - * flowsets and scheduler instances. - * XXX sch must be at the beginning, see schk_hash(). - */ -struct dn_schk { - struct dn_sch sch; - struct dn_alg *fp; /* Pointer to scheduler functions */ - struct dn_link link; /* The link, embedded */ - struct dn_profile *profile; /* delay profile, if any */ - struct dn_id *cfg; /* extra config arguments */ - - SLIST_ENTRY(dn_schk) schk_next; /* hash chain for schedhash */ - - struct dn_fsk_head fsk_list; /* all fsk linked to me */ - struct dn_fsk *fs; /* Flowset for !MULTIQUEUE */ - - /* bucket index used by the drain routine to drain the scheduler - * instance for this flowset. - */ - int drain_bucket; - - /* Hash table of all instances (through sch.sched_mask) - * or single instance if no mask. Always valid. - */ - struct dn_ht *siht; -}; - - -/* - * Scheduler instance. - * Contains variables and all queues relative to a this instance. - * This struct is created a runtime. - */ -struct dn_sch_inst { - struct dn_flow ni; /* oid, flowid and stats */ - SLIST_ENTRY(dn_sch_inst) si_next; /* hash chain for siht */ - struct delay_line dline; - struct dn_schk *sched; /* the template */ - int kflags; /* DN_ACTIVE */ - - int64_t credit; /* bits I can transmit (more or less). */ - uint64_t sched_time; /* time link was scheduled in ready_heap */ - uint64_t idle_time; /* start of scheduler instance idle time */ - - /* q_count is the number of queues that this instance is using. - * The counter is incremented or decremented when - * a reference from the queue is created or deleted. - * It is used to make sure that a scheduler instance can be safely - * deleted by the drain routine. See notes below. - */ - int q_count; - -}; - -/* - * NOTE about object drain. - * The system will automatically (XXX check when) drain queues and - * scheduler instances when they are idle. - * A queue is idle when it has no packets; an instance is idle when - * it is not in the evheap heap, and the corresponding delay line is empty. - * A queue can be safely deleted when it is idle because of the scheduler - * function xxx_free_queue() will remove any references to it. - * An instance can be only deleted when no queues reference it. To be sure - * of that, a counter (q_count) stores the number of queues that are pointing - * to the instance. - * - * XXX - * Order of scan: - * - take all flowset in a bucket for the flowset hash table - * - take all queues in a bucket for the flowset - * - increment the queue bucket - * - scan next flowset bucket - * Nothing is done if a bucket contains no entries. - * - * The same schema is used for sceduler instances - */ - - -/* kernel-side flags. Linux has DN_DELETE in fcntl.h - */ -enum { - /* 1 and 2 are reserved for the SCAN flags */ - DN_DESTROY = 0x0004, /* destroy */ - DN_DELETE_FS = 0x0008, /* destroy flowset */ - DN_DETACH = 0x0010, - DN_ACTIVE = 0x0020, /* object is in evheap */ - DN_F_DLINE = 0x0040, /* object is a delay line */ - DN_DEL_SAFE = 0x0080, /* delete a queue only if no longer needed - * by scheduler */ - DN_QHT_IS_Q = 0x0100, /* in flowset, qht is a single queue */ -}; - -extern struct dn_parms dn_cfg; -//VNET_DECLARE(struct dn_parms, _base_dn_cfg); -//#define dn_cfg VNET(_base_dn_cfg) - -int dummynet_io(struct mbuf **, int , struct ip_fw_args *); -void dummynet_task(void *context, int pending); -void dn_reschedule(void); - -struct dn_queue *ipdn_q_find(struct dn_fsk *, struct dn_sch_inst *, - struct ipfw_flow_id *); -struct dn_sch_inst *ipdn_si_find(struct dn_schk *, struct ipfw_flow_id *); - -/* - * copy_range is a template for requests for ranges of pipes/queues/scheds. - * The number of ranges is variable and can be derived by o.len. - * As a default, we use a small number of entries so that the struct - * fits easily on the stack and is sufficient for most common requests. - */ -#define DEFAULT_RANGES 5 -struct copy_range { - struct dn_id o; - uint32_t r[ 2 * DEFAULT_RANGES ]; -}; - -struct copy_args { - char **start; - char *end; - int flags; - int type; - struct copy_range *extra; /* extra filtering */ -}; - -struct sockopt; -int ip_dummynet_compat(struct sockopt *sopt); -int dummynet_get(struct sockopt *sopt, void **compat); -int dn_c_copy_q (void *_ni, void *arg); -int dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq); -int dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq); -int dn_compat_copy_queue(struct copy_args *a, void *_o); -int dn_compat_copy_pipe(struct copy_args *a, void *_o); -int copy_data_helper_compat(void *_o, void *_arg); -int dn_compat_calc_size(void); -int do_config(void *p, int l); - -/* function to drain idle object */ -void dn_drain_scheduler(void); -void dn_drain_queue(void); - -#endif /* _IP_DN_PRIVATE_H */ diff --git a/sys/netinet/ipfw/ip_dummynet.c b/sys/netinet/ipfw/ip_dummynet.c deleted file mode 100644 index e1c7a08..0000000 --- a/sys/netinet/ipfw/ip_dummynet.c +++ /dev/null @@ -1,2314 +0,0 @@ -/*- - * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa - * Portions Copyright (c) 2000 Akamba Corp. - * All rights reserved - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - -/* - * Configuration and internal object management for dummynet. - */ - -#include "opt_inet6.h" - -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/malloc.h> -#include <sys/mbuf.h> -#include <sys/kernel.h> -#include <sys/lock.h> -#include <sys/module.h> -#include <sys/priv.h> -#include <sys/proc.h> -#include <sys/rwlock.h> -#include <sys/socket.h> -#include <sys/socketvar.h> -#include <sys/time.h> -#include <sys/taskqueue.h> -#include <net/if.h> /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ -#include <netinet/in.h> -#include <netinet/ip_var.h> /* ip_output(), IP_FORWARDING */ -#include <netinet/ip_fw.h> -#include <netinet/ipfw/ip_fw_private.h> -#include <netinet/ipfw/dn_heap.h> -#include <netinet/ip_dummynet.h> -#include <netinet/ipfw/ip_dn_private.h> -#include <netinet/ipfw/dn_sched.h> - -/* which objects to copy */ -#define DN_C_LINK 0x01 -#define DN_C_SCH 0x02 -#define DN_C_FLOW 0x04 -#define DN_C_FS 0x08 -#define DN_C_QUEUE 0x10 - -/* we use this argument in case of a schk_new */ -struct schk_new_arg { - struct dn_alg *fp; - struct dn_sch *sch; -}; - -/*---- callout hooks. ----*/ -static struct callout dn_timeout; -static struct task dn_task; -static struct taskqueue *dn_tq = NULL; - -static void -dummynet(void *arg) -{ - - (void)arg; /* UNUSED */ - taskqueue_enqueue(dn_tq, &dn_task); -} - -void -dn_reschedule(void) -{ - callout_reset(&dn_timeout, 1, dummynet, NULL); -} -/*----- end of callout hooks -----*/ - -/* Return a scheduler descriptor given the type or name. */ -static struct dn_alg * -find_sched_type(int type, char *name) -{ - struct dn_alg *d; - - SLIST_FOREACH(d, &dn_cfg.schedlist, next) { - if (d->type == type || (name && !strcasecmp(d->name, name))) - return d; - } - return NULL; /* not found */ -} - -int -ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg) -{ - int oldv = *v; - const char *op = NULL; - if (dflt < lo) - dflt = lo; - if (dflt > hi) - dflt = hi; - if (oldv < lo) { - *v = dflt; - op = "Bump"; - } else if (oldv > hi) { - *v = hi; - op = "Clamp"; - } else - return *v; - if (op && msg) - printf("%s %s to %d (was %d)\n", op, msg, *v, oldv); - return *v; -} - -/*---- flow_id mask, hash and compare functions ---*/ -/* - * The flow_id includes the 5-tuple, the queue/pipe number - * which we store in the extra area in host order, - * and for ipv6 also the flow_id6. - * XXX see if we want the tos byte (can store in 'flags') - */ -static struct ipfw_flow_id * -flow_id_mask(struct ipfw_flow_id *mask, struct ipfw_flow_id *id) -{ - int is_v6 = IS_IP6_FLOW_ID(id); - - id->dst_port &= mask->dst_port; - id->src_port &= mask->src_port; - id->proto &= mask->proto; - id->extra &= mask->extra; - if (is_v6) { - APPLY_MASK(&id->dst_ip6, &mask->dst_ip6); - APPLY_MASK(&id->src_ip6, &mask->src_ip6); - id->flow_id6 &= mask->flow_id6; - } else { - id->dst_ip &= mask->dst_ip; - id->src_ip &= mask->src_ip; - } - return id; -} - -/* computes an OR of two masks, result in dst and also returned */ -static struct ipfw_flow_id * -flow_id_or(struct ipfw_flow_id *src, struct ipfw_flow_id *dst) -{ - int is_v6 = IS_IP6_FLOW_ID(dst); - - dst->dst_port |= src->dst_port; - dst->src_port |= src->src_port; - dst->proto |= src->proto; - dst->extra |= src->extra; - if (is_v6) { -#define OR_MASK(_d, _s) \ - (_d)->__u6_addr.__u6_addr32[0] |= (_s)->__u6_addr.__u6_addr32[0]; \ - (_d)->__u6_addr.__u6_addr32[1] |= (_s)->__u6_addr.__u6_addr32[1]; \ - (_d)->__u6_addr.__u6_addr32[2] |= (_s)->__u6_addr.__u6_addr32[2]; \ - (_d)->__u6_addr.__u6_addr32[3] |= (_s)->__u6_addr.__u6_addr32[3]; - OR_MASK(&dst->dst_ip6, &src->dst_ip6); - OR_MASK(&dst->src_ip6, &src->src_ip6); -#undef OR_MASK - dst->flow_id6 |= src->flow_id6; - } else { - dst->dst_ip |= src->dst_ip; - dst->src_ip |= src->src_ip; - } - return dst; -} - -static int -nonzero_mask(struct ipfw_flow_id *m) -{ - if (m->dst_port || m->src_port || m->proto || m->extra) - return 1; - if (IS_IP6_FLOW_ID(m)) { - return - m->dst_ip6.__u6_addr.__u6_addr32[0] || - m->dst_ip6.__u6_addr.__u6_addr32[1] || - m->dst_ip6.__u6_addr.__u6_addr32[2] || - m->dst_ip6.__u6_addr.__u6_addr32[3] || - m->src_ip6.__u6_addr.__u6_addr32[0] || - m->src_ip6.__u6_addr.__u6_addr32[1] || - m->src_ip6.__u6_addr.__u6_addr32[2] || - m->src_ip6.__u6_addr.__u6_addr32[3] || - m->flow_id6; - } else { - return m->dst_ip || m->src_ip; - } -} - -/* XXX we may want a better hash function */ -static uint32_t -flow_id_hash(struct ipfw_flow_id *id) -{ - uint32_t i; - - if (IS_IP6_FLOW_ID(id)) { - uint32_t *d = (uint32_t *)&id->dst_ip6; - uint32_t *s = (uint32_t *)&id->src_ip6; - i = (d[0] ) ^ (d[1]) ^ - (d[2] ) ^ (d[3]) ^ - (d[0] >> 15) ^ (d[1] >> 15) ^ - (d[2] >> 15) ^ (d[3] >> 15) ^ - (s[0] << 1) ^ (s[1] << 1) ^ - (s[2] << 1) ^ (s[3] << 1) ^ - (s[0] << 16) ^ (s[1] << 16) ^ - (s[2] << 16) ^ (s[3] << 16) ^ - (id->dst_port << 1) ^ (id->src_port) ^ - (id->extra) ^ - (id->proto ) ^ (id->flow_id6); - } else { - i = (id->dst_ip) ^ (id->dst_ip >> 15) ^ - (id->src_ip << 1) ^ (id->src_ip >> 16) ^ - (id->extra) ^ - (id->dst_port << 1) ^ (id->src_port) ^ (id->proto); - } - return i; -} - -/* Like bcmp, returns 0 if ids match, 1 otherwise. */ -static int -flow_id_cmp(struct ipfw_flow_id *id1, struct ipfw_flow_id *id2) -{ - int is_v6 = IS_IP6_FLOW_ID(id1); - - if (!is_v6) { - if (IS_IP6_FLOW_ID(id2)) - return 1; /* different address families */ - - return (id1->dst_ip == id2->dst_ip && - id1->src_ip == id2->src_ip && - id1->dst_port == id2->dst_port && - id1->src_port == id2->src_port && - id1->proto == id2->proto && - id1->extra == id2->extra) ? 0 : 1; - } - /* the ipv6 case */ - return ( - !bcmp(&id1->dst_ip6,&id2->dst_ip6, sizeof(id1->dst_ip6)) && - !bcmp(&id1->src_ip6,&id2->src_ip6, sizeof(id1->src_ip6)) && - id1->dst_port == id2->dst_port && - id1->src_port == id2->src_port && - id1->proto == id2->proto && - id1->extra == id2->extra && - id1->flow_id6 == id2->flow_id6) ? 0 : 1; -} -/*--------- end of flow-id mask, hash and compare ---------*/ - -/*--- support functions for the qht hashtable ---- - * Entries are hashed by flow-id - */ -static uint32_t -q_hash(uintptr_t key, int flags, void *arg) -{ - /* compute the hash slot from the flow id */ - struct ipfw_flow_id *id = (flags & DNHT_KEY_IS_OBJ) ? - &((struct dn_queue *)key)->ni.fid : - (struct ipfw_flow_id *)key; - - return flow_id_hash(id); -} - -static int -q_match(void *obj, uintptr_t key, int flags, void *arg) -{ - struct dn_queue *o = (struct dn_queue *)obj; - struct ipfw_flow_id *id2; - - if (flags & DNHT_KEY_IS_OBJ) { - /* compare pointers */ - id2 = &((struct dn_queue *)key)->ni.fid; - } else { - id2 = (struct ipfw_flow_id *)key; - } - return (0 == flow_id_cmp(&o->ni.fid, id2)); -} - -/* - * create a new queue instance for the given 'key'. - */ -static void * -q_new(uintptr_t key, int flags, void *arg) -{ - struct dn_queue *q, *template = arg; - struct dn_fsk *fs = template->fs; - int size = sizeof(*q) + fs->sched->fp->q_datalen; - - q = malloc(size, M_DUMMYNET, M_NOWAIT | M_ZERO); - if (q == NULL) { - D("no memory for new queue"); - return NULL; - } - - set_oid(&q->ni.oid, DN_QUEUE, size); - if (fs->fs.flags & DN_QHT_HASH) - q->ni.fid = *(struct ipfw_flow_id *)key; - q->fs = fs; - q->_si = template->_si; - q->_si->q_count++; - - if (fs->sched->fp->new_queue) - fs->sched->fp->new_queue(q); - dn_cfg.queue_count++; - return q; -} - -/* - * Notify schedulers that a queue is going away. - * If (flags & DN_DESTROY), also free the packets. - * The version for callbacks is called q_delete_cb(). - */ -static void -dn_delete_queue(struct dn_queue *q, int flags) -{ - struct dn_fsk *fs = q->fs; - - // D("fs %p si %p\n", fs, q->_si); - /* notify the parent scheduler that the queue is going away */ - if (fs && fs->sched->fp->free_queue) - fs->sched->fp->free_queue(q); - q->_si->q_count--; - q->_si = NULL; - if (flags & DN_DESTROY) { - if (q->mq.head) - dn_free_pkts(q->mq.head); - bzero(q, sizeof(*q)); // safety - free(q, M_DUMMYNET); - dn_cfg.queue_count--; - } -} - -static int -q_delete_cb(void *q, void *arg) -{ - int flags = (int)(uintptr_t)arg; - dn_delete_queue(q, flags); - return (flags & DN_DESTROY) ? DNHT_SCAN_DEL : 0; -} - -/* - * calls dn_delete_queue/q_delete_cb on all queues, - * which notifies the parent scheduler and possibly drains packets. - * flags & DN_DESTROY: drains queues and destroy qht; - */ -static void -qht_delete(struct dn_fsk *fs, int flags) -{ - ND("fs %d start flags %d qht %p", - fs->fs.fs_nr, flags, fs->qht); - if (!fs->qht) - return; - if (fs->fs.flags & DN_QHT_HASH) { - dn_ht_scan(fs->qht, q_delete_cb, (void *)(uintptr_t)flags); - if (flags & DN_DESTROY) { - dn_ht_free(fs->qht, 0); - fs->qht = NULL; - } - } else { - dn_delete_queue((struct dn_queue *)(fs->qht), flags); - if (flags & DN_DESTROY) - fs->qht = NULL; - } -} - -/* - * Find and possibly create the queue for a MULTIQUEUE scheduler. - * We never call it for !MULTIQUEUE (the queue is in the sch_inst). - */ -struct dn_queue * -ipdn_q_find(struct dn_fsk *fs, struct dn_sch_inst *si, - struct ipfw_flow_id *id) -{ - struct dn_queue template; - - template._si = si; - template.fs = fs; - - if (fs->fs.flags & DN_QHT_HASH) { - struct ipfw_flow_id masked_id; - if (fs->qht == NULL) { - fs->qht = dn_ht_init(NULL, fs->fs.buckets, - offsetof(struct dn_queue, q_next), - q_hash, q_match, q_new); - if (fs->qht == NULL) - return NULL; - } - masked_id = *id; - flow_id_mask(&fs->fsk_mask, &masked_id); - return dn_ht_find(fs->qht, (uintptr_t)&masked_id, - DNHT_INSERT, &template); - } else { - if (fs->qht == NULL) - fs->qht = q_new(0, 0, &template); - return (struct dn_queue *)fs->qht; - } -} -/*--- end of queue hash table ---*/ - -/*--- support functions for the sch_inst hashtable ---- - * - * These are hashed by flow-id - */ -static uint32_t -si_hash(uintptr_t key, int flags, void *arg) -{ - /* compute the hash slot from the flow id */ - struct ipfw_flow_id *id = (flags & DNHT_KEY_IS_OBJ) ? - &((struct dn_sch_inst *)key)->ni.fid : - (struct ipfw_flow_id *)key; - - return flow_id_hash(id); -} - -static int -si_match(void *obj, uintptr_t key, int flags, void *arg) -{ - struct dn_sch_inst *o = obj; - struct ipfw_flow_id *id2; - - id2 = (flags & DNHT_KEY_IS_OBJ) ? - &((struct dn_sch_inst *)key)->ni.fid : - (struct ipfw_flow_id *)key; - return flow_id_cmp(&o->ni.fid, id2) == 0; -} - -/* - * create a new instance for the given 'key' - * Allocate memory for instance, delay line and scheduler private data. - */ -static void * -si_new(uintptr_t key, int flags, void *arg) -{ - struct dn_schk *s = arg; - struct dn_sch_inst *si; - int l = sizeof(*si) + s->fp->si_datalen; - - si = malloc(l, M_DUMMYNET, M_NOWAIT | M_ZERO); - if (si == NULL) - goto error; - - /* Set length only for the part passed up to userland. */ - set_oid(&si->ni.oid, DN_SCH_I, sizeof(struct dn_flow)); - set_oid(&(si->dline.oid), DN_DELAY_LINE, - sizeof(struct delay_line)); - /* mark si and dline as outside the event queue */ - si->ni.oid.id = si->dline.oid.id = -1; - - si->sched = s; - si->dline.si = si; - - if (s->fp->new_sched && s->fp->new_sched(si)) { - D("new_sched error"); - goto error; - } - if (s->sch.flags & DN_HAVE_MASK) - si->ni.fid = *(struct ipfw_flow_id *)key; - - dn_cfg.si_count++; - return si; - -error: - if (si) { - bzero(si, sizeof(*si)); // safety - free(si, M_DUMMYNET); - } - return NULL; -} - -/* - * Callback from siht to delete all scheduler instances. Remove - * si and delay line from the system heap, destroy all queues. - * We assume that all flowset have been notified and do not - * point to us anymore. - */ -static int -si_destroy(void *_si, void *arg) -{ - struct dn_sch_inst *si = _si; - struct dn_schk *s = si->sched; - struct delay_line *dl = &si->dline; - - if (dl->oid.subtype) /* remove delay line from event heap */ - heap_extract(&dn_cfg.evheap, dl); - dn_free_pkts(dl->mq.head); /* drain delay line */ - if (si->kflags & DN_ACTIVE) /* remove si from event heap */ - heap_extract(&dn_cfg.evheap, si); - if (s->fp->free_sched) - s->fp->free_sched(si); - bzero(si, sizeof(*si)); /* safety */ - free(si, M_DUMMYNET); - dn_cfg.si_count--; - return DNHT_SCAN_DEL; -} - -/* - * Find the scheduler instance for this packet. If we need to apply - * a mask, do on a local copy of the flow_id to preserve the original. - * Assume siht is always initialized if we have a mask. - */ -struct dn_sch_inst * -ipdn_si_find(struct dn_schk *s, struct ipfw_flow_id *id) -{ - - if (s->sch.flags & DN_HAVE_MASK) { - struct ipfw_flow_id id_t = *id; - flow_id_mask(&s->sch.sched_mask, &id_t); - return dn_ht_find(s->siht, (uintptr_t)&id_t, - DNHT_INSERT, s); - } - if (!s->siht) - s->siht = si_new(0, 0, s); - return (struct dn_sch_inst *)s->siht; -} - -/* callback to flush credit for the scheduler instance */ -static int -si_reset_credit(void *_si, void *arg) -{ - struct dn_sch_inst *si = _si; - struct dn_link *p = &si->sched->link; - - si->credit = p->burst + (dn_cfg.io_fast ? p->bandwidth : 0); - return 0; -} - -static void -schk_reset_credit(struct dn_schk *s) -{ - if (s->sch.flags & DN_HAVE_MASK) - dn_ht_scan(s->siht, si_reset_credit, NULL); - else if (s->siht) - si_reset_credit(s->siht, NULL); -} -/*---- end of sch_inst hashtable ---------------------*/ - -/*------------------------------------------------------- - * flowset hash (fshash) support. Entries are hashed by fs_nr. - * New allocations are put in the fsunlinked list, from which - * they are removed when they point to a specific scheduler. - */ -static uint32_t -fsk_hash(uintptr_t key, int flags, void *arg) -{ - uint32_t i = !(flags & DNHT_KEY_IS_OBJ) ? key : - ((struct dn_fsk *)key)->fs.fs_nr; - - return ( (i>>8)^(i>>4)^i ); -} - -static int -fsk_match(void *obj, uintptr_t key, int flags, void *arg) -{ - struct dn_fsk *fs = obj; - int i = !(flags & DNHT_KEY_IS_OBJ) ? key : - ((struct dn_fsk *)key)->fs.fs_nr; - - return (fs->fs.fs_nr == i); -} - -static void * -fsk_new(uintptr_t key, int flags, void *arg) -{ - struct dn_fsk *fs; - - fs = malloc(sizeof(*fs), M_DUMMYNET, M_NOWAIT | M_ZERO); - if (fs) { - set_oid(&fs->fs.oid, DN_FS, sizeof(fs->fs)); - dn_cfg.fsk_count++; - fs->drain_bucket = 0; - SLIST_INSERT_HEAD(&dn_cfg.fsu, fs, sch_chain); - } - return fs; -} - -/* - * detach flowset from its current scheduler. Flags as follows: - * DN_DETACH removes from the fsk_list - * DN_DESTROY deletes individual queues - * DN_DELETE_FS destroys the flowset (otherwise goes in unlinked). - */ -static void -fsk_detach(struct dn_fsk *fs, int flags) -{ - if (flags & DN_DELETE_FS) - flags |= DN_DESTROY; - ND("fs %d from sched %d flags %s %s %s", - fs->fs.fs_nr, fs->fs.sched_nr, - (flags & DN_DELETE_FS) ? "DEL_FS":"", - (flags & DN_DESTROY) ? "DEL":"", - (flags & DN_DETACH) ? "DET":""); - if (flags & DN_DETACH) { /* detach from the list */ - struct dn_fsk_head *h; - h = fs->sched ? &fs->sched->fsk_list : &dn_cfg.fsu; - SLIST_REMOVE(h, fs, dn_fsk, sch_chain); - } - /* Free the RED parameters, they will be recomputed on - * subsequent attach if needed. - */ - if (fs->w_q_lookup) - free(fs->w_q_lookup, M_DUMMYNET); - fs->w_q_lookup = NULL; - qht_delete(fs, flags); - if (fs->sched && fs->sched->fp->free_fsk) - fs->sched->fp->free_fsk(fs); - fs->sched = NULL; - if (flags & DN_DELETE_FS) { - bzero(fs, sizeof(fs)); /* safety */ - free(fs, M_DUMMYNET); - dn_cfg.fsk_count--; - } else { - SLIST_INSERT_HEAD(&dn_cfg.fsu, fs, sch_chain); - } -} - -/* - * Detach or destroy all flowsets in a list. - * flags specifies what to do: - * DN_DESTROY: flush all queues - * DN_DELETE_FS: DN_DESTROY + destroy flowset - * DN_DELETE_FS implies DN_DESTROY - */ -static void -fsk_detach_list(struct dn_fsk_head *h, int flags) -{ - struct dn_fsk *fs; - int n = 0; /* only for stats */ - - ND("head %p flags %x", h, flags); - while ((fs = SLIST_FIRST(h))) { - SLIST_REMOVE_HEAD(h, sch_chain); - n++; - fsk_detach(fs, flags); - } - ND("done %d flowsets", n); -} - -/* - * called on 'queue X delete' -- removes the flowset from fshash, - * deletes all queues for the flowset, and removes the flowset. - */ -static int -delete_fs(int i, int locked) -{ - struct dn_fsk *fs; - int err = 0; - - if (!locked) - DN_BH_WLOCK(); - fs = dn_ht_find(dn_cfg.fshash, i, DNHT_REMOVE, NULL); - ND("fs %d found %p", i, fs); - if (fs) { - fsk_detach(fs, DN_DETACH | DN_DELETE_FS); - err = 0; - } else - err = EINVAL; - if (!locked) - DN_BH_WUNLOCK(); - return err; -} - -/*----- end of flowset hashtable support -------------*/ - -/*------------------------------------------------------------ - * Scheduler hash. When searching by index we pass sched_nr, - * otherwise we pass struct dn_sch * which is the first field in - * struct dn_schk so we can cast between the two. We use this trick - * because in the create phase (but it should be fixed). - */ -static uint32_t -schk_hash(uintptr_t key, int flags, void *_arg) -{ - uint32_t i = !(flags & DNHT_KEY_IS_OBJ) ? key : - ((struct dn_schk *)key)->sch.sched_nr; - return ( (i>>8)^(i>>4)^i ); -} - -static int -schk_match(void *obj, uintptr_t key, int flags, void *_arg) -{ - struct dn_schk *s = (struct dn_schk *)obj; - int i = !(flags & DNHT_KEY_IS_OBJ) ? key : - ((struct dn_schk *)key)->sch.sched_nr; - return (s->sch.sched_nr == i); -} - -/* - * Create the entry and intialize with the sched hash if needed. - * Leave s->fp unset so we can tell whether a dn_ht_find() returns - * a new object or a previously existing one. - */ -static void * -schk_new(uintptr_t key, int flags, void *arg) -{ - struct schk_new_arg *a = arg; - struct dn_schk *s; - int l = sizeof(*s) +a->fp->schk_datalen; - - s = malloc(l, M_DUMMYNET, M_NOWAIT | M_ZERO); - if (s == NULL) - return NULL; - set_oid(&s->link.oid, DN_LINK, sizeof(s->link)); - s->sch = *a->sch; // copy initial values - s->link.link_nr = s->sch.sched_nr; - SLIST_INIT(&s->fsk_list); - /* initialize the hash table or create the single instance */ - s->fp = a->fp; /* si_new needs this */ - s->drain_bucket = 0; - if (s->sch.flags & DN_HAVE_MASK) { - s->siht = dn_ht_init(NULL, s->sch.buckets, - offsetof(struct dn_sch_inst, si_next), - si_hash, si_match, si_new); - if (s->siht == NULL) { - free(s, M_DUMMYNET); - return NULL; - } - } - s->fp = NULL; /* mark as a new scheduler */ - dn_cfg.schk_count++; - return s; -} - -/* - * Callback for sched delete. Notify all attached flowsets to - * detach from the scheduler, destroy the internal flowset, and - * all instances. The scheduler goes away too. - * arg is 0 (only detach flowsets and destroy instances) - * DN_DESTROY (detach & delete queues, delete schk) - * or DN_DELETE_FS (delete queues and flowsets, delete schk) - */ -static int -schk_delete_cb(void *obj, void *arg) -{ - struct dn_schk *s = obj; -#if 0 - int a = (int)arg; - ND("sched %d arg %s%s", - s->sch.sched_nr, - a&DN_DESTROY ? "DEL ":"", - a&DN_DELETE_FS ? "DEL_FS":""); -#endif - fsk_detach_list(&s->fsk_list, arg ? DN_DESTROY : 0); - /* no more flowset pointing to us now */ - if (s->sch.flags & DN_HAVE_MASK) { - dn_ht_scan(s->siht, si_destroy, NULL); - dn_ht_free(s->siht, 0); - } else if (s->siht) - si_destroy(s->siht, NULL); - if (s->profile) { - free(s->profile, M_DUMMYNET); - s->profile = NULL; - } - s->siht = NULL; - if (s->fp->destroy) - s->fp->destroy(s); - bzero(s, sizeof(*s)); // safety - free(obj, M_DUMMYNET); - dn_cfg.schk_count--; - return DNHT_SCAN_DEL; -} - -/* - * called on a 'sched X delete' command. Deletes a single scheduler. - * This is done by removing from the schedhash, unlinking all - * flowsets and deleting their traffic. - */ -static int -delete_schk(int i) -{ - struct dn_schk *s; - - s = dn_ht_find(dn_cfg.schedhash, i, DNHT_REMOVE, NULL); - ND("%d %p", i, s); - if (!s) - return EINVAL; - delete_fs(i + DN_MAX_ID, 1); /* first delete internal fs */ - /* then detach flowsets, delete traffic */ - schk_delete_cb(s, (void*)(uintptr_t)DN_DESTROY); - return 0; -} -/*--- end of schk hashtable support ---*/ - -static int -copy_obj(char **start, char *end, void *_o, const char *msg, int i) -{ - struct dn_id *o = _o; - int have = end - *start; - - if (have < o->len || o->len == 0 || o->type == 0) { - D("(WARN) type %d %s %d have %d need %d", - o->type, msg, i, have, o->len); - return 1; - } - ND("type %d %s %d len %d", o->type, msg, i, o->len); - bcopy(_o, *start, o->len); - if (o->type == DN_LINK) { - /* Adjust burst parameter for link */ - struct dn_link *l = (struct dn_link *)*start; - l->burst = div64(l->burst, 8 * hz); - l->delay = l->delay * 1000 / hz; - } else if (o->type == DN_SCH) { - /* Set id->id to the number of instances */ - struct dn_schk *s = _o; - struct dn_id *id = (struct dn_id *)(*start); - id->id = (s->sch.flags & DN_HAVE_MASK) ? - dn_ht_entries(s->siht) : (s->siht ? 1 : 0); - } - *start += o->len; - return 0; -} - -/* Specific function to copy a queue. - * Copies only the user-visible part of a queue (which is in - * a struct dn_flow), and sets len accordingly. - */ -static int -copy_obj_q(char **start, char *end, void *_o, const char *msg, int i) -{ - struct dn_id *o = _o; - int have = end - *start; - int len = sizeof(struct dn_flow); /* see above comment */ - - if (have < len || o->len == 0 || o->type != DN_QUEUE) { - D("ERROR type %d %s %d have %d need %d", - o->type, msg, i, have, len); - return 1; - } - ND("type %d %s %d len %d", o->type, msg, i, len); - bcopy(_o, *start, len); - ((struct dn_id*)(*start))->len = len; - *start += len; - return 0; -} - -static int -copy_q_cb(void *obj, void *arg) -{ - struct dn_queue *q = obj; - struct copy_args *a = arg; - struct dn_flow *ni = (struct dn_flow *)(*a->start); - if (copy_obj_q(a->start, a->end, &q->ni, "queue", -1)) - return DNHT_SCAN_END; - ni->oid.type = DN_FLOW; /* override the DN_QUEUE */ - ni->oid.id = si_hash((uintptr_t)&ni->fid, 0, NULL); - return 0; -} - -static int -copy_q(struct copy_args *a, struct dn_fsk *fs, int flags) -{ - if (!fs->qht) - return 0; - if (fs->fs.flags & DN_QHT_HASH) - dn_ht_scan(fs->qht, copy_q_cb, a); - else - copy_q_cb(fs->qht, a); - return 0; -} - -/* - * This routine only copies the initial part of a profile ? XXX - */ -static int -copy_profile(struct copy_args *a, struct dn_profile *p) -{ - int have = a->end - *a->start; - /* XXX here we check for max length */ - int profile_len = sizeof(struct dn_profile) - - ED_MAX_SAMPLES_NO*sizeof(int); - - if (p == NULL) - return 0; - if (have < profile_len) { - D("error have %d need %d", have, profile_len); - return 1; - } - bcopy(p, *a->start, profile_len); - ((struct dn_id *)(*a->start))->len = profile_len; - *a->start += profile_len; - return 0; -} - -static int -copy_flowset(struct copy_args *a, struct dn_fsk *fs, int flags) -{ - struct dn_fs *ufs = (struct dn_fs *)(*a->start); - if (!fs) - return 0; - ND("flowset %d", fs->fs.fs_nr); - if (copy_obj(a->start, a->end, &fs->fs, "flowset", fs->fs.fs_nr)) - return DNHT_SCAN_END; - ufs->oid.id = (fs->fs.flags & DN_QHT_HASH) ? - dn_ht_entries(fs->qht) : (fs->qht ? 1 : 0); - if (flags) { /* copy queues */ - copy_q(a, fs, 0); - } - return 0; -} - -static int -copy_si_cb(void *obj, void *arg) -{ - struct dn_sch_inst *si = obj; - struct copy_args *a = arg; - struct dn_flow *ni = (struct dn_flow *)(*a->start); - if (copy_obj(a->start, a->end, &si->ni, "inst", - si->sched->sch.sched_nr)) - return DNHT_SCAN_END; - ni->oid.type = DN_FLOW; /* override the DN_SCH_I */ - ni->oid.id = si_hash((uintptr_t)si, DNHT_KEY_IS_OBJ, NULL); - return 0; -} - -static int -copy_si(struct copy_args *a, struct dn_schk *s, int flags) -{ - if (s->sch.flags & DN_HAVE_MASK) - dn_ht_scan(s->siht, copy_si_cb, a); - else if (s->siht) - copy_si_cb(s->siht, a); - return 0; -} - -/* - * compute a list of children of a scheduler and copy up - */ -static int -copy_fsk_list(struct copy_args *a, struct dn_schk *s, int flags) -{ - struct dn_fsk *fs; - struct dn_id *o; - uint32_t *p; - - int n = 0, space = sizeof(*o); - SLIST_FOREACH(fs, &s->fsk_list, sch_chain) { - if (fs->fs.fs_nr < DN_MAX_ID) - n++; - } - space += n * sizeof(uint32_t); - DX(3, "sched %d has %d flowsets", s->sch.sched_nr, n); - if (a->end - *(a->start) < space) - return DNHT_SCAN_END; - o = (struct dn_id *)(*(a->start)); - o->len = space; - *a->start += o->len; - o->type = DN_TEXT; - p = (uint32_t *)(o+1); - SLIST_FOREACH(fs, &s->fsk_list, sch_chain) - if (fs->fs.fs_nr < DN_MAX_ID) - *p++ = fs->fs.fs_nr; - return 0; -} - -static int -copy_data_helper(void *_o, void *_arg) -{ - struct copy_args *a = _arg; - uint32_t *r = a->extra->r; /* start of first range */ - uint32_t *lim; /* first invalid pointer */ - int n; - - lim = (uint32_t *)((char *)(a->extra) + a->extra->o.len); - - if (a->type == DN_LINK || a->type == DN_SCH) { - /* pipe|sched show, we receive a dn_schk */ - struct dn_schk *s = _o; - - n = s->sch.sched_nr; - if (a->type == DN_SCH && n >= DN_MAX_ID) - return 0; /* not a scheduler */ - if (a->type == DN_LINK && n <= DN_MAX_ID) - return 0; /* not a pipe */ - - /* see if the object is within one of our ranges */ - for (;r < lim; r += 2) { - if (n < r[0] || n > r[1]) - continue; - /* Found a valid entry, copy and we are done */ - if (a->flags & DN_C_LINK) { - if (copy_obj(a->start, a->end, - &s->link, "link", n)) - return DNHT_SCAN_END; - if (copy_profile(a, s->profile)) - return DNHT_SCAN_END; - if (copy_flowset(a, s->fs, 0)) - return DNHT_SCAN_END; - } - if (a->flags & DN_C_SCH) { - if (copy_obj(a->start, a->end, - &s->sch, "sched", n)) - return DNHT_SCAN_END; - /* list all attached flowsets */ - if (copy_fsk_list(a, s, 0)) - return DNHT_SCAN_END; - } - if (a->flags & DN_C_FLOW) - copy_si(a, s, 0); - break; - } - } else if (a->type == DN_FS) { - /* queue show, skip internal flowsets */ - struct dn_fsk *fs = _o; - - n = fs->fs.fs_nr; - if (n >= DN_MAX_ID) - return 0; - /* see if the object is within one of our ranges */ - for (;r < lim; r += 2) { - if (n < r[0] || n > r[1]) - continue; - if (copy_flowset(a, fs, 0)) - return DNHT_SCAN_END; - copy_q(a, fs, 0); - break; /* we are done */ - } - } - return 0; -} - -static inline struct dn_schk * -locate_scheduler(int i) -{ - return dn_ht_find(dn_cfg.schedhash, i, 0, NULL); -} - -/* - * red parameters are in fixed point arithmetic. - */ -static int -config_red(struct dn_fsk *fs) -{ - int64_t s, idle, weight, w0; - int t, i; - - fs->w_q = fs->fs.w_q; - fs->max_p = fs->fs.max_p; - ND("called"); - /* Doing stuff that was in userland */ - i = fs->sched->link.bandwidth; - s = (i <= 0) ? 0 : - hz * dn_cfg.red_avg_pkt_size * 8 * SCALE(1) / i; - - idle = div64((s * 3) , fs->w_q); /* s, fs->w_q scaled; idle not scaled */ - fs->lookup_step = div64(idle , dn_cfg.red_lookup_depth); - /* fs->lookup_step not scaled, */ - if (!fs->lookup_step) - fs->lookup_step = 1; - w0 = weight = SCALE(1) - fs->w_q; //fs->w_q scaled - - for (t = fs->lookup_step; t > 1; --t) - weight = SCALE_MUL(weight, w0); - fs->lookup_weight = (int)(weight); // scaled - - /* Now doing stuff that was in kerneland */ - fs->min_th = SCALE(fs->fs.min_th); - fs->max_th = SCALE(fs->fs.max_th); - - fs->c_1 = fs->max_p / (fs->fs.max_th - fs->fs.min_th); - fs->c_2 = SCALE_MUL(fs->c_1, SCALE(fs->fs.min_th)); - - if (fs->fs.flags & DN_IS_GENTLE_RED) { - fs->c_3 = (SCALE(1) - fs->max_p) / fs->fs.max_th; - fs->c_4 = SCALE(1) - 2 * fs->max_p; - } - - /* If the lookup table already exist, free and create it again. */ - if (fs->w_q_lookup) { - free(fs->w_q_lookup, M_DUMMYNET); - fs->w_q_lookup = NULL; - } - if (dn_cfg.red_lookup_depth == 0) { - printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth" - "must be > 0\n"); - fs->fs.flags &= ~DN_IS_RED; - fs->fs.flags &= ~DN_IS_GENTLE_RED; - return (EINVAL); - } - fs->lookup_depth = dn_cfg.red_lookup_depth; - fs->w_q_lookup = (u_int *)malloc(fs->lookup_depth * sizeof(int), - M_DUMMYNET, M_NOWAIT); - if (fs->w_q_lookup == NULL) { - printf("dummynet: sorry, cannot allocate red lookup table\n"); - fs->fs.flags &= ~DN_IS_RED; - fs->fs.flags &= ~DN_IS_GENTLE_RED; - return(ENOSPC); - } - - /* Fill the lookup table with (1 - w_q)^x */ - fs->w_q_lookup[0] = SCALE(1) - fs->w_q; - - for (i = 1; i < fs->lookup_depth; i++) - fs->w_q_lookup[i] = - SCALE_MUL(fs->w_q_lookup[i - 1], fs->lookup_weight); - - if (dn_cfg.red_avg_pkt_size < 1) - dn_cfg.red_avg_pkt_size = 512; - fs->avg_pkt_size = dn_cfg.red_avg_pkt_size; - if (dn_cfg.red_max_pkt_size < 1) - dn_cfg.red_max_pkt_size = 1500; - fs->max_pkt_size = dn_cfg.red_max_pkt_size; - ND("exit"); - return 0; -} - -/* Scan all flowset attached to this scheduler and update red */ -static void -update_red(struct dn_schk *s) -{ - struct dn_fsk *fs; - SLIST_FOREACH(fs, &s->fsk_list, sch_chain) { - if (fs && (fs->fs.flags & DN_IS_RED)) - config_red(fs); - } -} - -/* attach flowset to scheduler s, possibly requeue */ -static void -fsk_attach(struct dn_fsk *fs, struct dn_schk *s) -{ - ND("remove fs %d from fsunlinked, link to sched %d", - fs->fs.fs_nr, s->sch.sched_nr); - SLIST_REMOVE(&dn_cfg.fsu, fs, dn_fsk, sch_chain); - fs->sched = s; - SLIST_INSERT_HEAD(&s->fsk_list, fs, sch_chain); - if (s->fp->new_fsk) - s->fp->new_fsk(fs); - /* XXX compute fsk_mask */ - fs->fsk_mask = fs->fs.flow_mask; - if (fs->sched->sch.flags & DN_HAVE_MASK) - flow_id_or(&fs->sched->sch.sched_mask, &fs->fsk_mask); - if (fs->qht) { - /* - * we must drain qht according to the old - * type, and reinsert according to the new one. - * The requeue is complex -- in general we need to - * reclassify every single packet. - * For the time being, let's hope qht is never set - * when we reach this point. - */ - D("XXX TODO requeue from fs %d to sch %d", - fs->fs.fs_nr, s->sch.sched_nr); - fs->qht = NULL; - } - /* set the new type for qht */ - if (nonzero_mask(&fs->fsk_mask)) - fs->fs.flags |= DN_QHT_HASH; - else - fs->fs.flags &= ~DN_QHT_HASH; - - /* XXX config_red() can fail... */ - if (fs->fs.flags & DN_IS_RED) - config_red(fs); -} - -/* update all flowsets which may refer to this scheduler */ -static void -update_fs(struct dn_schk *s) -{ - struct dn_fsk *fs, *tmp; - - SLIST_FOREACH_SAFE(fs, &dn_cfg.fsu, sch_chain, tmp) { - if (s->sch.sched_nr != fs->fs.sched_nr) { - D("fs %d for sch %d not %d still unlinked", - fs->fs.fs_nr, fs->fs.sched_nr, - s->sch.sched_nr); - continue; - } - fsk_attach(fs, s); - } -} - -/* - * Configuration -- to preserve backward compatibility we use - * the following scheme (N is 65536) - * NUMBER SCHED LINK FLOWSET - * 1 .. N-1 (1)WFQ (2)WFQ (3)queue - * N+1 .. 2N-1 (4)FIFO (5)FIFO (6)FIFO for sched 1..N-1 - * 2N+1 .. 3N-1 -- -- (7)FIFO for sched N+1..2N-1 - * - * "pipe i config" configures #1, #2 and #3 - * "sched i config" configures #1 and possibly #6 - * "queue i config" configures #3 - * #1 is configured with 'pipe i config' or 'sched i config' - * #2 is configured with 'pipe i config', and created if not - * existing with 'sched i config' - * #3 is configured with 'queue i config' - * #4 is automatically configured after #1, can only be FIFO - * #5 is automatically configured after #2 - * #6 is automatically created when #1 is !MULTIQUEUE, - * and can be updated. - * #7 is automatically configured after #2 - */ - -/* - * configure a link (and its FIFO instance) - */ -static int -config_link(struct dn_link *p, struct dn_id *arg) -{ - int i; - - if (p->oid.len != sizeof(*p)) { - D("invalid pipe len %d", p->oid.len); - return EINVAL; - } - i = p->link_nr; - if (i <= 0 || i >= DN_MAX_ID) - return EINVAL; - /* - * The config program passes parameters as follows: - * bw = bits/second (0 means no limits), - * delay = ms, must be translated into ticks. - * qsize = slots/bytes - * burst ??? - */ - p->delay = (p->delay * hz) / 1000; - /* Scale burst size: bytes -> bits * hz */ - p->burst *= 8 * hz; - - DN_BH_WLOCK(); - /* do it twice, base link and FIFO link */ - for (; i < 2*DN_MAX_ID; i += DN_MAX_ID) { - struct dn_schk *s = locate_scheduler(i); - if (s == NULL) { - DN_BH_WUNLOCK(); - D("sched %d not found", i); - return EINVAL; - } - /* remove profile if exists */ - if (s->profile) { - free(s->profile, M_DUMMYNET); - s->profile = NULL; - } - /* copy all parameters */ - s->link.oid = p->oid; - s->link.link_nr = i; - s->link.delay = p->delay; - if (s->link.bandwidth != p->bandwidth) { - /* XXX bandwidth changes, need to update red params */ - s->link.bandwidth = p->bandwidth; - update_red(s); - } - s->link.burst = p->burst; - schk_reset_credit(s); - } - dn_cfg.id++; - DN_BH_WUNLOCK(); - return 0; -} - -/* - * configure a flowset. Can be called from inside with locked=1, - */ -static struct dn_fsk * -config_fs(struct dn_fs *nfs, struct dn_id *arg, int locked) -{ - int i; - struct dn_fsk *fs; - - if (nfs->oid.len != sizeof(*nfs)) { - D("invalid flowset len %d", nfs->oid.len); - return NULL; - } - i = nfs->fs_nr; - if (i <= 0 || i >= 3*DN_MAX_ID) - return NULL; - ND("flowset %d", i); - /* XXX other sanity checks */ - if (nfs->flags & DN_QSIZE_BYTES) { - ipdn_bound_var(&nfs->qsize, 16384, - 1500, dn_cfg.byte_limit, NULL); // "queue byte size"); - } else { - ipdn_bound_var(&nfs->qsize, 50, - 1, dn_cfg.slot_limit, NULL); // "queue slot size"); - } - if (nfs->flags & DN_HAVE_MASK) { - /* make sure we have some buckets */ - ipdn_bound_var((int *)&nfs->buckets, dn_cfg.hash_size, - 1, dn_cfg.max_hash_size, "flowset buckets"); - } else { - nfs->buckets = 1; /* we only need 1 */ - } - if (!locked) - DN_BH_WLOCK(); - do { /* exit with break when done */ - struct dn_schk *s; - int flags = nfs->sched_nr ? DNHT_INSERT : 0; - int j; - int oldc = dn_cfg.fsk_count; - fs = dn_ht_find(dn_cfg.fshash, i, flags, NULL); - if (fs == NULL) { - D("missing sched for flowset %d", i); - break; - } - /* grab some defaults from the existing one */ - if (nfs->sched_nr == 0) /* reuse */ - nfs->sched_nr = fs->fs.sched_nr; - for (j = 0; j < sizeof(nfs->par)/sizeof(nfs->par[0]); j++) { - if (nfs->par[j] == -1) /* reuse */ - nfs->par[j] = fs->fs.par[j]; - } - if (bcmp(&fs->fs, nfs, sizeof(*nfs)) == 0) { - ND("flowset %d unchanged", i); - break; /* no change, nothing to do */ - } - if (oldc != dn_cfg.fsk_count) /* new item */ - dn_cfg.id++; - s = locate_scheduler(nfs->sched_nr); - /* detach from old scheduler if needed, preserving - * queues if we need to reattach. Then update the - * configuration, and possibly attach to the new sched. - */ - DX(2, "fs %d changed sched %d@%p to %d@%p", - fs->fs.fs_nr, - fs->fs.sched_nr, fs->sched, nfs->sched_nr, s); - if (fs->sched) { - int flags = s ? DN_DETACH : (DN_DETACH | DN_DESTROY); - flags |= DN_DESTROY; /* XXX temporary */ - fsk_detach(fs, flags); - } - fs->fs = *nfs; /* copy configuration */ - if (s != NULL) - fsk_attach(fs, s); - } while (0); - if (!locked) - DN_BH_WUNLOCK(); - return fs; -} - -/* - * config/reconfig a scheduler and its FIFO variant. - * For !MULTIQUEUE schedulers, also set up the flowset. - * - * On reconfigurations (detected because s->fp is set), - * detach existing flowsets preserving traffic, preserve link, - * and delete the old scheduler creating a new one. - */ -static int -config_sched(struct dn_sch *_nsch, struct dn_id *arg) -{ - struct dn_schk *s; - struct schk_new_arg a; /* argument for schk_new */ - int i; - struct dn_link p; /* copy of oldlink */ - struct dn_profile *pf = NULL; /* copy of old link profile */ - /* Used to preserv mask parameter */ - struct ipfw_flow_id new_mask; - int new_buckets = 0; - int new_flags = 0; - int pipe_cmd; - int err = ENOMEM; - - a.sch = _nsch; - if (a.sch->oid.len != sizeof(*a.sch)) { - D("bad sched len %d", a.sch->oid.len); - return EINVAL; - } - i = a.sch->sched_nr; - if (i <= 0 || i >= DN_MAX_ID) - return EINVAL; - /* make sure we have some buckets */ - if (a.sch->flags & DN_HAVE_MASK) - ipdn_bound_var((int *)&a.sch->buckets, dn_cfg.hash_size, - 1, dn_cfg.max_hash_size, "sched buckets"); - /* XXX other sanity checks */ - bzero(&p, sizeof(p)); - - pipe_cmd = a.sch->flags & DN_PIPE_CMD; - a.sch->flags &= ~DN_PIPE_CMD; //XXX do it even if is not set? - if (pipe_cmd) { - /* Copy mask parameter */ - new_mask = a.sch->sched_mask; - new_buckets = a.sch->buckets; - new_flags = a.sch->flags; - } - DN_BH_WLOCK(); -again: /* run twice, for wfq and fifo */ - /* - * lookup the type. If not supplied, use the previous one - * or default to WF2Q+. Otherwise, return an error. - */ - dn_cfg.id++; - a.fp = find_sched_type(a.sch->oid.subtype, a.sch->name); - if (a.fp != NULL) { - /* found. Lookup or create entry */ - s = dn_ht_find(dn_cfg.schedhash, i, DNHT_INSERT, &a); - } else if (a.sch->oid.subtype == 0 && !a.sch->name[0]) { - /* No type. search existing s* or retry with WF2Q+ */ - s = dn_ht_find(dn_cfg.schedhash, i, 0, &a); - if (s != NULL) { - a.fp = s->fp; - /* Scheduler exists, skip to FIFO scheduler - * if command was pipe config... - */ - if (pipe_cmd) - goto next; - } else { - /* New scheduler, create a wf2q+ with no mask - * if command was pipe config... - */ - if (pipe_cmd) { - /* clear mask parameter */ - bzero(&a.sch->sched_mask, sizeof(new_mask)); - a.sch->buckets = 0; - a.sch->flags &= ~DN_HAVE_MASK; - } - a.sch->oid.subtype = DN_SCHED_WF2QP; - goto again; - } - } else { - D("invalid scheduler type %d %s", - a.sch->oid.subtype, a.sch->name); - err = EINVAL; - goto error; - } - /* normalize name and subtype */ - a.sch->oid.subtype = a.fp->type; - bzero(a.sch->name, sizeof(a.sch->name)); - strlcpy(a.sch->name, a.fp->name, sizeof(a.sch->name)); - if (s == NULL) { - D("cannot allocate scheduler %d", i); - goto error; - } - /* restore existing link if any */ - if (p.link_nr) { - s->link = p; - if (!pf || pf->link_nr != p.link_nr) { /* no saved value */ - s->profile = NULL; /* XXX maybe not needed */ - } else { - s->profile = malloc(sizeof(struct dn_profile), - M_DUMMYNET, M_NOWAIT | M_ZERO); - if (s->profile == NULL) { - D("cannot allocate profile"); - goto error; //XXX - } - bcopy(pf, s->profile, sizeof(*pf)); - } - } - p.link_nr = 0; - if (s->fp == NULL) { - DX(2, "sched %d new type %s", i, a.fp->name); - } else if (s->fp != a.fp || - bcmp(a.sch, &s->sch, sizeof(*a.sch)) ) { - /* already existing. */ - DX(2, "sched %d type changed from %s to %s", - i, s->fp->name, a.fp->name); - DX(4, " type/sub %d/%d -> %d/%d", - s->sch.oid.type, s->sch.oid.subtype, - a.sch->oid.type, a.sch->oid.subtype); - if (s->link.link_nr == 0) - D("XXX WARNING link 0 for sched %d", i); - p = s->link; /* preserve link */ - if (s->profile) {/* preserve profile */ - if (!pf) - pf = malloc(sizeof(*pf), - M_DUMMYNET, M_NOWAIT | M_ZERO); - if (pf) /* XXX should issue a warning otherwise */ - bcopy(s->profile, pf, sizeof(*pf)); - } - /* remove from the hash */ - dn_ht_find(dn_cfg.schedhash, i, DNHT_REMOVE, NULL); - /* Detach flowsets, preserve queues. */ - // schk_delete_cb(s, NULL); - // XXX temporarily, kill queues - schk_delete_cb(s, (void *)DN_DESTROY); - goto again; - } else { - DX(4, "sched %d unchanged type %s", i, a.fp->name); - } - /* complete initialization */ - s->sch = *a.sch; - s->fp = a.fp; - s->cfg = arg; - // XXX schk_reset_credit(s); - /* create the internal flowset if needed, - * trying to reuse existing ones if available - */ - if (!(s->fp->flags & DN_MULTIQUEUE) && !s->fs) { - s->fs = dn_ht_find(dn_cfg.fshash, i, 0, NULL); - if (!s->fs) { - struct dn_fs fs; - bzero(&fs, sizeof(fs)); - set_oid(&fs.oid, DN_FS, sizeof(fs)); - fs.fs_nr = i + DN_MAX_ID; - fs.sched_nr = i; - s->fs = config_fs(&fs, NULL, 1 /* locked */); - } - if (!s->fs) { - schk_delete_cb(s, (void *)DN_DESTROY); - D("error creating internal fs for %d", i); - goto error; - } - } - /* call init function after the flowset is created */ - if (s->fp->config) - s->fp->config(s); - update_fs(s); -next: - if (i < DN_MAX_ID) { /* now configure the FIFO instance */ - i += DN_MAX_ID; - if (pipe_cmd) { - /* Restore mask parameter for FIFO */ - a.sch->sched_mask = new_mask; - a.sch->buckets = new_buckets; - a.sch->flags = new_flags; - } else { - /* sched config shouldn't modify the FIFO scheduler */ - if (dn_ht_find(dn_cfg.schedhash, i, 0, &a) != NULL) { - /* FIFO already exist, don't touch it */ - err = 0; /* and this is not an error */ - goto error; - } - } - a.sch->sched_nr = i; - a.sch->oid.subtype = DN_SCHED_FIFO; - bzero(a.sch->name, sizeof(a.sch->name)); - goto again; - } - err = 0; -error: - DN_BH_WUNLOCK(); - if (pf) - free(pf, M_DUMMYNET); - return err; -} - -/* - * attach a profile to a link - */ -static int -config_profile(struct dn_profile *pf, struct dn_id *arg) -{ - struct dn_schk *s; - int i, olen, err = 0; - - if (pf->oid.len < sizeof(*pf)) { - D("short profile len %d", pf->oid.len); - return EINVAL; - } - i = pf->link_nr; - if (i <= 0 || i >= DN_MAX_ID) - return EINVAL; - /* XXX other sanity checks */ - DN_BH_WLOCK(); - for (; i < 2*DN_MAX_ID; i += DN_MAX_ID) { - s = locate_scheduler(i); - - if (s == NULL) { - err = EINVAL; - break; - } - dn_cfg.id++; - /* - * If we had a profile and the new one does not fit, - * or it is deleted, then we need to free memory. - */ - if (s->profile && (pf->samples_no == 0 || - s->profile->oid.len < pf->oid.len)) { - free(s->profile, M_DUMMYNET); - s->profile = NULL; - } - if (pf->samples_no == 0) - continue; - /* - * new profile, possibly allocate memory - * and copy data. - */ - if (s->profile == NULL) - s->profile = malloc(pf->oid.len, - M_DUMMYNET, M_NOWAIT | M_ZERO); - if (s->profile == NULL) { - D("no memory for profile %d", i); - err = ENOMEM; - break; - } - /* preserve larger length XXX double check */ - olen = s->profile->oid.len; - if (olen < pf->oid.len) - olen = pf->oid.len; - bcopy(pf, s->profile, pf->oid.len); - s->profile->oid.len = olen; - } - DN_BH_WUNLOCK(); - return err; -} - -/* - * Delete all objects: - */ -static void -dummynet_flush(void) -{ - - /* delete all schedulers and related links/queues/flowsets */ - dn_ht_scan(dn_cfg.schedhash, schk_delete_cb, - (void *)(uintptr_t)DN_DELETE_FS); - /* delete all remaining (unlinked) flowsets */ - DX(4, "still %d unlinked fs", dn_cfg.fsk_count); - dn_ht_free(dn_cfg.fshash, DNHT_REMOVE); - fsk_detach_list(&dn_cfg.fsu, DN_DELETE_FS); - /* Reinitialize system heap... */ - heap_init(&dn_cfg.evheap, 16, offsetof(struct dn_id, id)); -} - -/* - * Main handler for configuration. We are guaranteed to be called - * with an oid which is at least a dn_id. - * - the first object is the command (config, delete, flush, ...) - * - config_link must be issued after the corresponding config_sched - * - parameters (DN_TXT) for an object must preceed the object - * processed on a config_sched. - */ -int -do_config(void *p, int l) -{ - struct dn_id *next, *o; - int err = 0, err2 = 0; - struct dn_id *arg = NULL; - uintptr_t *a; - - o = p; - if (o->id != DN_API_VERSION) { - D("invalid api version got %d need %d", - o->id, DN_API_VERSION); - return EINVAL; - } - for (; l >= sizeof(*o); o = next) { - struct dn_id *prev = arg; - if (o->len < sizeof(*o) || l < o->len) { - D("bad len o->len %d len %d", o->len, l); - err = EINVAL; - break; - } - l -= o->len; - next = (struct dn_id *)((char *)o + o->len); - err = 0; - switch (o->type) { - default: - D("cmd %d not implemented", o->type); - break; - -#ifdef EMULATE_SYSCTL - /* sysctl emulation. - * if we recognize the command, jump to the correct - * handler and return - */ - case DN_SYSCTL_SET: - err = kesysctl_emu_set(p, l); - return err; -#endif - - case DN_CMD_CONFIG: /* simply a header */ - break; - - case DN_CMD_DELETE: - /* the argument is in the first uintptr_t after o */ - a = (uintptr_t *)(o+1); - if (o->len < sizeof(*o) + sizeof(*a)) { - err = EINVAL; - break; - } - switch (o->subtype) { - case DN_LINK: - /* delete base and derived schedulers */ - DN_BH_WLOCK(); - err = delete_schk(*a); - err2 = delete_schk(*a + DN_MAX_ID); - DN_BH_WUNLOCK(); - if (!err) - err = err2; - break; - - default: - D("invalid delete type %d", - o->subtype); - err = EINVAL; - break; - - case DN_FS: - err = (*a <1 || *a >= DN_MAX_ID) ? - EINVAL : delete_fs(*a, 0) ; - break; - } - break; - - case DN_CMD_FLUSH: - DN_BH_WLOCK(); - dummynet_flush(); - DN_BH_WUNLOCK(); - break; - case DN_TEXT: /* store argument the next block */ - prev = NULL; - arg = o; - break; - case DN_LINK: - err = config_link((struct dn_link *)o, arg); - break; - case DN_PROFILE: - err = config_profile((struct dn_profile *)o, arg); - break; - case DN_SCH: - err = config_sched((struct dn_sch *)o, arg); - break; - case DN_FS: - err = (NULL==config_fs((struct dn_fs *)o, arg, 0)); - break; - } - if (prev) - arg = NULL; - if (err != 0) - break; - } - return err; -} - -static int -compute_space(struct dn_id *cmd, struct copy_args *a) -{ - int x = 0, need = 0; - int profile_size = sizeof(struct dn_profile) - - ED_MAX_SAMPLES_NO*sizeof(int); - - /* NOTE about compute space: - * NP = dn_cfg.schk_count - * NSI = dn_cfg.si_count - * NF = dn_cfg.fsk_count - * NQ = dn_cfg.queue_count - * - ipfw pipe show - * (NP/2)*(dn_link + dn_sch + dn_id + dn_fs) only half scheduler - * link, scheduler template, flowset - * integrated in scheduler and header - * for flowset list - * (NSI)*(dn_flow) all scheduler instance (includes - * the queue instance) - * - ipfw sched show - * (NP/2)*(dn_link + dn_sch + dn_id + dn_fs) only half scheduler - * link, scheduler template, flowset - * integrated in scheduler and header - * for flowset list - * (NSI * dn_flow) all scheduler instances - * (NF * sizeof(uint_32)) space for flowset list linked to scheduler - * (NQ * dn_queue) all queue [XXXfor now not listed] - * - ipfw queue show - * (NF * dn_fs) all flowset - * (NQ * dn_queue) all queues - */ - switch (cmd->subtype) { - default: - return -1; - /* XXX where do LINK and SCH differ ? */ - /* 'ipfw sched show' could list all queues associated to - * a scheduler. This feature for now is disabled - */ - case DN_LINK: /* pipe show */ - x = DN_C_LINK | DN_C_SCH | DN_C_FLOW; - need += dn_cfg.schk_count * - (sizeof(struct dn_fs) + profile_size) / 2; - need += dn_cfg.fsk_count * sizeof(uint32_t); - break; - case DN_SCH: /* sched show */ - need += dn_cfg.schk_count * - (sizeof(struct dn_fs) + profile_size) / 2; - need += dn_cfg.fsk_count * sizeof(uint32_t); - x = DN_C_SCH | DN_C_LINK | DN_C_FLOW; - break; - case DN_FS: /* queue show */ - x = DN_C_FS | DN_C_QUEUE; - break; - case DN_GET_COMPAT: /* compatibility mode */ - need = dn_compat_calc_size(); - break; - } - a->flags = x; - if (x & DN_C_SCH) { - need += dn_cfg.schk_count * sizeof(struct dn_sch) / 2; - /* NOT also, each fs might be attached to a sched */ - need += dn_cfg.schk_count * sizeof(struct dn_id) / 2; - } - if (x & DN_C_FS) - need += dn_cfg.fsk_count * sizeof(struct dn_fs); - if (x & DN_C_LINK) { - need += dn_cfg.schk_count * sizeof(struct dn_link) / 2; - } - /* - * When exporting a queue to userland, only pass up the - * struct dn_flow, which is the only visible part. - */ - - if (x & DN_C_QUEUE) - need += dn_cfg.queue_count * sizeof(struct dn_flow); - if (x & DN_C_FLOW) - need += dn_cfg.si_count * (sizeof(struct dn_flow)); - return need; -} - -/* - * If compat != NULL dummynet_get is called in compatibility mode. - * *compat will be the pointer to the buffer to pass to ipfw - */ -int -dummynet_get(struct sockopt *sopt, void **compat) -{ - int have, i, need, error; - char *start = NULL, *buf; - size_t sopt_valsize; - struct dn_id *cmd; - struct copy_args a; - struct copy_range r; - int l = sizeof(struct dn_id); - - bzero(&a, sizeof(a)); - bzero(&r, sizeof(r)); - - /* save and restore original sopt_valsize around copyin */ - sopt_valsize = sopt->sopt_valsize; - - cmd = &r.o; - - if (!compat) { - /* copy at least an oid, and possibly a full object */ - error = sooptcopyin(sopt, cmd, sizeof(r), sizeof(*cmd)); - sopt->sopt_valsize = sopt_valsize; - if (error) - goto done; - l = cmd->len; -#ifdef EMULATE_SYSCTL - /* sysctl emulation. */ - if (cmd->type == DN_SYSCTL_GET) - return kesysctl_emu_get(sopt); -#endif - if (l > sizeof(r)) { - /* request larger than default, allocate buffer */ - cmd = malloc(l, M_DUMMYNET, M_WAITOK); - error = sooptcopyin(sopt, cmd, l, l); - sopt->sopt_valsize = sopt_valsize; - if (error) - goto done; - } - } else { /* compatibility */ - error = 0; - cmd->type = DN_CMD_GET; - cmd->len = sizeof(struct dn_id); - cmd->subtype = DN_GET_COMPAT; - // cmd->id = sopt_valsize; - D("compatibility mode"); - } - a.extra = (struct copy_range *)cmd; - if (cmd->len == sizeof(*cmd)) { /* no range, create a default */ - uint32_t *rp = (uint32_t *)(cmd + 1); - cmd->len += 2* sizeof(uint32_t); - rp[0] = 1; - rp[1] = DN_MAX_ID - 1; - if (cmd->subtype == DN_LINK) { - rp[0] += DN_MAX_ID; - rp[1] += DN_MAX_ID; - } - } - /* Count space (under lock) and allocate (outside lock). - * Exit with lock held if we manage to get enough buffer. - * Try a few times then give up. - */ - for (have = 0, i = 0; i < 10; i++) { - DN_BH_WLOCK(); - need = compute_space(cmd, &a); - - /* if there is a range, ignore value from compute_space() */ - if (l > sizeof(*cmd)) - need = sopt_valsize - sizeof(*cmd); - - if (need < 0) { - DN_BH_WUNLOCK(); - error = EINVAL; - goto done; - } - need += sizeof(*cmd); - cmd->id = need; - if (have >= need) - break; - - DN_BH_WUNLOCK(); - if (start) - free(start, M_DUMMYNET); - start = NULL; - if (need > sopt_valsize) - break; - - have = need; - start = malloc(have, M_DUMMYNET, M_WAITOK | M_ZERO); - } - - if (start == NULL) { - if (compat) { - *compat = NULL; - error = 1; // XXX - } else { - error = sooptcopyout(sopt, cmd, sizeof(*cmd)); - } - goto done; - } - ND("have %d:%d sched %d, %d:%d links %d, %d:%d flowsets %d, " - "%d:%d si %d, %d:%d queues %d", - dn_cfg.schk_count, sizeof(struct dn_sch), DN_SCH, - dn_cfg.schk_count, sizeof(struct dn_link), DN_LINK, - dn_cfg.fsk_count, sizeof(struct dn_fs), DN_FS, - dn_cfg.si_count, sizeof(struct dn_flow), DN_SCH_I, - dn_cfg.queue_count, sizeof(struct dn_queue), DN_QUEUE); - sopt->sopt_valsize = sopt_valsize; - a.type = cmd->subtype; - - if (compat == NULL) { - bcopy(cmd, start, sizeof(*cmd)); - ((struct dn_id*)(start))->len = sizeof(struct dn_id); - buf = start + sizeof(*cmd); - } else - buf = start; - a.start = &buf; - a.end = start + have; - /* start copying other objects */ - if (compat) { - a.type = DN_COMPAT_PIPE; - dn_ht_scan(dn_cfg.schedhash, copy_data_helper_compat, &a); - a.type = DN_COMPAT_QUEUE; - dn_ht_scan(dn_cfg.fshash, copy_data_helper_compat, &a); - } else if (a.type == DN_FS) { - dn_ht_scan(dn_cfg.fshash, copy_data_helper, &a); - } else { - dn_ht_scan(dn_cfg.schedhash, copy_data_helper, &a); - } - DN_BH_WUNLOCK(); - - if (compat) { - *compat = start; - sopt->sopt_valsize = buf - start; - /* free() is done by ip_dummynet_compat() */ - start = NULL; //XXX hack - } else { - error = sooptcopyout(sopt, start, buf - start); - } -done: - if (cmd && cmd != &r.o) - free(cmd, M_DUMMYNET); - if (start) - free(start, M_DUMMYNET); - return error; -} - -/* Callback called on scheduler instance to delete it if idle */ -static int -drain_scheduler_cb(void *_si, void *arg) -{ - struct dn_sch_inst *si = _si; - - if ((si->kflags & DN_ACTIVE) || si->dline.mq.head != NULL) - return 0; - - if (si->sched->fp->flags & DN_MULTIQUEUE) { - if (si->q_count == 0) - return si_destroy(si, NULL); - else - return 0; - } else { /* !DN_MULTIQUEUE */ - if ((si+1)->ni.length == 0) - return si_destroy(si, NULL); - else - return 0; - } - return 0; /* unreachable */ -} - -/* Callback called on scheduler to check if it has instances */ -static int -drain_scheduler_sch_cb(void *_s, void *arg) -{ - struct dn_schk *s = _s; - - if (s->sch.flags & DN_HAVE_MASK) { - dn_ht_scan_bucket(s->siht, &s->drain_bucket, - drain_scheduler_cb, NULL); - s->drain_bucket++; - } else { - if (s->siht) { - if (drain_scheduler_cb(s->siht, NULL) == DNHT_SCAN_DEL) - s->siht = NULL; - } - } - return 0; -} - -/* Called every tick, try to delete a 'bucket' of scheduler */ -void -dn_drain_scheduler(void) -{ - dn_ht_scan_bucket(dn_cfg.schedhash, &dn_cfg.drain_sch, - drain_scheduler_sch_cb, NULL); - dn_cfg.drain_sch++; -} - -/* Callback called on queue to delete if it is idle */ -static int -drain_queue_cb(void *_q, void *arg) -{ - struct dn_queue *q = _q; - - if (q->ni.length == 0) { - dn_delete_queue(q, DN_DESTROY); - return DNHT_SCAN_DEL; /* queue is deleted */ - } - - return 0; /* queue isn't deleted */ -} - -/* Callback called on flowset used to check if it has queues */ -static int -drain_queue_fs_cb(void *_fs, void *arg) -{ - struct dn_fsk *fs = _fs; - - if (fs->fs.flags & DN_QHT_HASH) { - /* Flowset has a hash table for queues */ - dn_ht_scan_bucket(fs->qht, &fs->drain_bucket, - drain_queue_cb, NULL); - fs->drain_bucket++; - } else { - /* No hash table for this flowset, null the pointer - * if the queue is deleted - */ - if (fs->qht) { - if (drain_queue_cb(fs->qht, NULL) == DNHT_SCAN_DEL) - fs->qht = NULL; - } - } - return 0; -} - -/* Called every tick, try to delete a 'bucket' of queue */ -void -dn_drain_queue(void) -{ - /* scan a bucket of flowset */ - dn_ht_scan_bucket(dn_cfg.fshash, &dn_cfg.drain_fs, - drain_queue_fs_cb, NULL); - dn_cfg.drain_fs++; -} - -/* - * Handler for the various dummynet socket options - */ -static int -ip_dn_ctl(struct sockopt *sopt) -{ - void *p = NULL; - int error, l; - - error = priv_check(sopt->sopt_td, PRIV_NETINET_DUMMYNET); - if (error) - return (error); - - /* Disallow sets in really-really secure mode. */ - if (sopt->sopt_dir == SOPT_SET) { - error = securelevel_ge(sopt->sopt_td->td_ucred, 3); - if (error) - return (error); - } - - switch (sopt->sopt_name) { - default : - D("dummynet: unknown option %d", sopt->sopt_name); - error = EINVAL; - break; - - case IP_DUMMYNET_FLUSH: - case IP_DUMMYNET_CONFIGURE: - case IP_DUMMYNET_DEL: /* remove a pipe or queue */ - case IP_DUMMYNET_GET: - D("dummynet: compat option %d", sopt->sopt_name); - error = ip_dummynet_compat(sopt); - break; - - case IP_DUMMYNET3 : - if (sopt->sopt_dir == SOPT_GET) { - error = dummynet_get(sopt, NULL); - break; - } - l = sopt->sopt_valsize; - if (l < sizeof(struct dn_id) || l > 12000) { - D("argument len %d invalid", l); - break; - } - p = malloc(l, M_TEMP, M_WAITOK); // XXX can it fail ? - error = sooptcopyin(sopt, p, l, l); - if (error) - break ; - error = do_config(p, l); - break; - } - - if (p != NULL) - free(p, M_TEMP); - - return error ; -} - - -static void -ip_dn_init(void) -{ - if (dn_cfg.init_done) - return; - printf("DUMMYNET %p with IPv6 initialized (100409)\n", curvnet); - dn_cfg.init_done = 1; - /* Set defaults here. MSVC does not accept initializers, - * and this is also useful for vimages - */ - /* queue limits */ - dn_cfg.slot_limit = 100; /* Foot shooting limit for queues. */ - dn_cfg.byte_limit = 1024 * 1024; - dn_cfg.expire = 1; - - /* RED parameters */ - dn_cfg.red_lookup_depth = 256; /* default lookup table depth */ - dn_cfg.red_avg_pkt_size = 512; /* default medium packet size */ - dn_cfg.red_max_pkt_size = 1500; /* default max packet size */ - - /* hash tables */ - dn_cfg.max_hash_size = 65536; /* max in the hash tables */ - dn_cfg.hash_size = 64; /* default hash size */ - - /* create hash tables for schedulers and flowsets. - * In both we search by key and by pointer. - */ - dn_cfg.schedhash = dn_ht_init(NULL, dn_cfg.hash_size, - offsetof(struct dn_schk, schk_next), - schk_hash, schk_match, schk_new); - dn_cfg.fshash = dn_ht_init(NULL, dn_cfg.hash_size, - offsetof(struct dn_fsk, fsk_next), - fsk_hash, fsk_match, fsk_new); - - /* bucket index to drain object */ - dn_cfg.drain_fs = 0; - dn_cfg.drain_sch = 0; - - heap_init(&dn_cfg.evheap, 16, offsetof(struct dn_id, id)); - SLIST_INIT(&dn_cfg.fsu); - SLIST_INIT(&dn_cfg.schedlist); - - DN_LOCK_INIT(); - - TASK_INIT(&dn_task, 0, dummynet_task, curvnet); - dn_tq = taskqueue_create("dummynet", M_WAITOK, - taskqueue_thread_enqueue, &dn_tq); - taskqueue_start_threads(&dn_tq, 1, PI_NET, "dummynet"); - - callout_init(&dn_timeout, CALLOUT_MPSAFE); - callout_reset(&dn_timeout, 1, dummynet, NULL); - - /* Initialize curr_time adjustment mechanics. */ - getmicrouptime(&dn_cfg.prev_t); -} - -#ifdef KLD_MODULE -static void -ip_dn_destroy(int last) -{ - callout_drain(&dn_timeout); - - DN_BH_WLOCK(); - if (last) { - ND("removing last instance\n"); - ip_dn_ctl_ptr = NULL; - ip_dn_io_ptr = NULL; - } - - dummynet_flush(); - DN_BH_WUNLOCK(); - taskqueue_drain(dn_tq, &dn_task); - taskqueue_free(dn_tq); - - dn_ht_free(dn_cfg.schedhash, 0); - dn_ht_free(dn_cfg.fshash, 0); - heap_free(&dn_cfg.evheap); - - DN_LOCK_DESTROY(); -} -#endif /* KLD_MODULE */ - -static int -dummynet_modevent(module_t mod, int type, void *data) -{ - - if (type == MOD_LOAD) { - if (ip_dn_io_ptr) { - printf("DUMMYNET already loaded\n"); - return EEXIST ; - } - ip_dn_init(); - ip_dn_ctl_ptr = ip_dn_ctl; - ip_dn_io_ptr = dummynet_io; - return 0; - } else if (type == MOD_UNLOAD) { -#if !defined(KLD_MODULE) - printf("dummynet statically compiled, cannot unload\n"); - return EINVAL ; -#else - ip_dn_destroy(1 /* last */); - return 0; -#endif - } else - return EOPNOTSUPP; -} - -/* modevent helpers for the modules */ -static int -load_dn_sched(struct dn_alg *d) -{ - struct dn_alg *s; - - if (d == NULL) - return 1; /* error */ - ip_dn_init(); /* just in case, we need the lock */ - - /* Check that mandatory funcs exists */ - if (d->enqueue == NULL || d->dequeue == NULL) { - D("missing enqueue or dequeue for %s", d->name); - return 1; - } - - /* Search if scheduler already exists */ - DN_BH_WLOCK(); - SLIST_FOREACH(s, &dn_cfg.schedlist, next) { - if (strcmp(s->name, d->name) == 0) { - D("%s already loaded", d->name); - break; /* scheduler already exists */ - } - } - if (s == NULL) - SLIST_INSERT_HEAD(&dn_cfg.schedlist, d, next); - DN_BH_WUNLOCK(); - D("dn_sched %s %sloaded", d->name, s ? "not ":""); - return s ? 1 : 0; -} - -static int -unload_dn_sched(struct dn_alg *s) -{ - struct dn_alg *tmp, *r; - int err = EINVAL; - - ND("called for %s", s->name); - - DN_BH_WLOCK(); - SLIST_FOREACH_SAFE(r, &dn_cfg.schedlist, next, tmp) { - if (strcmp(s->name, r->name) != 0) - continue; - ND("ref_count = %d", r->ref_count); - err = (r->ref_count != 0) ? EBUSY : 0; - if (err == 0) - SLIST_REMOVE(&dn_cfg.schedlist, r, dn_alg, next); - break; - } - DN_BH_WUNLOCK(); - D("dn_sched %s %sunloaded", s->name, err ? "not ":""); - return err; -} - -int -dn_sched_modevent(module_t mod, int cmd, void *arg) -{ - struct dn_alg *sch = arg; - - if (cmd == MOD_LOAD) - return load_dn_sched(sch); - else if (cmd == MOD_UNLOAD) - return unload_dn_sched(sch); - else - return EINVAL; -} - -static moduledata_t dummynet_mod = { - "dummynet", dummynet_modevent, NULL -}; - -#define DN_SI_SUB SI_SUB_PROTO_IFATTACHDOMAIN -#define DN_MODEV_ORD (SI_ORDER_ANY - 128) /* after ipfw */ -DECLARE_MODULE(dummynet, dummynet_mod, DN_SI_SUB, DN_MODEV_ORD); -MODULE_DEPEND(dummynet, ipfw, 2, 2, 2); -MODULE_VERSION(dummynet, 3); - -/* - * Starting up. Done in order after dummynet_modevent() has been called. - * VNET_SYSINIT is also called for each existing vnet and each new vnet. - */ -//VNET_SYSINIT(vnet_dn_init, DN_SI_SUB, DN_MODEV_ORD+2, ip_dn_init, NULL); - -/* - * Shutdown handlers up shop. These are done in REVERSE ORDER, but still - * after dummynet_modevent() has been called. Not called on reboot. - * VNET_SYSUNINIT is also called for each exiting vnet as it exits. - * or when the module is unloaded. - */ -//VNET_SYSUNINIT(vnet_dn_uninit, DN_SI_SUB, DN_MODEV_ORD+2, ip_dn_destroy, NULL); - -/* end of file */ diff --git a/sys/netinet/ipfw/ip_fw2.c b/sys/netinet/ipfw/ip_fw2.c deleted file mode 100644 index 0dfab1f..0000000 --- a/sys/netinet/ipfw/ip_fw2.c +++ /dev/null @@ -1,2790 +0,0 @@ -/*- - * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - -/* - * The FreeBSD IP packet firewall, main file - */ - -#include "opt_ipfw.h" -#include "opt_ipdivert.h" -#include "opt_inet.h" -#ifndef INET -#error "IPFIREWALL requires INET" -#endif /* INET */ -#include "opt_inet6.h" -#include "opt_ipsec.h" - -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/condvar.h> -#include <sys/eventhandler.h> -#include <sys/malloc.h> -#include <sys/mbuf.h> -#include <sys/kernel.h> -#include <sys/lock.h> -#include <sys/jail.h> -#include <sys/module.h> -#include <sys/priv.h> -#include <sys/proc.h> -#include <sys/rwlock.h> -#include <sys/socket.h> -#include <sys/socketvar.h> -#include <sys/sysctl.h> -#include <sys/syslog.h> -#include <sys/ucred.h> -#include <net/ethernet.h> /* for ETHERTYPE_IP */ -#include <net/if.h> -#include <net/route.h> -#include <net/pf_mtag.h> -#include <net/vnet.h> - -#include <netinet/in.h> -#include <netinet/in_var.h> -#include <netinet/in_pcb.h> -#include <netinet/ip.h> -#include <netinet/ip_var.h> -#include <netinet/ip_icmp.h> -#include <netinet/ip_fw.h> -#include <netinet/ipfw/ip_fw_private.h> -#include <netinet/ip_carp.h> -#include <netinet/pim.h> -#include <netinet/tcp_var.h> -#include <netinet/udp.h> -#include <netinet/udp_var.h> -#include <netinet/sctp.h> - -#include <netinet/ip6.h> -#include <netinet/icmp6.h> -#ifdef INET6 -#include <netinet6/in6_pcb.h> -#include <netinet6/scope6_var.h> -#include <netinet6/ip6_var.h> -#endif - -#include <machine/in_cksum.h> /* XXX for in_cksum */ - -#ifdef MAC -#include <security/mac/mac_framework.h> -#endif - -/* - * static variables followed by global ones. - * All ipfw global variables are here. - */ - -/* ipfw_vnet_ready controls when we are open for business */ -static VNET_DEFINE(int, ipfw_vnet_ready) = 0; -#define V_ipfw_vnet_ready VNET(ipfw_vnet_ready) - -static VNET_DEFINE(int, fw_deny_unknown_exthdrs); -#define V_fw_deny_unknown_exthdrs VNET(fw_deny_unknown_exthdrs) - -static VNET_DEFINE(int, fw_permit_single_frag6) = 1; -#define V_fw_permit_single_frag6 VNET(fw_permit_single_frag6) - -#ifdef IPFIREWALL_DEFAULT_TO_ACCEPT -static int default_to_accept = 1; -#else -static int default_to_accept; -#endif - -VNET_DEFINE(int, autoinc_step); -VNET_DEFINE(int, fw_one_pass) = 1; - -VNET_DEFINE(unsigned int, fw_tables_max); -/* Use 128 tables by default */ -static unsigned int default_fw_tables = IPFW_TABLES_DEFAULT; - -/* - * Each rule belongs to one of 32 different sets (0..31). - * The variable set_disable contains one bit per set. - * If the bit is set, all rules in the corresponding set - * are disabled. Set RESVD_SET(31) is reserved for the default rule - * and rules that are not deleted by the flush command, - * and CANNOT be disabled. - * Rules in set RESVD_SET can only be deleted individually. - */ -VNET_DEFINE(u_int32_t, set_disable); -#define V_set_disable VNET(set_disable) - -VNET_DEFINE(int, fw_verbose); -/* counter for ipfw_log(NULL...) */ -VNET_DEFINE(u_int64_t, norule_counter); -VNET_DEFINE(int, verbose_limit); - -/* layer3_chain contains the list of rules for layer 3 */ -VNET_DEFINE(struct ip_fw_chain, layer3_chain); - -ipfw_nat_t *ipfw_nat_ptr = NULL; -struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int); -ipfw_nat_cfg_t *ipfw_nat_cfg_ptr; -ipfw_nat_cfg_t *ipfw_nat_del_ptr; -ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr; -ipfw_nat_cfg_t *ipfw_nat_get_log_ptr; - -#ifdef SYSCTL_NODE -uint32_t dummy_def = IPFW_DEFAULT_RULE; -static int sysctl_ipfw_table_num(SYSCTL_HANDLER_ARGS); - -SYSBEGIN(f3) - -SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); -SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, one_pass, - CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0, - "Only do a single pass through ipfw when using dummynet(4)"); -SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step, - CTLFLAG_RW, &VNET_NAME(autoinc_step), 0, - "Rule number auto-increment step"); -SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose, - CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_verbose), 0, - "Log matches to ipfw rules"); -SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, - CTLFLAG_RW, &VNET_NAME(verbose_limit), 0, - "Set upper limit of matches of ipfw rules logged"); -SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, default_rule, CTLFLAG_RD, - &dummy_def, 0, - "The default/max possible rule number."); -SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, tables_max, - CTLTYPE_UINT|CTLFLAG_RW, 0, 0, sysctl_ipfw_table_num, "IU", - "Maximum number of tables"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, default_to_accept, CTLFLAG_RDTUN, - &default_to_accept, 0, - "Make the default rule accept all packets."); -TUNABLE_INT("net.inet.ip.fw.default_to_accept", &default_to_accept); -TUNABLE_INT("net.inet.ip.fw.tables_max", (int *)&default_fw_tables); -SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, static_count, - CTLFLAG_RD, &VNET_NAME(layer3_chain.n_rules), 0, - "Number of static rules"); - -#ifdef INET6 -SYSCTL_DECL(_net_inet6_ip6); -SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); -SYSCTL_VNET_INT(_net_inet6_ip6_fw, OID_AUTO, deny_unknown_exthdrs, - CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_deny_unknown_exthdrs), 0, - "Deny packets with unknown IPv6 Extension Headers"); -SYSCTL_VNET_INT(_net_inet6_ip6_fw, OID_AUTO, permit_single_frag6, - CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_permit_single_frag6), 0, - "Permit single packet IPv6 fragments"); -#endif /* INET6 */ - -SYSEND - -#endif /* SYSCTL_NODE */ - - -/* - * Some macros used in the various matching options. - * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T - * Other macros just cast void * into the appropriate type - */ -#define L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl)) -#define TCP(p) ((struct tcphdr *)(p)) -#define SCTP(p) ((struct sctphdr *)(p)) -#define UDP(p) ((struct udphdr *)(p)) -#define ICMP(p) ((struct icmphdr *)(p)) -#define ICMP6(p) ((struct icmp6_hdr *)(p)) - -static __inline int -icmptype_match(struct icmphdr *icmp, ipfw_insn_u32 *cmd) -{ - int type = icmp->icmp_type; - - return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1<<type)) ); -} - -#define TT ( (1 << ICMP_ECHO) | (1 << ICMP_ROUTERSOLICIT) | \ - (1 << ICMP_TSTAMP) | (1 << ICMP_IREQ) | (1 << ICMP_MASKREQ) ) - -static int -is_icmp_query(struct icmphdr *icmp) -{ - int type = icmp->icmp_type; - - return (type <= ICMP_MAXTYPE && (TT & (1<<type)) ); -} -#undef TT - -/* - * The following checks use two arrays of 8 or 16 bits to store the - * bits that we want set or clear, respectively. They are in the - * low and high half of cmd->arg1 or cmd->d[0]. - * - * We scan options and store the bits we find set. We succeed if - * - * (want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear - * - * The code is sometimes optimized not to store additional variables. - */ - -static int -flags_match(ipfw_insn *cmd, u_int8_t bits) -{ - u_char want_clear; - bits = ~bits; - - if ( ((cmd->arg1 & 0xff) & bits) != 0) - return 0; /* some bits we want set were clear */ - want_clear = (cmd->arg1 >> 8) & 0xff; - if ( (want_clear & bits) != want_clear) - return 0; /* some bits we want clear were set */ - return 1; -} - -static int -ipopts_match(struct ip *ip, ipfw_insn *cmd) -{ - int optlen, bits = 0; - u_char *cp = (u_char *)(ip + 1); - int x = (ip->ip_hl << 2) - sizeof (struct ip); - - for (; x > 0; x -= optlen, cp += optlen) { - int opt = cp[IPOPT_OPTVAL]; - - if (opt == IPOPT_EOL) - break; - if (opt == IPOPT_NOP) - optlen = 1; - else { - optlen = cp[IPOPT_OLEN]; - if (optlen <= 0 || optlen > x) - return 0; /* invalid or truncated */ - } - switch (opt) { - - default: - break; - - case IPOPT_LSRR: - bits |= IP_FW_IPOPT_LSRR; - break; - - case IPOPT_SSRR: - bits |= IP_FW_IPOPT_SSRR; - break; - - case IPOPT_RR: - bits |= IP_FW_IPOPT_RR; - break; - - case IPOPT_TS: - bits |= IP_FW_IPOPT_TS; - break; - } - } - return (flags_match(cmd, bits)); -} - -static int -tcpopts_match(struct tcphdr *tcp, ipfw_insn *cmd) -{ - int optlen, bits = 0; - u_char *cp = (u_char *)(tcp + 1); - int x = (tcp->th_off << 2) - sizeof(struct tcphdr); - - for (; x > 0; x -= optlen, cp += optlen) { - int opt = cp[0]; - if (opt == TCPOPT_EOL) - break; - if (opt == TCPOPT_NOP) - optlen = 1; - else { - optlen = cp[1]; - if (optlen <= 0) - break; - } - - switch (opt) { - - default: - break; - - case TCPOPT_MAXSEG: - bits |= IP_FW_TCPOPT_MSS; - break; - - case TCPOPT_WINDOW: - bits |= IP_FW_TCPOPT_WINDOW; - break; - - case TCPOPT_SACK_PERMITTED: - case TCPOPT_SACK: - bits |= IP_FW_TCPOPT_SACK; - break; - - case TCPOPT_TIMESTAMP: - bits |= IP_FW_TCPOPT_TS; - break; - - } - } - return (flags_match(cmd, bits)); -} - -static int -iface_match(struct ifnet *ifp, ipfw_insn_if *cmd, struct ip_fw_chain *chain, uint32_t *tablearg) -{ - if (ifp == NULL) /* no iface with this packet, match fails */ - return 0; - /* Check by name or by IP address */ - if (cmd->name[0] != '\0') { /* match by name */ - if (cmd->name[0] == '\1') /* use tablearg to match */ - return ipfw_lookup_table_extended(chain, cmd->p.glob, - ifp->if_xname, tablearg, IPFW_TABLE_INTERFACE); - /* Check name */ - if (cmd->p.glob) { - if (fnmatch(cmd->name, ifp->if_xname, 0) == 0) - return(1); - } else { - if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0) - return(1); - } - } else { -#ifdef __FreeBSD__ /* and OSX too ? */ - struct ifaddr *ia; - - if_addr_rlock(ifp); - TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) { - if (ia->ifa_addr->sa_family != AF_INET) - continue; - if (cmd->p.ip.s_addr == ((struct sockaddr_in *) - (ia->ifa_addr))->sin_addr.s_addr) { - if_addr_runlock(ifp); - return(1); /* match */ - } - } - if_addr_runlock(ifp); -#endif /* __FreeBSD__ */ - } - return(0); /* no match, fail ... */ -} - -/* - * The verify_path function checks if a route to the src exists and - * if it is reachable via ifp (when provided). - * - * The 'verrevpath' option checks that the interface that an IP packet - * arrives on is the same interface that traffic destined for the - * packet's source address would be routed out of. - * The 'versrcreach' option just checks that the source address is - * reachable via any route (except default) in the routing table. - * These two are a measure to block forged packets. This is also - * commonly known as "anti-spoofing" or Unicast Reverse Path - * Forwarding (Unicast RFP) in Cisco-ese. The name of the knobs - * is purposely reminiscent of the Cisco IOS command, - * - * ip verify unicast reverse-path - * ip verify unicast source reachable-via any - * - * which implements the same functionality. But note that the syntax - * is misleading, and the check may be performed on all IP packets - * whether unicast, multicast, or broadcast. - */ -static int -verify_path(struct in_addr src, struct ifnet *ifp, u_int fib) -{ -#ifndef __FreeBSD__ - return 0; -#else - struct route ro; - struct sockaddr_in *dst; - - bzero(&ro, sizeof(ro)); - - dst = (struct sockaddr_in *)&(ro.ro_dst); - dst->sin_family = AF_INET; - dst->sin_len = sizeof(*dst); - dst->sin_addr = src; - in_rtalloc_ign(&ro, 0, fib); - - if (ro.ro_rt == NULL) - return 0; - - /* - * If ifp is provided, check for equality with rtentry. - * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp, - * in order to pass packets injected back by if_simloop(): - * if useloopback == 1 routing entry (via lo0) for our own address - * may exist, so we need to handle routing assymetry. - */ - if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) { - RTFREE(ro.ro_rt); - return 0; - } - - /* if no ifp provided, check if rtentry is not default route */ - if (ifp == NULL && - satosin(rt_key(ro.ro_rt))->sin_addr.s_addr == INADDR_ANY) { - RTFREE(ro.ro_rt); - return 0; - } - - /* or if this is a blackhole/reject route */ - if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { - RTFREE(ro.ro_rt); - return 0; - } - - /* found valid route */ - RTFREE(ro.ro_rt); - return 1; -#endif /* __FreeBSD__ */ -} - -#ifdef INET6 -/* - * ipv6 specific rules here... - */ -static __inline int -icmp6type_match (int type, ipfw_insn_u32 *cmd) -{ - return (type <= ICMP6_MAXTYPE && (cmd->d[type/32] & (1<<(type%32)) ) ); -} - -static int -flow6id_match( int curr_flow, ipfw_insn_u32 *cmd ) -{ - int i; - for (i=0; i <= cmd->o.arg1; ++i ) - if (curr_flow == cmd->d[i] ) - return 1; - return 0; -} - -/* support for IP6_*_ME opcodes */ -static int -search_ip6_addr_net (struct in6_addr * ip6_addr) -{ - struct ifnet *mdc; - struct ifaddr *mdc2; - struct in6_ifaddr *fdm; - struct in6_addr copia; - - TAILQ_FOREACH(mdc, &V_ifnet, if_link) { - if_addr_rlock(mdc); - TAILQ_FOREACH(mdc2, &mdc->if_addrhead, ifa_link) { - if (mdc2->ifa_addr->sa_family == AF_INET6) { - fdm = (struct in6_ifaddr *)mdc2; - copia = fdm->ia_addr.sin6_addr; - /* need for leaving scope_id in the sock_addr */ - in6_clearscope(&copia); - if (IN6_ARE_ADDR_EQUAL(ip6_addr, &copia)) { - if_addr_runlock(mdc); - return 1; - } - } - } - if_addr_runlock(mdc); - } - return 0; -} - -static int -verify_path6(struct in6_addr *src, struct ifnet *ifp, u_int fib) -{ - struct route_in6 ro; - struct sockaddr_in6 *dst; - - bzero(&ro, sizeof(ro)); - - dst = (struct sockaddr_in6 * )&(ro.ro_dst); - dst->sin6_family = AF_INET6; - dst->sin6_len = sizeof(*dst); - dst->sin6_addr = *src; - - in6_rtalloc_ign(&ro, 0, fib); - if (ro.ro_rt == NULL) - return 0; - - /* - * if ifp is provided, check for equality with rtentry - * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp, - * to support the case of sending packets to an address of our own. - * (where the former interface is the first argument of if_simloop() - * (=ifp), the latter is lo0) - */ - if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) { - RTFREE(ro.ro_rt); - return 0; - } - - /* if no ifp provided, check if rtentry is not default route */ - if (ifp == NULL && - IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(ro.ro_rt))->sin6_addr)) { - RTFREE(ro.ro_rt); - return 0; - } - - /* or if this is a blackhole/reject route */ - if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { - RTFREE(ro.ro_rt); - return 0; - } - - /* found valid route */ - RTFREE(ro.ro_rt); - return 1; - -} - -static int -is_icmp6_query(int icmp6_type) -{ - if ((icmp6_type <= ICMP6_MAXTYPE) && - (icmp6_type == ICMP6_ECHO_REQUEST || - icmp6_type == ICMP6_MEMBERSHIP_QUERY || - icmp6_type == ICMP6_WRUREQUEST || - icmp6_type == ICMP6_FQDN_QUERY || - icmp6_type == ICMP6_NI_QUERY)) - return (1); - - return (0); -} - -static void -send_reject6(struct ip_fw_args *args, int code, u_int hlen, struct ip6_hdr *ip6) -{ - struct mbuf *m; - - m = args->m; - if (code == ICMP6_UNREACH_RST && args->f_id.proto == IPPROTO_TCP) { - struct tcphdr *tcp; - tcp = (struct tcphdr *)((char *)ip6 + hlen); - - if ((tcp->th_flags & TH_RST) == 0) { - struct mbuf *m0; - m0 = ipfw_send_pkt(args->m, &(args->f_id), - ntohl(tcp->th_seq), ntohl(tcp->th_ack), - tcp->th_flags | TH_RST); - if (m0 != NULL) - ip6_output(m0, NULL, NULL, 0, NULL, NULL, - NULL); - } - FREE_PKT(m); - } else if (code != ICMP6_UNREACH_RST) { /* Send an ICMPv6 unreach. */ -#if 0 - /* - * Unlike above, the mbufs need to line up with the ip6 hdr, - * as the contents are read. We need to m_adj() the - * needed amount. - * The mbuf will however be thrown away so we can adjust it. - * Remember we did an m_pullup on it already so we - * can make some assumptions about contiguousness. - */ - if (args->L3offset) - m_adj(m, args->L3offset); -#endif - icmp6_error(m, ICMP6_DST_UNREACH, code, 0); - } else - FREE_PKT(m); - - args->m = NULL; -} - -#endif /* INET6 */ - - -/* - * sends a reject message, consuming the mbuf passed as an argument. - */ -static void -send_reject(struct ip_fw_args *args, int code, int iplen, struct ip *ip) -{ - -#if 0 - /* XXX When ip is not guaranteed to be at mtod() we will - * need to account for this */ - * The mbuf will however be thrown away so we can adjust it. - * Remember we did an m_pullup on it already so we - * can make some assumptions about contiguousness. - */ - if (args->L3offset) - m_adj(m, args->L3offset); -#endif - if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */ - /* We need the IP header in host order for icmp_error(). */ - SET_HOST_IPLEN(ip); - icmp_error(args->m, ICMP_UNREACH, code, 0L, 0); - } else if (args->f_id.proto == IPPROTO_TCP) { - struct tcphdr *const tcp = - L3HDR(struct tcphdr, mtod(args->m, struct ip *)); - if ( (tcp->th_flags & TH_RST) == 0) { - struct mbuf *m; - m = ipfw_send_pkt(args->m, &(args->f_id), - ntohl(tcp->th_seq), ntohl(tcp->th_ack), - tcp->th_flags | TH_RST); - if (m != NULL) - ip_output(m, NULL, NULL, 0, NULL, NULL); - } - FREE_PKT(args->m); - } else - FREE_PKT(args->m); - args->m = NULL; -} - -/* - * Support for uid/gid/jail lookup. These tests are expensive - * (because we may need to look into the list of active sockets) - * so we cache the results. ugid_lookupp is 0 if we have not - * yet done a lookup, 1 if we succeeded, and -1 if we tried - * and failed. The function always returns the match value. - * We could actually spare the variable and use *uc, setting - * it to '(void *)check_uidgid if we have no info, NULL if - * we tried and failed, or any other value if successful. - */ -static int -check_uidgid(ipfw_insn_u32 *insn, struct ip_fw_args *args, int *ugid_lookupp, - struct ucred **uc) -{ -#ifndef __FreeBSD__ - /* XXX */ - return cred_check(insn, proto, oif, - dst_ip, dst_port, src_ip, src_port, - (struct bsd_ucred *)uc, ugid_lookupp, ((struct mbuf *)inp)->m_skb); -#else /* FreeBSD */ - struct in_addr src_ip, dst_ip; - struct inpcbinfo *pi; - struct ipfw_flow_id *id; - struct inpcb *pcb, *inp; - struct ifnet *oif; - int lookupflags; - int match; - - id = &args->f_id; - inp = args->inp; - oif = args->oif; - - /* - * Check to see if the UDP or TCP stack supplied us with - * the PCB. If so, rather then holding a lock and looking - * up the PCB, we can use the one that was supplied. - */ - if (inp && *ugid_lookupp == 0) { - INP_LOCK_ASSERT(inp); - if (inp->inp_socket != NULL) { - *uc = crhold(inp->inp_cred); - *ugid_lookupp = 1; - } else - *ugid_lookupp = -1; - } - /* - * If we have already been here and the packet has no - * PCB entry associated with it, then we can safely - * assume that this is a no match. - */ - if (*ugid_lookupp == -1) - return (0); - if (id->proto == IPPROTO_TCP) { - lookupflags = 0; - pi = &V_tcbinfo; - } else if (id->proto == IPPROTO_UDP) { - lookupflags = INPLOOKUP_WILDCARD; - pi = &V_udbinfo; - } else - return 0; - lookupflags |= INPLOOKUP_RLOCKPCB; - match = 0; - if (*ugid_lookupp == 0) { - if (id->addr_type == 6) { -#ifdef INET6 - if (oif == NULL) - pcb = in6_pcblookup_mbuf(pi, - &id->src_ip6, htons(id->src_port), - &id->dst_ip6, htons(id->dst_port), - lookupflags, oif, args->m); - else - pcb = in6_pcblookup_mbuf(pi, - &id->dst_ip6, htons(id->dst_port), - &id->src_ip6, htons(id->src_port), - lookupflags, oif, args->m); -#else - *ugid_lookupp = -1; - return (0); -#endif - } else { - src_ip.s_addr = htonl(id->src_ip); - dst_ip.s_addr = htonl(id->dst_ip); - if (oif == NULL) - pcb = in_pcblookup_mbuf(pi, - src_ip, htons(id->src_port), - dst_ip, htons(id->dst_port), - lookupflags, oif, args->m); - else - pcb = in_pcblookup_mbuf(pi, - dst_ip, htons(id->dst_port), - src_ip, htons(id->src_port), - lookupflags, oif, args->m); - } - if (pcb != NULL) { - INP_RLOCK_ASSERT(pcb); - *uc = crhold(pcb->inp_cred); - *ugid_lookupp = 1; - INP_RUNLOCK(pcb); - } - if (*ugid_lookupp == 0) { - /* - * We tried and failed, set the variable to -1 - * so we will not try again on this packet. - */ - *ugid_lookupp = -1; - return (0); - } - } - if (insn->o.opcode == O_UID) - match = ((*uc)->cr_uid == (uid_t)insn->d[0]); - else if (insn->o.opcode == O_GID) - match = groupmember((gid_t)insn->d[0], *uc); - else if (insn->o.opcode == O_JAIL) - match = ((*uc)->cr_prison->pr_id == (int)insn->d[0]); - return (match); -#endif /* __FreeBSD__ */ -} - -/* - * Helper function to set args with info on the rule after the matching - * one. slot is precise, whereas we guess rule_id as they are - * assigned sequentially. - */ -static inline void -set_match(struct ip_fw_args *args, int slot, - struct ip_fw_chain *chain) -{ - args->rule.chain_id = chain->id; - args->rule.slot = slot + 1; /* we use 0 as a marker */ - args->rule.rule_id = 1 + chain->map[slot]->id; - args->rule.rulenum = chain->map[slot]->rulenum; -} - -/* - * The main check routine for the firewall. - * - * All arguments are in args so we can modify them and return them - * back to the caller. - * - * Parameters: - * - * args->m (in/out) The packet; we set to NULL when/if we nuke it. - * Starts with the IP header. - * args->eh (in) Mac header if present, NULL for layer3 packet. - * args->L3offset Number of bytes bypassed if we came from L2. - * e.g. often sizeof(eh) ** NOTYET ** - * args->oif Outgoing interface, NULL if packet is incoming. - * The incoming interface is in the mbuf. (in) - * args->divert_rule (in/out) - * Skip up to the first rule past this rule number; - * upon return, non-zero port number for divert or tee. - * - * args->rule Pointer to the last matching rule (in/out) - * args->next_hop Socket we are forwarding to (out). - * args->next_hop6 IPv6 next hop we are forwarding to (out). - * args->f_id Addresses grabbed from the packet (out) - * args->rule.info a cookie depending on rule action - * - * Return value: - * - * IP_FW_PASS the packet must be accepted - * IP_FW_DENY the packet must be dropped - * IP_FW_DIVERT divert packet, port in m_tag - * IP_FW_TEE tee packet, port in m_tag - * IP_FW_DUMMYNET to dummynet, pipe in args->cookie - * IP_FW_NETGRAPH into netgraph, cookie args->cookie - * args->rule contains the matching rule, - * args->rule.info has additional information. - * - */ -int -ipfw_chk(struct ip_fw_args *args) -{ - - /* - * Local variables holding state while processing a packet: - * - * IMPORTANT NOTE: to speed up the processing of rules, there - * are some assumption on the values of the variables, which - * are documented here. Should you change them, please check - * the implementation of the various instructions to make sure - * that they still work. - * - * args->eh The MAC header. It is non-null for a layer2 - * packet, it is NULL for a layer-3 packet. - * **notyet** - * args->L3offset Offset in the packet to the L3 (IP or equiv.) header. - * - * m | args->m Pointer to the mbuf, as received from the caller. - * It may change if ipfw_chk() does an m_pullup, or if it - * consumes the packet because it calls send_reject(). - * XXX This has to change, so that ipfw_chk() never modifies - * or consumes the buffer. - * ip is the beginning of the ip(4 or 6) header. - * Calculated by adding the L3offset to the start of data. - * (Until we start using L3offset, the packet is - * supposed to start with the ip header). - */ - struct mbuf *m = args->m; - struct ip *ip = mtod(m, struct ip *); - - /* - * For rules which contain uid/gid or jail constraints, cache - * a copy of the users credentials after the pcb lookup has been - * executed. This will speed up the processing of rules with - * these types of constraints, as well as decrease contention - * on pcb related locks. - */ -#ifndef __FreeBSD__ - struct bsd_ucred ucred_cache; -#else - struct ucred *ucred_cache = NULL; -#endif - int ucred_lookup = 0; - - /* - * oif | args->oif If NULL, ipfw_chk has been called on the - * inbound path (ether_input, ip_input). - * If non-NULL, ipfw_chk has been called on the outbound path - * (ether_output, ip_output). - */ - struct ifnet *oif = args->oif; - - int f_pos = 0; /* index of current rule in the array */ - int retval = 0; - - /* - * hlen The length of the IP header. - */ - u_int hlen = 0; /* hlen >0 means we have an IP pkt */ - - /* - * offset The offset of a fragment. offset != 0 means that - * we have a fragment at this offset of an IPv4 packet. - * offset == 0 means that (if this is an IPv4 packet) - * this is the first or only fragment. - * For IPv6 offset|ip6f_mf == 0 means there is no Fragment Header - * or there is a single packet fragement (fragement header added - * without needed). We will treat a single packet fragment as if - * there was no fragment header (or log/block depending on the - * V_fw_permit_single_frag6 sysctl setting). - */ - u_short offset = 0; - u_short ip6f_mf = 0; - - /* - * Local copies of addresses. They are only valid if we have - * an IP packet. - * - * proto The protocol. Set to 0 for non-ip packets, - * or to the protocol read from the packet otherwise. - * proto != 0 means that we have an IPv4 packet. - * - * src_port, dst_port port numbers, in HOST format. Only - * valid for TCP and UDP packets. - * - * src_ip, dst_ip ip addresses, in NETWORK format. - * Only valid for IPv4 packets. - */ - uint8_t proto; - uint16_t src_port = 0, dst_port = 0; /* NOTE: host format */ - struct in_addr src_ip, dst_ip; /* NOTE: network format */ - uint16_t iplen=0; - int pktlen; - uint16_t etype = 0; /* Host order stored ether type */ - - /* - * dyn_dir = MATCH_UNKNOWN when rules unchecked, - * MATCH_NONE when checked and not matched (q = NULL), - * MATCH_FORWARD or MATCH_REVERSE otherwise (q != NULL) - */ - int dyn_dir = MATCH_UNKNOWN; - ipfw_dyn_rule *q = NULL; - struct ip_fw_chain *chain = &V_layer3_chain; - - /* - * We store in ulp a pointer to the upper layer protocol header. - * In the ipv4 case this is easy to determine from the header, - * but for ipv6 we might have some additional headers in the middle. - * ulp is NULL if not found. - */ - void *ulp = NULL; /* upper layer protocol pointer. */ - - /* XXX ipv6 variables */ - int is_ipv6 = 0; - uint8_t icmp6_type = 0; - uint16_t ext_hd = 0; /* bits vector for extension header filtering */ - /* end of ipv6 variables */ - - int is_ipv4 = 0; - - int done = 0; /* flag to exit the outer loop */ - - if (m->m_flags & M_SKIP_FIREWALL || (! V_ipfw_vnet_ready)) - return (IP_FW_PASS); /* accept */ - - dst_ip.s_addr = 0; /* make sure it is initialized */ - src_ip.s_addr = 0; /* make sure it is initialized */ - pktlen = m->m_pkthdr.len; - args->f_id.fib = M_GETFIB(m); /* note mbuf not altered) */ - proto = args->f_id.proto = 0; /* mark f_id invalid */ - /* XXX 0 is a valid proto: IP/IPv6 Hop-by-Hop Option */ - -/* - * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous, - * then it sets p to point at the offset "len" in the mbuf. WARNING: the - * pointer might become stale after other pullups (but we never use it - * this way). - */ -#define PULLUP_TO(_len, p, T) PULLUP_LEN(_len, p, sizeof(T)) -#define PULLUP_LEN(_len, p, T) \ -do { \ - int x = (_len) + T; \ - if ((m)->m_len < x) { \ - args->m = m = m_pullup(m, x); \ - if (m == NULL) \ - goto pullup_failed; \ - } \ - p = (mtod(m, char *) + (_len)); \ -} while (0) - - /* - * if we have an ether header, - */ - if (args->eh) - etype = ntohs(args->eh->ether_type); - - /* Identify IP packets and fill up variables. */ - if (pktlen >= sizeof(struct ip6_hdr) && - (args->eh == NULL || etype == ETHERTYPE_IPV6) && ip->ip_v == 6) { - struct ip6_hdr *ip6 = (struct ip6_hdr *)ip; - is_ipv6 = 1; - args->f_id.addr_type = 6; - hlen = sizeof(struct ip6_hdr); - proto = ip6->ip6_nxt; - - /* Search extension headers to find upper layer protocols */ - while (ulp == NULL && offset == 0) { - switch (proto) { - case IPPROTO_ICMPV6: - PULLUP_TO(hlen, ulp, struct icmp6_hdr); - icmp6_type = ICMP6(ulp)->icmp6_type; - break; - - case IPPROTO_TCP: - PULLUP_TO(hlen, ulp, struct tcphdr); - dst_port = TCP(ulp)->th_dport; - src_port = TCP(ulp)->th_sport; - /* save flags for dynamic rules */ - args->f_id._flags = TCP(ulp)->th_flags; - break; - - case IPPROTO_SCTP: - PULLUP_TO(hlen, ulp, struct sctphdr); - src_port = SCTP(ulp)->src_port; - dst_port = SCTP(ulp)->dest_port; - break; - - case IPPROTO_UDP: - PULLUP_TO(hlen, ulp, struct udphdr); - dst_port = UDP(ulp)->uh_dport; - src_port = UDP(ulp)->uh_sport; - break; - - case IPPROTO_HOPOPTS: /* RFC 2460 */ - PULLUP_TO(hlen, ulp, struct ip6_hbh); - ext_hd |= EXT_HOPOPTS; - hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; - proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; - ulp = NULL; - break; - - case IPPROTO_ROUTING: /* RFC 2460 */ - PULLUP_TO(hlen, ulp, struct ip6_rthdr); - switch (((struct ip6_rthdr *)ulp)->ip6r_type) { - case 0: - ext_hd |= EXT_RTHDR0; - break; - case 2: - ext_hd |= EXT_RTHDR2; - break; - default: - if (V_fw_verbose) - printf("IPFW2: IPV6 - Unknown " - "Routing Header type(%d)\n", - ((struct ip6_rthdr *) - ulp)->ip6r_type); - if (V_fw_deny_unknown_exthdrs) - return (IP_FW_DENY); - break; - } - ext_hd |= EXT_ROUTING; - hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3; - proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt; - ulp = NULL; - break; - - case IPPROTO_FRAGMENT: /* RFC 2460 */ - PULLUP_TO(hlen, ulp, struct ip6_frag); - ext_hd |= EXT_FRAGMENT; - hlen += sizeof (struct ip6_frag); - proto = ((struct ip6_frag *)ulp)->ip6f_nxt; - offset = ((struct ip6_frag *)ulp)->ip6f_offlg & - IP6F_OFF_MASK; - ip6f_mf = ((struct ip6_frag *)ulp)->ip6f_offlg & - IP6F_MORE_FRAG; - if (V_fw_permit_single_frag6 == 0 && - offset == 0 && ip6f_mf == 0) { - if (V_fw_verbose) - printf("IPFW2: IPV6 - Invalid " - "Fragment Header\n"); - if (V_fw_deny_unknown_exthdrs) - return (IP_FW_DENY); - break; - } - args->f_id.extra = - ntohl(((struct ip6_frag *)ulp)->ip6f_ident); - ulp = NULL; - break; - - case IPPROTO_DSTOPTS: /* RFC 2460 */ - PULLUP_TO(hlen, ulp, struct ip6_hbh); - ext_hd |= EXT_DSTOPTS; - hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; - proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; - ulp = NULL; - break; - - case IPPROTO_AH: /* RFC 2402 */ - PULLUP_TO(hlen, ulp, struct ip6_ext); - ext_hd |= EXT_AH; - hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2; - proto = ((struct ip6_ext *)ulp)->ip6e_nxt; - ulp = NULL; - break; - - case IPPROTO_ESP: /* RFC 2406 */ - PULLUP_TO(hlen, ulp, uint32_t); /* SPI, Seq# */ - /* Anything past Seq# is variable length and - * data past this ext. header is encrypted. */ - ext_hd |= EXT_ESP; - break; - - case IPPROTO_NONE: /* RFC 2460 */ - /* - * Packet ends here, and IPv6 header has - * already been pulled up. If ip6e_len!=0 - * then octets must be ignored. - */ - ulp = ip; /* non-NULL to get out of loop. */ - break; - - case IPPROTO_OSPFIGP: - /* XXX OSPF header check? */ - PULLUP_TO(hlen, ulp, struct ip6_ext); - break; - - case IPPROTO_PIM: - /* XXX PIM header check? */ - PULLUP_TO(hlen, ulp, struct pim); - break; - - case IPPROTO_CARP: - PULLUP_TO(hlen, ulp, struct carp_header); - if (((struct carp_header *)ulp)->carp_version != - CARP_VERSION) - return (IP_FW_DENY); - if (((struct carp_header *)ulp)->carp_type != - CARP_ADVERTISEMENT) - return (IP_FW_DENY); - break; - - case IPPROTO_IPV6: /* RFC 2893 */ - PULLUP_TO(hlen, ulp, struct ip6_hdr); - break; - - case IPPROTO_IPV4: /* RFC 2893 */ - PULLUP_TO(hlen, ulp, struct ip); - break; - - default: - if (V_fw_verbose) - printf("IPFW2: IPV6 - Unknown " - "Extension Header(%d), ext_hd=%x\n", - proto, ext_hd); - if (V_fw_deny_unknown_exthdrs) - return (IP_FW_DENY); - PULLUP_TO(hlen, ulp, struct ip6_ext); - break; - } /*switch */ - } - ip = mtod(m, struct ip *); - ip6 = (struct ip6_hdr *)ip; - args->f_id.src_ip6 = ip6->ip6_src; - args->f_id.dst_ip6 = ip6->ip6_dst; - args->f_id.src_ip = 0; - args->f_id.dst_ip = 0; - args->f_id.flow_id6 = ntohl(ip6->ip6_flow); - } else if (pktlen >= sizeof(struct ip) && - (args->eh == NULL || etype == ETHERTYPE_IP) && ip->ip_v == 4) { - is_ipv4 = 1; - hlen = ip->ip_hl << 2; - args->f_id.addr_type = 4; - - /* - * Collect parameters into local variables for faster matching. - */ - proto = ip->ip_p; - src_ip = ip->ip_src; - dst_ip = ip->ip_dst; - offset = ntohs(ip->ip_off) & IP_OFFMASK; - iplen = ntohs(ip->ip_len); - pktlen = iplen < pktlen ? iplen : pktlen; - - if (offset == 0) { - switch (proto) { - case IPPROTO_TCP: - PULLUP_TO(hlen, ulp, struct tcphdr); - dst_port = TCP(ulp)->th_dport; - src_port = TCP(ulp)->th_sport; - /* save flags for dynamic rules */ - args->f_id._flags = TCP(ulp)->th_flags; - break; - - case IPPROTO_SCTP: - PULLUP_TO(hlen, ulp, struct sctphdr); - src_port = SCTP(ulp)->src_port; - dst_port = SCTP(ulp)->dest_port; - break; - - case IPPROTO_UDP: - PULLUP_TO(hlen, ulp, struct udphdr); - dst_port = UDP(ulp)->uh_dport; - src_port = UDP(ulp)->uh_sport; - break; - - case IPPROTO_ICMP: - PULLUP_TO(hlen, ulp, struct icmphdr); - //args->f_id.flags = ICMP(ulp)->icmp_type; - break; - - default: - break; - } - } - - ip = mtod(m, struct ip *); - args->f_id.src_ip = ntohl(src_ip.s_addr); - args->f_id.dst_ip = ntohl(dst_ip.s_addr); - } -#undef PULLUP_TO - if (proto) { /* we may have port numbers, store them */ - args->f_id.proto = proto; - args->f_id.src_port = src_port = ntohs(src_port); - args->f_id.dst_port = dst_port = ntohs(dst_port); - } - - IPFW_RLOCK(chain); - if (! V_ipfw_vnet_ready) { /* shutting down, leave NOW. */ - IPFW_RUNLOCK(chain); - return (IP_FW_PASS); /* accept */ - } - if (args->rule.slot) { - /* - * Packet has already been tagged as a result of a previous - * match on rule args->rule aka args->rule_id (PIPE, QUEUE, - * REASS, NETGRAPH, DIVERT/TEE...) - * Validate the slot and continue from the next one - * if still present, otherwise do a lookup. - */ - f_pos = (args->rule.chain_id == chain->id) ? - args->rule.slot : - ipfw_find_rule(chain, args->rule.rulenum, - args->rule.rule_id); - } else { - f_pos = 0; - } - - /* - * Now scan the rules, and parse microinstructions for each rule. - * We have two nested loops and an inner switch. Sometimes we - * need to break out of one or both loops, or re-enter one of - * the loops with updated variables. Loop variables are: - * - * f_pos (outer loop) points to the current rule. - * On output it points to the matching rule. - * done (outer loop) is used as a flag to break the loop. - * l (inner loop) residual length of current rule. - * cmd points to the current microinstruction. - * - * We break the inner loop by setting l=0 and possibly - * cmdlen=0 if we don't want to advance cmd. - * We break the outer loop by setting done=1 - * We can restart the inner loop by setting l>0 and f_pos, f, cmd - * as needed. - */ - for (; f_pos < chain->n_rules; f_pos++) { - ipfw_insn *cmd; - uint32_t tablearg = 0; - int l, cmdlen, skip_or; /* skip rest of OR block */ - struct ip_fw *f; - - f = chain->map[f_pos]; - if (V_set_disable & (1 << f->set) ) - continue; - - skip_or = 0; - for (l = f->cmd_len, cmd = f->cmd ; l > 0 ; - l -= cmdlen, cmd += cmdlen) { - int match; - - /* - * check_body is a jump target used when we find a - * CHECK_STATE, and need to jump to the body of - * the target rule. - */ - -/* check_body: */ - cmdlen = F_LEN(cmd); - /* - * An OR block (insn_1 || .. || insn_n) has the - * F_OR bit set in all but the last instruction. - * The first match will set "skip_or", and cause - * the following instructions to be skipped until - * past the one with the F_OR bit clear. - */ - if (skip_or) { /* skip this instruction */ - if ((cmd->len & F_OR) == 0) - skip_or = 0; /* next one is good */ - continue; - } - match = 0; /* set to 1 if we succeed */ - - switch (cmd->opcode) { - /* - * The first set of opcodes compares the packet's - * fields with some pattern, setting 'match' if a - * match is found. At the end of the loop there is - * logic to deal with F_NOT and F_OR flags associated - * with the opcode. - */ - case O_NOP: - match = 1; - break; - - case O_FORWARD_MAC: - printf("ipfw: opcode %d unimplemented\n", - cmd->opcode); - break; - - case O_GID: - case O_UID: - case O_JAIL: - /* - * We only check offset == 0 && proto != 0, - * as this ensures that we have a - * packet with the ports info. - */ - if (offset != 0) - break; - if (proto == IPPROTO_TCP || - proto == IPPROTO_UDP) - match = check_uidgid( - (ipfw_insn_u32 *)cmd, - args, &ucred_lookup, -#ifdef __FreeBSD__ - &ucred_cache); -#else - (void *)&ucred_cache); -#endif - break; - - case O_RECV: - match = iface_match(m->m_pkthdr.rcvif, - (ipfw_insn_if *)cmd, chain, &tablearg); - break; - - case O_XMIT: - match = iface_match(oif, (ipfw_insn_if *)cmd, - chain, &tablearg); - break; - - case O_VIA: - match = iface_match(oif ? oif : - m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd, - chain, &tablearg); - break; - - case O_MACADDR2: - if (args->eh != NULL) { /* have MAC header */ - u_int32_t *want = (u_int32_t *) - ((ipfw_insn_mac *)cmd)->addr; - u_int32_t *mask = (u_int32_t *) - ((ipfw_insn_mac *)cmd)->mask; - u_int32_t *hdr = (u_int32_t *)args->eh; - - match = - ( want[0] == (hdr[0] & mask[0]) && - want[1] == (hdr[1] & mask[1]) && - want[2] == (hdr[2] & mask[2]) ); - } - break; - - case O_MAC_TYPE: - if (args->eh != NULL) { - u_int16_t *p = - ((ipfw_insn_u16 *)cmd)->ports; - int i; - - for (i = cmdlen - 1; !match && i>0; - i--, p += 2) - match = (etype >= p[0] && - etype <= p[1]); - } - break; - - case O_FRAG: - match = (offset != 0); - break; - - case O_IN: /* "out" is "not in" */ - match = (oif == NULL); - break; - - case O_LAYER2: - match = (args->eh != NULL); - break; - - case O_DIVERTED: - { - /* For diverted packets, args->rule.info - * contains the divert port (in host format) - * reason and direction. - */ - uint32_t i = args->rule.info; - match = (i&IPFW_IS_MASK) == IPFW_IS_DIVERT && - cmd->arg1 & ((i & IPFW_INFO_IN) ? 1 : 2); - } - break; - - case O_PROTO: - /* - * We do not allow an arg of 0 so the - * check of "proto" only suffices. - */ - match = (proto == cmd->arg1); - break; - - case O_IP_SRC: - match = is_ipv4 && - (((ipfw_insn_ip *)cmd)->addr.s_addr == - src_ip.s_addr); - break; - - case O_IP_SRC_LOOKUP: - case O_IP_DST_LOOKUP: - if (is_ipv4) { - uint32_t key = - (cmd->opcode == O_IP_DST_LOOKUP) ? - dst_ip.s_addr : src_ip.s_addr; - uint32_t v = 0; - - if (cmdlen > F_INSN_SIZE(ipfw_insn_u32)) { - /* generic lookup. The key must be - * in 32bit big-endian format. - */ - v = ((ipfw_insn_u32 *)cmd)->d[1]; - if (v == 0) - key = dst_ip.s_addr; - else if (v == 1) - key = src_ip.s_addr; - else if (v == 6) /* dscp */ - key = (ip->ip_tos >> 2) & 0x3f; - else if (offset != 0) - break; - else if (proto != IPPROTO_TCP && - proto != IPPROTO_UDP) - break; - else if (v == 2) - key = htonl(dst_port); - else if (v == 3) - key = htonl(src_port); - else if (v == 4 || v == 5) { - check_uidgid( - (ipfw_insn_u32 *)cmd, - args, &ucred_lookup, -#ifdef __FreeBSD__ - &ucred_cache); - if (v == 4 /* O_UID */) - key = ucred_cache->cr_uid; - else if (v == 5 /* O_JAIL */) - key = ucred_cache->cr_prison->pr_id; -#else /* !__FreeBSD__ */ - (void *)&ucred_cache); - if (v ==4 /* O_UID */) - key = ucred_cache.uid; - else if (v == 5 /* O_JAIL */) - key = ucred_cache.xid; -#endif /* !__FreeBSD__ */ - key = htonl(key); - } else - break; - } - match = ipfw_lookup_table(chain, - cmd->arg1, key, &v); - if (!match) - break; - if (cmdlen == F_INSN_SIZE(ipfw_insn_u32)) - match = - ((ipfw_insn_u32 *)cmd)->d[0] == v; - else - tablearg = v; - } else if (is_ipv6) { - uint32_t v = 0; - void *pkey = (cmd->opcode == O_IP_DST_LOOKUP) ? - &args->f_id.dst_ip6: &args->f_id.src_ip6; - match = ipfw_lookup_table_extended(chain, - cmd->arg1, pkey, &v, - IPFW_TABLE_CIDR); - if (cmdlen == F_INSN_SIZE(ipfw_insn_u32)) - match = ((ipfw_insn_u32 *)cmd)->d[0] == v; - if (match) - tablearg = v; - } - break; - - case O_IP_SRC_MASK: - case O_IP_DST_MASK: - if (is_ipv4) { - uint32_t a = - (cmd->opcode == O_IP_DST_MASK) ? - dst_ip.s_addr : src_ip.s_addr; - uint32_t *p = ((ipfw_insn_u32 *)cmd)->d; - int i = cmdlen-1; - - for (; !match && i>0; i-= 2, p+= 2) - match = (p[0] == (a & p[1])); - } - break; - - case O_IP_SRC_ME: - if (is_ipv4) { - struct ifnet *tif; - - INADDR_TO_IFP(src_ip, tif); - match = (tif != NULL); - break; - } -#ifdef INET6 - /* FALLTHROUGH */ - case O_IP6_SRC_ME: - match= is_ipv6 && search_ip6_addr_net(&args->f_id.src_ip6); -#endif - break; - - case O_IP_DST_SET: - case O_IP_SRC_SET: - if (is_ipv4) { - u_int32_t *d = (u_int32_t *)(cmd+1); - u_int32_t addr = - cmd->opcode == O_IP_DST_SET ? - args->f_id.dst_ip : - args->f_id.src_ip; - - if (addr < d[0]) - break; - addr -= d[0]; /* subtract base */ - match = (addr < cmd->arg1) && - ( d[ 1 + (addr>>5)] & - (1<<(addr & 0x1f)) ); - } - break; - - case O_IP_DST: - match = is_ipv4 && - (((ipfw_insn_ip *)cmd)->addr.s_addr == - dst_ip.s_addr); - break; - - case O_IP_DST_ME: - if (is_ipv4) { - struct ifnet *tif; - - INADDR_TO_IFP(dst_ip, tif); - match = (tif != NULL); - break; - } -#ifdef INET6 - /* FALLTHROUGH */ - case O_IP6_DST_ME: - match= is_ipv6 && search_ip6_addr_net(&args->f_id.dst_ip6); -#endif - break; - - - case O_IP_SRCPORT: - case O_IP_DSTPORT: - /* - * offset == 0 && proto != 0 is enough - * to guarantee that we have a - * packet with port info. - */ - if ((proto==IPPROTO_UDP || proto==IPPROTO_TCP) - && offset == 0) { - u_int16_t x = - (cmd->opcode == O_IP_SRCPORT) ? - src_port : dst_port ; - u_int16_t *p = - ((ipfw_insn_u16 *)cmd)->ports; - int i; - - for (i = cmdlen - 1; !match && i>0; - i--, p += 2) - match = (x>=p[0] && x<=p[1]); - } - break; - - case O_ICMPTYPE: - match = (offset == 0 && proto==IPPROTO_ICMP && - icmptype_match(ICMP(ulp), (ipfw_insn_u32 *)cmd) ); - break; - -#ifdef INET6 - case O_ICMP6TYPE: - match = is_ipv6 && offset == 0 && - proto==IPPROTO_ICMPV6 && - icmp6type_match( - ICMP6(ulp)->icmp6_type, - (ipfw_insn_u32 *)cmd); - break; -#endif /* INET6 */ - - case O_IPOPT: - match = (is_ipv4 && - ipopts_match(ip, cmd) ); - break; - - case O_IPVER: - match = (is_ipv4 && - cmd->arg1 == ip->ip_v); - break; - - case O_IPID: - case O_IPLEN: - case O_IPTTL: - if (is_ipv4) { /* only for IP packets */ - uint16_t x; - uint16_t *p; - int i; - - if (cmd->opcode == O_IPLEN) - x = iplen; - else if (cmd->opcode == O_IPTTL) - x = ip->ip_ttl; - else /* must be IPID */ - x = ntohs(ip->ip_id); - if (cmdlen == 1) { - match = (cmd->arg1 == x); - break; - } - /* otherwise we have ranges */ - p = ((ipfw_insn_u16 *)cmd)->ports; - i = cmdlen - 1; - for (; !match && i>0; i--, p += 2) - match = (x >= p[0] && x <= p[1]); - } - break; - - case O_IPPRECEDENCE: - match = (is_ipv4 && - (cmd->arg1 == (ip->ip_tos & 0xe0)) ); - break; - - case O_IPTOS: - match = (is_ipv4 && - flags_match(cmd, ip->ip_tos)); - break; - - case O_TCPDATALEN: - if (proto == IPPROTO_TCP && offset == 0) { - struct tcphdr *tcp; - uint16_t x; - uint16_t *p; - int i; - - tcp = TCP(ulp); - x = iplen - - ((ip->ip_hl + tcp->th_off) << 2); - if (cmdlen == 1) { - match = (cmd->arg1 == x); - break; - } - /* otherwise we have ranges */ - p = ((ipfw_insn_u16 *)cmd)->ports; - i = cmdlen - 1; - for (; !match && i>0; i--, p += 2) - match = (x >= p[0] && x <= p[1]); - } - break; - - case O_TCPFLAGS: - match = (proto == IPPROTO_TCP && offset == 0 && - flags_match(cmd, TCP(ulp)->th_flags)); - break; - - case O_TCPOPTS: - PULLUP_LEN(hlen, ulp, (TCP(ulp)->th_off << 2)); - match = (proto == IPPROTO_TCP && offset == 0 && - tcpopts_match(TCP(ulp), cmd)); - break; - - case O_TCPSEQ: - match = (proto == IPPROTO_TCP && offset == 0 && - ((ipfw_insn_u32 *)cmd)->d[0] == - TCP(ulp)->th_seq); - break; - - case O_TCPACK: - match = (proto == IPPROTO_TCP && offset == 0 && - ((ipfw_insn_u32 *)cmd)->d[0] == - TCP(ulp)->th_ack); - break; - - case O_TCPWIN: - if (proto == IPPROTO_TCP && offset == 0) { - uint16_t x; - uint16_t *p; - int i; - - x = ntohs(TCP(ulp)->th_win); - if (cmdlen == 1) { - match = (cmd->arg1 == x); - break; - } - /* Otherwise we have ranges. */ - p = ((ipfw_insn_u16 *)cmd)->ports; - i = cmdlen - 1; - for (; !match && i > 0; i--, p += 2) - match = (x >= p[0] && x <= p[1]); - } - break; - - case O_ESTAB: - /* reject packets which have SYN only */ - /* XXX should i also check for TH_ACK ? */ - match = (proto == IPPROTO_TCP && offset == 0 && - (TCP(ulp)->th_flags & - (TH_RST | TH_ACK | TH_SYN)) != TH_SYN); - break; - - case O_ALTQ: { - struct pf_mtag *at; - struct m_tag *mtag; - ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd; - - /* - * ALTQ uses mbuf tags from another - * packet filtering system - pf(4). - * We allocate a tag in its format - * and fill it in, pretending to be pf(4). - */ - match = 1; - at = pf_find_mtag(m); - if (at != NULL && at->qid != 0) - break; - mtag = m_tag_get(PACKET_TAG_PF, - sizeof(struct pf_mtag), M_NOWAIT | M_ZERO); - if (mtag == NULL) { - /* - * Let the packet fall back to the - * default ALTQ. - */ - break; - } - m_tag_prepend(m, mtag); - at = (struct pf_mtag *)(mtag + 1); - at->qid = altq->qid; - at->hdr = ip; - break; - } - - case O_LOG: - ipfw_log(f, hlen, args, m, - oif, offset | ip6f_mf, tablearg, ip); - match = 1; - break; - - case O_PROB: - match = (random()<((ipfw_insn_u32 *)cmd)->d[0]); - break; - - case O_VERREVPATH: - /* Outgoing packets automatically pass/match */ - match = ((oif != NULL) || - (m->m_pkthdr.rcvif == NULL) || - ( -#ifdef INET6 - is_ipv6 ? - verify_path6(&(args->f_id.src_ip6), - m->m_pkthdr.rcvif, args->f_id.fib) : -#endif - verify_path(src_ip, m->m_pkthdr.rcvif, - args->f_id.fib))); - break; - - case O_VERSRCREACH: - /* Outgoing packets automatically pass/match */ - match = (hlen > 0 && ((oif != NULL) || -#ifdef INET6 - is_ipv6 ? - verify_path6(&(args->f_id.src_ip6), - NULL, args->f_id.fib) : -#endif - verify_path(src_ip, NULL, args->f_id.fib))); - break; - - case O_ANTISPOOF: - /* Outgoing packets automatically pass/match */ - if (oif == NULL && hlen > 0 && - ( (is_ipv4 && in_localaddr(src_ip)) -#ifdef INET6 - || (is_ipv6 && - in6_localaddr(&(args->f_id.src_ip6))) -#endif - )) - match = -#ifdef INET6 - is_ipv6 ? verify_path6( - &(args->f_id.src_ip6), - m->m_pkthdr.rcvif, - args->f_id.fib) : -#endif - verify_path(src_ip, - m->m_pkthdr.rcvif, - args->f_id.fib); - else - match = 1; - break; - - case O_IPSEC: -#ifdef IPSEC - match = (m_tag_find(m, - PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL); -#endif - /* otherwise no match */ - break; - -#ifdef INET6 - case O_IP6_SRC: - match = is_ipv6 && - IN6_ARE_ADDR_EQUAL(&args->f_id.src_ip6, - &((ipfw_insn_ip6 *)cmd)->addr6); - break; - - case O_IP6_DST: - match = is_ipv6 && - IN6_ARE_ADDR_EQUAL(&args->f_id.dst_ip6, - &((ipfw_insn_ip6 *)cmd)->addr6); - break; - case O_IP6_SRC_MASK: - case O_IP6_DST_MASK: - if (is_ipv6) { - int i = cmdlen - 1; - struct in6_addr p; - struct in6_addr *d = - &((ipfw_insn_ip6 *)cmd)->addr6; - - for (; !match && i > 0; d += 2, - i -= F_INSN_SIZE(struct in6_addr) - * 2) { - p = (cmd->opcode == - O_IP6_SRC_MASK) ? - args->f_id.src_ip6: - args->f_id.dst_ip6; - APPLY_MASK(&p, &d[1]); - match = - IN6_ARE_ADDR_EQUAL(&d[0], - &p); - } - } - break; - - case O_FLOW6ID: - match = is_ipv6 && - flow6id_match(args->f_id.flow_id6, - (ipfw_insn_u32 *) cmd); - break; - - case O_EXT_HDR: - match = is_ipv6 && - (ext_hd & ((ipfw_insn *) cmd)->arg1); - break; - - case O_IP6: - match = is_ipv6; - break; -#endif - - case O_IP4: - match = is_ipv4; - break; - - case O_TAG: { - struct m_tag *mtag; - uint32_t tag = (cmd->arg1 == IP_FW_TABLEARG) ? - tablearg : cmd->arg1; - - /* Packet is already tagged with this tag? */ - mtag = m_tag_locate(m, MTAG_IPFW, tag, NULL); - - /* We have `untag' action when F_NOT flag is - * present. And we must remove this mtag from - * mbuf and reset `match' to zero (`match' will - * be inversed later). - * Otherwise we should allocate new mtag and - * push it into mbuf. - */ - if (cmd->len & F_NOT) { /* `untag' action */ - if (mtag != NULL) - m_tag_delete(m, mtag); - match = 0; - } else { - if (mtag == NULL) { - mtag = m_tag_alloc( MTAG_IPFW, - tag, 0, M_NOWAIT); - if (mtag != NULL) - m_tag_prepend(m, mtag); - } - match = 1; - } - break; - } - - case O_FIB: /* try match the specified fib */ - if (args->f_id.fib == cmd->arg1) - match = 1; - break; - - case O_SOCKARG: { - struct inpcb *inp = args->inp; - struct inpcbinfo *pi; - - if (is_ipv6) /* XXX can we remove this ? */ - break; - - if (proto == IPPROTO_TCP) - pi = &V_tcbinfo; - else if (proto == IPPROTO_UDP) - pi = &V_udbinfo; - else - break; - - /* - * XXXRW: so_user_cookie should almost - * certainly be inp_user_cookie? - */ - - /* For incomming packet, lookup up the - inpcb using the src/dest ip/port tuple */ - if (inp == NULL) { - inp = in_pcblookup(pi, - src_ip, htons(src_port), - dst_ip, htons(dst_port), - INPLOOKUP_RLOCKPCB, NULL); - if (inp != NULL) { - tablearg = - inp->inp_socket->so_user_cookie; - if (tablearg) - match = 1; - INP_RUNLOCK(inp); - } - } else { - if (inp->inp_socket) { - tablearg = - inp->inp_socket->so_user_cookie; - if (tablearg) - match = 1; - } - } - break; - } - - case O_TAGGED: { - struct m_tag *mtag; - uint32_t tag = (cmd->arg1 == IP_FW_TABLEARG) ? - tablearg : cmd->arg1; - - if (cmdlen == 1) { - match = m_tag_locate(m, MTAG_IPFW, - tag, NULL) != NULL; - break; - } - - /* we have ranges */ - for (mtag = m_tag_first(m); - mtag != NULL && !match; - mtag = m_tag_next(m, mtag)) { - uint16_t *p; - int i; - - if (mtag->m_tag_cookie != MTAG_IPFW) - continue; - - p = ((ipfw_insn_u16 *)cmd)->ports; - i = cmdlen - 1; - for(; !match && i > 0; i--, p += 2) - match = - mtag->m_tag_id >= p[0] && - mtag->m_tag_id <= p[1]; - } - break; - } - - /* - * The second set of opcodes represents 'actions', - * i.e. the terminal part of a rule once the packet - * matches all previous patterns. - * Typically there is only one action for each rule, - * and the opcode is stored at the end of the rule - * (but there are exceptions -- see below). - * - * In general, here we set retval and terminate the - * outer loop (would be a 'break 3' in some language, - * but we need to set l=0, done=1) - * - * Exceptions: - * O_COUNT and O_SKIPTO actions: - * instead of terminating, we jump to the next rule - * (setting l=0), or to the SKIPTO target (setting - * f/f_len, cmd and l as needed), respectively. - * - * O_TAG, O_LOG and O_ALTQ action parameters: - * perform some action and set match = 1; - * - * O_LIMIT and O_KEEP_STATE: these opcodes are - * not real 'actions', and are stored right - * before the 'action' part of the rule. - * These opcodes try to install an entry in the - * state tables; if successful, we continue with - * the next opcode (match=1; break;), otherwise - * the packet must be dropped (set retval, - * break loops with l=0, done=1) - * - * O_PROBE_STATE and O_CHECK_STATE: these opcodes - * cause a lookup of the state table, and a jump - * to the 'action' part of the parent rule - * if an entry is found, or - * (CHECK_STATE only) a jump to the next rule if - * the entry is not found. - * The result of the lookup is cached so that - * further instances of these opcodes become NOPs. - * The jump to the next rule is done by setting - * l=0, cmdlen=0. - */ - case O_LIMIT: - case O_KEEP_STATE: - if (ipfw_install_state(f, - (ipfw_insn_limit *)cmd, args, tablearg)) { - /* error or limit violation */ - retval = IP_FW_DENY; - l = 0; /* exit inner loop */ - done = 1; /* exit outer loop */ - } - match = 1; - break; - - case O_PROBE_STATE: - case O_CHECK_STATE: - /* - * dynamic rules are checked at the first - * keep-state or check-state occurrence, - * with the result being stored in dyn_dir. - * The compiler introduces a PROBE_STATE - * instruction for us when we have a - * KEEP_STATE (because PROBE_STATE needs - * to be run first). - */ - if (dyn_dir == MATCH_UNKNOWN && - (q = ipfw_lookup_dyn_rule(&args->f_id, - &dyn_dir, proto == IPPROTO_TCP ? - TCP(ulp) : NULL)) - != NULL) { - /* - * Found dynamic entry, update stats - * and jump to the 'action' part of - * the parent rule by setting - * f, cmd, l and clearing cmdlen. - */ - q->pcnt++; - q->bcnt += pktlen; - /* XXX we would like to have f_pos - * readily accessible in the dynamic - * rule, instead of having to - * lookup q->rule. - */ - f = q->rule; - f_pos = ipfw_find_rule(chain, - f->rulenum, f->id); - cmd = ACTION_PTR(f); - l = f->cmd_len - f->act_ofs; - ipfw_dyn_unlock(); - cmdlen = 0; - match = 1; - break; - } - /* - * Dynamic entry not found. If CHECK_STATE, - * skip to next rule, if PROBE_STATE just - * ignore and continue with next opcode. - */ - if (cmd->opcode == O_CHECK_STATE) - l = 0; /* exit inner loop */ - match = 1; - break; - - case O_ACCEPT: - retval = 0; /* accept */ - l = 0; /* exit inner loop */ - done = 1; /* exit outer loop */ - break; - - case O_PIPE: - case O_QUEUE: - set_match(args, f_pos, chain); - args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ? - tablearg : cmd->arg1; - if (cmd->opcode == O_PIPE) - args->rule.info |= IPFW_IS_PIPE; - if (V_fw_one_pass) - args->rule.info |= IPFW_ONEPASS; - retval = IP_FW_DUMMYNET; - l = 0; /* exit inner loop */ - done = 1; /* exit outer loop */ - break; - - case O_DIVERT: - case O_TEE: - if (args->eh) /* not on layer 2 */ - break; - /* otherwise this is terminal */ - l = 0; /* exit inner loop */ - done = 1; /* exit outer loop */ - retval = (cmd->opcode == O_DIVERT) ? - IP_FW_DIVERT : IP_FW_TEE; - set_match(args, f_pos, chain); - args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ? - tablearg : cmd->arg1; - break; - - case O_COUNT: - f->pcnt++; /* update stats */ - f->bcnt += pktlen; - f->timestamp = time_uptime; - l = 0; /* exit inner loop */ - break; - - case O_SKIPTO: - f->pcnt++; /* update stats */ - f->bcnt += pktlen; - f->timestamp = time_uptime; - /* If possible use cached f_pos (in f->next_rule), - * whose version is written in f->next_rule - * (horrible hacks to avoid changing the ABI). - */ - if (cmd->arg1 != IP_FW_TABLEARG && - (uintptr_t)f->x_next == chain->id) { - f_pos = (uintptr_t)f->next_rule; - } else { - int i = (cmd->arg1 == IP_FW_TABLEARG) ? - tablearg : cmd->arg1; - /* make sure we do not jump backward */ - if (i <= f->rulenum) - i = f->rulenum + 1; - f_pos = ipfw_find_rule(chain, i, 0); - /* update the cache */ - if (cmd->arg1 != IP_FW_TABLEARG) { - f->next_rule = - (void *)(uintptr_t)f_pos; - f->x_next = - (void *)(uintptr_t)chain->id; - } - } - /* - * Skip disabled rules, and re-enter - * the inner loop with the correct - * f_pos, f, l and cmd. - * Also clear cmdlen and skip_or - */ - for (; f_pos < chain->n_rules - 1 && - (V_set_disable & - (1 << chain->map[f_pos]->set)); - f_pos++) - ; - /* Re-enter the inner loop at the skipto rule. */ - f = chain->map[f_pos]; - l = f->cmd_len; - cmd = f->cmd; - match = 1; - cmdlen = 0; - skip_or = 0; - continue; - break; /* not reached */ - - case O_CALLRETURN: { - /* - * Implementation of `subroutine' call/return, - * in the stack carried in an mbuf tag. This - * is different from `skipto' in that any call - * address is possible (`skipto' must prevent - * backward jumps to avoid endless loops). - * We have `return' action when F_NOT flag is - * present. The `m_tag_id' field is used as - * stack pointer. - */ - struct m_tag *mtag; - uint16_t jmpto, *stack; - -#define IS_CALL ((cmd->len & F_NOT) == 0) -#define IS_RETURN ((cmd->len & F_NOT) != 0) - /* - * Hand-rolled version of m_tag_locate() with - * wildcard `type'. - * If not already tagged, allocate new tag. - */ - mtag = m_tag_first(m); - while (mtag != NULL) { - if (mtag->m_tag_cookie == - MTAG_IPFW_CALL) - break; - mtag = m_tag_next(m, mtag); - } - if (mtag == NULL && IS_CALL) { - mtag = m_tag_alloc(MTAG_IPFW_CALL, 0, - IPFW_CALLSTACK_SIZE * - sizeof(uint16_t), M_NOWAIT); - if (mtag != NULL) - m_tag_prepend(m, mtag); - } - - /* - * On error both `call' and `return' just - * continue with next rule. - */ - if (IS_RETURN && (mtag == NULL || - mtag->m_tag_id == 0)) { - l = 0; /* exit inner loop */ - break; - } - if (IS_CALL && (mtag == NULL || - mtag->m_tag_id >= IPFW_CALLSTACK_SIZE)) { - printf("ipfw: call stack error, " - "go to next rule\n"); - l = 0; /* exit inner loop */ - break; - } - - f->pcnt++; /* update stats */ - f->bcnt += pktlen; - f->timestamp = time_uptime; - stack = (uint16_t *)(mtag + 1); - - /* - * The `call' action may use cached f_pos - * (in f->next_rule), whose version is written - * in f->next_rule. - * The `return' action, however, doesn't have - * fixed jump address in cmd->arg1 and can't use - * cache. - */ - if (IS_CALL) { - stack[mtag->m_tag_id] = f->rulenum; - mtag->m_tag_id++; - if (cmd->arg1 != IP_FW_TABLEARG && - (uintptr_t)f->x_next == chain->id) { - f_pos = (uintptr_t)f->next_rule; - } else { - jmpto = (cmd->arg1 == - IP_FW_TABLEARG) ? tablearg: - cmd->arg1; - f_pos = ipfw_find_rule(chain, - jmpto, 0); - /* update the cache */ - if (cmd->arg1 != - IP_FW_TABLEARG) { - f->next_rule = - (void *)(uintptr_t) - f_pos; - f->x_next = - (void *)(uintptr_t) - chain->id; - } - } - } else { /* `return' action */ - mtag->m_tag_id--; - jmpto = stack[mtag->m_tag_id] + 1; - f_pos = ipfw_find_rule(chain, jmpto, 0); - } - - /* - * Skip disabled rules, and re-enter - * the inner loop with the correct - * f_pos, f, l and cmd. - * Also clear cmdlen and skip_or - */ - for (; f_pos < chain->n_rules - 1 && - (V_set_disable & - (1 << chain->map[f_pos]->set)); f_pos++) - ; - /* Re-enter the inner loop at the dest rule. */ - f = chain->map[f_pos]; - l = f->cmd_len; - cmd = f->cmd; - cmdlen = 0; - skip_or = 0; - continue; - break; /* NOTREACHED */ - } -#undef IS_CALL -#undef IS_RETURN - - case O_REJECT: - /* - * Drop the packet and send a reject notice - * if the packet is not ICMP (or is an ICMP - * query), and it is not multicast/broadcast. - */ - if (hlen > 0 && is_ipv4 && offset == 0 && - (proto != IPPROTO_ICMP || - is_icmp_query(ICMP(ulp))) && - !(m->m_flags & (M_BCAST|M_MCAST)) && - !IN_MULTICAST(ntohl(dst_ip.s_addr))) { - send_reject(args, cmd->arg1, iplen, ip); - m = args->m; - } - /* FALLTHROUGH */ -#ifdef INET6 - case O_UNREACH6: - if (hlen > 0 && is_ipv6 && - ((offset & IP6F_OFF_MASK) == 0) && - (proto != IPPROTO_ICMPV6 || - (is_icmp6_query(icmp6_type) == 1)) && - !(m->m_flags & (M_BCAST|M_MCAST)) && - !IN6_IS_ADDR_MULTICAST(&args->f_id.dst_ip6)) { - send_reject6( - args, cmd->arg1, hlen, - (struct ip6_hdr *)ip); - m = args->m; - } - /* FALLTHROUGH */ -#endif - case O_DENY: - retval = IP_FW_DENY; - l = 0; /* exit inner loop */ - done = 1; /* exit outer loop */ - break; - - case O_FORWARD_IP: - if (args->eh) /* not valid on layer2 pkts */ - break; - if (q == NULL || q->rule != f || - dyn_dir == MATCH_FORWARD) { - struct sockaddr_in *sa; - sa = &(((ipfw_insn_sa *)cmd)->sa); - if (sa->sin_addr.s_addr == INADDR_ANY) { - bcopy(sa, &args->hopstore, - sizeof(*sa)); - args->hopstore.sin_addr.s_addr = - htonl(tablearg); - args->next_hop = &args->hopstore; - } else { - args->next_hop = sa; - } - } - retval = IP_FW_PASS; - l = 0; /* exit inner loop */ - done = 1; /* exit outer loop */ - break; - -#ifdef INET6 - case O_FORWARD_IP6: - if (args->eh) /* not valid on layer2 pkts */ - break; - if (q == NULL || q->rule != f || - dyn_dir == MATCH_FORWARD) { - struct sockaddr_in6 *sin6; - - sin6 = &(((ipfw_insn_sa6 *)cmd)->sa); - args->next_hop6 = sin6; - } - retval = IP_FW_PASS; - l = 0; /* exit inner loop */ - done = 1; /* exit outer loop */ - break; -#endif - - case O_NETGRAPH: - case O_NGTEE: - set_match(args, f_pos, chain); - args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ? - tablearg : cmd->arg1; - if (V_fw_one_pass) - args->rule.info |= IPFW_ONEPASS; - retval = (cmd->opcode == O_NETGRAPH) ? - IP_FW_NETGRAPH : IP_FW_NGTEE; - l = 0; /* exit inner loop */ - done = 1; /* exit outer loop */ - break; - - case O_SETFIB: { - uint32_t fib; - - f->pcnt++; /* update stats */ - f->bcnt += pktlen; - f->timestamp = time_uptime; - fib = (cmd->arg1 == IP_FW_TABLEARG) ? tablearg: - cmd->arg1; - if (fib >= rt_numfibs) - fib = 0; - M_SETFIB(m, fib); - args->f_id.fib = fib; - l = 0; /* exit inner loop */ - break; - } - - case O_NAT: - if (!IPFW_NAT_LOADED) { - retval = IP_FW_DENY; - } else { - struct cfg_nat *t; - int nat_id; - - set_match(args, f_pos, chain); - /* Check if this is 'global' nat rule */ - if (cmd->arg1 == 0) { - retval = ipfw_nat_ptr(args, NULL, m); - l = 0; - done = 1; - break; - } - t = ((ipfw_insn_nat *)cmd)->nat; - if (t == NULL) { - nat_id = (cmd->arg1 == IP_FW_TABLEARG) ? - tablearg : cmd->arg1; - t = (*lookup_nat_ptr)(&chain->nat, nat_id); - - if (t == NULL) { - retval = IP_FW_DENY; - l = 0; /* exit inner loop */ - done = 1; /* exit outer loop */ - break; - } - if (cmd->arg1 != IP_FW_TABLEARG) - ((ipfw_insn_nat *)cmd)->nat = t; - } - retval = ipfw_nat_ptr(args, t, m); - } - l = 0; /* exit inner loop */ - done = 1; /* exit outer loop */ - break; - - case O_REASS: { - int ip_off; - - f->pcnt++; - f->bcnt += pktlen; - l = 0; /* in any case exit inner loop */ - ip_off = ntohs(ip->ip_off); - - /* if not fragmented, go to next rule */ - if ((ip_off & (IP_MF | IP_OFFMASK)) == 0) - break; - /* - * ip_reass() expects len & off in host - * byte order. - */ - SET_HOST_IPLEN(ip); - - args->m = m = ip_reass(m); - - /* - * do IP header checksum fixup. - */ - if (m == NULL) { /* fragment got swallowed */ - retval = IP_FW_DENY; - } else { /* good, packet complete */ - int hlen; - - ip = mtod(m, struct ip *); - hlen = ip->ip_hl << 2; - SET_NET_IPLEN(ip); - ip->ip_sum = 0; - if (hlen == sizeof(struct ip)) - ip->ip_sum = in_cksum_hdr(ip); - else - ip->ip_sum = in_cksum(m, hlen); - retval = IP_FW_REASS; - set_match(args, f_pos, chain); - } - done = 1; /* exit outer loop */ - break; - } - - default: - panic("-- unknown opcode %d\n", cmd->opcode); - } /* end of switch() on opcodes */ - /* - * if we get here with l=0, then match is irrelevant. - */ - - if (cmd->len & F_NOT) - match = !match; - - if (match) { - if (cmd->len & F_OR) - skip_or = 1; - } else { - if (!(cmd->len & F_OR)) /* not an OR block, */ - break; /* try next rule */ - } - - } /* end of inner loop, scan opcodes */ -#undef PULLUP_LEN - - if (done) - break; - -/* next_rule:; */ /* try next rule */ - - } /* end of outer for, scan rules */ - - if (done) { - struct ip_fw *rule = chain->map[f_pos]; - /* Update statistics */ - rule->pcnt++; - rule->bcnt += pktlen; - rule->timestamp = time_uptime; - } else { - retval = IP_FW_DENY; - printf("ipfw: ouch!, skip past end of rules, denying packet\n"); - } - IPFW_RUNLOCK(chain); -#ifdef __FreeBSD__ - if (ucred_cache != NULL) - crfree(ucred_cache); -#endif - return (retval); - -pullup_failed: - if (V_fw_verbose) - printf("ipfw: pullup failed\n"); - return (IP_FW_DENY); -} - -/* - * Set maximum number of tables that can be used in given VNET ipfw instance. - */ -#ifdef SYSCTL_NODE -static int -sysctl_ipfw_table_num(SYSCTL_HANDLER_ARGS) -{ - int error; - unsigned int ntables; - - ntables = V_fw_tables_max; - - error = sysctl_handle_int(oidp, &ntables, 0, req); - /* Read operation or some error */ - if ((error != 0) || (req->newptr == NULL)) - return (error); - - return (ipfw_resize_tables(&V_layer3_chain, ntables)); -} -#endif -/* - * Module and VNET glue - */ - -/* - * Stuff that must be initialised only on boot or module load - */ -static int -ipfw_init(void) -{ - int error = 0; - - ipfw_dyn_attach(); - /* - * Only print out this stuff the first time around, - * when called from the sysinit code. - */ - printf("ipfw2 " -#ifdef INET6 - "(+ipv6) " -#endif - "initialized, divert %s, nat %s, " - "rule-based forwarding " -#ifdef IPFIREWALL_FORWARD - "enabled, " -#else - "disabled, " -#endif - "default to %s, logging ", -#ifdef IPDIVERT - "enabled", -#else - "loadable", -#endif -#ifdef IPFIREWALL_NAT - "enabled", -#else - "loadable", -#endif - default_to_accept ? "accept" : "deny"); - - /* - * Note: V_xxx variables can be accessed here but the vnet specific - * initializer may not have been called yet for the VIMAGE case. - * Tuneables will have been processed. We will print out values for - * the default vnet. - * XXX This should all be rationalized AFTER 8.0 - */ - if (V_fw_verbose == 0) - printf("disabled\n"); - else if (V_verbose_limit == 0) - printf("unlimited\n"); - else - printf("limited to %d packets/entry by default\n", - V_verbose_limit); - - /* Check user-supplied table count for validness */ - if (default_fw_tables > IPFW_TABLES_MAX) - default_fw_tables = IPFW_TABLES_MAX; - - ipfw_log_bpf(1); /* init */ - return (error); -} - -/* - * Called for the removal of the last instance only on module unload. - */ -static void -ipfw_destroy(void) -{ - - ipfw_log_bpf(0); /* uninit */ - ipfw_dyn_detach(); - printf("IP firewall unloaded\n"); -} - -/* - * Stuff that must be initialized for every instance - * (including the first of course). - */ -static int -vnet_ipfw_init(const void *unused) -{ - int error; - struct ip_fw *rule = NULL; - struct ip_fw_chain *chain; - - chain = &V_layer3_chain; - - /* First set up some values that are compile time options */ - V_autoinc_step = 100; /* bounded to 1..1000 in add_rule() */ - V_fw_deny_unknown_exthdrs = 1; -#ifdef IPFIREWALL_VERBOSE - V_fw_verbose = 1; -#endif -#ifdef IPFIREWALL_VERBOSE_LIMIT - V_verbose_limit = IPFIREWALL_VERBOSE_LIMIT; -#endif -#ifdef IPFIREWALL_NAT - LIST_INIT(&chain->nat); -#endif - - /* insert the default rule and create the initial map */ - chain->n_rules = 1; - chain->static_len = sizeof(struct ip_fw); - chain->map = malloc(sizeof(struct ip_fw *), M_IPFW, M_WAITOK | M_ZERO); - if (chain->map) - rule = malloc(chain->static_len, M_IPFW, M_WAITOK | M_ZERO); - - /* Set initial number of tables */ - V_fw_tables_max = default_fw_tables; - error = ipfw_init_tables(chain); - if (error) { - printf("ipfw2: setting up tables failed\n"); - free(chain->map, M_IPFW); - free(rule, M_IPFW); - return (ENOSPC); - } - - /* fill and insert the default rule */ - rule->act_ofs = 0; - rule->rulenum = IPFW_DEFAULT_RULE; - rule->cmd_len = 1; - rule->set = RESVD_SET; - rule->cmd[0].len = 1; - rule->cmd[0].opcode = default_to_accept ? O_ACCEPT : O_DENY; - chain->rules = chain->default_rule = chain->map[0] = rule; - chain->id = rule->id = 1; - - IPFW_LOCK_INIT(chain); - ipfw_dyn_init(); - - /* First set up some values that are compile time options */ - V_ipfw_vnet_ready = 1; /* Open for business */ - - /* - * Hook the sockopt handler and pfil hooks for ipv4 and ipv6. - * Even if the latter two fail we still keep the module alive - * because the sockopt and layer2 paths are still useful. - * ipfw[6]_hook return 0 on success, ENOENT on failure, - * so we can ignore the exact return value and just set a flag. - * - * Note that V_fw[6]_enable are manipulated by a SYSCTL_PROC so - * changes in the underlying (per-vnet) variables trigger - * immediate hook()/unhook() calls. - * In layer2 we have the same behaviour, except that V_ether_ipfw - * is checked on each packet because there are no pfil hooks. - */ - V_ip_fw_ctl_ptr = ipfw_ctl; - error = ipfw_attach_hooks(1); - return (error); -} - -/* - * Called for the removal of each instance. - */ -static int -vnet_ipfw_uninit(const void *unused) -{ - struct ip_fw *reap, *rule; - struct ip_fw_chain *chain = &V_layer3_chain; - int i; - - V_ipfw_vnet_ready = 0; /* tell new callers to go away */ - /* - * disconnect from ipv4, ipv6, layer2 and sockopt. - * Then grab, release and grab again the WLOCK so we make - * sure the update is propagated and nobody will be in. - */ - (void)ipfw_attach_hooks(0 /* detach */); - V_ip_fw_ctl_ptr = NULL; - IPFW_UH_WLOCK(chain); - IPFW_UH_WUNLOCK(chain); - IPFW_UH_WLOCK(chain); - - IPFW_WLOCK(chain); - ipfw_dyn_uninit(0); /* run the callout_drain */ - IPFW_WUNLOCK(chain); - - ipfw_destroy_tables(chain); - reap = NULL; - IPFW_WLOCK(chain); - for (i = 0; i < chain->n_rules; i++) { - rule = chain->map[i]; - rule->x_next = reap; - reap = rule; - } - if (chain->map) - free(chain->map, M_IPFW); - IPFW_WUNLOCK(chain); - IPFW_UH_WUNLOCK(chain); - if (reap != NULL) - ipfw_reap_rules(reap); - IPFW_LOCK_DESTROY(chain); - ipfw_dyn_uninit(1); /* free the remaining parts */ - return 0; -} - -/* - * Module event handler. - * In general we have the choice of handling most of these events by the - * event handler or by the (VNET_)SYS(UN)INIT handlers. I have chosen to - * use the SYSINIT handlers as they are more capable of expressing the - * flow of control during module and vnet operations, so this is just - * a skeleton. Note there is no SYSINIT equivalent of the module - * SHUTDOWN handler, but we don't have anything to do in that case anyhow. - */ -static int -ipfw_modevent(module_t mod, int type, void *unused) -{ - int err = 0; - - switch (type) { - case MOD_LOAD: - /* Called once at module load or - * system boot if compiled in. */ - break; - case MOD_QUIESCE: - /* Called before unload. May veto unloading. */ - break; - case MOD_UNLOAD: - /* Called during unload. */ - break; - case MOD_SHUTDOWN: - /* Called during system shutdown. */ - break; - default: - err = EOPNOTSUPP; - break; - } - return err; -} - -static moduledata_t ipfwmod = { - "ipfw", - ipfw_modevent, - 0 -}; - -/* Define startup order. */ -#define IPFW_SI_SUB_FIREWALL SI_SUB_PROTO_IFATTACHDOMAIN -#define IPFW_MODEVENT_ORDER (SI_ORDER_ANY - 255) /* On boot slot in here. */ -#define IPFW_MODULE_ORDER (IPFW_MODEVENT_ORDER + 1) /* A little later. */ -#define IPFW_VNET_ORDER (IPFW_MODEVENT_ORDER + 2) /* Later still. */ - -DECLARE_MODULE(ipfw, ipfwmod, IPFW_SI_SUB_FIREWALL, IPFW_MODEVENT_ORDER); -MODULE_VERSION(ipfw, 2); -/* should declare some dependencies here */ - -/* - * Starting up. Done in order after ipfwmod() has been called. - * VNET_SYSINIT is also called for each existing vnet and each new vnet. - */ -SYSINIT(ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER, - ipfw_init, NULL); -VNET_SYSINIT(vnet_ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER, - vnet_ipfw_init, NULL); - -/* - * Closing up shop. These are done in REVERSE ORDER, but still - * after ipfwmod() has been called. Not called on reboot. - * VNET_SYSUNINIT is also called for each exiting vnet as it exits. - * or when the module is unloaded. - */ -SYSUNINIT(ipfw_destroy, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER, - ipfw_destroy, NULL); -VNET_SYSUNINIT(vnet_ipfw_uninit, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER, - vnet_ipfw_uninit, NULL); -/* end of file */ diff --git a/sys/netinet/ipfw/ip_fw_dynamic.c b/sys/netinet/ipfw/ip_fw_dynamic.c deleted file mode 100644 index edf7639..0000000 --- a/sys/netinet/ipfw/ip_fw_dynamic.c +++ /dev/null @@ -1,1244 +0,0 @@ -/*- - * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - -#define DEB(x) -#define DDB(x) x - -/* - * Dynamic rule support for ipfw - */ - -#include "opt_ipfw.h" -#include "opt_inet.h" -#ifndef INET -#error IPFIREWALL requires INET. -#endif /* INET */ -#include "opt_inet6.h" - -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/malloc.h> -#include <sys/mbuf.h> -#include <sys/kernel.h> -#include <sys/lock.h> -#include <sys/socket.h> -#include <sys/sysctl.h> -#include <sys/syslog.h> -#include <net/ethernet.h> /* for ETHERTYPE_IP */ -#include <net/if.h> -#include <net/vnet.h> - -#include <netinet/in.h> -#include <netinet/ip.h> -#include <netinet/ip_var.h> /* ip_defttl */ -#include <netinet/ip_fw.h> -#include <netinet/ipfw/ip_fw_private.h> -#include <netinet/tcp_var.h> -#include <netinet/udp.h> - -#include <netinet/ip6.h> /* IN6_ARE_ADDR_EQUAL */ -#ifdef INET6 -#include <netinet6/in6_var.h> -#include <netinet6/ip6_var.h> -#endif - -#include <machine/in_cksum.h> /* XXX for in_cksum */ - -#ifdef MAC -#include <security/mac/mac_framework.h> -#endif - -/* - * Description of dynamic rules. - * - * Dynamic rules are stored in lists accessed through a hash table - * (ipfw_dyn_v) whose size is curr_dyn_buckets. This value can - * be modified through the sysctl variable dyn_buckets which is - * updated when the table becomes empty. - * - * XXX currently there is only one list, ipfw_dyn. - * - * When a packet is received, its address fields are first masked - * with the mask defined for the rule, then hashed, then matched - * against the entries in the corresponding list. - * Dynamic rules can be used for different purposes: - * + stateful rules; - * + enforcing limits on the number of sessions; - * + in-kernel NAT (not implemented yet) - * - * The lifetime of dynamic rules is regulated by dyn_*_lifetime, - * measured in seconds and depending on the flags. - * - * The total number of dynamic rules is stored in dyn_count. - * The max number of dynamic rules is dyn_max. When we reach - * the maximum number of rules we do not create anymore. This is - * done to avoid consuming too much memory, but also too much - * time when searching on each packet (ideally, we should try instead - * to put a limit on the length of the list on each bucket...). - * - * Each dynamic rule holds a pointer to the parent ipfw rule so - * we know what action to perform. Dynamic rules are removed when - * the parent rule is deleted. XXX we should make them survive. - * - * There are some limitations with dynamic rules -- we do not - * obey the 'randomized match', and we do not do multiple - * passes through the firewall. XXX check the latter!!! - */ - -/* - * Static variables followed by global ones - */ -static VNET_DEFINE(ipfw_dyn_rule **, ipfw_dyn_v); -static VNET_DEFINE(u_int32_t, dyn_buckets); -static VNET_DEFINE(u_int32_t, curr_dyn_buckets); -static VNET_DEFINE(struct callout, ipfw_timeout); -#define V_ipfw_dyn_v VNET(ipfw_dyn_v) -#define V_dyn_buckets VNET(dyn_buckets) -#define V_curr_dyn_buckets VNET(curr_dyn_buckets) -#define V_ipfw_timeout VNET(ipfw_timeout) - -static uma_zone_t ipfw_dyn_rule_zone; -#ifndef __FreeBSD__ -DEFINE_SPINLOCK(ipfw_dyn_mtx); -#else -static struct mtx ipfw_dyn_mtx; /* mutex guarding dynamic rules */ -#endif - -#define IPFW_DYN_LOCK_INIT() \ - mtx_init(&ipfw_dyn_mtx, "IPFW dynamic rules", NULL, MTX_DEF) -#define IPFW_DYN_LOCK_DESTROY() mtx_destroy(&ipfw_dyn_mtx) -#define IPFW_DYN_LOCK() mtx_lock(&ipfw_dyn_mtx) -#define IPFW_DYN_UNLOCK() mtx_unlock(&ipfw_dyn_mtx) -#define IPFW_DYN_LOCK_ASSERT() mtx_assert(&ipfw_dyn_mtx, MA_OWNED) - -void -ipfw_dyn_unlock(void) -{ - IPFW_DYN_UNLOCK(); -} - -/* - * Timeouts for various events in handing dynamic rules. - */ -static VNET_DEFINE(u_int32_t, dyn_ack_lifetime); -static VNET_DEFINE(u_int32_t, dyn_syn_lifetime); -static VNET_DEFINE(u_int32_t, dyn_fin_lifetime); -static VNET_DEFINE(u_int32_t, dyn_rst_lifetime); -static VNET_DEFINE(u_int32_t, dyn_udp_lifetime); -static VNET_DEFINE(u_int32_t, dyn_short_lifetime); - -#define V_dyn_ack_lifetime VNET(dyn_ack_lifetime) -#define V_dyn_syn_lifetime VNET(dyn_syn_lifetime) -#define V_dyn_fin_lifetime VNET(dyn_fin_lifetime) -#define V_dyn_rst_lifetime VNET(dyn_rst_lifetime) -#define V_dyn_udp_lifetime VNET(dyn_udp_lifetime) -#define V_dyn_short_lifetime VNET(dyn_short_lifetime) - -/* - * Keepalives are sent if dyn_keepalive is set. They are sent every - * dyn_keepalive_period seconds, in the last dyn_keepalive_interval - * seconds of lifetime of a rule. - * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower - * than dyn_keepalive_period. - */ - -static VNET_DEFINE(u_int32_t, dyn_keepalive_interval); -static VNET_DEFINE(u_int32_t, dyn_keepalive_period); -static VNET_DEFINE(u_int32_t, dyn_keepalive); - -#define V_dyn_keepalive_interval VNET(dyn_keepalive_interval) -#define V_dyn_keepalive_period VNET(dyn_keepalive_period) -#define V_dyn_keepalive VNET(dyn_keepalive) - -static VNET_DEFINE(u_int32_t, dyn_count); /* # of dynamic rules */ -static VNET_DEFINE(u_int32_t, dyn_max); /* max # of dynamic rules */ - -#define V_dyn_count VNET(dyn_count) -#define V_dyn_max VNET(dyn_max) - -#ifdef SYSCTL_NODE - -SYSBEGIN(f2) - -SYSCTL_DECL(_net_inet_ip_fw); -SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_buckets, - CTLFLAG_RW, &VNET_NAME(dyn_buckets), 0, - "Number of dyn. buckets"); -SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, - CTLFLAG_RD, &VNET_NAME(curr_dyn_buckets), 0, - "Current Number of dyn. buckets"); -SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_count, - CTLFLAG_RD, &VNET_NAME(dyn_count), 0, - "Number of dyn. rules"); -SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_max, - CTLFLAG_RW, &VNET_NAME(dyn_max), 0, - "Max number of dyn. rules"); -SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, - CTLFLAG_RW, &VNET_NAME(dyn_ack_lifetime), 0, - "Lifetime of dyn. rules for acks"); -SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, - CTLFLAG_RW, &VNET_NAME(dyn_syn_lifetime), 0, - "Lifetime of dyn. rules for syn"); -SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, - CTLFLAG_RW, &VNET_NAME(dyn_fin_lifetime), 0, - "Lifetime of dyn. rules for fin"); -SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, - CTLFLAG_RW, &VNET_NAME(dyn_rst_lifetime), 0, - "Lifetime of dyn. rules for rst"); -SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, - CTLFLAG_RW, &VNET_NAME(dyn_udp_lifetime), 0, - "Lifetime of dyn. rules for UDP"); -SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, - CTLFLAG_RW, &VNET_NAME(dyn_short_lifetime), 0, - "Lifetime of dyn. rules for other situations"); -SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, - CTLFLAG_RW, &VNET_NAME(dyn_keepalive), 0, - "Enable keepalives for dyn. rules"); - -SYSEND - -#endif /* SYSCTL_NODE */ - - -static __inline int -hash_packet6(struct ipfw_flow_id *id) -{ - u_int32_t i; - i = (id->dst_ip6.__u6_addr.__u6_addr32[2]) ^ - (id->dst_ip6.__u6_addr.__u6_addr32[3]) ^ - (id->src_ip6.__u6_addr.__u6_addr32[2]) ^ - (id->src_ip6.__u6_addr.__u6_addr32[3]) ^ - (id->dst_port) ^ (id->src_port); - return i; -} - -/* - * IMPORTANT: the hash function for dynamic rules must be commutative - * in source and destination (ip,port), because rules are bidirectional - * and we want to find both in the same bucket. - */ -static __inline int -hash_packet(struct ipfw_flow_id *id) -{ - u_int32_t i; - -#ifdef INET6 - if (IS_IP6_FLOW_ID(id)) - i = hash_packet6(id); - else -#endif /* INET6 */ - i = (id->dst_ip) ^ (id->src_ip) ^ (id->dst_port) ^ (id->src_port); - i &= (V_curr_dyn_buckets - 1); - return i; -} - -static __inline void -unlink_dyn_rule_print(struct ipfw_flow_id *id) -{ - struct in_addr da; -#ifdef INET6 - char src[INET6_ADDRSTRLEN], dst[INET6_ADDRSTRLEN]; -#else - char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN]; -#endif - -#ifdef INET6 - if (IS_IP6_FLOW_ID(id)) { - ip6_sprintf(src, &id->src_ip6); - ip6_sprintf(dst, &id->dst_ip6); - } else -#endif - { - da.s_addr = htonl(id->src_ip); - inet_ntop(AF_INET, &da, src, sizeof(src)); - da.s_addr = htonl(id->dst_ip); - inet_ntop(AF_INET, &da, dst, sizeof(dst)); - } - printf("ipfw: unlink entry %s %d -> %s %d, %d left\n", - src, id->src_port, dst, id->dst_port, V_dyn_count - 1); -} - -/** - * unlink a dynamic rule from a chain. prev is a pointer to - * the previous one, q is a pointer to the rule to delete, - * head is a pointer to the head of the queue. - * Modifies q and potentially also head. - */ -#define UNLINK_DYN_RULE(prev, head, q) { \ - ipfw_dyn_rule *old_q = q; \ - \ - /* remove a refcount to the parent */ \ - if (q->dyn_type == O_LIMIT) \ - q->parent->count--; \ - DEB(unlink_dyn_rule_print(&q->id);) \ - if (prev != NULL) \ - prev->next = q = q->next; \ - else \ - head = q = q->next; \ - V_dyn_count--; \ - uma_zfree(ipfw_dyn_rule_zone, old_q); } - -#define TIME_LEQ(a,b) ((int)((a)-(b)) <= 0) - -/** - * Remove dynamic rules pointing to "rule", or all of them if rule == NULL. - * - * If keep_me == NULL, rules are deleted even if not expired, - * otherwise only expired rules are removed. - * - * The value of the second parameter is also used to point to identify - * a rule we absolutely do not want to remove (e.g. because we are - * holding a reference to it -- this is the case with O_LIMIT_PARENT - * rules). The pointer is only used for comparison, so any non-null - * value will do. - */ -static void -remove_dyn_rule(struct ip_fw *rule, ipfw_dyn_rule *keep_me) -{ - static u_int32_t last_remove = 0; - -#define FORCE (keep_me == NULL) - - ipfw_dyn_rule *prev, *q; - int i, pass = 0, max_pass = 0; - - IPFW_DYN_LOCK_ASSERT(); - - if (V_ipfw_dyn_v == NULL || V_dyn_count == 0) - return; - /* do not expire more than once per second, it is useless */ - if (!FORCE && last_remove == time_uptime) - return; - last_remove = time_uptime; - - /* - * because O_LIMIT refer to parent rules, during the first pass only - * remove child and mark any pending LIMIT_PARENT, and remove - * them in a second pass. - */ -next_pass: - for (i = 0 ; i < V_curr_dyn_buckets ; i++) { - for (prev=NULL, q = V_ipfw_dyn_v[i] ; q ; ) { - /* - * Logic can become complex here, so we split tests. - */ - if (q == keep_me) - goto next; - if (rule != NULL && rule != q->rule) - goto next; /* not the one we are looking for */ - if (q->dyn_type == O_LIMIT_PARENT) { - /* - * handle parent in the second pass, - * record we need one. - */ - max_pass = 1; - if (pass == 0) - goto next; - if (FORCE && q->count != 0 ) { - /* XXX should not happen! */ - printf("ipfw: OUCH! cannot remove rule," - " count %d\n", q->count); - } - } else { - if (!FORCE && - !TIME_LEQ( q->expire, time_uptime )) - goto next; - } - if (q->dyn_type != O_LIMIT_PARENT || !q->count) { - UNLINK_DYN_RULE(prev, V_ipfw_dyn_v[i], q); - continue; - } -next: - prev=q; - q=q->next; - } - } - if (pass++ < max_pass) - goto next_pass; -} - -void -ipfw_remove_dyn_children(struct ip_fw *rule) -{ - IPFW_DYN_LOCK(); - remove_dyn_rule(rule, NULL /* force removal */); - IPFW_DYN_UNLOCK(); -} - -/* - * Lookup a dynamic rule, locked version. - */ -static ipfw_dyn_rule * -lookup_dyn_rule_locked(struct ipfw_flow_id *pkt, int *match_direction, - struct tcphdr *tcp) -{ - /* - * Stateful ipfw extensions. - * Lookup into dynamic session queue. - */ -#define MATCH_REVERSE 0 -#define MATCH_FORWARD 1 -#define MATCH_NONE 2 -#define MATCH_UNKNOWN 3 - int i, dir = MATCH_NONE; - ipfw_dyn_rule *prev, *q = NULL; - - IPFW_DYN_LOCK_ASSERT(); - - if (V_ipfw_dyn_v == NULL) - goto done; /* not found */ - i = hash_packet(pkt); - for (prev = NULL, q = V_ipfw_dyn_v[i]; q != NULL;) { - if (q->dyn_type == O_LIMIT_PARENT && q->count) - goto next; - if (TIME_LEQ(q->expire, time_uptime)) { /* expire entry */ - UNLINK_DYN_RULE(prev, V_ipfw_dyn_v[i], q); - continue; - } - if (pkt->proto != q->id.proto || q->dyn_type == O_LIMIT_PARENT) - goto next; - - if (IS_IP6_FLOW_ID(pkt)) { - if (IN6_ARE_ADDR_EQUAL(&pkt->src_ip6, &q->id.src_ip6) && - IN6_ARE_ADDR_EQUAL(&pkt->dst_ip6, &q->id.dst_ip6) && - pkt->src_port == q->id.src_port && - pkt->dst_port == q->id.dst_port) { - dir = MATCH_FORWARD; - break; - } - if (IN6_ARE_ADDR_EQUAL(&pkt->src_ip6, &q->id.dst_ip6) && - IN6_ARE_ADDR_EQUAL(&pkt->dst_ip6, &q->id.src_ip6) && - pkt->src_port == q->id.dst_port && - pkt->dst_port == q->id.src_port) { - dir = MATCH_REVERSE; - break; - } - } else { - if (pkt->src_ip == q->id.src_ip && - pkt->dst_ip == q->id.dst_ip && - pkt->src_port == q->id.src_port && - pkt->dst_port == q->id.dst_port) { - dir = MATCH_FORWARD; - break; - } - if (pkt->src_ip == q->id.dst_ip && - pkt->dst_ip == q->id.src_ip && - pkt->src_port == q->id.dst_port && - pkt->dst_port == q->id.src_port) { - dir = MATCH_REVERSE; - break; - } - } -next: - prev = q; - q = q->next; - } - if (q == NULL) - goto done; /* q = NULL, not found */ - - if (prev != NULL) { /* found and not in front */ - prev->next = q->next; - q->next = V_ipfw_dyn_v[i]; - V_ipfw_dyn_v[i] = q; - } - if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */ - uint32_t ack; - u_char flags = pkt->_flags & (TH_FIN | TH_SYN | TH_RST); - -#define BOTH_SYN (TH_SYN | (TH_SYN << 8)) -#define BOTH_FIN (TH_FIN | (TH_FIN << 8)) -#define TCP_FLAGS (TH_FLAGS | (TH_FLAGS << 8)) -#define ACK_FWD 0x10000 /* fwd ack seen */ -#define ACK_REV 0x20000 /* rev ack seen */ - - q->state |= (dir == MATCH_FORWARD) ? flags : (flags << 8); - switch (q->state & TCP_FLAGS) { - case TH_SYN: /* opening */ - q->expire = time_uptime + V_dyn_syn_lifetime; - break; - - case BOTH_SYN: /* move to established */ - case BOTH_SYN | TH_FIN: /* one side tries to close */ - case BOTH_SYN | (TH_FIN << 8): -#define _SEQ_GE(a,b) ((int)(a) - (int)(b) >= 0) - if (tcp == NULL) - break; - - ack = ntohl(tcp->th_ack); - if (dir == MATCH_FORWARD) { - if (q->ack_fwd == 0 || - _SEQ_GE(ack, q->ack_fwd)) { - q->ack_fwd = ack; - q->state |= ACK_FWD; - } - } else { - if (q->ack_rev == 0 || - _SEQ_GE(ack, q->ack_rev)) { - q->ack_rev = ack; - q->state |= ACK_REV; - } - } - if ((q->state & (ACK_FWD | ACK_REV)) == - (ACK_FWD | ACK_REV)) { - q->expire = time_uptime + V_dyn_ack_lifetime; - q->state &= ~(ACK_FWD | ACK_REV); - } - break; - - case BOTH_SYN | BOTH_FIN: /* both sides closed */ - if (V_dyn_fin_lifetime >= V_dyn_keepalive_period) - V_dyn_fin_lifetime = V_dyn_keepalive_period - 1; - q->expire = time_uptime + V_dyn_fin_lifetime; - break; - - default: -#if 0 - /* - * reset or some invalid combination, but can also - * occur if we use keep-state the wrong way. - */ - if ( (q->state & ((TH_RST << 8)|TH_RST)) == 0) - printf("invalid state: 0x%x\n", q->state); -#endif - if (V_dyn_rst_lifetime >= V_dyn_keepalive_period) - V_dyn_rst_lifetime = V_dyn_keepalive_period - 1; - q->expire = time_uptime + V_dyn_rst_lifetime; - break; - } - } else if (pkt->proto == IPPROTO_UDP) { - q->expire = time_uptime + V_dyn_udp_lifetime; - } else { - /* other protocols */ - q->expire = time_uptime + V_dyn_short_lifetime; - } -done: - if (match_direction != NULL) - *match_direction = dir; - return (q); -} - -ipfw_dyn_rule * -ipfw_lookup_dyn_rule(struct ipfw_flow_id *pkt, int *match_direction, - struct tcphdr *tcp) -{ - ipfw_dyn_rule *q; - - IPFW_DYN_LOCK(); - q = lookup_dyn_rule_locked(pkt, match_direction, tcp); - if (q == NULL) - IPFW_DYN_UNLOCK(); - /* NB: return table locked when q is not NULL */ - return q; -} - -static void -realloc_dynamic_table(void) -{ - IPFW_DYN_LOCK_ASSERT(); - - /* - * Try reallocation, make sure we have a power of 2 and do - * not allow more than 64k entries. In case of overflow, - * default to 1024. - */ - - if (V_dyn_buckets > 65536) - V_dyn_buckets = 1024; - if ((V_dyn_buckets & (V_dyn_buckets-1)) != 0) { /* not a power of 2 */ - V_dyn_buckets = V_curr_dyn_buckets; /* reset */ - return; - } - V_curr_dyn_buckets = V_dyn_buckets; - if (V_ipfw_dyn_v != NULL) - free(V_ipfw_dyn_v, M_IPFW); - for (;;) { - V_ipfw_dyn_v = malloc(V_curr_dyn_buckets * sizeof(ipfw_dyn_rule *), - M_IPFW, M_NOWAIT | M_ZERO); - if (V_ipfw_dyn_v != NULL || V_curr_dyn_buckets <= 2) - break; - V_curr_dyn_buckets /= 2; - } -} - -/** - * Install state of type 'type' for a dynamic session. - * The hash table contains two type of rules: - * - regular rules (O_KEEP_STATE) - * - rules for sessions with limited number of sess per user - * (O_LIMIT). When they are created, the parent is - * increased by 1, and decreased on delete. In this case, - * the third parameter is the parent rule and not the chain. - * - "parent" rules for the above (O_LIMIT_PARENT). - */ -static ipfw_dyn_rule * -add_dyn_rule(struct ipfw_flow_id *id, u_int8_t dyn_type, struct ip_fw *rule) -{ - ipfw_dyn_rule *r; - int i; - - IPFW_DYN_LOCK_ASSERT(); - - if (V_ipfw_dyn_v == NULL || - (V_dyn_count == 0 && V_dyn_buckets != V_curr_dyn_buckets)) { - realloc_dynamic_table(); - if (V_ipfw_dyn_v == NULL) - return NULL; /* failed ! */ - } - i = hash_packet(id); - - r = uma_zalloc(ipfw_dyn_rule_zone, M_NOWAIT | M_ZERO); - if (r == NULL) { - printf ("ipfw: sorry cannot allocate state\n"); - return NULL; - } - - /* increase refcount on parent, and set pointer */ - if (dyn_type == O_LIMIT) { - ipfw_dyn_rule *parent = (ipfw_dyn_rule *)rule; - if ( parent->dyn_type != O_LIMIT_PARENT) - panic("invalid parent"); - parent->count++; - r->parent = parent; - rule = parent->rule; - } - - r->id = *id; - r->expire = time_uptime + V_dyn_syn_lifetime; - r->rule = rule; - r->dyn_type = dyn_type; - r->pcnt = r->bcnt = 0; - r->count = 0; - - r->bucket = i; - r->next = V_ipfw_dyn_v[i]; - V_ipfw_dyn_v[i] = r; - V_dyn_count++; - DEB({ - struct in_addr da; -#ifdef INET6 - char src[INET6_ADDRSTRLEN]; - char dst[INET6_ADDRSTRLEN]; -#else - char src[INET_ADDRSTRLEN]; - char dst[INET_ADDRSTRLEN]; -#endif - -#ifdef INET6 - if (IS_IP6_FLOW_ID(&(r->id))) { - ip6_sprintf(src, &r->id.src_ip6); - ip6_sprintf(dst, &r->id.dst_ip6); - } else -#endif - { - da.s_addr = htonl(r->id.src_ip); - inet_ntop(AF_INET, &da, src, sizeof(src)); - da.s_addr = htonl(r->id.dst_ip); - inet_ntop(AF_INET, &da, dst, sizeof(dst)); - } - printf("ipfw: add dyn entry ty %d %s %d -> %s %d, total %d\n", - dyn_type, src, r->id.src_port, dst, r->id.dst_port, - V_dyn_count); - }) - return r; -} - -/** - * lookup dynamic parent rule using pkt and rule as search keys. - * If the lookup fails, then install one. - */ -static ipfw_dyn_rule * -lookup_dyn_parent(struct ipfw_flow_id *pkt, struct ip_fw *rule) -{ - ipfw_dyn_rule *q; - int i; - - IPFW_DYN_LOCK_ASSERT(); - - if (V_ipfw_dyn_v) { - int is_v6 = IS_IP6_FLOW_ID(pkt); - i = hash_packet( pkt ); - for (q = V_ipfw_dyn_v[i] ; q != NULL ; q=q->next) - if (q->dyn_type == O_LIMIT_PARENT && - rule== q->rule && - pkt->proto == q->id.proto && - pkt->src_port == q->id.src_port && - pkt->dst_port == q->id.dst_port && - ( - (is_v6 && - IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6), - &(q->id.src_ip6)) && - IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6), - &(q->id.dst_ip6))) || - (!is_v6 && - pkt->src_ip == q->id.src_ip && - pkt->dst_ip == q->id.dst_ip) - ) - ) { - q->expire = time_uptime + V_dyn_short_lifetime; - DEB(printf("ipfw: lookup_dyn_parent found 0x%p\n",q);) - return q; - } - } - return add_dyn_rule(pkt, O_LIMIT_PARENT, rule); -} - -/** - * Install dynamic state for rule type cmd->o.opcode - * - * Returns 1 (failure) if state is not installed because of errors or because - * session limitations are enforced. - */ -int -ipfw_install_state(struct ip_fw *rule, ipfw_insn_limit *cmd, - struct ip_fw_args *args, uint32_t tablearg) -{ - static int last_log; - ipfw_dyn_rule *q; - struct in_addr da; -#ifdef INET6 - char src[INET6_ADDRSTRLEN + 2], dst[INET6_ADDRSTRLEN + 2]; -#else - char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN]; -#endif - - src[0] = '\0'; - dst[0] = '\0'; - - IPFW_DYN_LOCK(); - - DEB( -#ifdef INET6 - if (IS_IP6_FLOW_ID(&(args->f_id))) { - ip6_sprintf(src, &args->f_id.src_ip6); - ip6_sprintf(dst, &args->f_id.dst_ip6); - } else -#endif - { - da.s_addr = htonl(args->f_id.src_ip); - inet_ntop(AF_INET, &da, src, sizeof(src)); - da.s_addr = htonl(args->f_id.dst_ip); - inet_ntop(AF_INET, &da, dst, sizeof(dst)); - } - printf("ipfw: %s: type %d %s %u -> %s %u\n", - __func__, cmd->o.opcode, src, args->f_id.src_port, - dst, args->f_id.dst_port); - src[0] = '\0'; - dst[0] = '\0'; - ) - - q = lookup_dyn_rule_locked(&args->f_id, NULL, NULL); - - if (q != NULL) { /* should never occur */ - DEB( - if (last_log != time_uptime) { - last_log = time_uptime; - printf("ipfw: %s: entry already present, done\n", - __func__); - }) - IPFW_DYN_UNLOCK(); - return (0); - } - - if (V_dyn_count >= V_dyn_max) - /* Run out of slots, try to remove any expired rule. */ - remove_dyn_rule(NULL, (ipfw_dyn_rule *)1); - - if (V_dyn_count >= V_dyn_max) { - if (last_log != time_uptime) { - last_log = time_uptime; - printf("ipfw: %s: Too many dynamic rules\n", __func__); - } - IPFW_DYN_UNLOCK(); - return (1); /* cannot install, notify caller */ - } - - switch (cmd->o.opcode) { - case O_KEEP_STATE: /* bidir rule */ - add_dyn_rule(&args->f_id, O_KEEP_STATE, rule); - break; - - case O_LIMIT: { /* limit number of sessions */ - struct ipfw_flow_id id; - ipfw_dyn_rule *parent; - uint32_t conn_limit; - uint16_t limit_mask = cmd->limit_mask; - - conn_limit = (cmd->conn_limit == IP_FW_TABLEARG) ? - tablearg : cmd->conn_limit; - - DEB( - if (cmd->conn_limit == IP_FW_TABLEARG) - printf("ipfw: %s: O_LIMIT rule, conn_limit: %u " - "(tablearg)\n", __func__, conn_limit); - else - printf("ipfw: %s: O_LIMIT rule, conn_limit: %u\n", - __func__, conn_limit); - ) - - id.dst_ip = id.src_ip = id.dst_port = id.src_port = 0; - id.proto = args->f_id.proto; - id.addr_type = args->f_id.addr_type; - id.fib = M_GETFIB(args->m); - - if (IS_IP6_FLOW_ID (&(args->f_id))) { - if (limit_mask & DYN_SRC_ADDR) - id.src_ip6 = args->f_id.src_ip6; - if (limit_mask & DYN_DST_ADDR) - id.dst_ip6 = args->f_id.dst_ip6; - } else { - if (limit_mask & DYN_SRC_ADDR) - id.src_ip = args->f_id.src_ip; - if (limit_mask & DYN_DST_ADDR) - id.dst_ip = args->f_id.dst_ip; - } - if (limit_mask & DYN_SRC_PORT) - id.src_port = args->f_id.src_port; - if (limit_mask & DYN_DST_PORT) - id.dst_port = args->f_id.dst_port; - if ((parent = lookup_dyn_parent(&id, rule)) == NULL) { - printf("ipfw: %s: add parent failed\n", __func__); - IPFW_DYN_UNLOCK(); - return (1); - } - - if (parent->count >= conn_limit) { - /* See if we can remove some expired rule. */ - remove_dyn_rule(rule, parent); - if (parent->count >= conn_limit) { - if (V_fw_verbose && last_log != time_uptime) { - last_log = time_uptime; -#ifdef INET6 - /* - * XXX IPv6 flows are not - * supported yet. - */ - if (IS_IP6_FLOW_ID(&(args->f_id))) { - char ip6buf[INET6_ADDRSTRLEN]; - snprintf(src, sizeof(src), - "[%s]", ip6_sprintf(ip6buf, - &args->f_id.src_ip6)); - snprintf(dst, sizeof(dst), - "[%s]", ip6_sprintf(ip6buf, - &args->f_id.dst_ip6)); - } else -#endif - { - da.s_addr = - htonl(args->f_id.src_ip); - inet_ntop(AF_INET, &da, src, - sizeof(src)); - da.s_addr = - htonl(args->f_id.dst_ip); - inet_ntop(AF_INET, &da, dst, - sizeof(dst)); - } - log(LOG_SECURITY | LOG_DEBUG, - "ipfw: %d %s %s:%u -> %s:%u, %s\n", - parent->rule->rulenum, - "drop session", - src, (args->f_id.src_port), - dst, (args->f_id.dst_port), - "too many entries"); - } - IPFW_DYN_UNLOCK(); - return (1); - } - } - add_dyn_rule(&args->f_id, O_LIMIT, (struct ip_fw *)parent); - break; - } - default: - printf("ipfw: %s: unknown dynamic rule type %u\n", - __func__, cmd->o.opcode); - IPFW_DYN_UNLOCK(); - return (1); - } - - /* XXX just set lifetime */ - lookup_dyn_rule_locked(&args->f_id, NULL, NULL); - - IPFW_DYN_UNLOCK(); - return (0); -} - -/* - * Generate a TCP packet, containing either a RST or a keepalive. - * When flags & TH_RST, we are sending a RST packet, because of a - * "reset" action matched the packet. - * Otherwise we are sending a keepalive, and flags & TH_ - * The 'replyto' mbuf is the mbuf being replied to, if any, and is required - * so that MAC can label the reply appropriately. - */ -struct mbuf * -ipfw_send_pkt(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t seq, - u_int32_t ack, int flags) -{ - struct mbuf *m = NULL; /* stupid compiler */ - int len, dir; - struct ip *h = NULL; /* stupid compiler */ -#ifdef INET6 - struct ip6_hdr *h6 = NULL; -#endif - struct tcphdr *th = NULL; - - MGETHDR(m, M_DONTWAIT, MT_DATA); - if (m == NULL) - return (NULL); - - M_SETFIB(m, id->fib); -#ifdef MAC - if (replyto != NULL) - mac_netinet_firewall_reply(replyto, m); - else - mac_netinet_firewall_send(m); -#else - (void)replyto; /* don't warn about unused arg */ -#endif - - switch (id->addr_type) { - case 4: - len = sizeof(struct ip) + sizeof(struct tcphdr); - break; -#ifdef INET6 - case 6: - len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); - break; -#endif - default: - /* XXX: log me?!? */ - FREE_PKT(m); - return (NULL); - } - dir = ((flags & (TH_SYN | TH_RST)) == TH_SYN); - - m->m_data += max_linkhdr; - m->m_flags |= M_SKIP_FIREWALL; - m->m_pkthdr.len = m->m_len = len; - m->m_pkthdr.rcvif = NULL; - bzero(m->m_data, len); - - switch (id->addr_type) { - case 4: - h = mtod(m, struct ip *); - - /* prepare for checksum */ - h->ip_p = IPPROTO_TCP; - h->ip_len = htons(sizeof(struct tcphdr)); - if (dir) { - h->ip_src.s_addr = htonl(id->src_ip); - h->ip_dst.s_addr = htonl(id->dst_ip); - } else { - h->ip_src.s_addr = htonl(id->dst_ip); - h->ip_dst.s_addr = htonl(id->src_ip); - } - - th = (struct tcphdr *)(h + 1); - break; -#ifdef INET6 - case 6: - h6 = mtod(m, struct ip6_hdr *); - - /* prepare for checksum */ - h6->ip6_nxt = IPPROTO_TCP; - h6->ip6_plen = htons(sizeof(struct tcphdr)); - if (dir) { - h6->ip6_src = id->src_ip6; - h6->ip6_dst = id->dst_ip6; - } else { - h6->ip6_src = id->dst_ip6; - h6->ip6_dst = id->src_ip6; - } - - th = (struct tcphdr *)(h6 + 1); - break; -#endif - } - - if (dir) { - th->th_sport = htons(id->src_port); - th->th_dport = htons(id->dst_port); - } else { - th->th_sport = htons(id->dst_port); - th->th_dport = htons(id->src_port); - } - th->th_off = sizeof(struct tcphdr) >> 2; - - if (flags & TH_RST) { - if (flags & TH_ACK) { - th->th_seq = htonl(ack); - th->th_flags = TH_RST; - } else { - if (flags & TH_SYN) - seq++; - th->th_ack = htonl(seq); - th->th_flags = TH_RST | TH_ACK; - } - } else { - /* - * Keepalive - use caller provided sequence numbers - */ - th->th_seq = htonl(seq); - th->th_ack = htonl(ack); - th->th_flags = TH_ACK; - } - - switch (id->addr_type) { - case 4: - th->th_sum = in_cksum(m, len); - - /* finish the ip header */ - h->ip_v = 4; - h->ip_hl = sizeof(*h) >> 2; - h->ip_tos = IPTOS_LOWDELAY; - h->ip_off = 0; - /* ip_len must be in host format for ip_output */ - h->ip_len = len; - h->ip_ttl = V_ip_defttl; - h->ip_sum = 0; - break; -#ifdef INET6 - case 6: - th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(*h6), - sizeof(struct tcphdr)); - - /* finish the ip6 header */ - h6->ip6_vfc |= IPV6_VERSION; - h6->ip6_hlim = IPV6_DEFHLIM; - break; -#endif - } - - return (m); -} - -/* - * This procedure is only used to handle keepalives. It is invoked - * every dyn_keepalive_period - */ -static void -ipfw_tick(void * vnetx) -{ - struct mbuf *m0, *m, *mnext, **mtailp; -#ifdef INET6 - struct mbuf *m6, **m6_tailp; -#endif - int i; - ipfw_dyn_rule *q; -#ifdef VIMAGE - struct vnet *vp = vnetx; -#endif - - CURVNET_SET(vp); - if (V_dyn_keepalive == 0 || V_ipfw_dyn_v == NULL || V_dyn_count == 0) - goto done; - - /* - * We make a chain of packets to go out here -- not deferring - * until after we drop the IPFW dynamic rule lock would result - * in a lock order reversal with the normal packet input -> ipfw - * call stack. - */ - m0 = NULL; - mtailp = &m0; -#ifdef INET6 - m6 = NULL; - m6_tailp = &m6; -#endif - IPFW_DYN_LOCK(); - for (i = 0 ; i < V_curr_dyn_buckets ; i++) { - for (q = V_ipfw_dyn_v[i] ; q ; q = q->next ) { - if (q->dyn_type == O_LIMIT_PARENT) - continue; - if (q->id.proto != IPPROTO_TCP) - continue; - if ( (q->state & BOTH_SYN) != BOTH_SYN) - continue; - if (TIME_LEQ(time_uptime + V_dyn_keepalive_interval, - q->expire)) - continue; /* too early */ - if (TIME_LEQ(q->expire, time_uptime)) - continue; /* too late, rule expired */ - - m = (q->state & ACK_REV) ? NULL : - ipfw_send_pkt(NULL, &(q->id), q->ack_rev - 1, - q->ack_fwd, TH_SYN); - mnext = (q->state & ACK_FWD) ? NULL : - ipfw_send_pkt(NULL, &(q->id), q->ack_fwd - 1, - q->ack_rev, 0); - - switch (q->id.addr_type) { - case 4: - if (m != NULL) { - *mtailp = m; - mtailp = &(*mtailp)->m_nextpkt; - } - if (mnext != NULL) { - *mtailp = mnext; - mtailp = &(*mtailp)->m_nextpkt; - } - break; -#ifdef INET6 - case 6: - if (m != NULL) { - *m6_tailp = m; - m6_tailp = &(*m6_tailp)->m_nextpkt; - } - if (mnext != NULL) { - *m6_tailp = mnext; - m6_tailp = &(*m6_tailp)->m_nextpkt; - } - break; -#endif - } - } - } - IPFW_DYN_UNLOCK(); - for (m = m0; m != NULL; m = mnext) { - mnext = m->m_nextpkt; - m->m_nextpkt = NULL; - ip_output(m, NULL, NULL, 0, NULL, NULL); - } -#ifdef INET6 - for (m = m6; m != NULL; m = mnext) { - mnext = m->m_nextpkt; - m->m_nextpkt = NULL; - ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); - } -#endif -done: - callout_reset_on(&V_ipfw_timeout, V_dyn_keepalive_period * hz, - ipfw_tick, vnetx, 0); - CURVNET_RESTORE(); -} - -void -ipfw_dyn_attach(void) -{ - ipfw_dyn_rule_zone = uma_zcreate("IPFW dynamic rule", - sizeof(ipfw_dyn_rule), NULL, NULL, NULL, NULL, - UMA_ALIGN_PTR, 0); - - IPFW_DYN_LOCK_INIT(); -} - -void -ipfw_dyn_detach(void) -{ - uma_zdestroy(ipfw_dyn_rule_zone); - IPFW_DYN_LOCK_DESTROY(); -} - -void -ipfw_dyn_init(void) -{ - V_ipfw_dyn_v = NULL; - V_dyn_buckets = 256; /* must be power of 2 */ - V_curr_dyn_buckets = 256; /* must be power of 2 */ - - V_dyn_ack_lifetime = 300; - V_dyn_syn_lifetime = 20; - V_dyn_fin_lifetime = 1; - V_dyn_rst_lifetime = 1; - V_dyn_udp_lifetime = 10; - V_dyn_short_lifetime = 5; - - V_dyn_keepalive_interval = 20; - V_dyn_keepalive_period = 5; - V_dyn_keepalive = 1; /* do send keepalives */ - - V_dyn_max = 4096; /* max # of dynamic rules */ - callout_init(&V_ipfw_timeout, CALLOUT_MPSAFE); - callout_reset_on(&V_ipfw_timeout, hz, ipfw_tick, curvnet, 0); -} - -void -ipfw_dyn_uninit(int pass) -{ - if (pass == 0) - callout_drain(&V_ipfw_timeout); - else { - if (V_ipfw_dyn_v != NULL) - free(V_ipfw_dyn_v, M_IPFW); - } -} - -int -ipfw_dyn_len(void) -{ - return (V_ipfw_dyn_v == NULL) ? 0 : - (V_dyn_count * sizeof(ipfw_dyn_rule)); -} - -void -ipfw_get_dynamic(char **pbp, const char *ep) -{ - ipfw_dyn_rule *p, *last = NULL; - char *bp; - int i; - - if (V_ipfw_dyn_v == NULL) - return; - bp = *pbp; - - IPFW_DYN_LOCK(); - for (i = 0 ; i < V_curr_dyn_buckets; i++) - for (p = V_ipfw_dyn_v[i] ; p != NULL; p = p->next) { - if (bp + sizeof *p <= ep) { - ipfw_dyn_rule *dst = - (ipfw_dyn_rule *)bp; - bcopy(p, dst, sizeof *p); - bcopy(&(p->rule->rulenum), &(dst->rule), - sizeof(p->rule->rulenum)); - /* - * store set number into high word of - * dst->rule pointer. - */ - bcopy(&(p->rule->set), - (char *)&dst->rule + - sizeof(p->rule->rulenum), - sizeof(p->rule->set)); - /* - * store a non-null value in "next". - * The userland code will interpret a - * NULL here as a marker - * for the last dynamic rule. - */ - bcopy(&dst, &dst->next, sizeof(dst)); - last = dst; - dst->expire = - TIME_LEQ(dst->expire, time_uptime) ? - 0 : dst->expire - time_uptime ; - bp += sizeof(ipfw_dyn_rule); - } - } - IPFW_DYN_UNLOCK(); - if (last != NULL) /* mark last dynamic rule */ - bzero(&last->next, sizeof(last)); - *pbp = bp; -} -/* end of file */ diff --git a/sys/netinet/ipfw/ip_fw_log.c b/sys/netinet/ipfw/ip_fw_log.c deleted file mode 100644 index c0f8fcd..0000000 --- a/sys/netinet/ipfw/ip_fw_log.c +++ /dev/null @@ -1,552 +0,0 @@ -/*- - * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - -/* - * Logging support for ipfw - */ - -#include "opt_ipfw.h" -#include "opt_inet.h" -#ifndef INET -#error IPFIREWALL requires INET. -#endif /* INET */ -#include "opt_inet6.h" - -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/mbuf.h> -#include <sys/kernel.h> -#include <sys/socket.h> -#include <sys/sysctl.h> -#include <sys/syslog.h> -#include <sys/lock.h> -#include <sys/rwlock.h> -#include <net/ethernet.h> /* for ETHERTYPE_IP */ -#include <net/if.h> -#include <net/if_clone.h> -#include <net/vnet.h> -#include <net/if_types.h> /* for IFT_PFLOG */ -#include <net/bpf.h> /* for BPF */ - -#include <netinet/in.h> -#include <netinet/ip.h> -#include <netinet/ip_icmp.h> -#include <netinet/ip_var.h> -#include <netinet/ip_fw.h> -#include <netinet/ipfw/ip_fw_private.h> -#include <netinet/tcp_var.h> -#include <netinet/udp.h> - -#include <netinet/ip6.h> -#include <netinet/icmp6.h> -#ifdef INET6 -#include <netinet6/in6_var.h> /* ip6_sprintf() */ -#endif - -#ifdef MAC -#include <security/mac/mac_framework.h> -#endif - -/* - * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T - * Other macros just cast void * into the appropriate type - */ -#define L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl)) -#define TCP(p) ((struct tcphdr *)(p)) -#define SCTP(p) ((struct sctphdr *)(p)) -#define UDP(p) ((struct udphdr *)(p)) -#define ICMP(p) ((struct icmphdr *)(p)) -#define ICMP6(p) ((struct icmp6_hdr *)(p)) - -#define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0 -#define SNP(buf) buf, sizeof(buf) - -#ifdef WITHOUT_BPF -void -ipfw_log_bpf(int onoff) -{ -} -#else /* !WITHOUT_BPF */ -static struct ifnet *log_if; /* hook to attach to bpf */ -static struct rwlock log_if_lock; -#define LOGIF_LOCK_INIT(x) rw_init(&log_if_lock, "ipfw log_if lock") -#define LOGIF_LOCK_DESTROY(x) rw_destroy(&log_if_lock) -#define LOGIF_RLOCK(x) rw_rlock(&log_if_lock) -#define LOGIF_RUNLOCK(x) rw_runlock(&log_if_lock) -#define LOGIF_WLOCK(x) rw_wlock(&log_if_lock) -#define LOGIF_WUNLOCK(x) rw_wunlock(&log_if_lock) - -#define IPFWNAME "ipfw" - -/* we use this dummy function for all ifnet callbacks */ -static int -log_dummy(struct ifnet *ifp, u_long cmd, caddr_t addr) -{ - return EINVAL; -} - -static int -ipfw_log_output(struct ifnet *ifp, struct mbuf *m, - struct sockaddr *dst, struct route *ro) -{ - if (m != NULL) - FREE_PKT(m); - return EINVAL; -} - -static void -ipfw_log_start(struct ifnet* ifp) -{ - panic("ipfw_log_start() must not be called"); -} - -static const u_char ipfwbroadcastaddr[6] = - { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; - -static int -ipfw_log_clone_match(struct if_clone *ifc, const char *name) -{ - - return (strncmp(name, IPFWNAME, sizeof(IPFWNAME) - 1) == 0); -} - -static int -ipfw_log_clone_create(struct if_clone *ifc, char *name, size_t len, - caddr_t params) -{ - int error; - int unit; - struct ifnet *ifp; - - error = ifc_name2unit(name, &unit); - if (error) - return (error); - - error = ifc_alloc_unit(ifc, &unit); - if (error) - return (error); - - ifp = if_alloc(IFT_PFLOG); - if (ifp == NULL) { - ifc_free_unit(ifc, unit); - return (ENOSPC); - } - ifp->if_dname = IPFWNAME; - ifp->if_dunit = unit; - snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", IPFWNAME, unit); - strlcpy(name, ifp->if_xname, len); - ifp->if_mtu = 65536; - ifp->if_flags = IFF_UP | IFF_SIMPLEX | IFF_MULTICAST; - ifp->if_init = (void *)log_dummy; - ifp->if_ioctl = log_dummy; - ifp->if_start = ipfw_log_start; - ifp->if_output = ipfw_log_output; - ifp->if_addrlen = 6; - ifp->if_hdrlen = 14; - ifp->if_broadcastaddr = ipfwbroadcastaddr; - ifp->if_baudrate = IF_Mbps(10); - - LOGIF_WLOCK(); - if (log_if == NULL) - log_if = ifp; - else { - LOGIF_WUNLOCK(); - if_free(ifp); - ifc_free_unit(ifc, unit); - return (EEXIST); - } - LOGIF_WUNLOCK(); - if_attach(ifp); - bpfattach(ifp, DLT_EN10MB, 14); - - return (0); -} - -static int -ipfw_log_clone_destroy(struct if_clone *ifc, struct ifnet *ifp) -{ - int unit; - - if (ifp == NULL) - return (0); - - LOGIF_WLOCK(); - if (log_if != NULL && ifp == log_if) - log_if = NULL; - else { - LOGIF_WUNLOCK(); - return (EINVAL); - } - LOGIF_WUNLOCK(); - - unit = ifp->if_dunit; - bpfdetach(ifp); - if_detach(ifp); - if_free(ifp); - ifc_free_unit(ifc, unit); - - return (0); -} - -static struct if_clone ipfw_log_cloner = IFC_CLONE_INITIALIZER( - IPFWNAME, NULL, IF_MAXUNIT, - NULL, ipfw_log_clone_match, ipfw_log_clone_create, ipfw_log_clone_destroy); - -void -ipfw_log_bpf(int onoff) -{ - - if (onoff) { - LOGIF_LOCK_INIT(); - if_clone_attach(&ipfw_log_cloner); - } else { - if_clone_detach(&ipfw_log_cloner); - LOGIF_LOCK_DESTROY(); - } -} -#endif /* !WITHOUT_BPF */ - -/* - * We enter here when we have a rule with O_LOG. - * XXX this function alone takes about 2Kbytes of code! - */ -void -ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args, - struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg, - struct ip *ip) -{ - char *action; - int limit_reached = 0; - char action2[92], proto[128], fragment[32]; - - if (V_fw_verbose == 0) { -#ifndef WITHOUT_BPF - LOGIF_RLOCK(); - if (log_if == NULL || log_if->if_bpf == NULL) { - LOGIF_RUNLOCK(); - return; - } - - if (args->eh) /* layer2, use orig hdr */ - BPF_MTAP2(log_if, args->eh, ETHER_HDR_LEN, m); - else - /* Add fake header. Later we will store - * more info in the header. - */ - BPF_MTAP2(log_if, "DDDDDDSSSSSS\x08\x00", ETHER_HDR_LEN, m); - LOGIF_RUNLOCK(); -#endif /* !WITHOUT_BPF */ - return; - } - /* the old 'log' function */ - fragment[0] = '\0'; - proto[0] = '\0'; - - if (f == NULL) { /* bogus pkt */ - if (V_verbose_limit != 0 && V_norule_counter >= V_verbose_limit) - return; - V_norule_counter++; - if (V_norule_counter == V_verbose_limit) - limit_reached = V_verbose_limit; - action = "Refuse"; - } else { /* O_LOG is the first action, find the real one */ - ipfw_insn *cmd = ACTION_PTR(f); - ipfw_insn_log *l = (ipfw_insn_log *)cmd; - - if (l->max_log != 0 && l->log_left == 0) - return; - l->log_left--; - if (l->log_left == 0) - limit_reached = l->max_log; - cmd += F_LEN(cmd); /* point to first action */ - if (cmd->opcode == O_ALTQ) { - ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd; - - snprintf(SNPARGS(action2, 0), "Altq %d", - altq->qid); - cmd += F_LEN(cmd); - } - if (cmd->opcode == O_PROB) - cmd += F_LEN(cmd); - - if (cmd->opcode == O_TAG) - cmd += F_LEN(cmd); - - action = action2; - switch (cmd->opcode) { - case O_DENY: - action = "Deny"; - break; - - case O_REJECT: - if (cmd->arg1==ICMP_REJECT_RST) - action = "Reset"; - else if (cmd->arg1==ICMP_UNREACH_HOST) - action = "Reject"; - else - snprintf(SNPARGS(action2, 0), "Unreach %d", - cmd->arg1); - break; - - case O_UNREACH6: - if (cmd->arg1==ICMP6_UNREACH_RST) - action = "Reset"; - else - snprintf(SNPARGS(action2, 0), "Unreach %d", - cmd->arg1); - break; - - case O_ACCEPT: - action = "Accept"; - break; - case O_COUNT: - action = "Count"; - break; - case O_DIVERT: - snprintf(SNPARGS(action2, 0), "Divert %d", - cmd->arg1); - break; - case O_TEE: - snprintf(SNPARGS(action2, 0), "Tee %d", - cmd->arg1); - break; - case O_SETFIB: - snprintf(SNPARGS(action2, 0), "SetFib %d", - cmd->arg1); - break; - case O_SKIPTO: - snprintf(SNPARGS(action2, 0), "SkipTo %d", - cmd->arg1); - break; - case O_PIPE: - snprintf(SNPARGS(action2, 0), "Pipe %d", - cmd->arg1); - break; - case O_QUEUE: - snprintf(SNPARGS(action2, 0), "Queue %d", - cmd->arg1); - break; - case O_FORWARD_IP: { - ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd; - int len; - struct in_addr dummyaddr; - if (sa->sa.sin_addr.s_addr == INADDR_ANY) - dummyaddr.s_addr = htonl(tablearg); - else - dummyaddr.s_addr = sa->sa.sin_addr.s_addr; - - len = snprintf(SNPARGS(action2, 0), "Forward to %s", - inet_ntoa(dummyaddr)); - - if (sa->sa.sin_port) - snprintf(SNPARGS(action2, len), ":%d", - sa->sa.sin_port); - } - break; -#ifdef INET6 - case O_FORWARD_IP6: { - char buf[INET6_ADDRSTRLEN]; - ipfw_insn_sa6 *sa = (ipfw_insn_sa6 *)cmd; - int len; - - len = snprintf(SNPARGS(action2, 0), "Forward to [%s]", - ip6_sprintf(buf, &sa->sa.sin6_addr)); - - if (sa->sa.sin6_port) - snprintf(SNPARGS(action2, len), ":%u", - sa->sa.sin6_port); - } - break; -#endif - case O_NETGRAPH: - snprintf(SNPARGS(action2, 0), "Netgraph %d", - cmd->arg1); - break; - case O_NGTEE: - snprintf(SNPARGS(action2, 0), "Ngtee %d", - cmd->arg1); - break; - case O_NAT: - action = "Nat"; - break; - case O_REASS: - action = "Reass"; - break; - case O_CALLRETURN: - if (cmd->len & F_NOT) - action = "Return"; - else - snprintf(SNPARGS(action2, 0), "Call %d", - cmd->arg1); - break; - default: - action = "UNKNOWN"; - break; - } - } - - if (hlen == 0) { /* non-ip */ - snprintf(SNPARGS(proto, 0), "MAC"); - - } else { - int len; -#ifdef INET6 - char src[INET6_ADDRSTRLEN + 2], dst[INET6_ADDRSTRLEN + 2]; -#else - char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN]; -#endif - struct icmphdr *icmp; - struct tcphdr *tcp; - struct udphdr *udp; -#ifdef INET6 - struct ip6_hdr *ip6 = NULL; - struct icmp6_hdr *icmp6; - u_short ip6f_mf; -#endif - src[0] = '\0'; - dst[0] = '\0'; -#ifdef INET6 - ip6f_mf = offset & IP6F_MORE_FRAG; - offset &= IP6F_OFF_MASK; - - if (IS_IP6_FLOW_ID(&(args->f_id))) { - char ip6buf[INET6_ADDRSTRLEN]; - snprintf(src, sizeof(src), "[%s]", - ip6_sprintf(ip6buf, &args->f_id.src_ip6)); - snprintf(dst, sizeof(dst), "[%s]", - ip6_sprintf(ip6buf, &args->f_id.dst_ip6)); - - ip6 = (struct ip6_hdr *)ip; - tcp = (struct tcphdr *)(((char *)ip) + hlen); - udp = (struct udphdr *)(((char *)ip) + hlen); - } else -#endif - { - tcp = L3HDR(struct tcphdr, ip); - udp = L3HDR(struct udphdr, ip); - - inet_ntop(AF_INET, &ip->ip_src, src, sizeof(src)); - inet_ntop(AF_INET, &ip->ip_dst, dst, sizeof(dst)); - } - - switch (args->f_id.proto) { - case IPPROTO_TCP: - len = snprintf(SNPARGS(proto, 0), "TCP %s", src); - if (offset == 0) - snprintf(SNPARGS(proto, len), ":%d %s:%d", - ntohs(tcp->th_sport), - dst, - ntohs(tcp->th_dport)); - else - snprintf(SNPARGS(proto, len), " %s", dst); - break; - - case IPPROTO_UDP: - len = snprintf(SNPARGS(proto, 0), "UDP %s", src); - if (offset == 0) - snprintf(SNPARGS(proto, len), ":%d %s:%d", - ntohs(udp->uh_sport), - dst, - ntohs(udp->uh_dport)); - else - snprintf(SNPARGS(proto, len), " %s", dst); - break; - - case IPPROTO_ICMP: - icmp = L3HDR(struct icmphdr, ip); - if (offset == 0) - len = snprintf(SNPARGS(proto, 0), - "ICMP:%u.%u ", - icmp->icmp_type, icmp->icmp_code); - else - len = snprintf(SNPARGS(proto, 0), "ICMP "); - len += snprintf(SNPARGS(proto, len), "%s", src); - snprintf(SNPARGS(proto, len), " %s", dst); - break; -#ifdef INET6 - case IPPROTO_ICMPV6: - icmp6 = (struct icmp6_hdr *)(((char *)ip) + hlen); - if (offset == 0) - len = snprintf(SNPARGS(proto, 0), - "ICMPv6:%u.%u ", - icmp6->icmp6_type, icmp6->icmp6_code); - else - len = snprintf(SNPARGS(proto, 0), "ICMPv6 "); - len += snprintf(SNPARGS(proto, len), "%s", src); - snprintf(SNPARGS(proto, len), " %s", dst); - break; -#endif - default: - len = snprintf(SNPARGS(proto, 0), "P:%d %s", - args->f_id.proto, src); - snprintf(SNPARGS(proto, len), " %s", dst); - break; - } - -#ifdef INET6 - if (IS_IP6_FLOW_ID(&(args->f_id))) { - if (offset & (IP6F_OFF_MASK | IP6F_MORE_FRAG)) - snprintf(SNPARGS(fragment, 0), - " (frag %08x:%d@%d%s)", - args->f_id.extra, - ntohs(ip6->ip6_plen) - hlen, - ntohs(offset) << 3, ip6f_mf ? "+" : ""); - } else -#endif - { - int ipoff, iplen; - ipoff = ntohs(ip->ip_off); - iplen = ntohs(ip->ip_len); - if (ipoff & (IP_MF | IP_OFFMASK)) - snprintf(SNPARGS(fragment, 0), - " (frag %d:%d@%d%s)", - ntohs(ip->ip_id), iplen - (ip->ip_hl << 2), - offset << 3, - (ipoff & IP_MF) ? "+" : ""); - } - } -#ifdef __FreeBSD__ - if (oif || m->m_pkthdr.rcvif) - log(LOG_SECURITY | LOG_INFO, - "ipfw: %d %s %s %s via %s%s\n", - f ? f->rulenum : -1, - action, proto, oif ? "out" : "in", - oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname, - fragment); - else -#endif - log(LOG_SECURITY | LOG_INFO, - "ipfw: %d %s %s [no if info]%s\n", - f ? f->rulenum : -1, - action, proto, fragment); - if (limit_reached) - log(LOG_SECURITY | LOG_NOTICE, - "ipfw: limit %d reached on entry %d\n", - limit_reached, f ? f->rulenum : -1); -} -/* end of file */ diff --git a/sys/netinet/ipfw/ip_fw_nat.c b/sys/netinet/ipfw/ip_fw_nat.c deleted file mode 100644 index dbeb254..0000000 --- a/sys/netinet/ipfw/ip_fw_nat.c +++ /dev/null @@ -1,661 +0,0 @@ -/*- - * Copyright (c) 2008 Paolo Pisati - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/eventhandler.h> -#include <sys/malloc.h> -#include <sys/kernel.h> -#include <sys/lock.h> -#include <sys/module.h> -#include <sys/rwlock.h> - -#define IPFW_INTERNAL /* Access to protected data structures in ip_fw.h. */ - -#include <netinet/libalias/alias.h> -#include <netinet/libalias/alias_local.h> - -#include <net/if.h> -#include <netinet/in.h> -#include <netinet/ip.h> -#include <netinet/ip_var.h> -#include <netinet/ip_fw.h> -#include <netinet/ipfw/ip_fw_private.h> -#include <netinet/tcp.h> -#include <netinet/udp.h> - -#include <machine/in_cksum.h> /* XXX for in_cksum */ - -static VNET_DEFINE(eventhandler_tag, ifaddr_event_tag); -#define V_ifaddr_event_tag VNET(ifaddr_event_tag) - -static void -ifaddr_change(void *arg __unused, struct ifnet *ifp) -{ - struct cfg_nat *ptr; - struct ifaddr *ifa; - struct ip_fw_chain *chain; - - chain = &V_layer3_chain; - IPFW_WLOCK(chain); - /* Check every nat entry... */ - LIST_FOREACH(ptr, &chain->nat, _next) { - /* ...using nic 'ifp->if_xname' as dynamic alias address. */ - if (strncmp(ptr->if_name, ifp->if_xname, IF_NAMESIZE) != 0) - continue; - if_addr_rlock(ifp); - TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { - if (ifa->ifa_addr == NULL) - continue; - if (ifa->ifa_addr->sa_family != AF_INET) - continue; - ptr->ip = ((struct sockaddr_in *) - (ifa->ifa_addr))->sin_addr; - LibAliasSetAddress(ptr->lib, ptr->ip); - } - if_addr_runlock(ifp); - } - IPFW_WUNLOCK(chain); -} - -/* - * delete the pointers for nat entry ix, or all of them if ix < 0 - */ -static void -flush_nat_ptrs(struct ip_fw_chain *chain, const int ix) -{ - int i; - ipfw_insn_nat *cmd; - - IPFW_WLOCK_ASSERT(chain); - for (i = 0; i < chain->n_rules; i++) { - cmd = (ipfw_insn_nat *)ACTION_PTR(chain->map[i]); - /* XXX skip log and the like ? */ - if (cmd->o.opcode == O_NAT && cmd->nat != NULL && - (ix < 0 || cmd->nat->id == ix)) - cmd->nat = NULL; - } -} - -static void -del_redir_spool_cfg(struct cfg_nat *n, struct redir_chain *head) -{ - struct cfg_redir *r, *tmp_r; - struct cfg_spool *s, *tmp_s; - int i, num; - - LIST_FOREACH_SAFE(r, head, _next, tmp_r) { - num = 1; /* Number of alias_link to delete. */ - switch (r->mode) { - case REDIR_PORT: - num = r->pport_cnt; - /* FALLTHROUGH */ - case REDIR_ADDR: - case REDIR_PROTO: - /* Delete all libalias redirect entry. */ - for (i = 0; i < num; i++) - LibAliasRedirectDelete(n->lib, r->alink[i]); - /* Del spool cfg if any. */ - LIST_FOREACH_SAFE(s, &r->spool_chain, _next, tmp_s) { - LIST_REMOVE(s, _next); - free(s, M_IPFW); - } - free(r->alink, M_IPFW); - LIST_REMOVE(r, _next); - free(r, M_IPFW); - break; - default: - printf("unknown redirect mode: %u\n", r->mode); - /* XXX - panic?!?!? */ - break; - } - } -} - -static void -add_redir_spool_cfg(char *buf, struct cfg_nat *ptr) -{ - struct cfg_redir *r, *ser_r; - struct cfg_spool *s, *ser_s; - int cnt, off, i; - - for (cnt = 0, off = 0; cnt < ptr->redir_cnt; cnt++) { - ser_r = (struct cfg_redir *)&buf[off]; - r = malloc(SOF_REDIR, M_IPFW, M_WAITOK | M_ZERO); - memcpy(r, ser_r, SOF_REDIR); - LIST_INIT(&r->spool_chain); - off += SOF_REDIR; - r->alink = malloc(sizeof(struct alias_link *) * r->pport_cnt, - M_IPFW, M_WAITOK | M_ZERO); - switch (r->mode) { - case REDIR_ADDR: - r->alink[0] = LibAliasRedirectAddr(ptr->lib, r->laddr, - r->paddr); - break; - case REDIR_PORT: - for (i = 0 ; i < r->pport_cnt; i++) { - /* If remotePort is all ports, set it to 0. */ - u_short remotePortCopy = r->rport + i; - if (r->rport_cnt == 1 && r->rport == 0) - remotePortCopy = 0; - r->alink[i] = LibAliasRedirectPort(ptr->lib, - r->laddr, htons(r->lport + i), r->raddr, - htons(remotePortCopy), r->paddr, - htons(r->pport + i), r->proto); - if (r->alink[i] == NULL) { - r->alink[0] = NULL; - break; - } - } - break; - case REDIR_PROTO: - r->alink[0] = LibAliasRedirectProto(ptr->lib ,r->laddr, - r->raddr, r->paddr, r->proto); - break; - default: - printf("unknown redirect mode: %u\n", r->mode); - break; - } - /* XXX perhaps return an error instead of panic ? */ - if (r->alink[0] == NULL) - panic("LibAliasRedirect* returned NULL"); - /* LSNAT handling. */ - for (i = 0; i < r->spool_cnt; i++) { - ser_s = (struct cfg_spool *)&buf[off]; - s = malloc(SOF_REDIR, M_IPFW, M_WAITOK | M_ZERO); - memcpy(s, ser_s, SOF_SPOOL); - LibAliasAddServer(ptr->lib, r->alink[0], - s->addr, htons(s->port)); - off += SOF_SPOOL; - /* Hook spool entry. */ - LIST_INSERT_HEAD(&r->spool_chain, s, _next); - } - /* And finally hook this redir entry. */ - LIST_INSERT_HEAD(&ptr->redir_chain, r, _next); - } -} - -static int -ipfw_nat(struct ip_fw_args *args, struct cfg_nat *t, struct mbuf *m) -{ - struct mbuf *mcl; - struct ip *ip; - /* XXX - libalias duct tape */ - int ldt, retval, found; - struct ip_fw_chain *chain; - char *c; - - ldt = 0; - retval = 0; - mcl = m_megapullup(m, m->m_pkthdr.len); - if (mcl == NULL) { - args->m = NULL; - return (IP_FW_DENY); - } - ip = mtod(mcl, struct ip *); - - /* - * XXX - Libalias checksum offload 'duct tape': - * - * locally generated packets have only pseudo-header checksum - * calculated and libalias will break it[1], so mark them for - * later fix. Moreover there are cases when libalias modifies - * tcp packet data[2], mark them for later fix too. - * - * [1] libalias was never meant to run in kernel, so it does - * not have any knowledge about checksum offloading, and - * expects a packet with a full internet checksum. - * Unfortunately, packets generated locally will have just the - * pseudo header calculated, and when libalias tries to adjust - * the checksum it will actually compute a wrong value. - * - * [2] when libalias modifies tcp's data content, full TCP - * checksum has to be recomputed: the problem is that - * libalias does not have any idea about checksum offloading. - * To work around this, we do not do checksumming in LibAlias, - * but only mark the packets in th_x2 field. If we receive a - * marked packet, we calculate correct checksum for it - * aware of offloading. Why such a terrible hack instead of - * recalculating checksum for each packet? - * Because the previous checksum was not checked! - * Recalculating checksums for EVERY packet will hide ALL - * transmission errors. Yes, marked packets still suffer from - * this problem. But, sigh, natd(8) has this problem, too. - * - * TODO: -make libalias mbuf aware (so - * it can handle delayed checksum and tso) - */ - - if (mcl->m_pkthdr.rcvif == NULL && - mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA) - ldt = 1; - - c = mtod(mcl, char *); - - /* Check if this is 'global' instance */ - if (t == NULL) { - if (args->oif == NULL) { - /* Wrong direction, skip processing */ - args->m = mcl; - return (IP_FW_NAT); - } - - found = 0; - chain = &V_layer3_chain; - IPFW_RLOCK(chain); - /* Check every nat entry... */ - LIST_FOREACH(t, &chain->nat, _next) { - if ((t->mode & PKT_ALIAS_SKIP_GLOBAL) != 0) - continue; - retval = LibAliasOutTry(t->lib, c, - mcl->m_len + M_TRAILINGSPACE(mcl), 0); - if (retval == PKT_ALIAS_OK) { - /* Nat instance recognises state */ - found = 1; - break; - } - } - IPFW_RUNLOCK(chain); - if (found != 1) { - /* No instance found, return ignore */ - args->m = mcl; - return (IP_FW_NAT); - } - } else { - if (args->oif == NULL) - retval = LibAliasIn(t->lib, c, - mcl->m_len + M_TRAILINGSPACE(mcl)); - else - retval = LibAliasOut(t->lib, c, - mcl->m_len + M_TRAILINGSPACE(mcl)); - } - - /* - * We drop packet when: - * 1. libalias returns PKT_ALIAS_ERROR; - * 2. For incoming packets: - * a) for unresolved fragments; - * b) libalias returns PKT_ALIAS_IGNORED and - * PKT_ALIAS_DENY_INCOMING flag is set. - */ - if (retval == PKT_ALIAS_ERROR || - (args->oif == NULL && (retval == PKT_ALIAS_UNRESOLVED_FRAGMENT || - (retval == PKT_ALIAS_IGNORED && - (t->mode & PKT_ALIAS_DENY_INCOMING) != 0)))) { - /* XXX - should i add some logging? */ - m_free(mcl); - args->m = NULL; - return (IP_FW_DENY); - } - - if (retval == PKT_ALIAS_RESPOND) - mcl->m_flags |= M_SKIP_FIREWALL; - mcl->m_pkthdr.len = mcl->m_len = ntohs(ip->ip_len); - - /* - * XXX - libalias checksum offload - * 'duct tape' (see above) - */ - - if ((ip->ip_off & htons(IP_OFFMASK)) == 0 && - ip->ip_p == IPPROTO_TCP) { - struct tcphdr *th; - - th = (struct tcphdr *)(ip + 1); - if (th->th_x2) - ldt = 1; - } - - if (ldt) { - struct tcphdr *th; - struct udphdr *uh; - u_short cksum; - - ip->ip_len = ntohs(ip->ip_len); - cksum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, - htons(ip->ip_p + ip->ip_len - (ip->ip_hl << 2))); - - switch (ip->ip_p) { - case IPPROTO_TCP: - th = (struct tcphdr *)(ip + 1); - /* - * Maybe it was set in - * libalias... - */ - th->th_x2 = 0; - th->th_sum = cksum; - mcl->m_pkthdr.csum_data = - offsetof(struct tcphdr, th_sum); - break; - case IPPROTO_UDP: - uh = (struct udphdr *)(ip + 1); - uh->uh_sum = cksum; - mcl->m_pkthdr.csum_data = - offsetof(struct udphdr, uh_sum); - break; - } - /* No hw checksum offloading: do it ourselves */ - if ((mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA) == 0) { - in_delayed_cksum(mcl); - mcl->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; - } - ip->ip_len = htons(ip->ip_len); - } - args->m = mcl; - return (IP_FW_NAT); -} - -static struct cfg_nat * -lookup_nat(struct nat_list *l, int nat_id) -{ - struct cfg_nat *res; - - LIST_FOREACH(res, l, _next) { - if (res->id == nat_id) - break; - } - return res; -} - -static int -ipfw_nat_cfg(struct sockopt *sopt) -{ - struct cfg_nat *cfg, *ptr; - char *buf; - struct ip_fw_chain *chain = &V_layer3_chain; - size_t len; - int gencnt, error = 0; - - len = sopt->sopt_valsize; - buf = malloc(len, M_TEMP, M_WAITOK | M_ZERO); - if ((error = sooptcopyin(sopt, buf, len, sizeof(struct cfg_nat))) != 0) - goto out; - - cfg = (struct cfg_nat *)buf; - if (cfg->id < 0) { - error = EINVAL; - goto out; - } - - /* - * Find/create nat rule. - */ - IPFW_WLOCK(chain); - gencnt = chain->gencnt; - ptr = lookup_nat(&chain->nat, cfg->id); - if (ptr == NULL) { - IPFW_WUNLOCK(chain); - /* New rule: allocate and init new instance. */ - ptr = malloc(sizeof(struct cfg_nat), M_IPFW, M_WAITOK | M_ZERO); - ptr->lib = LibAliasInit(NULL); - LIST_INIT(&ptr->redir_chain); - } else { - /* Entry already present: temporarily unhook it. */ - LIST_REMOVE(ptr, _next); - flush_nat_ptrs(chain, cfg->id); - IPFW_WUNLOCK(chain); - } - - /* - * Basic nat configuration. - */ - ptr->id = cfg->id; - /* - * XXX - what if this rule doesn't nat any ip and just - * redirect? - * do we set aliasaddress to 0.0.0.0? - */ - ptr->ip = cfg->ip; - ptr->redir_cnt = cfg->redir_cnt; - ptr->mode = cfg->mode; - LibAliasSetMode(ptr->lib, cfg->mode, cfg->mode); - LibAliasSetAddress(ptr->lib, ptr->ip); - memcpy(ptr->if_name, cfg->if_name, IF_NAMESIZE); - - /* - * Redir and LSNAT configuration. - */ - /* Delete old cfgs. */ - del_redir_spool_cfg(ptr, &ptr->redir_chain); - /* Add new entries. */ - add_redir_spool_cfg(&buf[(sizeof(struct cfg_nat))], ptr); - - IPFW_WLOCK(chain); - /* Extra check to avoid race with another ipfw_nat_cfg() */ - if (gencnt != chain->gencnt && - ((cfg = lookup_nat(&chain->nat, ptr->id)) != NULL)) - LIST_REMOVE(cfg, _next); - LIST_INSERT_HEAD(&chain->nat, ptr, _next); - chain->gencnt++; - IPFW_WUNLOCK(chain); - -out: - free(buf, M_TEMP); - return (error); -} - -static int -ipfw_nat_del(struct sockopt *sopt) -{ - struct cfg_nat *ptr; - struct ip_fw_chain *chain = &V_layer3_chain; - int i; - - sooptcopyin(sopt, &i, sizeof i, sizeof i); - /* XXX validate i */ - IPFW_WLOCK(chain); - ptr = lookup_nat(&chain->nat, i); - if (ptr == NULL) { - IPFW_WUNLOCK(chain); - return (EINVAL); - } - LIST_REMOVE(ptr, _next); - flush_nat_ptrs(chain, i); - IPFW_WUNLOCK(chain); - del_redir_spool_cfg(ptr, &ptr->redir_chain); - LibAliasUninit(ptr->lib); - free(ptr, M_IPFW); - return (0); -} - -static int -ipfw_nat_get_cfg(struct sockopt *sopt) -{ - struct ip_fw_chain *chain = &V_layer3_chain; - struct cfg_nat *n; - struct cfg_redir *r; - struct cfg_spool *s; - char *data; - int gencnt, nat_cnt, len, error; - - nat_cnt = 0; - len = sizeof(nat_cnt); - - IPFW_RLOCK(chain); -retry: - gencnt = chain->gencnt; - /* Estimate memory amount */ - LIST_FOREACH(n, &chain->nat, _next) { - nat_cnt++; - len += sizeof(struct cfg_nat); - LIST_FOREACH(r, &n->redir_chain, _next) { - len += sizeof(struct cfg_redir); - LIST_FOREACH(s, &r->spool_chain, _next) - len += sizeof(struct cfg_spool); - } - } - IPFW_RUNLOCK(chain); - - data = malloc(len, M_TEMP, M_WAITOK | M_ZERO); - bcopy(&nat_cnt, data, sizeof(nat_cnt)); - - nat_cnt = 0; - len = sizeof(nat_cnt); - - IPFW_RLOCK(chain); - if (gencnt != chain->gencnt) { - free(data, M_TEMP); - goto retry; - } - /* Serialize all the data. */ - LIST_FOREACH(n, &chain->nat, _next) { - bcopy(n, &data[len], sizeof(struct cfg_nat)); - len += sizeof(struct cfg_nat); - LIST_FOREACH(r, &n->redir_chain, _next) { - bcopy(r, &data[len], sizeof(struct cfg_redir)); - len += sizeof(struct cfg_redir); - LIST_FOREACH(s, &r->spool_chain, _next) { - bcopy(s, &data[len], sizeof(struct cfg_spool)); - len += sizeof(struct cfg_spool); - } - } - } - IPFW_RUNLOCK(chain); - - error = sooptcopyout(sopt, data, len); - free(data, M_TEMP); - - return (error); -} - -static int -ipfw_nat_get_log(struct sockopt *sopt) -{ - uint8_t *data; - struct cfg_nat *ptr; - int i, size; - struct ip_fw_chain *chain; - - chain = &V_layer3_chain; - - IPFW_RLOCK(chain); - /* one pass to count, one to copy the data */ - i = 0; - LIST_FOREACH(ptr, &chain->nat, _next) { - if (ptr->lib->logDesc == NULL) - continue; - i++; - } - size = i * (LIBALIAS_BUF_SIZE + sizeof(int)); - data = malloc(size, M_IPFW, M_NOWAIT | M_ZERO); - if (data == NULL) { - IPFW_RUNLOCK(chain); - return (ENOSPC); - } - i = 0; - LIST_FOREACH(ptr, &chain->nat, _next) { - if (ptr->lib->logDesc == NULL) - continue; - bcopy(&ptr->id, &data[i], sizeof(int)); - i += sizeof(int); - bcopy(ptr->lib->logDesc, &data[i], LIBALIAS_BUF_SIZE); - i += LIBALIAS_BUF_SIZE; - } - IPFW_RUNLOCK(chain); - sooptcopyout(sopt, data, size); - free(data, M_IPFW); - return(0); -} - -static void -ipfw_nat_init(void) -{ - - IPFW_WLOCK(&V_layer3_chain); - /* init ipfw hooks */ - ipfw_nat_ptr = ipfw_nat; - lookup_nat_ptr = lookup_nat; - ipfw_nat_cfg_ptr = ipfw_nat_cfg; - ipfw_nat_del_ptr = ipfw_nat_del; - ipfw_nat_get_cfg_ptr = ipfw_nat_get_cfg; - ipfw_nat_get_log_ptr = ipfw_nat_get_log; - IPFW_WUNLOCK(&V_layer3_chain); - V_ifaddr_event_tag = EVENTHANDLER_REGISTER( - ifaddr_event, ifaddr_change, - NULL, EVENTHANDLER_PRI_ANY); -} - -static void -ipfw_nat_destroy(void) -{ - struct cfg_nat *ptr, *ptr_temp; - struct ip_fw_chain *chain; - - chain = &V_layer3_chain; - IPFW_WLOCK(chain); - LIST_FOREACH_SAFE(ptr, &chain->nat, _next, ptr_temp) { - LIST_REMOVE(ptr, _next); - del_redir_spool_cfg(ptr, &ptr->redir_chain); - LibAliasUninit(ptr->lib); - free(ptr, M_IPFW); - } - EVENTHANDLER_DEREGISTER(ifaddr_event, V_ifaddr_event_tag); - flush_nat_ptrs(chain, -1 /* flush all */); - /* deregister ipfw_nat */ - ipfw_nat_ptr = NULL; - lookup_nat_ptr = NULL; - ipfw_nat_cfg_ptr = NULL; - ipfw_nat_del_ptr = NULL; - ipfw_nat_get_cfg_ptr = NULL; - ipfw_nat_get_log_ptr = NULL; - IPFW_WUNLOCK(chain); -} - -static int -ipfw_nat_modevent(module_t mod, int type, void *unused) -{ - int err = 0; - - switch (type) { - case MOD_LOAD: - ipfw_nat_init(); - break; - - case MOD_UNLOAD: - ipfw_nat_destroy(); - break; - - default: - return EOPNOTSUPP; - break; - } - return err; -} - -static moduledata_t ipfw_nat_mod = { - "ipfw_nat", - ipfw_nat_modevent, - 0 -}; - -DECLARE_MODULE(ipfw_nat, ipfw_nat_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); -MODULE_DEPEND(ipfw_nat, libalias, 1, 1, 1); -MODULE_DEPEND(ipfw_nat, ipfw, 2, 2, 2); -MODULE_VERSION(ipfw_nat, 1); -/* end of file */ diff --git a/sys/netinet/ipfw/ip_fw_pfil.c b/sys/netinet/ipfw/ip_fw_pfil.c deleted file mode 100644 index 8d429e7..0000000 --- a/sys/netinet/ipfw/ip_fw_pfil.c +++ /dev/null @@ -1,588 +0,0 @@ -/*- - * Copyright (c) 2004 Andre Oppermann, Internet Business Solutions AG - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - -#include "opt_ipfw.h" -#include "opt_inet.h" -#include "opt_inet6.h" -#ifndef INET -#error IPFIREWALL requires INET. -#endif /* INET */ - -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/malloc.h> -#include <sys/mbuf.h> -#include <sys/module.h> -#include <sys/kernel.h> -#include <sys/lock.h> -#include <sys/rwlock.h> -#include <sys/socket.h> -#include <sys/sysctl.h> - -#include <net/if.h> -#include <net/route.h> -#include <net/ethernet.h> -#include <net/pfil.h> -#include <net/vnet.h> - -#include <netinet/in.h> -#include <netinet/in_systm.h> -#include <netinet/ip.h> -#include <netinet/ip_var.h> -#include <netinet/ip_fw.h> -#ifdef INET6 -#include <netinet/ip6.h> -#include <netinet6/ip6_var.h> -#endif -#include <netinet/ipfw/ip_fw_private.h> -#include <netgraph/ng_ipfw.h> - -#include <machine/in_cksum.h> - -static VNET_DEFINE(int, fw_enable) = 1; -#define V_fw_enable VNET(fw_enable) - -#ifdef INET6 -static VNET_DEFINE(int, fw6_enable) = 1; -#define V_fw6_enable VNET(fw6_enable) -#endif - -static VNET_DEFINE(int, fwlink_enable) = 0; -#define V_fwlink_enable VNET(fwlink_enable) - -int ipfw_chg_hook(SYSCTL_HANDLER_ARGS); - -/* Forward declarations. */ -static int ipfw_divert(struct mbuf **, int, struct ipfw_rule_ref *, int); -static int ipfw_check_packet(void *, struct mbuf **, struct ifnet *, int, - struct inpcb *); -static int ipfw_check_frame(void *, struct mbuf **, struct ifnet *, int, - struct inpcb *); - -#ifdef SYSCTL_NODE - -SYSBEGIN(f1) - -SYSCTL_DECL(_net_inet_ip_fw); -SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, enable, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_enable), 0, - ipfw_chg_hook, "I", "Enable ipfw"); -#ifdef INET6 -SYSCTL_DECL(_net_inet6_ip6_fw); -SYSCTL_VNET_PROC(_net_inet6_ip6_fw, OID_AUTO, enable, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw6_enable), 0, - ipfw_chg_hook, "I", "Enable ipfw+6"); -#endif /* INET6 */ - -SYSCTL_DECL(_net_link_ether); -SYSCTL_VNET_PROC(_net_link_ether, OID_AUTO, ipfw, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fwlink_enable), 0, - ipfw_chg_hook, "I", "Pass ether pkts through firewall"); - -SYSEND - -#endif /* SYSCTL_NODE */ - -/* - * The pfilter hook to pass packets to ipfw_chk and then to - * dummynet, divert, netgraph or other modules. - * The packet may be consumed. - */ -static int -ipfw_check_packet(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir, - struct inpcb *inp) -{ - struct ip_fw_args args; - struct m_tag *tag; - int ipfw; - int ret; - - /* all the processing now uses ip_len in net format */ - if (mtod(*m0, struct ip *)->ip_v == 4) - SET_NET_IPLEN(mtod(*m0, struct ip *)); - - /* convert dir to IPFW values */ - dir = (dir == PFIL_IN) ? DIR_IN : DIR_OUT; - bzero(&args, sizeof(args)); - -again: - /* - * extract and remove the tag if present. If we are left - * with onepass, optimize the outgoing path. - */ - tag = m_tag_locate(*m0, MTAG_IPFW_RULE, 0, NULL); - if (tag != NULL) { - args.rule = *((struct ipfw_rule_ref *)(tag+1)); - m_tag_delete(*m0, tag); - if (args.rule.info & IPFW_ONEPASS) { - if (mtod(*m0, struct ip *)->ip_v == 4) - SET_HOST_IPLEN(mtod(*m0, struct ip *)); - return (0); - } - } - - args.m = *m0; - args.oif = dir == DIR_OUT ? ifp : NULL; - args.inp = inp; - - ipfw = ipfw_chk(&args); - *m0 = args.m; - - KASSERT(*m0 != NULL || ipfw == IP_FW_DENY, ("%s: m0 is NULL", - __func__)); - - /* breaking out of the switch means drop */ - ret = 0; /* default return value for pass */ - switch (ipfw) { - case IP_FW_PASS: - /* next_hop may be set by ipfw_chk */ - if (args.next_hop == NULL && args.next_hop6 == NULL) - break; /* pass */ -#if !defined(IPFIREWALL_FORWARD) || (!defined(INET6) && !defined(INET)) - ret = EACCES; -#else - { - struct m_tag *fwd_tag; - size_t len; - - KASSERT(args.next_hop == NULL || args.next_hop6 == NULL, - ("%s: both next_hop=%p and next_hop6=%p not NULL", __func__, - args.next_hop, args.next_hop6)); -#ifdef INET6 - if (args.next_hop6 != NULL) - len = sizeof(struct sockaddr_in6); -#endif -#ifdef INET - if (args.next_hop != NULL) - len = sizeof(struct sockaddr_in); -#endif - - /* Incoming packets should not be tagged so we do not - * m_tag_find. Outgoing packets may be tagged, so we - * reuse the tag if present. - */ - fwd_tag = (dir == DIR_IN) ? NULL : - m_tag_find(*m0, PACKET_TAG_IPFORWARD, NULL); - if (fwd_tag != NULL) { - m_tag_unlink(*m0, fwd_tag); - } else { - fwd_tag = m_tag_get(PACKET_TAG_IPFORWARD, len, - M_NOWAIT); - if (fwd_tag == NULL) { - ret = EACCES; - break; /* i.e. drop */ - } - } -#ifdef INET6 - if (args.next_hop6 != NULL) { - bcopy(args.next_hop6, (fwd_tag+1), len); - if (in6_localip(&args.next_hop6->sin6_addr)) - (*m0)->m_flags |= M_FASTFWD_OURS; - } -#endif -#ifdef INET - if (args.next_hop != NULL) { - bcopy(args.next_hop, (fwd_tag+1), len); - if (in_localip(args.next_hop->sin_addr)) - (*m0)->m_flags |= M_FASTFWD_OURS; - } -#endif - m_tag_prepend(*m0, fwd_tag); - } -#endif /* IPFIREWALL_FORWARD */ - break; - - case IP_FW_DENY: - ret = EACCES; - break; /* i.e. drop */ - - case IP_FW_DUMMYNET: - ret = EACCES; - if (ip_dn_io_ptr == NULL) - break; /* i.e. drop */ - if (mtod(*m0, struct ip *)->ip_v == 4) - ret = ip_dn_io_ptr(m0, dir, &args); - else if (mtod(*m0, struct ip *)->ip_v == 6) - ret = ip_dn_io_ptr(m0, dir | PROTO_IPV6, &args); - else - break; /* drop it */ - /* - * XXX should read the return value. - * dummynet normally eats the packet and sets *m0=NULL - * unless the packet can be sent immediately. In this - * case args is updated and we should re-run the - * check without clearing args. - */ - if (*m0 != NULL) - goto again; - break; - - case IP_FW_TEE: - case IP_FW_DIVERT: - if (ip_divert_ptr == NULL) { - ret = EACCES; - break; /* i.e. drop */ - } - ret = ipfw_divert(m0, dir, &args.rule, - (ipfw == IP_FW_TEE) ? 1 : 0); - /* continue processing for the original packet (tee). */ - if (*m0) - goto again; - break; - - case IP_FW_NGTEE: - case IP_FW_NETGRAPH: - if (ng_ipfw_input_p == NULL) { - ret = EACCES; - break; /* i.e. drop */ - } - ret = ng_ipfw_input_p(m0, dir, &args, - (ipfw == IP_FW_NGTEE) ? 1 : 0); - if (ipfw == IP_FW_NGTEE) /* ignore errors for NGTEE */ - goto again; /* continue with packet */ - break; - - case IP_FW_NAT: - /* honor one-pass in case of successful nat */ - if (V_fw_one_pass) - break; /* ret is already 0 */ - goto again; - - case IP_FW_REASS: - goto again; /* continue with packet */ - - default: - KASSERT(0, ("%s: unknown retval", __func__)); - } - - if (ret != 0) { - if (*m0) - FREE_PKT(*m0); - *m0 = NULL; - } - if (*m0 && mtod(*m0, struct ip *)->ip_v == 4) - SET_HOST_IPLEN(mtod(*m0, struct ip *)); - return ret; -} - -/* - * ipfw processing for ethernet packets (in and out). - * Inteface is NULL from ether_demux, and ifp from - * ether_output_frame. - */ -static int -ipfw_check_frame(void *arg, struct mbuf **m0, struct ifnet *dst, int dir, - struct inpcb *inp) -{ - struct ether_header *eh; - struct ether_header save_eh; - struct mbuf *m; - int i, ret; - struct ip_fw_args args; - struct m_tag *mtag; - - /* fetch start point from rule, if any */ - mtag = m_tag_locate(*m0, MTAG_IPFW_RULE, 0, NULL); - if (mtag == NULL) { - args.rule.slot = 0; - } else { - /* dummynet packet, already partially processed */ - struct ipfw_rule_ref *r; - - /* XXX can we free it after use ? */ - mtag->m_tag_id = PACKET_TAG_NONE; - r = (struct ipfw_rule_ref *)(mtag + 1); - if (r->info & IPFW_ONEPASS) - return (0); - args.rule = *r; - } - - /* I need some amt of data to be contiguous */ - m = *m0; - i = min(m->m_pkthdr.len, max_protohdr); - if (m->m_len < i) { - m = m_pullup(m, i); - if (m == NULL) { - *m0 = m; - return (0); - } - } - eh = mtod(m, struct ether_header *); - save_eh = *eh; /* save copy for restore below */ - m_adj(m, ETHER_HDR_LEN); /* strip ethernet header */ - - args.m = m; /* the packet we are looking at */ - args.oif = dst; /* destination, if any */ - args.next_hop = NULL; /* we do not support forward yet */ - args.next_hop6 = NULL; /* we do not support forward yet */ - args.eh = &save_eh; /* MAC header for bridged/MAC packets */ - args.inp = NULL; /* used by ipfw uid/gid/jail rules */ - i = ipfw_chk(&args); - m = args.m; - if (m != NULL) { - /* - * Restore Ethernet header, as needed, in case the - * mbuf chain was replaced by ipfw. - */ - M_PREPEND(m, ETHER_HDR_LEN, M_DONTWAIT); - if (m == NULL) { - *m0 = NULL; - return (0); - } - if (eh != mtod(m, struct ether_header *)) - bcopy(&save_eh, mtod(m, struct ether_header *), - ETHER_HDR_LEN); - } - *m0 = m; - - ret = 0; - /* Check result of ipfw_chk() */ - switch (i) { - case IP_FW_PASS: - break; - - case IP_FW_DENY: - ret = EACCES; - break; /* i.e. drop */ - - case IP_FW_DUMMYNET: - ret = EACCES; - int dir; - - if (ip_dn_io_ptr == NULL) - break; /* i.e. drop */ - - *m0 = NULL; - dir = PROTO_LAYER2 | (dst ? DIR_OUT : DIR_IN); - ip_dn_io_ptr(&m, dir, &args); - return 0; - - default: - KASSERT(0, ("%s: unknown retval", __func__)); - } - - if (ret != 0) { - if (*m0) - FREE_PKT(*m0); - *m0 = NULL; - } - - return ret; -} - -/* do the divert, return 1 on error 0 on success */ -static int -ipfw_divert(struct mbuf **m0, int incoming, struct ipfw_rule_ref *rule, - int tee) -{ - /* - * ipfw_chk() has already tagged the packet with the divert tag. - * If tee is set, copy packet and return original. - * If not tee, consume packet and send it to divert socket. - */ - struct mbuf *clone; - struct ip *ip = mtod(*m0, struct ip *); - struct m_tag *tag; - - /* Cloning needed for tee? */ - if (tee == 0) { - clone = *m0; /* use the original mbuf */ - *m0 = NULL; - } else { - clone = m_dup(*m0, M_DONTWAIT); - /* If we cannot duplicate the mbuf, we sacrifice the divert - * chain and continue with the tee-ed packet. - */ - if (clone == NULL) - return 1; - } - - /* - * Divert listeners can normally handle non-fragmented packets, - * but we can only reass in the non-tee case. - * This means that listeners on a tee rule may get fragments, - * and have to live with that. - * Note that we now have the 'reass' ipfw option so if we care - * we can do it before a 'tee'. - */ - if (!tee) switch (ip->ip_v) { - case IPVERSION: - if (ntohs(ip->ip_off) & (IP_MF | IP_OFFMASK)) { - int hlen; - struct mbuf *reass; - - SET_HOST_IPLEN(ip); /* ip_reass wants host order */ - reass = ip_reass(clone); /* Reassemble packet. */ - if (reass == NULL) - return 0; /* not an error */ - /* if reass = NULL then it was consumed by ip_reass */ - /* - * IP header checksum fixup after reassembly and leave header - * in network byte order. - */ - ip = mtod(reass, struct ip *); - hlen = ip->ip_hl << 2; - SET_NET_IPLEN(ip); - ip->ip_sum = 0; - if (hlen == sizeof(struct ip)) - ip->ip_sum = in_cksum_hdr(ip); - else - ip->ip_sum = in_cksum(reass, hlen); - clone = reass; - } - break; -#ifdef INET6 - case IPV6_VERSION >> 4: - { - struct ip6_hdr *const ip6 = mtod(clone, struct ip6_hdr *); - - if (ip6->ip6_nxt == IPPROTO_FRAGMENT) { - int nxt, off; - - off = sizeof(struct ip6_hdr); - nxt = frag6_input(&clone, &off, 0); - if (nxt == IPPROTO_DONE) - return (0); - } - break; - } -#endif - } - - /* attach a tag to the packet with the reinject info */ - tag = m_tag_alloc(MTAG_IPFW_RULE, 0, - sizeof(struct ipfw_rule_ref), M_NOWAIT); - if (tag == NULL) { - FREE_PKT(clone); - return 1; - } - *((struct ipfw_rule_ref *)(tag+1)) = *rule; - m_tag_prepend(clone, tag); - - /* Do the dirty job... */ - ip_divert_ptr(clone, incoming); - return 0; -} - -/* - * attach or detach hooks for a given protocol family - */ -static int -ipfw_hook(int onoff, int pf) -{ - struct pfil_head *pfh; - void *hook_func; - - pfh = pfil_head_get(PFIL_TYPE_AF, pf); - if (pfh == NULL) - return ENOENT; - - hook_func = (pf == AF_LINK) ? ipfw_check_frame : ipfw_check_packet; - - (void) (onoff ? pfil_add_hook : pfil_remove_hook) - (hook_func, NULL, PFIL_IN | PFIL_OUT | PFIL_WAITOK, pfh); - - return 0; -} - -int -ipfw_attach_hooks(int arg) -{ - int error = 0; - - if (arg == 0) /* detach */ - ipfw_hook(0, AF_INET); - else if (V_fw_enable && ipfw_hook(1, AF_INET) != 0) { - error = ENOENT; /* see ip_fw_pfil.c::ipfw_hook() */ - printf("ipfw_hook() error\n"); - } -#ifdef INET6 - if (arg == 0) /* detach */ - ipfw_hook(0, AF_INET6); - else if (V_fw6_enable && ipfw_hook(1, AF_INET6) != 0) { - error = ENOENT; - printf("ipfw6_hook() error\n"); - } -#endif - if (arg == 0) /* detach */ - ipfw_hook(0, AF_LINK); - else if (V_fwlink_enable && ipfw_hook(1, AF_LINK) != 0) { - error = ENOENT; - printf("ipfw_link_hook() error\n"); - } - return error; -} - -int -ipfw_chg_hook(SYSCTL_HANDLER_ARGS) -{ - int *enable; - int newval; - int error; - int af; - - if (arg1 == &VNET_NAME(fw_enable)) { - enable = &V_fw_enable; - af = AF_INET; - } -#ifdef INET6 - else if (arg1 == &VNET_NAME(fw6_enable)) { - enable = &V_fw6_enable; - af = AF_INET6; - } -#endif - else if (arg1 == &VNET_NAME(fwlink_enable)) { - enable = &V_fwlink_enable; - af = AF_LINK; - } - else - return (EINVAL); - - newval = *enable; - - /* Handle sysctl change */ - error = sysctl_handle_int(oidp, &newval, 0, req); - - if (error) - return (error); - - /* Formalize new value */ - newval = (newval) ? 1 : 0; - - if (*enable == newval) - return (0); - - error = ipfw_hook(newval, af); - if (error) - return (error); - *enable = newval; - - return (0); -} -/* end of file */ diff --git a/sys/netinet/ipfw/ip_fw_private.h b/sys/netinet/ipfw/ip_fw_private.h deleted file mode 100644 index fb13a72..0000000 --- a/sys/netinet/ipfw/ip_fw_private.h +++ /dev/null @@ -1,309 +0,0 @@ -/*- - * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#ifndef _IPFW2_PRIVATE_H -#define _IPFW2_PRIVATE_H - -/* - * Internal constants and data structures used by ipfw components - * and not meant to be exported outside the kernel. - */ - -#ifdef _KERNEL - -/* - * For platforms that do not have SYSCTL support, we wrap the - * SYSCTL_* into a function (one per file) to collect the values - * into an array at module initialization. The wrapping macros, - * SYSBEGIN() and SYSEND, are empty in the default case. - */ -#ifndef SYSBEGIN -#define SYSBEGIN(x) -#endif -#ifndef SYSEND -#define SYSEND -#endif - -/* Return values from ipfw_chk() */ -enum { - IP_FW_PASS = 0, - IP_FW_DENY, - IP_FW_DIVERT, - IP_FW_TEE, - IP_FW_DUMMYNET, - IP_FW_NETGRAPH, - IP_FW_NGTEE, - IP_FW_NAT, - IP_FW_REASS, -}; - -/* - * Structure for collecting parameters to dummynet for ip6_output forwarding - */ -struct _ip6dn_args { - struct ip6_pktopts *opt_or; - struct route_in6 ro_or; - int flags_or; - struct ip6_moptions *im6o_or; - struct ifnet *origifp_or; - struct ifnet *ifp_or; - struct sockaddr_in6 dst_or; - u_long mtu_or; - struct route_in6 ro_pmtu_or; -}; - - -/* - * Arguments for calling ipfw_chk() and dummynet_io(). We put them - * all into a structure because this way it is easier and more - * efficient to pass variables around and extend the interface. - */ -struct ip_fw_args { - struct mbuf *m; /* the mbuf chain */ - struct ifnet *oif; /* output interface */ - struct sockaddr_in *next_hop; /* forward address */ - struct sockaddr_in6 *next_hop6; /* ipv6 forward address */ - - /* - * On return, it points to the matching rule. - * On entry, rule.slot > 0 means the info is valid and - * contains the starting rule for an ipfw search. - * If chain_id == chain->id && slot >0 then jump to that slot. - * Otherwise, we locate the first rule >= rulenum:rule_id - */ - struct ipfw_rule_ref rule; /* match/restart info */ - - struct ether_header *eh; /* for bridged packets */ - - struct ipfw_flow_id f_id; /* grabbed from IP header */ - //uint32_t cookie; /* a cookie depending on rule action */ - struct inpcb *inp; - - struct _ip6dn_args dummypar; /* dummynet->ip6_output */ - struct sockaddr_in hopstore; /* store here if cannot use a pointer */ -}; - -MALLOC_DECLARE(M_IPFW); - -/* - * Hooks sometime need to know the direction of the packet - * (divert, dummynet, netgraph, ...) - * We use a generic definition here, with bit0-1 indicating the - * direction, bit 2 indicating layer2 or 3, bit 3-4 indicating the - * specific protocol - * indicating the protocol (if necessary) - */ -enum { - DIR_MASK = 0x3, - DIR_OUT = 0, - DIR_IN = 1, - DIR_FWD = 2, - DIR_DROP = 3, - PROTO_LAYER2 = 0x4, /* set for layer 2 */ - /* PROTO_DEFAULT = 0, */ - PROTO_IPV4 = 0x08, - PROTO_IPV6 = 0x10, - PROTO_IFB = 0x0c, /* layer2 + ifbridge */ - /* PROTO_OLDBDG = 0x14, unused, old bridge */ -}; - -/* wrapper for freeing a packet, in case we need to do more work */ -#ifndef FREE_PKT -#if defined(__linux__) || defined(_WIN32) -#define FREE_PKT(m) netisr_dispatch(-1, m) -#else -#define FREE_PKT(m) m_freem(m) -#endif -#endif /* !FREE_PKT */ - -/* - * Function definitions. - */ - -/* attach (arg = 1) or detach (arg = 0) hooks */ -int ipfw_attach_hooks(int); -#ifdef NOTYET -void ipfw_nat_destroy(void); -#endif - -/* In ip_fw_log.c */ -struct ip; -void ipfw_log_bpf(int); -void ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args, - struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg, - struct ip *ip); -VNET_DECLARE(u_int64_t, norule_counter); -#define V_norule_counter VNET(norule_counter) -VNET_DECLARE(int, verbose_limit); -#define V_verbose_limit VNET(verbose_limit) - -/* In ip_fw_dynamic.c */ - -enum { /* result for matching dynamic rules */ - MATCH_REVERSE = 0, - MATCH_FORWARD, - MATCH_NONE, - MATCH_UNKNOWN, -}; - -/* - * The lock for dynamic rules is only used once outside the file, - * and only to release the result of lookup_dyn_rule(). - * Eventually we may implement it with a callback on the function. - */ -void ipfw_dyn_unlock(void); - -struct tcphdr; -struct mbuf *ipfw_send_pkt(struct mbuf *, struct ipfw_flow_id *, - u_int32_t, u_int32_t, int); -int ipfw_install_state(struct ip_fw *rule, ipfw_insn_limit *cmd, - struct ip_fw_args *args, uint32_t tablearg); -ipfw_dyn_rule *ipfw_lookup_dyn_rule(struct ipfw_flow_id *pkt, - int *match_direction, struct tcphdr *tcp); -void ipfw_remove_dyn_children(struct ip_fw *rule); -void ipfw_get_dynamic(char **bp, const char *ep); - -void ipfw_dyn_attach(void); /* uma_zcreate .... */ -void ipfw_dyn_detach(void); /* uma_zdestroy ... */ -void ipfw_dyn_init(void); /* per-vnet initialization */ -void ipfw_dyn_uninit(int); /* per-vnet deinitialization */ -int ipfw_dyn_len(void); - -/* common variables */ -VNET_DECLARE(int, fw_one_pass); -#define V_fw_one_pass VNET(fw_one_pass) - -VNET_DECLARE(int, fw_verbose); -#define V_fw_verbose VNET(fw_verbose) - -VNET_DECLARE(struct ip_fw_chain, layer3_chain); -#define V_layer3_chain VNET(layer3_chain) - -VNET_DECLARE(u_int32_t, set_disable); -#define V_set_disable VNET(set_disable) - -VNET_DECLARE(int, autoinc_step); -#define V_autoinc_step VNET(autoinc_step) - -VNET_DECLARE(unsigned int, fw_tables_max); -#define V_fw_tables_max VNET(fw_tables_max) - -struct ip_fw_chain { - struct ip_fw *rules; /* list of rules */ - struct ip_fw *reap; /* list of rules to reap */ - struct ip_fw *default_rule; - int n_rules; /* number of static rules */ - int static_len; /* total len of static rules */ - struct ip_fw **map; /* array of rule ptrs to ease lookup */ - LIST_HEAD(nat_list, cfg_nat) nat; /* list of nat entries */ - struct radix_node_head **tables; /* IPv4 tables */ - struct radix_node_head **xtables; /* extended tables */ - uint8_t *tabletype; /* Array of table types */ -#if defined( __linux__ ) || defined( _WIN32 ) - spinlock_t rwmtx; - spinlock_t uh_lock; -#else - struct rwlock rwmtx; - struct rwlock uh_lock; /* lock for upper half */ -#endif - uint32_t id; /* ruleset id */ - uint32_t gencnt; /* generation count */ -}; - -struct sockopt; /* used by tcp_var.h */ - -/* - * The lock is heavily used by ip_fw2.c (the main file) and ip_fw_nat.c - * so the variable and the macros must be here. - */ - -#define IPFW_LOCK_INIT(_chain) do { \ - rw_init(&(_chain)->rwmtx, "IPFW static rules"); \ - rw_init(&(_chain)->uh_lock, "IPFW UH lock"); \ - } while (0) - -#define IPFW_LOCK_DESTROY(_chain) do { \ - rw_destroy(&(_chain)->rwmtx); \ - rw_destroy(&(_chain)->uh_lock); \ - } while (0) - -#define IPFW_WLOCK_ASSERT(_chain) rw_assert(&(_chain)->rwmtx, RA_WLOCKED) - -#define IPFW_RLOCK(p) rw_rlock(&(p)->rwmtx) -#define IPFW_RUNLOCK(p) rw_runlock(&(p)->rwmtx) -#define IPFW_WLOCK(p) rw_wlock(&(p)->rwmtx) -#define IPFW_WUNLOCK(p) rw_wunlock(&(p)->rwmtx) - -#define IPFW_UH_RLOCK(p) rw_rlock(&(p)->uh_lock) -#define IPFW_UH_RUNLOCK(p) rw_runlock(&(p)->uh_lock) -#define IPFW_UH_WLOCK(p) rw_wlock(&(p)->uh_lock) -#define IPFW_UH_WUNLOCK(p) rw_wunlock(&(p)->uh_lock) - -/* In ip_fw_sockopt.c */ -int ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id); -int ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule); -int ipfw_ctl(struct sockopt *sopt); -int ipfw_chk(struct ip_fw_args *args); -void ipfw_reap_rules(struct ip_fw *head); - -/* In ip_fw_table.c */ -struct radix_node; -int ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, - uint32_t *val); -int ipfw_lookup_table_extended(struct ip_fw_chain *ch, uint16_t tbl, void *paddr, - uint32_t *val, int type); -int ipfw_init_tables(struct ip_fw_chain *ch); -void ipfw_destroy_tables(struct ip_fw_chain *ch); -int ipfw_flush_table(struct ip_fw_chain *ch, uint16_t tbl); -int ipfw_add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, void *paddr, - uint8_t plen, uint8_t mlen, uint8_t type, uint32_t value); -int ipfw_del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, void *paddr, - uint8_t plen, uint8_t mlen, uint8_t type); -int ipfw_count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt); -int ipfw_dump_table_entry(struct radix_node *rn, void *arg); -int ipfw_dump_table(struct ip_fw_chain *ch, ipfw_table *tbl); -int ipfw_count_xtable(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt); -int ipfw_dump_xtable(struct ip_fw_chain *ch, ipfw_xtable *tbl); -int ipfw_resize_tables(struct ip_fw_chain *ch, unsigned int ntables); - -/* In ip_fw_nat.c -- XXX to be moved to ip_var.h */ - -extern struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int); - -typedef int ipfw_nat_t(struct ip_fw_args *, struct cfg_nat *, struct mbuf *); -typedef int ipfw_nat_cfg_t(struct sockopt *); - -extern ipfw_nat_t *ipfw_nat_ptr; -#define IPFW_NAT_LOADED (ipfw_nat_ptr != NULL) - -extern ipfw_nat_cfg_t *ipfw_nat_cfg_ptr; -extern ipfw_nat_cfg_t *ipfw_nat_del_ptr; -extern ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr; -extern ipfw_nat_cfg_t *ipfw_nat_get_log_ptr; - -#endif /* _KERNEL */ -#endif /* _IPFW2_PRIVATE_H */ diff --git a/sys/netinet/ipfw/ip_fw_sockopt.c b/sys/netinet/ipfw/ip_fw_sockopt.c deleted file mode 100644 index 2a5f4e7..0000000 --- a/sys/netinet/ipfw/ip_fw_sockopt.c +++ /dev/null @@ -1,1448 +0,0 @@ -/*- - * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa - * - * Supported by: Valeria Paoli - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - -/* - * Sockopt support for ipfw. The routines here implement - * the upper half of the ipfw code. - */ - -#include "opt_ipfw.h" -#include "opt_inet.h" -#ifndef INET -#error IPFIREWALL requires INET. -#endif /* INET */ -#include "opt_inet6.h" - -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/malloc.h> -#include <sys/mbuf.h> /* struct m_tag used by nested headers */ -#include <sys/kernel.h> -#include <sys/lock.h> -#include <sys/priv.h> -#include <sys/proc.h> -#include <sys/rwlock.h> -#include <sys/socket.h> -#include <sys/socketvar.h> -#include <sys/sysctl.h> -#include <sys/syslog.h> -#include <net/if.h> -#include <net/route.h> -#include <net/vnet.h> - -#include <netinet/in.h> -#include <netinet/ip_var.h> /* hooks */ -#include <netinet/ip_fw.h> -#include <netinet/ipfw/ip_fw_private.h> - -#ifdef MAC -#include <security/mac/mac_framework.h> -#endif - -MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's"); - -/* - * static variables followed by global ones (none in this file) - */ - -/* - * Find the smallest rule >= key, id. - * We could use bsearch but it is so simple that we code it directly - */ -int -ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id) -{ - int i, lo, hi; - struct ip_fw *r; - - for (lo = 0, hi = chain->n_rules - 1; lo < hi;) { - i = (lo + hi) / 2; - r = chain->map[i]; - if (r->rulenum < key) - lo = i + 1; /* continue from the next one */ - else if (r->rulenum > key) - hi = i; /* this might be good */ - else if (r->id < id) - lo = i + 1; /* continue from the next one */ - else /* r->id >= id */ - hi = i; /* this might be good */ - }; - return hi; -} - -/* - * allocate a new map, returns the chain locked. extra is the number - * of entries to add or delete. - */ -static struct ip_fw ** -get_map(struct ip_fw_chain *chain, int extra, int locked) -{ - - for (;;) { - struct ip_fw **map; - int i; - - i = chain->n_rules + extra; - map = malloc(i * sizeof(struct ip_fw *), M_IPFW, - locked ? M_NOWAIT : M_WAITOK); - if (map == NULL) { - printf("%s: cannot allocate map\n", __FUNCTION__); - return NULL; - } - if (!locked) - IPFW_UH_WLOCK(chain); - if (i >= chain->n_rules + extra) /* good */ - return map; - /* otherwise we lost the race, free and retry */ - if (!locked) - IPFW_UH_WUNLOCK(chain); - free(map, M_IPFW); - } -} - -/* - * swap the maps. It is supposed to be called with IPFW_UH_WLOCK - */ -static struct ip_fw ** -swap_map(struct ip_fw_chain *chain, struct ip_fw **new_map, int new_len) -{ - struct ip_fw **old_map; - - IPFW_WLOCK(chain); - chain->id++; - chain->n_rules = new_len; - old_map = chain->map; - chain->map = new_map; - IPFW_WUNLOCK(chain); - return old_map; -} - -/* - * Add a new rule to the list. Copy the rule into a malloc'ed area, then - * possibly create a rule number and add the rule to the list. - * Update the rule_number in the input struct so the caller knows it as well. - * XXX DO NOT USE FOR THE DEFAULT RULE. - * Must be called without IPFW_UH held - */ -int -ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule) -{ - struct ip_fw *rule; - int i, l, insert_before; - struct ip_fw **map; /* the new array of pointers */ - - if (chain->rules == NULL || input_rule->rulenum > IPFW_DEFAULT_RULE-1) - return (EINVAL); - - l = RULESIZE(input_rule); - rule = malloc(l, M_IPFW, M_WAITOK | M_ZERO); - /* get_map returns with IPFW_UH_WLOCK if successful */ - map = get_map(chain, 1, 0 /* not locked */); - if (map == NULL) { - free(rule, M_IPFW); - return ENOSPC; - } - - bcopy(input_rule, rule, l); - /* clear fields not settable from userland */ - rule->x_next = NULL; - rule->next_rule = NULL; - rule->pcnt = 0; - rule->bcnt = 0; - rule->timestamp = 0; - - if (V_autoinc_step < 1) - V_autoinc_step = 1; - else if (V_autoinc_step > 1000) - V_autoinc_step = 1000; - /* find the insertion point, we will insert before */ - insert_before = rule->rulenum ? rule->rulenum + 1 : IPFW_DEFAULT_RULE; - i = ipfw_find_rule(chain, insert_before, 0); - /* duplicate first part */ - if (i > 0) - bcopy(chain->map, map, i * sizeof(struct ip_fw *)); - map[i] = rule; - /* duplicate remaining part, we always have the default rule */ - bcopy(chain->map + i, map + i + 1, - sizeof(struct ip_fw *) *(chain->n_rules - i)); - if (rule->rulenum == 0) { - /* write back the number */ - rule->rulenum = i > 0 ? map[i-1]->rulenum : 0; - if (rule->rulenum < IPFW_DEFAULT_RULE - V_autoinc_step) - rule->rulenum += V_autoinc_step; - input_rule->rulenum = rule->rulenum; - } - - rule->id = chain->id + 1; - map = swap_map(chain, map, chain->n_rules + 1); - chain->static_len += l; - IPFW_UH_WUNLOCK(chain); - if (map) - free(map, M_IPFW); - return (0); -} - -/* - * Reclaim storage associated with a list of rules. This is - * typically the list created using remove_rule. - * A NULL pointer on input is handled correctly. - */ -void -ipfw_reap_rules(struct ip_fw *head) -{ - struct ip_fw *rule; - - while ((rule = head) != NULL) { - head = head->x_next; - free(rule, M_IPFW); - } -} - -/* - * Used by del_entry() to check if a rule should be kept. - * Returns 1 if the rule must be kept, 0 otherwise. - * - * Called with cmd = {0,1,5}. - * cmd == 0 matches on rule numbers, excludes rules in RESVD_SET if n == 0 ; - * cmd == 1 matches on set numbers only, rule numbers are ignored; - * cmd == 5 matches on rule and set numbers. - * - * n == 0 is a wildcard for rule numbers, there is no wildcard for sets. - * - * Rules to keep are - * (default || reserved || !match_set || !match_number) - * where - * default ::= (rule->rulenum == IPFW_DEFAULT_RULE) - * // the default rule is always protected - * - * reserved ::= (cmd == 0 && n == 0 && rule->set == RESVD_SET) - * // RESVD_SET is protected only if cmd == 0 and n == 0 ("ipfw flush") - * - * match_set ::= (cmd == 0 || rule->set == set) - * // set number is ignored for cmd == 0 - * - * match_number ::= (cmd == 1 || n == 0 || n == rule->rulenum) - * // number is ignored for cmd == 1 or n == 0 - * - */ -static int -keep_rule(struct ip_fw *rule, uint8_t cmd, uint8_t set, uint32_t n) -{ - return - (rule->rulenum == IPFW_DEFAULT_RULE) || - (cmd == 0 && n == 0 && rule->set == RESVD_SET) || - !(cmd == 0 || rule->set == set) || - !(cmd == 1 || n == 0 || n == rule->rulenum); -} - -/** - * Remove all rules with given number, or do set manipulation. - * Assumes chain != NULL && *chain != NULL. - * - * The argument is an uint32_t. The low 16 bit are the rule or set number; - * the next 8 bits are the new set; the top 8 bits indicate the command: - * - * 0 delete rules numbered "rulenum" - * 1 delete rules in set "rulenum" - * 2 move rules "rulenum" to set "new_set" - * 3 move rules from set "rulenum" to set "new_set" - * 4 swap sets "rulenum" and "new_set" - * 5 delete rules "rulenum" and set "new_set" - */ -static int -del_entry(struct ip_fw_chain *chain, uint32_t arg) -{ - struct ip_fw *rule; - uint32_t num; /* rule number or old_set */ - uint8_t cmd, new_set; - int start, end, i, ofs, n; - struct ip_fw **map = NULL; - int error = 0; - - num = arg & 0xffff; - cmd = (arg >> 24) & 0xff; - new_set = (arg >> 16) & 0xff; - - if (cmd > 5 || new_set > RESVD_SET) - return EINVAL; - if (cmd == 0 || cmd == 2 || cmd == 5) { - if (num >= IPFW_DEFAULT_RULE) - return EINVAL; - } else { - if (num > RESVD_SET) /* old_set */ - return EINVAL; - } - - IPFW_UH_WLOCK(chain); /* arbitrate writers */ - chain->reap = NULL; /* prepare for deletions */ - - switch (cmd) { - case 0: /* delete rules "num" (num == 0 matches all) */ - case 1: /* delete all rules in set N */ - case 5: /* delete rules with number N and set "new_set". */ - - /* - * Locate first rule to delete (start), the rule after - * the last one to delete (end), and count how many - * rules to delete (n). Always use keep_rule() to - * determine which rules to keep. - */ - n = 0; - if (cmd == 1) { - /* look for a specific set including RESVD_SET. - * Must scan the entire range, ignore num. - */ - new_set = num; - for (start = -1, end = i = 0; i < chain->n_rules; i++) { - if (keep_rule(chain->map[i], cmd, new_set, 0)) - continue; - if (start < 0) - start = i; - end = i; - n++; - } - end++; /* first non-matching */ - } else { - /* Optimized search on rule numbers */ - start = ipfw_find_rule(chain, num, 0); - for (end = start; end < chain->n_rules; end++) { - rule = chain->map[end]; - if (num > 0 && rule->rulenum != num) - break; - if (!keep_rule(rule, cmd, new_set, num)) - n++; - } - } - - if (n == 0) { - /* A flush request (arg == 0 or cmd == 1) on empty - * ruleset returns with no error. On the contrary, - * if there is no match on a specific request, - * we return EINVAL. - */ - if (arg != 0 && cmd != 1) - error = EINVAL; - break; - } - - /* We have something to delete. Allocate the new map */ - map = get_map(chain, -n, 1 /* locked */); - if (map == NULL) { - error = EINVAL; - break; - } - - /* 1. bcopy the initial part of the map */ - if (start > 0) - bcopy(chain->map, map, start * sizeof(struct ip_fw *)); - /* 2. copy active rules between start and end */ - for (i = ofs = start; i < end; i++) { - rule = chain->map[i]; - if (keep_rule(rule, cmd, new_set, num)) - map[ofs++] = rule; - } - /* 3. copy the final part of the map */ - bcopy(chain->map + end, map + ofs, - (chain->n_rules - end) * sizeof(struct ip_fw *)); - /* 4. swap the maps (under BH_LOCK) */ - map = swap_map(chain, map, chain->n_rules - n); - /* 5. now remove the rules deleted from the old map */ - for (i = start; i < end; i++) { - int l; - rule = map[i]; - if (keep_rule(rule, cmd, new_set, num)) - continue; - l = RULESIZE(rule); - chain->static_len -= l; - ipfw_remove_dyn_children(rule); - rule->x_next = chain->reap; - chain->reap = rule; - } - break; - - /* - * In the next 3 cases the loop stops at (n_rules - 1) - * because the default rule is never eligible.. - */ - - case 2: /* move rules with given RULE number to new set */ - for (i = 0; i < chain->n_rules - 1; i++) { - rule = chain->map[i]; - if (rule->rulenum == num) - rule->set = new_set; - } - break; - - case 3: /* move rules with given SET number to new set */ - for (i = 0; i < chain->n_rules - 1; i++) { - rule = chain->map[i]; - if (rule->set == num) - rule->set = new_set; - } - break; - - case 4: /* swap two sets */ - for (i = 0; i < chain->n_rules - 1; i++) { - rule = chain->map[i]; - if (rule->set == num) - rule->set = new_set; - else if (rule->set == new_set) - rule->set = num; - } - break; - } - - rule = chain->reap; - chain->reap = NULL; - IPFW_UH_WUNLOCK(chain); - ipfw_reap_rules(rule); - if (map) - free(map, M_IPFW); - return error; -} - -/* - * Clear counters for a specific rule. - * Normally run under IPFW_UH_RLOCK, but these are idempotent ops - * so we only care that rules do not disappear. - */ -static void -clear_counters(struct ip_fw *rule, int log_only) -{ - ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule); - - if (log_only == 0) { - rule->bcnt = rule->pcnt = 0; - rule->timestamp = 0; - } - if (l->o.opcode == O_LOG) - l->log_left = l->max_log; -} - -/** - * Reset some or all counters on firewall rules. - * The argument `arg' is an u_int32_t. The low 16 bit are the rule number, - * the next 8 bits are the set number, the top 8 bits are the command: - * 0 work with rules from all set's; - * 1 work with rules only from specified set. - * Specified rule number is zero if we want to clear all entries. - * log_only is 1 if we only want to reset logs, zero otherwise. - */ -static int -zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only) -{ - struct ip_fw *rule; - char *msg; - int i; - - uint16_t rulenum = arg & 0xffff; - uint8_t set = (arg >> 16) & 0xff; - uint8_t cmd = (arg >> 24) & 0xff; - - if (cmd > 1) - return (EINVAL); - if (cmd == 1 && set > RESVD_SET) - return (EINVAL); - - IPFW_UH_RLOCK(chain); - if (rulenum == 0) { - V_norule_counter = 0; - for (i = 0; i < chain->n_rules; i++) { - rule = chain->map[i]; - /* Skip rules not in our set. */ - if (cmd == 1 && rule->set != set) - continue; - clear_counters(rule, log_only); - } - msg = log_only ? "All logging counts reset" : - "Accounting cleared"; - } else { - int cleared = 0; - for (i = 0; i < chain->n_rules; i++) { - rule = chain->map[i]; - if (rule->rulenum == rulenum) { - if (cmd == 0 || rule->set == set) - clear_counters(rule, log_only); - cleared = 1; - } - if (rule->rulenum > rulenum) - break; - } - if (!cleared) { /* we did not find any matching rules */ - IPFW_UH_RUNLOCK(chain); - return (EINVAL); - } - msg = log_only ? "logging count reset" : "cleared"; - } - IPFW_UH_RUNLOCK(chain); - - if (V_fw_verbose) { - int lev = LOG_SECURITY | LOG_NOTICE; - - if (rulenum) - log(lev, "ipfw: Entry %d %s.\n", rulenum, msg); - else - log(lev, "ipfw: %s.\n", msg); - } - return (0); -} - -/* - * Check validity of the structure before insert. - * Rules are simple, so this mostly need to check rule sizes. - */ -static int -check_ipfw_struct(struct ip_fw *rule, int size) -{ - int l, cmdlen = 0; - int have_action=0; - ipfw_insn *cmd; - - if (size < sizeof(*rule)) { - printf("ipfw: rule too short\n"); - return (EINVAL); - } - /* first, check for valid size */ - l = RULESIZE(rule); - if (l != size) { - printf("ipfw: size mismatch (have %d want %d)\n", size, l); - return (EINVAL); - } - if (rule->act_ofs >= rule->cmd_len) { - printf("ipfw: bogus action offset (%u > %u)\n", - rule->act_ofs, rule->cmd_len - 1); - return (EINVAL); - } - /* - * Now go for the individual checks. Very simple ones, basically only - * instruction sizes. - */ - for (l = rule->cmd_len, cmd = rule->cmd ; - l > 0 ; l -= cmdlen, cmd += cmdlen) { - cmdlen = F_LEN(cmd); - if (cmdlen > l) { - printf("ipfw: opcode %d size truncated\n", - cmd->opcode); - return EINVAL; - } - switch (cmd->opcode) { - case O_PROBE_STATE: - case O_KEEP_STATE: - case O_PROTO: - case O_IP_SRC_ME: - case O_IP_DST_ME: - case O_LAYER2: - case O_IN: - case O_FRAG: - case O_DIVERTED: - case O_IPOPT: - case O_IPTOS: - case O_IPPRECEDENCE: - case O_IPVER: - case O_SOCKARG: - case O_TCPFLAGS: - case O_TCPOPTS: - case O_ESTAB: - case O_VERREVPATH: - case O_VERSRCREACH: - case O_ANTISPOOF: - case O_IPSEC: -#ifdef INET6 - case O_IP6_SRC_ME: - case O_IP6_DST_ME: - case O_EXT_HDR: - case O_IP6: -#endif - case O_IP4: - case O_TAG: - if (cmdlen != F_INSN_SIZE(ipfw_insn)) - goto bad_size; - break; - - case O_FIB: - if (cmdlen != F_INSN_SIZE(ipfw_insn)) - goto bad_size; - if (cmd->arg1 >= rt_numfibs) { - printf("ipfw: invalid fib number %d\n", - cmd->arg1); - return EINVAL; - } - break; - - case O_SETFIB: - if (cmdlen != F_INSN_SIZE(ipfw_insn)) - goto bad_size; - if ((cmd->arg1 != IP_FW_TABLEARG) && - (cmd->arg1 >= rt_numfibs)) { - printf("ipfw: invalid fib number %d\n", - cmd->arg1); - return EINVAL; - } - goto check_action; - - case O_UID: - case O_GID: - case O_JAIL: - case O_IP_SRC: - case O_IP_DST: - case O_TCPSEQ: - case O_TCPACK: - case O_PROB: - case O_ICMPTYPE: - if (cmdlen != F_INSN_SIZE(ipfw_insn_u32)) - goto bad_size; - break; - - case O_LIMIT: - if (cmdlen != F_INSN_SIZE(ipfw_insn_limit)) - goto bad_size; - break; - - case O_LOG: - if (cmdlen != F_INSN_SIZE(ipfw_insn_log)) - goto bad_size; - - ((ipfw_insn_log *)cmd)->log_left = - ((ipfw_insn_log *)cmd)->max_log; - - break; - - case O_IP_SRC_MASK: - case O_IP_DST_MASK: - /* only odd command lengths */ - if ( !(cmdlen & 1) || cmdlen > 31) - goto bad_size; - break; - - case O_IP_SRC_SET: - case O_IP_DST_SET: - if (cmd->arg1 == 0 || cmd->arg1 > 256) { - printf("ipfw: invalid set size %d\n", - cmd->arg1); - return EINVAL; - } - if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + - (cmd->arg1+31)/32 ) - goto bad_size; - break; - - case O_IP_SRC_LOOKUP: - case O_IP_DST_LOOKUP: - if (cmd->arg1 >= IPFW_TABLES_MAX) { - printf("ipfw: invalid table number %d\n", - cmd->arg1); - return (EINVAL); - } - if (cmdlen != F_INSN_SIZE(ipfw_insn) && - cmdlen != F_INSN_SIZE(ipfw_insn_u32) + 1 && - cmdlen != F_INSN_SIZE(ipfw_insn_u32)) - goto bad_size; - break; - case O_MACADDR2: - if (cmdlen != F_INSN_SIZE(ipfw_insn_mac)) - goto bad_size; - break; - - case O_NOP: - case O_IPID: - case O_IPTTL: - case O_IPLEN: - case O_TCPDATALEN: - case O_TCPWIN: - case O_TAGGED: - if (cmdlen < 1 || cmdlen > 31) - goto bad_size; - break; - - case O_MAC_TYPE: - case O_IP_SRCPORT: - case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */ - if (cmdlen < 2 || cmdlen > 31) - goto bad_size; - break; - - case O_RECV: - case O_XMIT: - case O_VIA: - if (cmdlen != F_INSN_SIZE(ipfw_insn_if)) - goto bad_size; - break; - - case O_ALTQ: - if (cmdlen != F_INSN_SIZE(ipfw_insn_altq)) - goto bad_size; - break; - - case O_PIPE: - case O_QUEUE: - if (cmdlen != F_INSN_SIZE(ipfw_insn)) - goto bad_size; - goto check_action; - - case O_FORWARD_IP: -#ifdef IPFIREWALL_FORWARD - if (cmdlen != F_INSN_SIZE(ipfw_insn_sa)) - goto bad_size; - goto check_action; -#else - return EINVAL; -#endif - -#ifdef INET6 - case O_FORWARD_IP6: -#ifdef IPFIREWALL_FORWARD - if (cmdlen != F_INSN_SIZE(ipfw_insn_sa6)) - goto bad_size; - goto check_action; -#else - return (EINVAL); -#endif -#endif /* INET6 */ - - case O_DIVERT: - case O_TEE: - if (ip_divert_ptr == NULL) - return EINVAL; - else - goto check_size; - case O_NETGRAPH: - case O_NGTEE: - if (ng_ipfw_input_p == NULL) - return EINVAL; - else - goto check_size; - case O_NAT: - if (!IPFW_NAT_LOADED) - return EINVAL; - if (cmdlen != F_INSN_SIZE(ipfw_insn_nat)) - goto bad_size; - goto check_action; - case O_FORWARD_MAC: /* XXX not implemented yet */ - case O_CHECK_STATE: - case O_COUNT: - case O_ACCEPT: - case O_DENY: - case O_REJECT: -#ifdef INET6 - case O_UNREACH6: -#endif - case O_SKIPTO: - case O_REASS: - case O_CALLRETURN: -check_size: - if (cmdlen != F_INSN_SIZE(ipfw_insn)) - goto bad_size; -check_action: - if (have_action) { - printf("ipfw: opcode %d, multiple actions" - " not allowed\n", - cmd->opcode); - return EINVAL; - } - have_action = 1; - if (l != cmdlen) { - printf("ipfw: opcode %d, action must be" - " last opcode\n", - cmd->opcode); - return EINVAL; - } - break; -#ifdef INET6 - case O_IP6_SRC: - case O_IP6_DST: - if (cmdlen != F_INSN_SIZE(struct in6_addr) + - F_INSN_SIZE(ipfw_insn)) - goto bad_size; - break; - - case O_FLOW6ID: - if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + - ((ipfw_insn_u32 *)cmd)->o.arg1) - goto bad_size; - break; - - case O_IP6_SRC_MASK: - case O_IP6_DST_MASK: - if ( !(cmdlen & 1) || cmdlen > 127) - goto bad_size; - break; - case O_ICMP6TYPE: - if( cmdlen != F_INSN_SIZE( ipfw_insn_icmp6 ) ) - goto bad_size; - break; -#endif - - default: - switch (cmd->opcode) { -#ifndef INET6 - case O_IP6_SRC_ME: - case O_IP6_DST_ME: - case O_EXT_HDR: - case O_IP6: - case O_UNREACH6: - case O_IP6_SRC: - case O_IP6_DST: - case O_FLOW6ID: - case O_IP6_SRC_MASK: - case O_IP6_DST_MASK: - case O_ICMP6TYPE: - printf("ipfw: no IPv6 support in kernel\n"); - return EPROTONOSUPPORT; -#endif - default: - printf("ipfw: opcode %d, unknown opcode\n", - cmd->opcode); - return EINVAL; - } - } - } - if (have_action == 0) { - printf("ipfw: missing action\n"); - return EINVAL; - } - return 0; - -bad_size: - printf("ipfw: opcode %d size %d wrong\n", - cmd->opcode, cmdlen); - return EINVAL; -} - - -/* - * Translation of requests for compatibility with FreeBSD 7.2/8. - * a static variable tells us if we have an old client from userland, - * and if necessary we translate requests and responses between the - * two formats. - */ -static int is7 = 0; - -struct ip_fw7 { - struct ip_fw7 *next; /* linked list of rules */ - struct ip_fw7 *next_rule; /* ptr to next [skipto] rule */ - /* 'next_rule' is used to pass up 'set_disable' status */ - - uint16_t act_ofs; /* offset of action in 32-bit units */ - uint16_t cmd_len; /* # of 32-bit words in cmd */ - uint16_t rulenum; /* rule number */ - uint8_t set; /* rule set (0..31) */ - // #define RESVD_SET 31 /* set for default and persistent rules */ - uint8_t _pad; /* padding */ - // uint32_t id; /* rule id, only in v.8 */ - /* These fields are present in all rules. */ - uint64_t pcnt; /* Packet counter */ - uint64_t bcnt; /* Byte counter */ - uint32_t timestamp; /* tv_sec of last match */ - - ipfw_insn cmd[1]; /* storage for commands */ -}; - - int convert_rule_to_7(struct ip_fw *rule); -int convert_rule_to_8(struct ip_fw *rule); - -#ifndef RULESIZE7 -#define RULESIZE7(rule) (sizeof(struct ip_fw7) + \ - ((struct ip_fw7 *)(rule))->cmd_len * 4 - 4) -#endif - - -/* - * Copy the static and dynamic rules to the supplied buffer - * and return the amount of space actually used. - * Must be run under IPFW_UH_RLOCK - */ -static size_t -ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space) -{ - char *bp = buf; - char *ep = bp + space; - struct ip_fw *rule, *dst; - int l, i; - time_t boot_seconds; - - boot_seconds = boottime.tv_sec; - for (i = 0; i < chain->n_rules; i++) { - rule = chain->map[i]; - - if (is7) { - /* Convert rule to FreeBSd 7.2 format */ - l = RULESIZE7(rule); - if (bp + l + sizeof(uint32_t) <= ep) { - int error; - bcopy(rule, bp, l + sizeof(uint32_t)); - error = convert_rule_to_7((struct ip_fw *) bp); - if (error) - return 0; /*XXX correct? */ - /* - * XXX HACK. Store the disable mask in the "next" - * pointer in a wild attempt to keep the ABI the same. - * Why do we do this on EVERY rule? - */ - bcopy(&V_set_disable, - &(((struct ip_fw7 *)bp)->next_rule), - sizeof(V_set_disable)); - if (((struct ip_fw7 *)bp)->timestamp) - ((struct ip_fw7 *)bp)->timestamp += boot_seconds; - bp += l; - } - continue; /* go to next rule */ - } - - /* normal mode, don't touch rules */ - l = RULESIZE(rule); - if (bp + l > ep) { /* should not happen */ - printf("overflow dumping static rules\n"); - break; - } - dst = (struct ip_fw *)bp; - bcopy(rule, dst, l); - /* - * XXX HACK. Store the disable mask in the "next" - * pointer in a wild attempt to keep the ABI the same. - * Why do we do this on EVERY rule? - */ - bcopy(&V_set_disable, &dst->next_rule, sizeof(V_set_disable)); - if (dst->timestamp) - dst->timestamp += boot_seconds; - bp += l; - } - ipfw_get_dynamic(&bp, ep); /* protected by the dynamic lock */ - return (bp - (char *)buf); -} - - -#define IP_FW3_OPLENGTH(x) ((x)->sopt_valsize - sizeof(ip_fw3_opheader)) -/** - * {set|get}sockopt parser. - */ -int -ipfw_ctl(struct sockopt *sopt) -{ -#define RULE_MAXSIZE (256*sizeof(u_int32_t)) - int error; - size_t size, len, valsize; - struct ip_fw *buf, *rule; - struct ip_fw_chain *chain; - u_int32_t rulenum[2]; - uint32_t opt; - char xbuf[128]; - ip_fw3_opheader *op3 = NULL; - - error = priv_check(sopt->sopt_td, PRIV_NETINET_IPFW); - if (error) - return (error); - - /* - * Disallow modifications in really-really secure mode, but still allow - * the logging counters to be reset. - */ - if (sopt->sopt_name == IP_FW_ADD || - (sopt->sopt_dir == SOPT_SET && sopt->sopt_name != IP_FW_RESETLOG)) { - error = securelevel_ge(sopt->sopt_td->td_ucred, 3); - if (error) - return (error); - } - - chain = &V_layer3_chain; - error = 0; - - /* Save original valsize before it is altered via sooptcopyin() */ - valsize = sopt->sopt_valsize; - if ((opt = sopt->sopt_name) == IP_FW3) { - /* - * Copy not less than sizeof(ip_fw3_opheader). - * We hope any IP_FW3 command will fit into 128-byte buffer. - */ - if ((error = sooptcopyin(sopt, xbuf, sizeof(xbuf), - sizeof(ip_fw3_opheader))) != 0) - return (error); - op3 = (ip_fw3_opheader *)xbuf; - opt = op3->opcode; - } - - switch (opt) { - case IP_FW_GET: - /* - * pass up a copy of the current rules. Static rules - * come first (the last of which has number IPFW_DEFAULT_RULE), - * followed by a possibly empty list of dynamic rule. - * The last dynamic rule has NULL in the "next" field. - * - * Note that the calculated size is used to bound the - * amount of data returned to the user. The rule set may - * change between calculating the size and returning the - * data in which case we'll just return what fits. - */ - for (;;) { - int len = 0, want; - - size = chain->static_len; - size += ipfw_dyn_len(); - if (size >= sopt->sopt_valsize) - break; - buf = malloc(size, M_TEMP, M_WAITOK); - IPFW_UH_RLOCK(chain); - /* check again how much space we need */ - want = chain->static_len + ipfw_dyn_len(); - if (size >= want) - len = ipfw_getrules(chain, buf, size); - IPFW_UH_RUNLOCK(chain); - if (size >= want) - error = sooptcopyout(sopt, buf, len); - free(buf, M_TEMP); - if (size >= want) - break; - } - break; - - case IP_FW_FLUSH: - /* locking is done within del_entry() */ - error = del_entry(chain, 0); /* special case, rule=0, cmd=0 means all */ - break; - - case IP_FW_ADD: - rule = malloc(RULE_MAXSIZE, M_TEMP, M_WAITOK); - error = sooptcopyin(sopt, rule, RULE_MAXSIZE, - sizeof(struct ip_fw7) ); - - /* - * If the size of commands equals RULESIZE7 then we assume - * a FreeBSD7.2 binary is talking to us (set is7=1). - * is7 is persistent so the next 'ipfw list' command - * will use this format. - * NOTE: If wrong version is guessed (this can happen if - * the first ipfw command is 'ipfw [pipe] list') - * the ipfw binary may crash or loop infinitly... - */ - if (sopt->sopt_valsize == RULESIZE7(rule)) { - is7 = 1; - error = convert_rule_to_8(rule); - if (error) - return error; - if (error == 0) - error = check_ipfw_struct(rule, RULESIZE(rule)); - } else { - is7 = 0; - if (error == 0) - error = check_ipfw_struct(rule, sopt->sopt_valsize); - } - if (error == 0) { - /* locking is done within ipfw_add_rule() */ - error = ipfw_add_rule(chain, rule); - size = RULESIZE(rule); - if (!error && sopt->sopt_dir == SOPT_GET) { - if (is7) { - error = convert_rule_to_7(rule); - size = RULESIZE7(rule); - if (error) - return error; - } - error = sooptcopyout(sopt, rule, size); - } - } - free(rule, M_TEMP); - break; - - case IP_FW_DEL: - /* - * IP_FW_DEL is used for deleting single rules or sets, - * and (ab)used to atomically manipulate sets. Argument size - * is used to distinguish between the two: - * sizeof(u_int32_t) - * delete single rule or set of rules, - * or reassign rules (or sets) to a different set. - * 2*sizeof(u_int32_t) - * atomic disable/enable sets. - * first u_int32_t contains sets to be disabled, - * second u_int32_t contains sets to be enabled. - */ - error = sooptcopyin(sopt, rulenum, - 2*sizeof(u_int32_t), sizeof(u_int32_t)); - if (error) - break; - size = sopt->sopt_valsize; - if (size == sizeof(u_int32_t) && rulenum[0] != 0) { - /* delete or reassign, locking done in del_entry() */ - error = del_entry(chain, rulenum[0]); - } else if (size == 2*sizeof(u_int32_t)) { /* set enable/disable */ - IPFW_UH_WLOCK(chain); - V_set_disable = - (V_set_disable | rulenum[0]) & ~rulenum[1] & - ~(1<<RESVD_SET); /* set RESVD_SET always enabled */ - IPFW_UH_WUNLOCK(chain); - } else - error = EINVAL; - break; - - case IP_FW_ZERO: - case IP_FW_RESETLOG: /* argument is an u_int_32, the rule number */ - rulenum[0] = 0; - if (sopt->sopt_val != 0) { - error = sooptcopyin(sopt, rulenum, - sizeof(u_int32_t), sizeof(u_int32_t)); - if (error) - break; - } - error = zero_entry(chain, rulenum[0], - sopt->sopt_name == IP_FW_RESETLOG); - break; - - /*--- TABLE manipulations are protected by the IPFW_LOCK ---*/ - case IP_FW_TABLE_ADD: - { - ipfw_table_entry ent; - - error = sooptcopyin(sopt, &ent, - sizeof(ent), sizeof(ent)); - if (error) - break; - error = ipfw_add_table_entry(chain, ent.tbl, - &ent.addr, sizeof(ent.addr), ent.masklen, - IPFW_TABLE_CIDR, ent.value); - } - break; - - case IP_FW_TABLE_DEL: - { - ipfw_table_entry ent; - - error = sooptcopyin(sopt, &ent, - sizeof(ent), sizeof(ent)); - if (error) - break; - error = ipfw_del_table_entry(chain, ent.tbl, - &ent.addr, sizeof(ent.addr), ent.masklen, IPFW_TABLE_CIDR); - } - break; - - case IP_FW_TABLE_XADD: /* IP_FW3 */ - case IP_FW_TABLE_XDEL: /* IP_FW3 */ - { - ipfw_table_xentry *xent = (ipfw_table_xentry *)(op3 + 1); - - /* Check minimum header size */ - if (IP_FW3_OPLENGTH(sopt) < offsetof(ipfw_table_xentry, k)) { - error = EINVAL; - break; - } - - /* Check if len field is valid */ - if (xent->len > sizeof(ipfw_table_xentry)) { - error = EINVAL; - break; - } - - len = xent->len - offsetof(ipfw_table_xentry, k); - - error = (opt == IP_FW_TABLE_XADD) ? - ipfw_add_table_entry(chain, xent->tbl, &xent->k, - len, xent->masklen, xent->type, xent->value) : - ipfw_del_table_entry(chain, xent->tbl, &xent->k, - len, xent->masklen, xent->type); - } - break; - - case IP_FW_TABLE_FLUSH: - { - u_int16_t tbl; - - error = sooptcopyin(sopt, &tbl, - sizeof(tbl), sizeof(tbl)); - if (error) - break; - error = ipfw_flush_table(chain, tbl); - } - break; - - case IP_FW_TABLE_GETSIZE: - { - u_int32_t tbl, cnt; - - if ((error = sooptcopyin(sopt, &tbl, sizeof(tbl), - sizeof(tbl)))) - break; - IPFW_RLOCK(chain); - error = ipfw_count_table(chain, tbl, &cnt); - IPFW_RUNLOCK(chain); - if (error) - break; - error = sooptcopyout(sopt, &cnt, sizeof(cnt)); - } - break; - - case IP_FW_TABLE_LIST: - { - ipfw_table *tbl; - - if (sopt->sopt_valsize < sizeof(*tbl)) { - error = EINVAL; - break; - } - size = sopt->sopt_valsize; - tbl = malloc(size, M_TEMP, M_WAITOK); - error = sooptcopyin(sopt, tbl, size, sizeof(*tbl)); - if (error) { - free(tbl, M_TEMP); - break; - } - tbl->size = (size - sizeof(*tbl)) / - sizeof(ipfw_table_entry); - IPFW_RLOCK(chain); - error = ipfw_dump_table(chain, tbl); - IPFW_RUNLOCK(chain); - if (error) { - free(tbl, M_TEMP); - break; - } - error = sooptcopyout(sopt, tbl, size); - free(tbl, M_TEMP); - } - break; - - case IP_FW_TABLE_XGETSIZE: /* IP_FW3 */ - { - uint32_t *tbl; - - if (IP_FW3_OPLENGTH(sopt) < sizeof(uint32_t)) { - error = EINVAL; - break; - } - - tbl = (uint32_t *)(op3 + 1); - - IPFW_RLOCK(chain); - error = ipfw_count_xtable(chain, *tbl, tbl); - IPFW_RUNLOCK(chain); - if (error) - break; - error = sooptcopyout(sopt, op3, sopt->sopt_valsize); - } - break; - - case IP_FW_TABLE_XLIST: /* IP_FW3 */ - { - ipfw_xtable *tbl; - - if ((size = valsize) < sizeof(ipfw_xtable)) { - error = EINVAL; - break; - } - - tbl = malloc(size, M_TEMP, M_ZERO | M_WAITOK); - memcpy(tbl, op3, sizeof(ipfw_xtable)); - - /* Get maximum number of entries we can store */ - tbl->size = (size - sizeof(ipfw_xtable)) / - sizeof(ipfw_table_xentry); - IPFW_RLOCK(chain); - error = ipfw_dump_xtable(chain, tbl); - IPFW_RUNLOCK(chain); - if (error) { - free(tbl, M_TEMP); - break; - } - - /* Revert size field back to bytes */ - tbl->size = tbl->size * sizeof(ipfw_table_xentry) + - sizeof(ipfw_table); - /* - * Since we call sooptcopyin() with small buffer, sopt_valsize is - * decreased to reflect supplied buffer size. Set it back to original value - */ - sopt->sopt_valsize = valsize; - error = sooptcopyout(sopt, tbl, size); - free(tbl, M_TEMP); - } - break; - - /*--- NAT operations are protected by the IPFW_LOCK ---*/ - case IP_FW_NAT_CFG: - if (IPFW_NAT_LOADED) - error = ipfw_nat_cfg_ptr(sopt); - else { - printf("IP_FW_NAT_CFG: %s\n", - "ipfw_nat not present, please load it"); - error = EINVAL; - } - break; - - case IP_FW_NAT_DEL: - if (IPFW_NAT_LOADED) - error = ipfw_nat_del_ptr(sopt); - else { - printf("IP_FW_NAT_DEL: %s\n", - "ipfw_nat not present, please load it"); - error = EINVAL; - } - break; - - case IP_FW_NAT_GET_CONFIG: - if (IPFW_NAT_LOADED) - error = ipfw_nat_get_cfg_ptr(sopt); - else { - printf("IP_FW_NAT_GET_CFG: %s\n", - "ipfw_nat not present, please load it"); - error = EINVAL; - } - break; - - case IP_FW_NAT_GET_LOG: - if (IPFW_NAT_LOADED) - error = ipfw_nat_get_log_ptr(sopt); - else { - printf("IP_FW_NAT_GET_LOG: %s\n", - "ipfw_nat not present, please load it"); - error = EINVAL; - } - break; - - default: - printf("ipfw: ipfw_ctl invalid option %d\n", sopt->sopt_name); - error = EINVAL; - } - - return (error); -#undef RULE_MAXSIZE -} - - -#define RULE_MAXSIZE (256*sizeof(u_int32_t)) - -/* Functions to convert rules 7.2 <==> 8.0 */ -int -convert_rule_to_7(struct ip_fw *rule) -{ - /* Used to modify original rule */ - struct ip_fw7 *rule7 = (struct ip_fw7 *)rule; - /* copy of original rule, version 8 */ - struct ip_fw *tmp; - - /* Used to copy commands */ - ipfw_insn *ccmd, *dst; - int ll = 0, ccmdlen = 0; - - tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO); - if (tmp == NULL) { - return 1; //XXX error - } - bcopy(rule, tmp, RULE_MAXSIZE); - - /* Copy fields */ - rule7->_pad = tmp->_pad; - rule7->set = tmp->set; - rule7->rulenum = tmp->rulenum; - rule7->cmd_len = tmp->cmd_len; - rule7->act_ofs = tmp->act_ofs; - rule7->next_rule = (struct ip_fw7 *)tmp->next_rule; - rule7->next = (struct ip_fw7 *)tmp->x_next; - rule7->cmd_len = tmp->cmd_len; - rule7->pcnt = tmp->pcnt; - rule7->bcnt = tmp->bcnt; - rule7->timestamp = tmp->timestamp; - - /* Copy commands */ - for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule7->cmd ; - ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) { - ccmdlen = F_LEN(ccmd); - - bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t)); - - if (dst->opcode > O_NAT) - /* O_REASS doesn't exists in 7.2 version, so - * decrement opcode if it is after O_REASS - */ - dst->opcode--; - - if (ccmdlen > ll) { - printf("ipfw: opcode %d size truncated\n", - ccmd->opcode); - return EINVAL; - } - } - free(tmp, M_TEMP); - - return 0; -} - -int -convert_rule_to_8(struct ip_fw *rule) -{ - /* Used to modify original rule */ - struct ip_fw7 *rule7 = (struct ip_fw7 *) rule; - - /* Used to copy commands */ - ipfw_insn *ccmd, *dst; - int ll = 0, ccmdlen = 0; - - /* Copy of original rule */ - struct ip_fw7 *tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO); - if (tmp == NULL) { - return 1; //XXX error - } - - bcopy(rule7, tmp, RULE_MAXSIZE); - - for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule->cmd ; - ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) { - ccmdlen = F_LEN(ccmd); - - bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t)); - - if (dst->opcode > O_NAT) - /* O_REASS doesn't exists in 7.2 version, so - * increment opcode if it is after O_REASS - */ - dst->opcode++; - - if (ccmdlen > ll) { - printf("ipfw: opcode %d size truncated\n", - ccmd->opcode); - return EINVAL; - } - } - - rule->_pad = tmp->_pad; - rule->set = tmp->set; - rule->rulenum = tmp->rulenum; - rule->cmd_len = tmp->cmd_len; - rule->act_ofs = tmp->act_ofs; - rule->next_rule = (struct ip_fw *)tmp->next_rule; - rule->x_next = (struct ip_fw *)tmp->next; - rule->cmd_len = tmp->cmd_len; - rule->id = 0; /* XXX see if is ok = 0 */ - rule->pcnt = tmp->pcnt; - rule->bcnt = tmp->bcnt; - rule->timestamp = tmp->timestamp; - - free (tmp, M_TEMP); - return 0; -} - -/* end of file */ diff --git a/sys/netinet/ipfw/ip_fw_table.c b/sys/netinet/ipfw/ip_fw_table.c deleted file mode 100644 index 68a6220..0000000 --- a/sys/netinet/ipfw/ip_fw_table.c +++ /dev/null @@ -1,761 +0,0 @@ -/*- - * Copyright (c) 2004 Ruslan Ermilov and Vsevolod Lobko. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); - -/* - * Lookup table support for ipfw - * - * Lookup tables are implemented (at the moment) using the radix - * tree used for routing tables. Tables store key-value entries, where - * keys are network prefixes (addr/masklen), and values are integers. - * As a degenerate case we can interpret keys as 32-bit integers - * (with a /32 mask). - * - * The table is protected by the IPFW lock even for manipulation coming - * from userland, because operations are typically fast. - */ - -#include "opt_ipfw.h" -#include "opt_inet.h" -#ifndef INET -#error IPFIREWALL requires INET. -#endif /* INET */ -#include "opt_inet6.h" - -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/malloc.h> -#include <sys/kernel.h> -#include <sys/lock.h> -#include <sys/rwlock.h> -#include <sys/socket.h> -#include <net/if.h> /* ip_fw.h requires IFNAMSIZ */ -#include <net/radix.h> -#include <net/route.h> -#include <net/vnet.h> - -#include <netinet/in.h> -#include <netinet/ip_var.h> /* struct ipfw_rule_ref */ -#include <netinet/ip_fw.h> -#include <sys/queue.h> /* LIST_HEAD */ -#include <netinet/ipfw/ip_fw_private.h> - -#ifdef MAC -#include <security/mac/mac_framework.h> -#endif - -static MALLOC_DEFINE(M_IPFW_TBL, "ipfw_tbl", "IpFw tables"); - -struct table_entry { - struct radix_node rn[2]; - struct sockaddr_in addr, mask; - u_int32_t value; -}; - -struct xaddr_iface { - uint8_t if_len; /* length of this struct */ - uint8_t pad[7]; /* Align name */ - char ifname[IF_NAMESIZE]; /* Interface name */ -}; - -struct table_xentry { - struct radix_node rn[2]; - union { -#ifdef INET6 - struct sockaddr_in6 addr6; -#endif - struct xaddr_iface iface; - } a; - union { -#ifdef INET6 - struct sockaddr_in6 mask6; -#endif - struct xaddr_iface ifmask; - } m; - u_int32_t value; -}; - -/* - * The radix code expects addr and mask to be array of bytes, - * with the first byte being the length of the array. rn_inithead - * is called with the offset in bits of the lookup key within the - * array. If we use a sockaddr_in as the underlying type, - * sin_len is conveniently located at offset 0, sin_addr is at - * offset 4 and normally aligned. - * But for portability, let's avoid assumption and make the code explicit - */ -#define KEY_LEN(v) *((uint8_t *)&(v)) -#define KEY_OFS (8*offsetof(struct sockaddr_in, sin_addr)) -/* - * Do not require radix to compare more than actual IPv4/IPv6 address - */ -#define KEY_LEN_INET (offsetof(struct sockaddr_in, sin_addr) + sizeof(in_addr_t)) -#define KEY_LEN_INET6 (offsetof(struct sockaddr_in6, sin6_addr) + sizeof(struct in6_addr)) -#define KEY_LEN_IFACE (offsetof(struct xaddr_iface, ifname)) - -#define OFF_LEN_INET (8 * offsetof(struct sockaddr_in, sin_addr)) -#define OFF_LEN_INET6 (8 * offsetof(struct sockaddr_in6, sin6_addr)) -#define OFF_LEN_IFACE (8 * offsetof(struct xaddr_iface, ifname)) - - -static inline void -ipv6_writemask(struct in6_addr *addr6, uint8_t mask) -{ - uint32_t *cp; - - for (cp = (uint32_t *)addr6; mask >= 32; mask -= 32) - *cp++ = 0xFFFFFFFF; - *cp = htonl(mask ? ~((1 << (32 - mask)) - 1) : 0); -} - -int -ipfw_add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, void *paddr, - uint8_t plen, uint8_t mlen, uint8_t type, uint32_t value) -{ - struct radix_node_head *rnh, **rnh_ptr; - struct table_entry *ent; - struct table_xentry *xent; - struct radix_node *rn; - in_addr_t addr; - int offset; - void *ent_ptr; - struct sockaddr *addr_ptr, *mask_ptr; - char c; - - if (tbl >= V_fw_tables_max) - return (EINVAL); - - switch (type) { - case IPFW_TABLE_CIDR: - if (plen == sizeof(in_addr_t)) { -#ifdef INET - /* IPv4 case */ - if (mlen > 32) - return (EINVAL); - ent = malloc(sizeof(*ent), M_IPFW_TBL, M_WAITOK | M_ZERO); - ent->value = value; - /* Set 'total' structure length */ - KEY_LEN(ent->addr) = KEY_LEN_INET; - KEY_LEN(ent->mask) = KEY_LEN_INET; - /* Set offset of IPv4 address in bits */ - offset = OFF_LEN_INET; - ent->mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0); - addr = *((in_addr_t *)paddr); - ent->addr.sin_addr.s_addr = addr & ent->mask.sin_addr.s_addr; - /* Set pointers */ - rnh_ptr = &ch->tables[tbl]; - ent_ptr = ent; - addr_ptr = (struct sockaddr *)&ent->addr; - mask_ptr = (struct sockaddr *)&ent->mask; -#endif -#ifdef INET6 - } else if (plen == sizeof(struct in6_addr)) { - /* IPv6 case */ - if (mlen > 128) - return (EINVAL); - xent = malloc(sizeof(*xent), M_IPFW_TBL, M_WAITOK | M_ZERO); - xent->value = value; - /* Set 'total' structure length */ - KEY_LEN(xent->a.addr6) = KEY_LEN_INET6; - KEY_LEN(xent->m.mask6) = KEY_LEN_INET6; - /* Set offset of IPv6 address in bits */ - offset = OFF_LEN_INET6; - ipv6_writemask(&xent->m.mask6.sin6_addr, mlen); - memcpy(&xent->a.addr6.sin6_addr, paddr, sizeof(struct in6_addr)); - APPLY_MASK(&xent->a.addr6.sin6_addr, &xent->m.mask6.sin6_addr); - /* Set pointers */ - rnh_ptr = &ch->xtables[tbl]; - ent_ptr = xent; - addr_ptr = (struct sockaddr *)&xent->a.addr6; - mask_ptr = (struct sockaddr *)&xent->m.mask6; -#endif - } else { - /* Unknown CIDR type */ - return (EINVAL); - } - break; - - case IPFW_TABLE_INTERFACE: - /* Check if string is terminated */ - c = ((char *)paddr)[IF_NAMESIZE - 1]; - ((char *)paddr)[IF_NAMESIZE - 1] = '\0'; - if (((mlen = strlen((char *)paddr)) == IF_NAMESIZE - 1) && (c != '\0')) - return (EINVAL); - - /* Include last \0 into comparison */ - mlen++; - - xent = malloc(sizeof(*xent), M_IPFW_TBL, M_WAITOK | M_ZERO); - xent->value = value; - /* Set 'total' structure length */ - KEY_LEN(xent->a.iface) = KEY_LEN_IFACE + mlen; - KEY_LEN(xent->m.ifmask) = KEY_LEN_IFACE + mlen; - /* Set offset of interface name in bits */ - offset = OFF_LEN_IFACE; - memcpy(xent->a.iface.ifname, paddr, mlen); - /* Assume direct match */ - /* TODO: Add interface pattern matching */ -#if 0 - memset(xent->m.ifmask.ifname, 0xFF, IF_NAMESIZE); - mask_ptr = (struct sockaddr *)&xent->m.ifmask; -#endif - /* Set pointers */ - rnh_ptr = &ch->xtables[tbl]; - ent_ptr = xent; - addr_ptr = (struct sockaddr *)&xent->a.iface; - mask_ptr = NULL; - break; - - default: - return (EINVAL); - } - - IPFW_WLOCK(ch); - - /* Check if tabletype is valid */ - if ((ch->tabletype[tbl] != 0) && (ch->tabletype[tbl] != type)) { - IPFW_WUNLOCK(ch); - free(ent_ptr, M_IPFW_TBL); - return (EINVAL); - } - - /* Check if radix tree exists */ - if ((rnh = *rnh_ptr) == NULL) { - IPFW_WUNLOCK(ch); - /* Create radix for a new table */ - if (!rn_inithead((void **)&rnh, offset)) { - free(ent_ptr, M_IPFW_TBL); - return (ENOMEM); - } - - IPFW_WLOCK(ch); - if (*rnh_ptr != NULL) { - /* Tree is already attached by other thread */ - rn_detachhead((void **)&rnh); - rnh = *rnh_ptr; - /* Check table type another time */ - if (ch->tabletype[tbl] != type) { - IPFW_WUNLOCK(ch); - free(ent_ptr, M_IPFW_TBL); - return (EINVAL); - } - } else { - *rnh_ptr = rnh; - /* - * Set table type. It can be set already - * (if we have IPv6-only table) but setting - * it another time does not hurt - */ - ch->tabletype[tbl] = type; - } - } - - rn = rnh->rnh_addaddr(addr_ptr, mask_ptr, rnh, ent_ptr); - IPFW_WUNLOCK(ch); - - if (rn == NULL) { - free(ent_ptr, M_IPFW_TBL); - return (EEXIST); - } - return (0); -} - -int -ipfw_del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, void *paddr, - uint8_t plen, uint8_t mlen, uint8_t type) -{ - struct radix_node_head *rnh, **rnh_ptr; - struct table_entry *ent; - in_addr_t addr; - struct sockaddr_in sa, mask; - struct sockaddr *sa_ptr, *mask_ptr; - char c; - - if (tbl >= V_fw_tables_max) - return (EINVAL); - - switch (type) { - case IPFW_TABLE_CIDR: - if (plen == sizeof(in_addr_t)) { - /* Set 'total' structure length */ - KEY_LEN(sa) = KEY_LEN_INET; - KEY_LEN(mask) = KEY_LEN_INET; - mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0); - addr = *((in_addr_t *)paddr); - sa.sin_addr.s_addr = addr & mask.sin_addr.s_addr; - rnh_ptr = &ch->tables[tbl]; - sa_ptr = (struct sockaddr *)&sa; - mask_ptr = (struct sockaddr *)&mask; -#ifdef INET6 - } else if (plen == sizeof(struct in6_addr)) { - /* IPv6 case */ - if (mlen > 128) - return (EINVAL); - struct sockaddr_in6 sa6, mask6; - memset(&sa6, 0, sizeof(struct sockaddr_in6)); - memset(&mask6, 0, sizeof(struct sockaddr_in6)); - /* Set 'total' structure length */ - KEY_LEN(sa6) = KEY_LEN_INET6; - KEY_LEN(mask6) = KEY_LEN_INET6; - ipv6_writemask(&mask6.sin6_addr, mlen); - memcpy(&sa6.sin6_addr, paddr, sizeof(struct in6_addr)); - APPLY_MASK(&sa6.sin6_addr, &mask6.sin6_addr); - rnh_ptr = &ch->xtables[tbl]; - sa_ptr = (struct sockaddr *)&sa6; - mask_ptr = (struct sockaddr *)&mask6; -#endif - } else { - /* Unknown CIDR type */ - return (EINVAL); - } - break; - - case IPFW_TABLE_INTERFACE: - /* Check if string is terminated */ - c = ((char *)paddr)[IF_NAMESIZE - 1]; - ((char *)paddr)[IF_NAMESIZE - 1] = '\0'; - if (((mlen = strlen((char *)paddr)) == IF_NAMESIZE - 1) && (c != '\0')) - return (EINVAL); - - struct xaddr_iface ifname, ifmask; - memset(&ifname, 0, sizeof(ifname)); - - /* Include last \0 into comparison */ - mlen++; - - /* Set 'total' structure length */ - KEY_LEN(ifname) = KEY_LEN_IFACE + mlen; - KEY_LEN(ifmask) = KEY_LEN_IFACE + mlen; - /* Assume direct match */ - /* FIXME: Add interface pattern matching */ -#if 0 - memset(ifmask.ifname, 0xFF, IF_NAMESIZE); - mask_ptr = (struct sockaddr *)&ifmask; -#endif - mask_ptr = NULL; - memcpy(ifname.ifname, paddr, mlen); - /* Set pointers */ - rnh_ptr = &ch->xtables[tbl]; - sa_ptr = (struct sockaddr *)&ifname; - - break; - - default: - return (EINVAL); - } - - IPFW_WLOCK(ch); - if ((rnh = *rnh_ptr) == NULL) { - IPFW_WUNLOCK(ch); - return (ESRCH); - } - - if (ch->tabletype[tbl] != type) { - IPFW_WUNLOCK(ch); - return (EINVAL); - } - - ent = (struct table_entry *)rnh->rnh_deladdr(sa_ptr, mask_ptr, rnh); - IPFW_WUNLOCK(ch); - - if (ent == NULL) - return (ESRCH); - - free(ent, M_IPFW_TBL); - return (0); -} - -static int -flush_table_entry(struct radix_node *rn, void *arg) -{ - struct radix_node_head * const rnh = arg; - struct table_entry *ent; - - ent = (struct table_entry *) - rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh); - if (ent != NULL) - free(ent, M_IPFW_TBL); - return (0); -} - -int -ipfw_flush_table(struct ip_fw_chain *ch, uint16_t tbl) -{ - struct radix_node_head *rnh, *xrnh; - - if (tbl >= V_fw_tables_max) - return (EINVAL); - - /* - * We free both (IPv4 and extended) radix trees and - * clear table type here to permit table to be reused - * for different type without module reload - */ - - IPFW_WLOCK(ch); - /* Set IPv4 table pointer to zero */ - if ((rnh = ch->tables[tbl]) != NULL) - ch->tables[tbl] = NULL; - /* Set extended table pointer to zero */ - if ((xrnh = ch->xtables[tbl]) != NULL) - ch->xtables[tbl] = NULL; - /* Zero table type */ - ch->tabletype[tbl] = 0; - IPFW_WUNLOCK(ch); - - if (rnh != NULL) { - rnh->rnh_walktree(rnh, flush_table_entry, rnh); - rn_detachhead((void **)&rnh); - } - - if (xrnh != NULL) { - xrnh->rnh_walktree(xrnh, flush_table_entry, xrnh); - rn_detachhead((void **)&xrnh); - } - - return (0); -} - -void -ipfw_destroy_tables(struct ip_fw_chain *ch) -{ - uint16_t tbl; - - /* Flush all tables */ - for (tbl = 0; tbl < V_fw_tables_max; tbl++) - ipfw_flush_table(ch, tbl); - - /* Free pointers itself */ - free(ch->tables, M_IPFW); - free(ch->xtables, M_IPFW); - free(ch->tabletype, M_IPFW); -} - -int -ipfw_init_tables(struct ip_fw_chain *ch) -{ - /* Allocate pointers */ - ch->tables = malloc(V_fw_tables_max * sizeof(void *), M_IPFW, M_WAITOK | M_ZERO); - ch->xtables = malloc(V_fw_tables_max * sizeof(void *), M_IPFW, M_WAITOK | M_ZERO); - ch->tabletype = malloc(V_fw_tables_max * sizeof(uint8_t), M_IPFW, M_WAITOK | M_ZERO); - return (0); -} - -int -ipfw_resize_tables(struct ip_fw_chain *ch, unsigned int ntables) -{ - struct radix_node_head **tables, **xtables, *rnh; - struct radix_node_head **tables_old, **xtables_old; - uint8_t *tabletype, *tabletype_old; - unsigned int ntables_old, tbl; - - /* Check new value for validity */ - if (ntables > IPFW_TABLES_MAX) - ntables = IPFW_TABLES_MAX; - - /* Allocate new pointers */ - tables = malloc(ntables * sizeof(void *), M_IPFW, M_WAITOK | M_ZERO); - xtables = malloc(ntables * sizeof(void *), M_IPFW, M_WAITOK | M_ZERO); - tabletype = malloc(ntables * sizeof(uint8_t), M_IPFW, M_WAITOK | M_ZERO); - - IPFW_WLOCK(ch); - - tbl = (ntables >= V_fw_tables_max) ? V_fw_tables_max : ntables; - - /* Copy old table pointers */ - memcpy(tables, ch->tables, sizeof(void *) * tbl); - memcpy(xtables, ch->xtables, sizeof(void *) * tbl); - memcpy(tabletype, ch->tabletype, sizeof(uint8_t) * tbl); - - /* Change pointers and number of tables */ - tables_old = ch->tables; - xtables_old = ch->xtables; - tabletype_old = ch->tabletype; - ch->tables = tables; - ch->xtables = xtables; - ch->tabletype = tabletype; - - ntables_old = V_fw_tables_max; - V_fw_tables_max = ntables; - - IPFW_WUNLOCK(ch); - - /* Check if we need to destroy radix trees */ - if (ntables < ntables_old) { - for (tbl = ntables; tbl < ntables_old; tbl++) { - if ((rnh = tables_old[tbl]) != NULL) { - rnh->rnh_walktree(rnh, flush_table_entry, rnh); - rn_detachhead((void **)&rnh); - } - - if ((rnh = xtables_old[tbl]) != NULL) { - rnh->rnh_walktree(rnh, flush_table_entry, rnh); - rn_detachhead((void **)&rnh); - } - } - } - - /* Free old pointers */ - free(tables_old, M_IPFW); - free(xtables_old, M_IPFW); - free(tabletype_old, M_IPFW); - - return (0); -} - -int -ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, - uint32_t *val) -{ - struct radix_node_head *rnh; - struct table_entry *ent; - struct sockaddr_in sa; - - if (tbl >= V_fw_tables_max) - return (0); - if ((rnh = ch->tables[tbl]) == NULL) - return (0); - KEY_LEN(sa) = KEY_LEN_INET; - sa.sin_addr.s_addr = addr; - ent = (struct table_entry *)(rnh->rnh_lookup(&sa, NULL, rnh)); - if (ent != NULL) { - *val = ent->value; - return (1); - } - return (0); -} - -int -ipfw_lookup_table_extended(struct ip_fw_chain *ch, uint16_t tbl, void *paddr, - uint32_t *val, int type) -{ - struct radix_node_head *rnh; - struct table_xentry *xent; - struct sockaddr_in6 sa6; - struct xaddr_iface iface; - - if (tbl >= V_fw_tables_max) - return (0); - if ((rnh = ch->xtables[tbl]) == NULL) - return (0); - - switch (type) { - case IPFW_TABLE_CIDR: - KEY_LEN(sa6) = KEY_LEN_INET6; - memcpy(&sa6.sin6_addr, paddr, sizeof(struct in6_addr)); - xent = (struct table_xentry *)(rnh->rnh_lookup(&sa6, NULL, rnh)); - break; - - case IPFW_TABLE_INTERFACE: - KEY_LEN(iface) = KEY_LEN_IFACE + - strlcpy(iface.ifname, (char *)paddr, IF_NAMESIZE) + 1; - /* Assume direct match */ - /* FIXME: Add interface pattern matching */ - xent = (struct table_xentry *)(rnh->rnh_lookup(&iface, NULL, rnh)); - break; - - default: - return (0); - } - - if (xent != NULL) { - *val = xent->value; - return (1); - } - return (0); -} - -static int -count_table_entry(struct radix_node *rn, void *arg) -{ - u_int32_t * const cnt = arg; - - (*cnt)++; - return (0); -} - -int -ipfw_count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt) -{ - struct radix_node_head *rnh; - - if (tbl >= V_fw_tables_max) - return (EINVAL); - *cnt = 0; - if ((rnh = ch->tables[tbl]) == NULL) - return (0); - rnh->rnh_walktree(rnh, count_table_entry, cnt); - return (0); -} - -static int -dump_table_entry(struct radix_node *rn, void *arg) -{ - struct table_entry * const n = (struct table_entry *)rn; - ipfw_table * const tbl = arg; - ipfw_table_entry *ent; - - if (tbl->cnt == tbl->size) - return (1); - ent = &tbl->ent[tbl->cnt]; - ent->tbl = tbl->tbl; - if (in_nullhost(n->mask.sin_addr)) - ent->masklen = 0; - else - ent->masklen = 33 - ffs(ntohl(n->mask.sin_addr.s_addr)); - ent->addr = n->addr.sin_addr.s_addr; - ent->value = n->value; - tbl->cnt++; - return (0); -} - -int -ipfw_dump_table(struct ip_fw_chain *ch, ipfw_table *tbl) -{ - struct radix_node_head *rnh; - - if (tbl->tbl >= V_fw_tables_max) - return (EINVAL); - tbl->cnt = 0; - if ((rnh = ch->tables[tbl->tbl]) == NULL) - return (0); - rnh->rnh_walktree(rnh, dump_table_entry, tbl); - return (0); -} - -static int -count_table_xentry(struct radix_node *rn, void *arg) -{ - uint32_t * const cnt = arg; - - (*cnt) += sizeof(ipfw_table_xentry); - return (0); -} - -int -ipfw_count_xtable(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt) -{ - struct radix_node_head *rnh; - - if (tbl >= V_fw_tables_max) - return (EINVAL); - *cnt = 0; - if ((rnh = ch->tables[tbl]) != NULL) - rnh->rnh_walktree(rnh, count_table_xentry, cnt); - if ((rnh = ch->xtables[tbl]) != NULL) - rnh->rnh_walktree(rnh, count_table_xentry, cnt); - /* Return zero if table is empty */ - if (*cnt > 0) - (*cnt) += sizeof(ipfw_xtable); - return (0); -} - - -static int -dump_table_xentry_base(struct radix_node *rn, void *arg) -{ - struct table_entry * const n = (struct table_entry *)rn; - ipfw_xtable * const tbl = arg; - ipfw_table_xentry *xent; - - /* Out of memory, returning */ - if (tbl->cnt == tbl->size) - return (1); - xent = &tbl->xent[tbl->cnt]; - xent->len = sizeof(ipfw_table_xentry); - xent->tbl = tbl->tbl; - if (in_nullhost(n->mask.sin_addr)) - xent->masklen = 0; - else - xent->masklen = 33 - ffs(ntohl(n->mask.sin_addr.s_addr)); - /* Save IPv4 address as deprecated IPv6 compatible */ - xent->k.addr6.s6_addr32[3] = n->addr.sin_addr.s_addr; - xent->value = n->value; - tbl->cnt++; - return (0); -} - -static int -dump_table_xentry_extended(struct radix_node *rn, void *arg) -{ - struct table_xentry * const n = (struct table_xentry *)rn; - ipfw_xtable * const tbl = arg; - ipfw_table_xentry *xent; -#ifdef INET6 - int i; - uint32_t *v; -#endif - /* Out of memory, returning */ - if (tbl->cnt == tbl->size) - return (1); - xent = &tbl->xent[tbl->cnt]; - xent->len = sizeof(ipfw_table_xentry); - xent->tbl = tbl->tbl; - - switch (tbl->type) { -#ifdef INET6 - case IPFW_TABLE_CIDR: - /* Count IPv6 mask */ - v = (uint32_t *)&n->m.mask6.sin6_addr; - for (i = 0; i < sizeof(struct in6_addr) / 4; i++, v++) - xent->masklen += bitcount32(*v); - memcpy(&xent->k, &n->a.addr6.sin6_addr, sizeof(struct in6_addr)); - break; -#endif - case IPFW_TABLE_INTERFACE: - /* Assume exact mask */ - xent->masklen = 8 * IF_NAMESIZE; - memcpy(&xent->k, &n->a.iface.ifname, IF_NAMESIZE); - break; - - default: - /* unknown, skip entry */ - return (0); - } - - xent->value = n->value; - tbl->cnt++; - return (0); -} - -int -ipfw_dump_xtable(struct ip_fw_chain *ch, ipfw_xtable *tbl) -{ - struct radix_node_head *rnh; - - if (tbl->tbl >= V_fw_tables_max) - return (EINVAL); - tbl->cnt = 0; - tbl->type = ch->tabletype[tbl->tbl]; - if ((rnh = ch->tables[tbl->tbl]) != NULL) - rnh->rnh_walktree(rnh, dump_table_xentry_base, tbl); - if ((rnh = ch->xtables[tbl->tbl]) != NULL) - rnh->rnh_walktree(rnh, dump_table_xentry_extended, tbl); - return (0); -} - -/* end of file */ diff --git a/sys/netinet/ipfw/test/Makefile b/sys/netinet/ipfw/test/Makefile deleted file mode 100644 index c556a4b..0000000 --- a/sys/netinet/ipfw/test/Makefile +++ /dev/null @@ -1,51 +0,0 @@ -# -# $FreeBSD$ -# -# Makefile for building userland tests -# this is written in a form compatible with gmake - -SCHED_SRCS = test_dn_sched.c -SCHED_SRCS += dn_sched_fifo.c -SCHED_SRCS += dn_sched_prio.c -SCHED_SRCS += dn_sched_qfq.c -SCHED_SRCS += dn_sched_rr.c -SCHED_SRCS += dn_sched_wf2q.c -SCHED_SRCS += dn_heap.c -SCHED_SRCS += main.c - -SCHED_OBJS=$(SCHED_SRCS:.c=.o) - -HEAP_SRCS = dn_heap.c test_dn_heap.c -HEAP_OBJS=$(HEAP_SRCS:.c=.o) - -VPATH= .:.. - -CFLAGS = -I.. -I. -Wall -Werror -O3 -DIPFW -TARGETS= test_sched # no test_heap by default - -all: $(TARGETS) - -test_heap : $(HEAP_OBJS) - $(CC) -o $@ $(HEAP_OBJS) - -test_sched : $(SCHED_OBJS) - $(CC) -o $@ $(SCHED_OBJS) - -$(SCHED_OBJS): dn_test.h -main.o: mylist.h - -clean: - - rm *.o $(TARGETS) *.core - -ALLSRCS = $(SCHED_SRCS) dn_test.h mylist.h \ - dn_sched.h dn_heap.h ip_dn_private.h Makefile -TMPBASE = /tmp/testXYZ -TMPDIR = $(TMPBASE)/test - -tgz: - -rm -rf $(TMPDIR) - mkdir -p $(TMPDIR) - -cp -p $(ALLSRCS) $(TMPDIR) - -(cd ..; cp -p $(ALLSRCS) $(TMPDIR)) - ls -la $(TMPDIR) - (cd $(TMPBASE); tar cvzf /tmp/test.tgz test) diff --git a/sys/netinet/ipfw/test/dn_test.h b/sys/netinet/ipfw/test/dn_test.h deleted file mode 100644 index 4e079bc..0000000 --- a/sys/netinet/ipfw/test/dn_test.h +++ /dev/null @@ -1,175 +0,0 @@ -/* - * $FreeBSD$ - * - * userspace compatibility code for dummynet schedulers - */ - -#ifndef _DN_TEST_H -#define _DN_TEST_H - -#ifdef __cplusplus -extern "C" { -#endif - -#include <inttypes.h> -#include <stdio.h> -#include <stdlib.h> -#include <strings.h> /* bzero, ffs, ... */ -#include <string.h> /* strcmp */ -#include <errno.h> -#include <sys/queue.h> -#include <sys/time.h> - -extern int debug; -#define ND(fmt, args...) do {} while (0) -#define D1(fmt, args...) do {} while (0) -#define D(fmt, args...) fprintf(stderr, "%-8s " fmt "\n", \ - __FUNCTION__, ## args) -#define DX(lev, fmt, args...) do { \ - if (debug > lev) D(fmt, ## args); } while (0) - - -#ifndef offsetof -#define offsetof(t,m) (int)((&((t *)0L)->m)) -#endif - -#include <mylist.h> - -/* prevent include of other system headers */ -#define _NETINET_IP_VAR_H_ /* ip_fw_args */ -#define _IPFW2_H -#define _SYS_MBUF_H_ - -enum { - DN_QUEUE, -}; - -enum { - DN_SCHED_FIFO, - DN_SCHED_WF2QP, -}; - -struct dn_id { - int type, subtype, len, id; -}; - -struct dn_fs { - int par[4]; /* flowset parameters */ - - /* simulation entries. - * 'index' is not strictly necessary - * y is used for the inverse mapping , - */ - int index; - int y; /* inverse mapping */ - int base_y; /* inverse mapping */ - int next_y; /* inverse mapping */ - int n_flows; - int first_flow; - int next_flow; /* first_flow + n_flows */ - /* - * when generating, let 'cur' go from 0 to n_flows-1, - * then point to flow first_flow + cur - */ - int cur; -}; - -struct dn_sch { -}; - -struct dn_flow { - struct dn_id oid; - int length; - int len_bytes; - int drops; - uint64_t tot_bytes; - uint32_t flow_id; - struct list_head h; /* used by the generator */ -}; - -struct dn_link { -}; - -struct ip_fw_args { -}; - -struct mbuf { - struct { - int len; - } m_pkthdr; - struct mbuf *m_nextpkt; - int flow_id; /* for testing, index of a flow */ - //int flowset_id; /* for testing, index of a flowset */ - void *cfg; /* config args */ -}; - -#define MALLOC_DECLARE(x) -#define KASSERT(x, y) do { if (!(x)) printf y ; exit(0); } while (0) -struct ipfw_flow_id { -}; - -typedef void * module_t; - -struct _md_t { - const char *name; - int (*f)(module_t, int, void *); - void *p; -}; - -typedef struct _md_t moduledata_t; - -#define DECLARE_MODULE(name, b, c, d) \ - moduledata_t *_g_##name = & b -#define MODULE_DEPEND(a, b, c, d, e) - -#ifdef IPFW -#include <dn_heap.h> -#include <ip_dn_private.h> -#include <dn_sched.h> -#else -struct dn_queue { - struct dn_fsk *fs; /* parent flowset. */ - struct dn_sch_inst *_si; /* parent sched instance. */ -}; -struct dn_schk { -}; -struct dn_fsk { - struct dn_fs fs; - struct dn_schk *sched; -}; -struct dn_sch_inst { - struct dn_schk *sched; -}; -struct dn_alg { - int type; - const char *name; - void *enqueue, *dequeue; - int q_datalen, si_datalen, schk_datalen; - int (*config)(struct dn_schk *); - int (*new_sched)(struct dn_sch_inst *); - int (*new_fsk)(struct dn_fsk *); - int (*new_queue)(struct dn_queue *q); -}; - -#endif - -#ifndef __FreeBSD__ -int fls(int); -#endif - -static inline void -mq_append(struct mq *q, struct mbuf *m) -{ - if (q->head == NULL) - q->head = m; - else - q->tail->m_nextpkt = m; - q->tail = m; - m->m_nextpkt = NULL; -} - -#ifdef __cplusplus -} -#endif - -#endif /* _DN_TEST_H */ diff --git a/sys/netinet/ipfw/test/main.c b/sys/netinet/ipfw/test/main.c deleted file mode 100644 index be9fdf5..0000000 --- a/sys/netinet/ipfw/test/main.c +++ /dev/null @@ -1,636 +0,0 @@ -/* - * $FreeBSD$ - * - * Testing program for schedulers - * - * The framework include a simple controller which, at each - * iteration, decides whether we can enqueue and/or dequeue. - * Then the mainloop runs the required number of tests, - * keeping track of statistics. - */ - -#include "dn_test.h" - -struct q_list { - struct list_head h; -}; - -struct cfg_s { - int ac; - char * const *av; - - const char *name; - int loops; - struct timeval time; - - /* running counters */ - uint32_t _enqueue; - uint32_t drop; - uint32_t pending; - uint32_t dequeue; - - /* generator parameters */ - int th_min, th_max; - int maxburst; - int lmin, lmax; /* packet len */ - int flows; /* number of flows */ - int flowsets; /* number of flowsets */ - int wsum; /* sum of weights of all flows */ - int max_y; /* max random number in the generation */ - int cur_y, cur_fs; /* used in generation, between 0 and max_y - 1 */ - const char *fs_config; /* flowset config */ - int can_dequeue; - int burst; /* count of packets sent in a burst */ - struct mbuf *tosend; /* packet to send -- also flag to enqueue */ - - struct mbuf *freelist; - - struct mbuf *head, *tail; /* a simple tailq */ - - /* scheduler hooks */ - int (*enq)(struct dn_sch_inst *, struct dn_queue *, - struct mbuf *); - struct mbuf * (*deq)(struct dn_sch_inst *); - /* size of the three fields including sched-specific areas */ - int schk_len; - int q_len; /* size of a queue including sched-fields */ - int si_len; /* size of a sch_inst including sched-fields */ - char *q; /* array of flow queues */ - /* use a char* because size is variable */ - struct dn_fsk *fs; /* array of flowsets */ - struct dn_sch_inst *si; - struct dn_schk *sched; - - /* generator state */ - int state; /* 0 = going up, 1: going down */ - - /* - * We keep lists for each backlog level, and always serve - * the one with shortest backlog. llmask contains a bitmap - * of lists, and ll are the heads of the lists. The last - * entry (BACKLOG) contains all entries considered 'full' - * XXX to optimize things, entry i could contain queues with - * 2^{i-1}+1 .. 2^i entries. - */ -#define BACKLOG 30 - uint32_t llmask; - struct list_head ll[BACKLOG + 10]; -}; - -/* FI2Q and Q2FI converts from flow_id to dn_queue and back. - * We cannot easily use pointer arithmetic because it is variable size. - */ -#define FI2Q(c, i) ((struct dn_queue *)((c)->q + (c)->q_len * (i))) -#define Q2FI(c, q) (((char *)(q) - (c)->q)/(c)->q_len) - -int debug = 0; - -struct dn_parms dn_cfg; - -static void controller(struct cfg_s *c); - -/* release a packet: put the mbuf in the freelist, and the queue in - * the bucket. - */ -int -drop(struct cfg_s *c, struct mbuf *m) -{ - struct dn_queue *q; - int i; - - c->drop++; - q = FI2Q(c, m->flow_id); - i = q->ni.length; // XXX or ffs... - - ND("q %p id %d current length %d", q, m->flow_id, i); - if (i < BACKLOG) { - struct list_head *h = &q->ni.h; - c->llmask &= ~(1<<(i+1)); - c->llmask |= (1<<(i)); - list_del(h); - list_add_tail(h, &c->ll[i]); - } - m->m_nextpkt = c->freelist; - c->freelist = m; - return 0; -} - -/* dequeue returns NON-NULL when a packet is dropped */ -static int -enqueue(struct cfg_s *c, void *_m) -{ - struct mbuf *m = _m; - if (c->enq) - return c->enq(c->si, FI2Q(c, m->flow_id), m); - if (c->head == NULL) - c->head = m; - else - c->tail->m_nextpkt = m; - c->tail = m; - return 0; /* default - success */ -} - -/* dequeue returns NON-NULL when a packet is available */ -static void * -dequeue(struct cfg_s *c) -{ - struct mbuf *m; - if (c->deq) - return c->deq(c->si); - if ((m = c->head)) { - m = c->head; - c->head = m->m_nextpkt; - m->m_nextpkt = NULL; - } - return m; -} - -static int -mainloop(struct cfg_s *c) -{ - int i; - struct mbuf *m; - - for (i=0; i < c->loops; i++) { - /* implement histeresis */ - controller(c); - DX(3, "loop %d enq %d send %p rx %d", - i, c->_enqueue, c->tosend, c->can_dequeue); - if ( (m = c->tosend) ) { - c->_enqueue++; - if (enqueue(c, m)) { - drop(c, m); - ND("loop %d enqueue fail", i ); - } else { - ND("enqueue ok"); - c->pending++; - } - } - if (c->can_dequeue) { - c->dequeue++; - if ((m = dequeue(c))) { - c->pending--; - drop(c, m); - c->drop--; /* compensate */ - } - } - } - DX(1, "mainloop ends %d", i); - return 0; -} - -int -dump(struct cfg_s *c) -{ - int i; - struct dn_queue *q; - - for (i=0; i < c->flows; i++) { - q = FI2Q(c, i); - DX(1, "queue %4d tot %10lld", i, q->ni.tot_bytes); - } - DX(1, "done %d loops\n", c->loops); - return 0; -} - -/* interpret a number in human form */ -static long -getnum(const char *s, char **next, const char *key) -{ - char *end = NULL; - long l; - - if (next) /* default */ - *next = NULL; - if (s && *s) { - DX(3, "token is <%s> %s", s, key ? key : "-"); - l = strtol(s, &end, 0); - } else { - DX(3, "empty string"); - l = -1; - } - if (l < 0) { - DX(2, "invalid %s for %s", s ? s : "NULL", (key ? key : "") ); - return 0; // invalid - } - if (!end || !*end) - return l; - if (*end == 'n') - l = -l; /* multiply by n */ - else if (*end == 'K') - l = l*1000; - else if (*end == 'M') - l = l*1000000; - else if (*end == 'k') - l = l*1024; - else if (*end == 'm') - l = l*1024*1024; - else if (*end == 'w') - ; - else {/* not recognized */ - D("suffix %s for %s, next %p", end, key, next); - end--; - } - end++; - DX(3, "suffix now %s for %s, next %p", end, key, next); - if (next && *end) { - DX(3, "setting next to %s for %s", end, key); - *next = end; - } - return l; -} - -/* - * flowsets are a comma-separated list of - * weight:maxlen:flows - * indicating how many flows are hooked to that fs. - * Both weight and range can be min-max-steps. - * In a first pass we just count the number of flowsets and flows, - * in a second pass we complete the setup. - */ -static void -parse_flowsets(struct cfg_s *c, const char *fs, int pass) -{ - char *s, *cur, *next; - int n_flows = 0, n_fs = 0, wsum = 0; - int i, j; - struct dn_fs *prev = NULL; - - DX(3, "--- pass %d flows %d flowsets %d", pass, c->flows, c->flowsets); - if (pass == 0) - c->fs_config = fs; - s = c->fs_config ? strdup(c->fs_config) : NULL; - if (s == NULL) { - if (pass == 0) - D("no fsconfig"); - return; - } - for (next = s; (cur = strsep(&next, ","));) { - char *p = NULL; - int w, w_h, w_steps, wi; - int len, len_h, l_steps, li; - int flows; - - w = getnum(strsep(&cur, ":"), &p, "weight"); - if (w <= 0) - w = 1; - w_h = p ? getnum(p+1, &p, "weight_max") : w; - w_steps = p ? getnum(p+1, &p, "w_steps") : (w_h == w ?1:2); - len = getnum(strsep(&cur, ":"), &p, "len"); - if (len <= 0) - len = 1000; - len_h = p ? getnum(p+1, &p, "len_max") : len; - l_steps = p ? getnum(p+1, &p, "l_steps") : (len_h == len ? 1 : 2); - flows = getnum(strsep(&cur, ":"), NULL, "flows"); - if (flows == 0) - flows = 1; - DX(4, "weight %d..%d (%d) len %d..%d (%d) flows %d", - w, w_h, w_steps, len, len_h, l_steps, flows); - if (w == 0 || w_h < w || len == 0 || len_h < len || - flows == 0) { - DX(4,"wrong parameters %s", fs); - return; - } - n_flows += flows * w_steps * l_steps; - for (i = 0; i < w_steps; i++) { - wi = w + ((w_h - w)* i)/(w_steps == 1 ? 1 : (w_steps-1)); - for (j = 0; j < l_steps; j++, n_fs++) { - struct dn_fs *fs = &c->fs[n_fs].fs; // tentative - int x; - - li = len + ((len_h - len)* j)/(l_steps == 1 ? 1 : (l_steps-1)); - x = (wi*2048)/li; - DX(3, "----- fs %4d weight %4d lmax %4d X %4d flows %d", - n_fs, wi, li, x, flows); - if (pass == 0) - continue; - if (c->fs == NULL || c->flowsets <= n_fs) { - D("error in number of flowsets"); - return; - } - wsum += wi * flows; - fs->par[0] = wi; - fs->par[1] = li; - fs->index = n_fs; - fs->n_flows = flows; - fs->cur = fs->first_flow = prev==NULL ? 0 : prev->next_flow; - fs->next_flow = fs->first_flow + fs->n_flows; - fs->y = x * flows; - fs->base_y = (prev == NULL) ? 0 : prev->next_y; - fs->next_y = fs->base_y + fs->y; - prev = fs; - } - } - } - c->max_y = prev ? prev->base_y + prev->y : 0; - c->flows = n_flows; - c->flowsets = n_fs; - c->wsum = wsum; - if (pass == 0) - return; - - /* now link all flows to their parent flowsets */ - DX(1,"%d flows on %d flowsets max_y %d", c->flows, c->flowsets, c->max_y); - for (i=0; i < c->flowsets; i++) { - struct dn_fs *fs = &c->fs[i].fs; - DX(1, "fs %3d w %5d l %4d flow %5d .. %5d y %6d .. %6d", - i, fs->par[0], fs->par[1], - fs->first_flow, fs->next_flow, - fs->base_y, fs->next_y); - for (j = fs->first_flow; j < fs->next_flow; j++) { - struct dn_queue *q = FI2Q(c, j); - q->fs = &c->fs[i]; - } - } -} - -static int -init(struct cfg_s *c) -{ - int i; - int ac = c->ac; - char * const *av = c->av; - - c->si_len = sizeof(struct dn_sch_inst); - c->q_len = sizeof(struct dn_queue); - moduledata_t *mod = NULL; - struct dn_alg *p = NULL; - - c->th_min = 0; - c->th_max = -20;/* 20 packets per flow */ - c->lmin = c->lmax = 1280; /* packet len */ - c->flows = 1; - c->flowsets = 1; - c->name = "null"; - ac--; av++; - while (ac > 1) { - if (!strcmp(*av, "-n")) { - c->loops = getnum(av[1], NULL, av[0]); - } else if (!strcmp(*av, "-d")) { - debug = atoi(av[1]); - } else if (!strcmp(*av, "-alg")) { - extern moduledata_t *_g_dn_fifo; - extern moduledata_t *_g_dn_wf2qp; - extern moduledata_t *_g_dn_rr; - extern moduledata_t *_g_dn_qfq; -#ifdef WITH_KPS - extern moduledata_t *_g_dn_kps; -#endif - if (!strcmp(av[1], "rr")) - mod = _g_dn_rr; - else if (!strcmp(av[1], "wf2qp")) - mod = _g_dn_wf2qp; - else if (!strcmp(av[1], "fifo")) - mod = _g_dn_fifo; - else if (!strcmp(av[1], "qfq")) - mod = _g_dn_qfq; -#ifdef WITH_KPS - else if (!strcmp(av[1], "kps")) - mod = _g_dn_kps; -#endif - else - mod = NULL; - c->name = mod ? mod->name : "NULL"; - DX(3, "using scheduler %s", c->name); - } else if (!strcmp(*av, "-len")) { - c->lmin = getnum(av[1], NULL, av[0]); - c->lmax = c->lmin; - DX(3, "setting max to %d", c->th_max); - } else if (!strcmp(*av, "-burst")) { - c->maxburst = getnum(av[1], NULL, av[0]); - DX(3, "setting max to %d", c->th_max); - } else if (!strcmp(*av, "-qmax")) { - c->th_max = getnum(av[1], NULL, av[0]); - DX(3, "setting max to %d", c->th_max); - } else if (!strcmp(*av, "-qmin")) { - c->th_min = getnum(av[1], NULL, av[0]); - DX(3, "setting min to %d", c->th_min); - } else if (!strcmp(*av, "-flows")) { - c->flows = getnum(av[1], NULL, av[0]); - DX(3, "setting flows to %d", c->flows); - } else if (!strcmp(*av, "-flowsets")) { - parse_flowsets(c, av[1], 0); - DX(3, "setting flowsets to %d", c->flowsets); - } else { - D("option %s not recognised, ignore", *av); - } - ac -= 2; av += 2; - } - if (c->maxburst <= 0) - c->maxburst = 1; - if (c->loops <= 0) - c->loops = 1; - if (c->flows <= 0) - c->flows = 1; - if (c->flowsets <= 0) - c->flowsets = 1; - if (c->lmin <= 0) - c->lmin = 1; - if (c->lmax <= 0) - c->lmax = 1; - /* multiply by N */ - if (c->th_min < 0) - c->th_min = c->flows * -c->th_min; - if (c->th_max < 0) - c->th_max = c->flows * -c->th_max; - if (c->th_max <= c->th_min) - c->th_max = c->th_min + 1; - if (mod) { - p = mod->p; - DX(3, "using module %s f %p p %p", mod->name, mod->f, mod->p); - DX(3, "modname %s ty %d", p->name, p->type); - c->enq = p->enqueue; - c->deq = p->dequeue; - c->si_len += p->si_datalen; - c->q_len += p->q_datalen; - c->schk_len += p->schk_datalen; - } - /* allocate queues, flowsets and one scheduler */ - c->q = calloc(c->flows, c->q_len); - c->fs = calloc(c->flowsets, sizeof(struct dn_fsk)); - c->si = calloc(1, c->si_len); - c->sched = calloc(c->flows, c->schk_len); - if (c->q == NULL || c->fs == NULL) { - D("error allocating memory for flows"); - exit(1); - } - c->si->sched = c->sched; - if (p) { - if (p->config) - p->config(c->sched); - if (p->new_sched) - p->new_sched(c->si); - } - /* parse_flowsets links queues to their flowsets */ - parse_flowsets(c, av[1], 1); - /* complete the work calling new_fsk */ - for (i = 0; i < c->flowsets; i++) { - if (c->fs[i].fs.par[1] == 0) - c->fs[i].fs.par[1] = 1000; /* default pkt len */ - c->fs[i].sched = c->sched; - if (p && p->new_fsk) - p->new_fsk(&c->fs[i]); - } - - /* initialize the lists for the generator, and put - * all flows in the list for backlog = 0 - */ - for (i=0; i <= BACKLOG+5; i++) - INIT_LIST_HEAD(&c->ll[i]); - - for (i = 0; i < c->flows; i++) { - struct dn_queue *q = FI2Q(c, i); - if (q->fs == NULL) - q->fs = &c->fs[0]; /* XXX */ - q->_si = c->si; - if (p && p->new_queue) - p->new_queue(q); - INIT_LIST_HEAD(&q->ni.h); - list_add_tail(&q->ni.h, &c->ll[0]); - } - c->llmask = 1; - return 0; -} - - -int -main(int ac, char *av[]) -{ - struct cfg_s c; - struct timeval end; - double ll; - int i; - char msg[40]; - - bzero(&c, sizeof(c)); - c.ac = ac; - c.av = av; - init(&c); - gettimeofday(&c.time, NULL); - mainloop(&c); - gettimeofday(&end, NULL); - end.tv_sec -= c.time.tv_sec; - end.tv_usec -= c.time.tv_usec; - if (end.tv_usec < 0) { - end.tv_usec += 1000000; - end.tv_sec--; - } - c.time = end; - ll = end.tv_sec*1000000 + end.tv_usec; - ll *= 1000; /* convert to nanoseconds */ - ll /= c._enqueue; - sprintf(msg, "1::%d", c.flows); - D("%-8s n %d %d time %d.%06d %8.3f qlen %d %d flows %s drops %d", - c.name, c._enqueue, c.loops, - (int)c.time.tv_sec, (int)c.time.tv_usec, ll, - c.th_min, c.th_max, - c.fs_config ? c.fs_config : msg, c.drop); - dump(&c); - DX(1, "done ac %d av %p", ac, av); - for (i=0; i < ac; i++) - DX(1, "arg %d %s", i, av[i]); - return 0; -} - -/* - * The controller decides whether in this iteration we should send - * (the packet is in c->tosend) and/or receive (flag c->can_dequeue) - */ -static void -controller(struct cfg_s *c) -{ - struct mbuf *m; - struct dn_fs *fs; - int flow_id; - - /* histeresis between max and min */ - if (c->state == 0 && c->pending >= c->th_max) - c->state = 1; - else if (c->state == 1 && c->pending <= c->th_min) - c->state = 0; - ND(1, "state %d pending %2d", c->state, c->pending); - c->can_dequeue = c->state; - c->tosend = NULL; - if (c->state) - return; - - if (1) { - int i; - struct dn_queue *q; - struct list_head *h; - - i = ffs(c->llmask) - 1; - if (i < 0) { - DX(2, "no candidate"); - c->can_dequeue = 1; - return; - } - h = &c->ll[i]; - ND(1, "backlog %d p %p prev %p next %p", i, h, h->prev, h->next); - q = list_first_entry(h, struct dn_queue, ni.h); - list_del(&q->ni.h); - flow_id = Q2FI(c, q); - DX(2, "extracted flow %p %d backlog %d", q, flow_id, i); - if (list_empty(h)) { - ND(2, "backlog %d empty", i); - c->llmask &= ~(1<<i); - } - ND(1, "before %d p %p prev %p next %p", i+1, h+1, h[1].prev, h[1].next); - list_add_tail(&q->ni.h, h+1); - ND(1, " after %d p %p prev %p next %p", i+1, h+1, h[1].prev, h[1].next); - if (i < BACKLOG) { - ND(2, "backlog %d full", i+1); - c->llmask |= 1<<(1+i); - } - fs = &q->fs->fs; - c->cur_fs = q->fs - c->fs; - fs->cur = flow_id; - } else { - /* XXX this does not work ? */ - /* now decide whom to send the packet, and the length */ - /* lookup in the flow table */ - if (c->cur_y >= c->max_y) { /* handle wraparound */ - c->cur_y = 0; - c->cur_fs = 0; - } - fs = &c->fs[c->cur_fs].fs; - flow_id = fs->cur++; - if (fs->cur >= fs->next_flow) - fs->cur = fs->first_flow; - c->cur_y++; - if (c->cur_y >= fs->next_y) - c->cur_fs++; - } - - /* construct a packet */ - if (c->freelist) { - m = c->tosend = c->freelist; - c->freelist = c->freelist->m_nextpkt; - } else { - m = c->tosend = calloc(1, sizeof(struct mbuf)); - } - if (m == NULL) - return; - - m->cfg = c; - m->m_nextpkt = NULL; - m->m_pkthdr.len = fs->par[1]; // XXX maxlen - m->flow_id = flow_id; - - ND(2,"y %6d flow %5d fs %3d weight %4d len %4d", - c->cur_y, m->flow_id, c->cur_fs, - fs->par[0], m->m_pkthdr.len); - -} - -/* -Packet allocation: -to achieve a distribution that matches weights, for each X=w/lmax class -we should generate a number of packets proportional to Y = X times the number -of flows in the class. -So we construct an array with the cumulative distribution of Y's, -and use it to identify the flow via inverse mapping (if the Y's are -not too many we can use an array for the lookup). In practice, -each flow will have X entries [virtually] pointing to it. - -*/ diff --git a/sys/netinet/ipfw/test/mylist.h b/sys/netinet/ipfw/test/mylist.h deleted file mode 100644 index 6247f32..0000000 --- a/sys/netinet/ipfw/test/mylist.h +++ /dev/null @@ -1,49 +0,0 @@ -/* - * $FreeBSD$ - * - * linux-like bidirectional lists - */ - -#ifndef _MYLIST_H -#define _MYLIST_H -struct list_head { - struct list_head *prev, *next; -}; - -#define INIT_LIST_HEAD(l) do { (l)->prev = (l)->next = (l); } while (0) -#define list_empty(l) ( (l)->next == l ) -static inline void -__list_add(struct list_head *o, struct list_head *prev, - struct list_head *next) -{ - next->prev = o; - o->next = next; - o->prev = prev; - prev->next = o; -} - -static inline void -list_add_tail(struct list_head *o, struct list_head *head) -{ - __list_add(o, head->prev, head); -} - -#define list_first_entry(pL, ty, member) \ - (ty *)((char *)((pL)->next) - offsetof(ty, member)) - -static inline void -__list_del(struct list_head *prev, struct list_head *next) -{ - next->prev = prev; - prev->next = next; -} - -static inline void -list_del(struct list_head *entry) -{ - ND("called on %p", entry); - __list_del(entry->prev, entry->next); - entry->next = entry->prev = NULL; -} - -#endif /* _MYLIST_H */ diff --git a/sys/netinet/ipfw/test/test_dn_heap.c b/sys/netinet/ipfw/test/test_dn_heap.c deleted file mode 100644 index d460cf2..0000000 --- a/sys/netinet/ipfw/test/test_dn_heap.c +++ /dev/null @@ -1,162 +0,0 @@ -/*- - * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa - * All rights reserved - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * Userland code for testing binary heaps and hash tables - * - * $FreeBSD$ - */ - -#include <sys/cdefs.h> -#include <sys/param.h> - -#include <stdio.h> -#include <strings.h> -#include <stdlib.h> - -#include "dn_heap.h" -#define log(x, arg...) fprintf(stderr, ## arg) -#define panic(x...) fprintf(stderr, ## x), exit(1) - -#include <string.h> - -struct x { - struct x *ht_link; - char buf[0]; -}; - -uint32_t hf(uintptr_t key, int flags, void *arg) -{ - return (flags & DNHT_KEY_IS_OBJ) ? - ((struct x *)key)->buf[0] : *(char *)key; -} - -int matchf(void *obj, uintptr_t key, int flags, void *arg) -{ - char *s = (flags & DNHT_KEY_IS_OBJ) ? - ((struct x *)key)->buf : (char *)key; - return (strcmp(((struct x *)obj)->buf, s) == 0); -} - -void *newfn(uintptr_t key, int flags, void *arg) -{ - char *s = (char *)key; - struct x *p = malloc(sizeof(*p) + 1 + strlen(s)); - if (p) - strcpy(p->buf, s); - return p; -} - -char *strings[] = { - "undici", "unico", "doppio", "devoto", - "uno", "due", "tre", "quattro", "cinque", "sei", - "uno", "due", "tre", "quattro", "cinque", "sei", - NULL, -}; - -int doprint(void *_x, void *arg) -{ - struct x *x = _x; - printf("found element <%s>\n", x->buf); - return (int)arg; -} - -static void -test_hash() -{ - char **p; - struct dn_ht *h; - uintptr_t x = 0; - uintptr_t x1 = 0; - - /* first, find and allocate */ - h = dn_ht_init(NULL, 10, 0, hf, matchf, newfn); - - for (p = strings; *p; p++) { - dn_ht_find(h, (uintptr_t)*p, DNHT_INSERT, NULL); - } - dn_ht_scan(h, doprint, 0); - printf("/* second -- find without allocate */\n"); - h = dn_ht_init(NULL, 10, 0, hf, matchf, NULL); - for (p = strings; *p; p++) { - void **y = newfn((uintptr_t)*p, 0, NULL); - if (x == 0) - x = (uintptr_t)y; - else { - if (x1 == 0) - x1 = (uintptr_t)*p; - } - dn_ht_find(h, (uintptr_t)y, DNHT_INSERT | DNHT_KEY_IS_OBJ, NULL); - } - dn_ht_scan(h, doprint, 0); - printf("remove %p gives %p\n", (void *)x, - dn_ht_find(h, x, DNHT_KEY_IS_OBJ | DNHT_REMOVE, NULL)); - printf("remove %p gives %p\n", (void *)x, - dn_ht_find(h, x, DNHT_KEY_IS_OBJ | DNHT_REMOVE, NULL)); - printf("remove %p gives %p\n", (void *)x, - dn_ht_find(h, x1, DNHT_REMOVE, NULL)); - printf("remove %p gives %p\n", (void *)x, - dn_ht_find(h, x1, DNHT_REMOVE, NULL)); - dn_ht_scan(h, doprint, 0); -} - -int -main(int argc, char *argv[]) -{ - struct dn_heap h; - int i, n, n2, n3; - - test_hash(); - return 0; - - /* n = elements, n2 = cycles */ - n = (argc > 1) ? atoi(argv[1]) : 0; - if (n <= 0 || n > 1000000) - n = 100; - n2 = (argc > 2) ? atoi(argv[2]) : 0; - if (n2 <= 0) - n = 1000000; - n3 = (argc > 3) ? atoi(argv[3]) : 0; - bzero(&h, sizeof(h)); - heap_init(&h, n, -1); - while (n2-- > 0) { - uint64_t prevk = 0; - for (i=0; i < n; i++) - heap_insert(&h, n3 ? n-i: random(), (void *)(100+i)); - - for (i=0; h.elements > 0; i++) { - uint64_t k = h.p[0].key; - if (k < prevk) - panic("wrong sequence\n"); - prevk = k; - if (0) - printf("%d key %llu, val %p\n", - i, h.p[0].key, h.p[0].object); - heap_extract(&h, NULL); - } - } - return 0; -} diff --git a/sys/netinet/ipfw/test/test_dn_sched.c b/sys/netinet/ipfw/test/test_dn_sched.c deleted file mode 100644 index ee46c95..0000000 --- a/sys/netinet/ipfw/test/test_dn_sched.c +++ /dev/null @@ -1,89 +0,0 @@ -/* - * $FreeBSD$ - * - * library functions for userland testing of dummynet schedulers - */ - -#include "dn_test.h" - -void -m_freem(struct mbuf *m) -{ - printf("free %p\n", m); -} - -int -dn_sched_modevent(module_t mod, int cmd, void *arg) -{ - return 0; -} - -void -dn_free_pkts(struct mbuf *m) -{ - struct mbuf *x; - while ( (x = m) ) { - m = m->m_nextpkt; - m_freem(x); - } -} - -int -dn_delete_queue(void *_q, void *do_free) -{ - struct dn_queue *q = _q; - if (q->mq.head) - dn_free_pkts(q->mq.head); - free(q); - return 0; -} - -/* - * This is a simplified function for testing purposes, which does - * not implement statistics or random loss. - * Enqueue a packet in q, subject to space and queue management policy - * (whose parameters are in q->fs). - * Update stats for the queue and the scheduler. - * Return 0 on success, 1 on drop. The packet is consumed anyways. - */ -int -dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop) -{ - if (drop) - goto drop; - if (q->ni.length >= 200) - goto drop; - mq_append(&q->mq, m); - q->ni.length++; - q->ni.tot_bytes += m->m_pkthdr.len; - return 0; - -drop: - q->ni.drops++; - return 1; -} - -int -ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg) -{ - if (*v < lo) { - *v = dflt; - } else if (*v > hi) { - *v = hi; - } - return *v; -} - -#ifndef __FreeBSD__ -int -fls(int mask) -{ - int bit; - - if (mask == 0) - return (0); - for (bit = 1; mask != 1; bit++) - mask = (unsigned int)mask >> 1; - return (bit); -} -#endif |