summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorglebius <glebius@FreeBSD.org>2014-03-04 15:14:47 +0000
committerglebius <glebius@FreeBSD.org>2014-03-04 15:14:47 +0000
commited41469327bef074f42d569aee03907897699292 (patch)
tree7aa5af9b3e24c43d49fb01dcca5c75d54bab6174
parent1b9278cc989bd23a4e620f1b58d624819ca8e1c0 (diff)
downloadFreeBSD-src-ed41469327bef074f42d569aee03907897699292.zip
FreeBSD-src-ed41469327bef074f42d569aee03907897699292.tar.gz
Merge r261582, r261601, r261610, r261613, r261627, r261640, r261641, r261823,
r261825, r261859, r261875, r261883, r261911, r262027, r262028, r262029, r262030, r262162 from head. Large flowtable revamp. See commit messages for merged revisions for details. Sponsored by: Netflix
-rw-r--r--sys/conf/options1
-rw-r--r--sys/net/flowtable.c1971
-rw-r--r--sys/net/flowtable.h119
-rw-r--r--sys/net/route.c13
-rw-r--r--sys/netinet/ip_input.c29
-rw-r--r--sys/netinet/ip_output.c16
-rw-r--r--sys/netinet6/in6_proto.c14
-rw-r--r--sys/netinet6/ip6_input.c24
-rw-r--r--sys/netinet6/ip6_output.c15
-rw-r--r--usr.bin/netstat/Makefile3
-rw-r--r--usr.bin/netstat/flowtable.c84
-rw-r--r--usr.bin/netstat/main.c5
-rw-r--r--usr.bin/netstat/netstat.h1
13 files changed, 807 insertions, 1488 deletions
diff --git a/sys/conf/options b/sys/conf/options
index 642064d..8a288fe 100644
--- a/sys/conf/options
+++ b/sys/conf/options
@@ -438,6 +438,7 @@ TCP_SIGNATURE opt_inet.h
VLAN_ARRAY opt_vlan.h
XBONEHACK
FLOWTABLE opt_route.h
+FLOWTABLE_HASH_ALL opt_route.h
#
# SCTP
diff --git a/sys/net/flowtable.c b/sys/net/flowtable.c
index 32b953c..873ec36 100644
--- a/sys/net/flowtable.c
+++ b/sys/net/flowtable.c
@@ -1,31 +1,30 @@
-/**************************************************************************
-
-Copyright (c) 2008-2010, BitGravity Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- 1. Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
-
- 2. Neither the name of the BitGravity Corporation nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-***************************************************************************/
+/*-
+ * Copyright (c) 2014 Gleb Smirnoff <glebius@FreeBSD.org>
+ * Copyright (c) 2008-2010, BitGravity Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Neither the name of the BitGravity Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
#include "opt_route.h"
#include "opt_mpath.h"
@@ -36,29 +35,32 @@ POSSIBILITY OF SUCH DAMAGE.
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
-#include <sys/param.h>
+#include <sys/param.h>
#include <sys/types.h>
#include <sys/bitstring.h>
#include <sys/condvar.h>
#include <sys/callout.h>
#include <sys/hash.h>
-#include <sys/kernel.h>
+#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/limits.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
+#include <sys/pcpu.h>
#include <sys/proc.h>
+#include <sys/queue.h>
#include <sys/sbuf.h>
#include <sys/sched.h>
#include <sys/smp.h>
#include <sys/socket.h>
#include <sys/syslog.h>
#include <sys/sysctl.h>
+#include <vm/uma.h>
#include <net/if.h>
#include <net/if_llatbl.h>
#include <net/if_var.h>
-#include <net/route.h>
+#include <net/route.h>
#include <net/flowtable.h>
#include <net/vnet.h>
@@ -70,156 +72,79 @@ __FBSDID("$FreeBSD$");
#ifdef INET6
#include <netinet/ip6.h>
#endif
+#ifdef FLOWTABLE_HASH_ALL
#include <netinet/tcp.h>
#include <netinet/udp.h>
#include <netinet/sctp.h>
+#endif
#include <ddb/ddb.h>
-struct ipv4_tuple {
- uint16_t ip_sport; /* source port */
- uint16_t ip_dport; /* destination port */
- in_addr_t ip_saddr; /* source address */
- in_addr_t ip_daddr; /* destination address */
-};
-
-union ipv4_flow {
- struct ipv4_tuple ipf_ipt;
- uint32_t ipf_key[3];
-};
+#ifdef FLOWTABLE_HASH_ALL
+#define KEY_PORTS (sizeof(uint16_t) * 2)
+#define KEY_ADDRS 2
+#else
+#define KEY_PORTS 0
+#define KEY_ADDRS 1
+#endif
-struct ipv6_tuple {
- uint16_t ip_sport; /* source port */
- uint16_t ip_dport; /* destination port */
- struct in6_addr ip_saddr; /* source address */
- struct in6_addr ip_daddr; /* destination address */
-};
+#ifdef INET6
+#define KEY_ADDR_LEN sizeof(struct in6_addr)
+#else
+#define KEY_ADDR_LEN sizeof(struct in_addr)
+#endif
-union ipv6_flow {
- struct ipv6_tuple ipf_ipt;
- uint32_t ipf_key[9];
-};
+#define KEYLEN ((KEY_ADDR_LEN * KEY_ADDRS + KEY_PORTS) / sizeof(uint32_t))
struct flentry {
- volatile uint32_t f_fhash; /* hash flowing forward */
- uint16_t f_flags; /* flow flags */
- uint8_t f_pad;
- uint8_t f_proto; /* protocol */
- uint32_t f_fibnum; /* fib index */
+ uint32_t f_hash; /* hash flowing forward */
+ uint32_t f_key[KEYLEN]; /* address(es and ports) */
uint32_t f_uptime; /* uptime at last access */
- struct flentry *f_next; /* pointer to collision entry */
- volatile struct rtentry *f_rt; /* rtentry for flow */
- volatile struct llentry *f_lle; /* llentry for flow */
-};
-
-struct flentry_v4 {
- struct flentry fl_entry;
- union ipv4_flow fl_flow;
-};
-
-struct flentry_v6 {
- struct flentry fl_entry;
- union ipv6_flow fl_flow;
-};
-
-#define fl_fhash fl_entry.fl_fhash
-#define fl_flags fl_entry.fl_flags
-#define fl_proto fl_entry.fl_proto
-#define fl_uptime fl_entry.fl_uptime
-#define fl_rt fl_entry.fl_rt
-#define fl_lle fl_entry.fl_lle
-
-#define SECS_PER_HOUR 3600
-#define SECS_PER_DAY (24*SECS_PER_HOUR)
-
-#define SYN_IDLE 300
-#define UDP_IDLE 300
-#define FIN_WAIT_IDLE 600
-#define TCP_IDLE SECS_PER_DAY
-
-
-typedef void fl_lock_t(struct flowtable *, uint32_t);
-typedef void fl_rtalloc_t(struct route *, uint32_t, u_int);
-
-union flentryp {
- struct flentry **global;
- struct flentry **pcpu[MAXCPU];
+ uint16_t f_fibnum; /* fib index */
+#ifdef FLOWTABLE_HASH_ALL
+ uint8_t f_proto; /* protocol */
+ uint8_t f_flags; /* stale? */
+#define FL_STALE 1
+#endif
+ SLIST_ENTRY(flentry) f_next; /* pointer to collision entry */
+ struct rtentry *f_rt; /* rtentry for flow */
+ struct llentry *f_lle; /* llentry for flow */
};
+#undef KEYLEN
-struct flowtable_stats {
- uint64_t ft_collisions;
- uint64_t ft_allocated;
- uint64_t ft_misses;
- uint64_t ft_max_depth;
- uint64_t ft_free_checks;
- uint64_t ft_frees;
- uint64_t ft_hits;
- uint64_t ft_lookups;
-} __aligned(CACHE_LINE_SIZE);
+SLIST_HEAD(flist, flentry);
+/* Make sure we can use pcpu_zone_ptr for struct flist. */
+CTASSERT(sizeof(struct flist) == sizeof(void *));
struct flowtable {
- struct flowtable_stats ft_stats[MAXCPU];
+ counter_u64_t *ft_stat;
int ft_size;
- int ft_lock_count;
- uint32_t ft_flags;
- char *ft_name;
- fl_lock_t *ft_lock;
- fl_lock_t *ft_unlock;
- fl_rtalloc_t *ft_rtalloc;
/*
- * XXX need to pad out
- */
- struct mtx *ft_locks;
- union flentryp ft_table;
- bitstr_t *ft_masks[MAXCPU];
+ * ft_table is a malloc(9)ed array of pointers. Pointers point to
+ * memory from UMA_ZONE_PCPU zone.
+ * ft_masks is per-cpu pointer itself. Each instance points
+ * to a malloc(9)ed bitset, that is private to corresponding CPU.
+ */
+ struct flist **ft_table;
+ bitstr_t **ft_masks;
bitstr_t *ft_tmpmask;
- struct flowtable *ft_next;
+};
- uint32_t ft_count __aligned(CACHE_LINE_SIZE);
- uint32_t ft_udp_idle __aligned(CACHE_LINE_SIZE);
- uint32_t ft_fin_wait_idle;
- uint32_t ft_syn_idle;
- uint32_t ft_tcp_idle;
- boolean_t ft_full;
-} __aligned(CACHE_LINE_SIZE);
+#define FLOWSTAT_ADD(ft, name, v) \
+ counter_u64_add((ft)->ft_stat[offsetof(struct flowtable_stat, name) / sizeof(uint64_t)], (v))
+#define FLOWSTAT_INC(ft, name) FLOWSTAT_ADD(ft, name, 1)
static struct proc *flowcleanerproc;
-static VNET_DEFINE(struct flowtable *, flow_list_head);
-static VNET_DEFINE(uint32_t, flow_hashjitter);
-static VNET_DEFINE(uma_zone_t, flow_ipv4_zone);
-static VNET_DEFINE(uma_zone_t, flow_ipv6_zone);
-
-#define V_flow_list_head VNET(flow_list_head)
-#define V_flow_hashjitter VNET(flow_hashjitter)
-#define V_flow_ipv4_zone VNET(flow_ipv4_zone)
-#define V_flow_ipv6_zone VNET(flow_ipv6_zone)
-
+static uint32_t flow_hashjitter;
static struct cv flowclean_f_cv;
static struct cv flowclean_c_cv;
static struct mtx flowclean_lock;
static uint32_t flowclean_cycles;
-static uint32_t flowclean_freq;
-
-#ifdef FLOWTABLE_DEBUG
-#define FLDPRINTF(ft, flags, fmt, ...) \
-do { \
- if ((ft)->ft_flags & (flags)) \
- printf((fmt), __VA_ARGS__); \
-} while (0); \
-
-#else
-#define FLDPRINTF(ft, flags, fmt, ...)
-
-#endif
-
/*
* TODO:
- * - Make flowtable stats per-cpu, aggregated at sysctl call time,
- * to avoid extra cache evictions caused by incrementing a shared
- * counter
- * - add sysctls to resize && flush flow tables
+ * - add sysctls to resize && flush flow tables
* - Add per flowtable sysctls for statistics and configuring timeouts
* - add saturation counter to rtentry to support per-packet load-balancing
* add flag to indicate round-robin flow, add list lookup from head
@@ -230,396 +155,117 @@ do { \
* - support explicit connection state (currently only ad-hoc for DSR)
* - idetach() cleanup for options VIMAGE builds.
*/
-VNET_DEFINE(int, flowtable_enable) = 1;
-static VNET_DEFINE(int, flowtable_debug);
-static VNET_DEFINE(int, flowtable_syn_expire) = SYN_IDLE;
-static VNET_DEFINE(int, flowtable_udp_expire) = UDP_IDLE;
-static VNET_DEFINE(int, flowtable_fin_wait_expire) = FIN_WAIT_IDLE;
-static VNET_DEFINE(int, flowtable_tcp_expire) = TCP_IDLE;
-static VNET_DEFINE(int, flowtable_nmbflows);
-static VNET_DEFINE(int, flowtable_ready) = 0;
-
-#define V_flowtable_enable VNET(flowtable_enable)
-#define V_flowtable_debug VNET(flowtable_debug)
-#define V_flowtable_syn_expire VNET(flowtable_syn_expire)
-#define V_flowtable_udp_expire VNET(flowtable_udp_expire)
-#define V_flowtable_fin_wait_expire VNET(flowtable_fin_wait_expire)
-#define V_flowtable_tcp_expire VNET(flowtable_tcp_expire)
-#define V_flowtable_nmbflows VNET(flowtable_nmbflows)
-#define V_flowtable_ready VNET(flowtable_ready)
-
-static SYSCTL_NODE(_net_inet, OID_AUTO, flowtable, CTLFLAG_RD, NULL,
- "flowtable");
-SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, debug, CTLFLAG_RW,
- &VNET_NAME(flowtable_debug), 0, "print debug info.");
-SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, enable, CTLFLAG_RW,
- &VNET_NAME(flowtable_enable), 0, "enable flowtable caching.");
-
-/*
- * XXX This does not end up updating timeouts at runtime
- * and only reflects the value for the last table added :-/
- */
-SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, syn_expire, CTLFLAG_RW,
- &VNET_NAME(flowtable_syn_expire), 0,
- "seconds after which to remove syn allocated flow.");
-SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, udp_expire, CTLFLAG_RW,
- &VNET_NAME(flowtable_udp_expire), 0,
- "seconds after which to remove flow allocated to UDP.");
-SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, fin_wait_expire, CTLFLAG_RW,
- &VNET_NAME(flowtable_fin_wait_expire), 0,
- "seconds after which to remove a flow in FIN_WAIT.");
-SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, tcp_expire, CTLFLAG_RW,
- &VNET_NAME(flowtable_tcp_expire), 0,
- "seconds after which to remove flow allocated to a TCP connection.");
-
-
-/*
- * Maximum number of flows that can be allocated of a given type.
- *
- * The table is allocated at boot time (for the pure caching case
- * there is no reason why this could not be changed at runtime)
- * and thus (currently) needs to be set with a tunable.
- */
-static int
-sysctl_nmbflows(SYSCTL_HANDLER_ARGS)
-{
- int error, newnmbflows;
-
- newnmbflows = V_flowtable_nmbflows;
- error = sysctl_handle_int(oidp, &newnmbflows, 0, req);
- if (error == 0 && req->newptr) {
- if (newnmbflows > V_flowtable_nmbflows) {
- V_flowtable_nmbflows = newnmbflows;
- uma_zone_set_max(V_flow_ipv4_zone,
- V_flowtable_nmbflows);
- uma_zone_set_max(V_flow_ipv6_zone,
- V_flowtable_nmbflows);
- } else
- error = EINVAL;
- }
- return (error);
-}
-SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, nmbflows,
- CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_nmbflows, "IU",
- "Maximum number of flows allowed");
-
-
-
-#define FS_PRINT(sb, field) sbuf_printf((sb), "\t%s: %jd\n", #field, fs->ft_##field)
-
-static void
-fs_print(struct sbuf *sb, struct flowtable_stats *fs)
-{
-
- FS_PRINT(sb, collisions);
- FS_PRINT(sb, allocated);
- FS_PRINT(sb, misses);
- FS_PRINT(sb, max_depth);
- FS_PRINT(sb, free_checks);
- FS_PRINT(sb, frees);
- FS_PRINT(sb, hits);
- FS_PRINT(sb, lookups);
-}
-
-static void
-flowtable_show_stats(struct sbuf *sb, struct flowtable *ft)
-{
- int i;
- struct flowtable_stats fs, *pfs;
-
- if (ft->ft_flags & FL_PCPU) {
- bzero(&fs, sizeof(fs));
- pfs = &fs;
- CPU_FOREACH(i) {
- pfs->ft_collisions += ft->ft_stats[i].ft_collisions;
- pfs->ft_allocated += ft->ft_stats[i].ft_allocated;
- pfs->ft_misses += ft->ft_stats[i].ft_misses;
- pfs->ft_free_checks += ft->ft_stats[i].ft_free_checks;
- pfs->ft_frees += ft->ft_stats[i].ft_frees;
- pfs->ft_hits += ft->ft_stats[i].ft_hits;
- pfs->ft_lookups += ft->ft_stats[i].ft_lookups;
- if (ft->ft_stats[i].ft_max_depth > pfs->ft_max_depth)
- pfs->ft_max_depth = ft->ft_stats[i].ft_max_depth;
- }
- } else {
- pfs = &ft->ft_stats[0];
- }
- fs_print(sb, pfs);
-}
-
-static int
-sysctl_flowtable_stats(SYSCTL_HANDLER_ARGS)
-{
- struct flowtable *ft;
- struct sbuf *sb;
- int error;
-
- sb = sbuf_new(NULL, NULL, 64*1024, SBUF_FIXEDLEN);
-
- ft = V_flow_list_head;
- while (ft != NULL) {
- sbuf_printf(sb, "\ntable name: %s\n", ft->ft_name);
- flowtable_show_stats(sb, ft);
- ft = ft->ft_next;
- }
- sbuf_finish(sb);
- error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
- sbuf_delete(sb);
-
- return (error);
-}
-SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, stats, CTLTYPE_STRING|CTLFLAG_RD,
- NULL, 0, sysctl_flowtable_stats, "A", "flowtable statistics");
-
-
-#ifndef RADIX_MPATH
-static void
-rtalloc_ign_wrapper(struct route *ro, uint32_t hash, u_int fibnum)
-{
-
- rtalloc_ign_fib(ro, 0, fibnum);
-}
+#ifdef INET
+static VNET_DEFINE(struct flowtable, ip4_ft);
+#define V_ip4_ft VNET(ip4_ft)
+#endif
+#ifdef INET6
+static VNET_DEFINE(struct flowtable, ip6_ft);
+#define V_ip6_ft VNET(ip6_ft)
#endif
-static void
-flowtable_global_lock(struct flowtable *table, uint32_t hash)
-{
- int lock_index = (hash)&(table->ft_lock_count - 1);
-
- mtx_lock(&table->ft_locks[lock_index]);
-}
-
-static void
-flowtable_global_unlock(struct flowtable *table, uint32_t hash)
-{
- int lock_index = (hash)&(table->ft_lock_count - 1);
-
- mtx_unlock(&table->ft_locks[lock_index]);
-}
-
-static void
-flowtable_pcpu_lock(struct flowtable *table, uint32_t hash)
-{
-
- critical_enter();
-}
-
-static void
-flowtable_pcpu_unlock(struct flowtable *table, uint32_t hash)
-{
-
- critical_exit();
-}
-
-#define FL_ENTRY_INDEX(table, hash)((hash) % (table)->ft_size)
-#define FL_ENTRY(table, hash) *flowtable_entry((table), (hash))
-#define FL_ENTRY_LOCK(table, hash) (table)->ft_lock((table), (hash))
-#define FL_ENTRY_UNLOCK(table, hash) (table)->ft_unlock((table), (hash))
-
-#define FL_STALE (1<<8)
-#define FL_OVERWRITE (1<<10)
-
-void
-flow_invalidate(struct flentry *fle)
-{
-
- fle->f_flags |= FL_STALE;
-}
-
-static __inline int
-proto_to_flags(uint8_t proto)
-{
- int flag;
+static uma_zone_t flow_zone;
- switch (proto) {
- case IPPROTO_TCP:
- flag = FL_TCP;
- break;
- case IPPROTO_SCTP:
- flag = FL_SCTP;
- break;
- case IPPROTO_UDP:
- flag = FL_UDP;
- break;
- default:
- flag = 0;
- break;
- }
+static VNET_DEFINE(int, flowtable_enable) = 1;
+#define V_flowtable_enable VNET(flowtable_enable)
- return (flag);
-}
+static SYSCTL_NODE(_net, OID_AUTO, flowtable, CTLFLAG_RD, NULL,
+ "flowtable");
+SYSCTL_VNET_INT(_net_flowtable, OID_AUTO, enable, CTLFLAG_RW,
+ &VNET_NAME(flowtable_enable), 0, "enable flowtable caching.");
+SYSCTL_UMA_MAX(_net_flowtable, OID_AUTO, maxflows, CTLFLAG_RW,
+ &flow_zone, "Maximum number of flows allowed");
-static __inline int
-flags_to_proto(int flags)
-{
- int proto, protoflags;
+static MALLOC_DEFINE(M_FTABLE, "flowtable", "flowtable hashes and bitstrings");
- protoflags = flags & (FL_TCP|FL_SCTP|FL_UDP);
- switch (protoflags) {
- case FL_TCP:
- proto = IPPROTO_TCP;
- break;
- case FL_SCTP:
- proto = IPPROTO_SCTP;
- break;
- case FL_UDP:
- proto = IPPROTO_UDP;
- break;
- default:
- proto = 0;
- break;
- }
- return (proto);
-}
+static struct flentry *
+flowtable_lookup_common(struct flowtable *, uint32_t *, int, uint32_t);
#ifdef INET
-#ifdef FLOWTABLE_DEBUG
-static void
-ipv4_flow_print_tuple(int flags, int proto, struct sockaddr_in *ssin,
- struct sockaddr_in *dsin)
-{
- char saddr[4*sizeof "123"], daddr[4*sizeof "123"];
-
- if (flags & FL_HASH_ALL) {
- inet_ntoa_r(ssin->sin_addr, saddr);
- inet_ntoa_r(dsin->sin_addr, daddr);
- printf("proto=%d %s:%d->%s:%d\n",
- proto, saddr, ntohs(ssin->sin_port), daddr,
- ntohs(dsin->sin_port));
- } else {
- inet_ntoa_r(*(struct in_addr *) &dsin->sin_addr, daddr);
- printf("proto=%d %s\n", proto, daddr);
- }
-
-}
-#endif
-
-static int
-ipv4_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
- struct sockaddr_in *ssin, struct sockaddr_in *dsin, uint16_t *flags)
+static struct flentry *
+flowtable_lookup_ipv4(struct mbuf *m, struct route *ro)
{
+ struct flentry *fle;
+ struct sockaddr_in *sin;
struct ip *ip;
- uint8_t proto;
+ uint32_t fibnum;
+#ifdef FLOWTABLE_HASH_ALL
+ uint32_t key[3];
int iphlen;
- struct tcphdr *th;
- struct udphdr *uh;
- struct sctphdr *sh;
uint16_t sport, dport;
+ uint8_t proto;
+#endif
- proto = sport = dport = 0;
ip = mtod(m, struct ip *);
- dsin->sin_family = AF_INET;
- dsin->sin_len = sizeof(*dsin);
- dsin->sin_addr = ip->ip_dst;
- ssin->sin_family = AF_INET;
- ssin->sin_len = sizeof(*ssin);
- ssin->sin_addr = ip->ip_src;
- proto = ip->ip_p;
- if ((*flags & FL_HASH_ALL) == 0) {
- FLDPRINTF(ft, FL_DEBUG_ALL, "skip port check flags=0x%x ",
- *flags);
- goto skipports;
- }
+ if (ip->ip_src.s_addr == ip->ip_dst.s_addr ||
+ (ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
+ (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
+ return (NULL);
- iphlen = ip->ip_hl << 2; /* XXX options? */
+ fibnum = M_GETFIB(m);
+
+#ifdef FLOWTABLE_HASH_ALL
+ iphlen = ip->ip_hl << 2;
+ proto = ip->ip_p;
switch (proto) {
- case IPPROTO_TCP:
- th = (struct tcphdr *)((caddr_t)ip + iphlen);
+ case IPPROTO_TCP: {
+ struct tcphdr *th;
+
+ th = (struct tcphdr *)((char *)ip + iphlen);
sport = th->th_sport;
dport = th->th_dport;
- if ((*flags & FL_HASH_ALL) &&
- (th->th_flags & (TH_RST|TH_FIN)))
- *flags |= FL_STALE;
- break;
- case IPPROTO_UDP:
- uh = (struct udphdr *)((caddr_t)ip + iphlen);
+ if (th->th_flags & (TH_RST|TH_FIN))
+ fibnum |= (FL_STALE << 24);
+ break;
+ }
+ case IPPROTO_UDP: {
+ struct udphdr *uh;
+
+ uh = (struct udphdr *)((char *)ip + iphlen);
sport = uh->uh_sport;
dport = uh->uh_dport;
- break;
- case IPPROTO_SCTP:
- sh = (struct sctphdr *)((caddr_t)ip + iphlen);
+ break;
+ }
+ case IPPROTO_SCTP: {
+ struct sctphdr *sh;
+
+ sh = (struct sctphdr *)((char *)ip + iphlen);
sport = sh->src_port;
dport = sh->dest_port;
- break;
+ /* XXXGL: handle stale? */
+ break;
+ }
default:
- FLDPRINTF(ft, FL_DEBUG_ALL, "proto=0x%x not supported\n", proto);
- return (ENOTSUP);
- /* no port - hence not a protocol we care about */
+ sport = dport = 0;
break;
-
}
-skipports:
- *flags |= proto_to_flags(proto);
- ssin->sin_port = sport;
- dsin->sin_port = dport;
- return (0);
-}
+ key[0] = ip->ip_dst.s_addr;
+ key[1] = ip->ip_src.s_addr;
+ key[2] = (dport << 16) | sport;
+ fibnum |= proto << 16;
-static uint32_t
-ipv4_flow_lookup_hash_internal(
- struct sockaddr_in *ssin, struct sockaddr_in *dsin,
- uint32_t *key, uint16_t flags)
-{
- uint16_t sport, dport;
- uint8_t proto;
- int offset = 0;
-
- if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
- return (0);
- proto = flags_to_proto(flags);
- sport = dport = key[2] = key[1] = key[0] = 0;
- if ((ssin != NULL) && (flags & FL_HASH_ALL)) {
- key[1] = ssin->sin_addr.s_addr;
- sport = ssin->sin_port;
- }
- if (dsin != NULL) {
- key[2] = dsin->sin_addr.s_addr;
- dport = dsin->sin_port;
- }
- if (flags & FL_HASH_ALL) {
- ((uint16_t *)key)[0] = sport;
- ((uint16_t *)key)[1] = dport;
- } else
- offset = V_flow_hashjitter + proto;
+ fle = flowtable_lookup_common(&V_ip4_ft, key, 3 * sizeof(uint32_t),
+ fibnum);
- return (jenkins_hash32(key, 3, offset));
-}
+#else /* !FLOWTABLE_HASH_ALL */
-static struct flentry *
-flowtable_lookup_mbuf4(struct flowtable *ft, struct mbuf *m)
-{
- struct sockaddr_storage ssa, dsa;
- uint16_t flags;
- struct sockaddr_in *dsin, *ssin;
-
- dsin = (struct sockaddr_in *)&dsa;
- ssin = (struct sockaddr_in *)&ssa;
- bzero(dsin, sizeof(*dsin));
- bzero(ssin, sizeof(*ssin));
- flags = ft->ft_flags;
- if (ipv4_mbuf_demarshal(ft, m, ssin, dsin, &flags) != 0)
- return (NULL);
+ fle = flowtable_lookup_common(&V_ip4_ft, (uint32_t *)&ip->ip_dst,
+ sizeof(struct in_addr), fibnum);
- return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags));
-}
+#endif /* FLOWTABLE_HASH_ALL */
-void
-flow_to_route(struct flentry *fle, struct route *ro)
-{
- uint32_t *hashkey = NULL;
- struct sockaddr_in *sin;
+ if (fle == NULL)
+ return (NULL);
sin = (struct sockaddr_in *)&ro->ro_dst;
sin->sin_family = AF_INET;
sin->sin_len = sizeof(*sin);
- hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
- sin->sin_addr.s_addr = hashkey[2];
- ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
- ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle);
- ro->ro_flags |= RT_NORTREF;
+ sin->sin_addr = ip->ip_dst;
+
+ return (fle);
}
#endif /* INET */
@@ -633,9 +279,8 @@ flow_to_route(struct flentry *fle, struct route *ro)
#define PULLUP_TO(_len, p, T) \
do { \
int x = (_len) + sizeof(T); \
- if ((m)->m_len < x) { \
- goto receive_failed; \
- } \
+ if ((m)->m_len < x) \
+ return (NULL); \
p = (mtod(m, char *) + (_len)); \
} while (0)
@@ -643,26 +288,35 @@ do { \
#define SCTP(p) ((struct sctphdr *)(p))
#define UDP(p) ((struct udphdr *)(p))
-static int
-ipv6_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
- struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6, uint16_t *flags)
+static struct flentry *
+flowtable_lookup_ipv6(struct mbuf *m, struct route *ro)
{
+ struct flentry *fle;
+ struct sockaddr_in6 *sin6;
struct ip6_hdr *ip6;
- uint8_t proto;
+ uint32_t fibnum;
+#ifdef FLOWTABLE_HASH_ALL
+ uint32_t key[9];
+ void *ulp;
int hlen;
- uint16_t src_port, dst_port;
+ uint16_t sport, dport;
u_short offset;
- void *ulp;
+ uint8_t proto;
+#else
+ uint32_t key[4];
+#endif
- offset = hlen = src_port = dst_port = 0;
- ulp = NULL;
ip6 = mtod(m, struct ip6_hdr *);
- hlen = sizeof(struct ip6_hdr);
- proto = ip6->ip6_nxt;
+ if (in6_localaddr(&ip6->ip6_dst))
+ return (NULL);
- if ((*flags & FL_HASH_ALL) == 0)
- goto skipports;
+ fibnum = M_GETFIB(m);
+#ifdef FLOWTABLE_HASH_ALL
+ hlen = sizeof(struct ip6_hdr);
+ proto = ip6->ip6_nxt;
+ offset = sport = dport = 0;
+ ulp = NULL;
while (ulp == NULL) {
switch (proto) {
case IPPROTO_ICMPV6:
@@ -675,21 +329,21 @@ ipv6_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
break;
case IPPROTO_TCP:
PULLUP_TO(hlen, ulp, struct tcphdr);
- dst_port = TCP(ulp)->th_dport;
- src_port = TCP(ulp)->th_sport;
- if ((*flags & FL_HASH_ALL) &&
- (TCP(ulp)->th_flags & (TH_RST|TH_FIN)))
- *flags |= FL_STALE;
+ dport = TCP(ulp)->th_dport;
+ sport = TCP(ulp)->th_sport;
+ if (TCP(ulp)->th_flags & (TH_RST|TH_FIN))
+ fibnum |= (FL_STALE << 24);
break;
case IPPROTO_SCTP:
PULLUP_TO(hlen, ulp, struct sctphdr);
- src_port = SCTP(ulp)->src_port;
- dst_port = SCTP(ulp)->dest_port;
+ dport = SCTP(ulp)->src_port;
+ sport = SCTP(ulp)->dest_port;
+ /* XXXGL: handle stale? */
break;
case IPPROTO_UDP:
PULLUP_TO(hlen, ulp, struct udphdr);
- dst_port = UDP(ulp)->uh_dport;
- src_port = UDP(ulp)->uh_sport;
+ dport = UDP(ulp)->uh_dport;
+ sport = UDP(ulp)->uh_sport;
break;
case IPPROTO_HOPOPTS: /* RFC 2460 */
PULLUP_TO(hlen, ulp, struct ip6_hbh);
@@ -698,7 +352,7 @@ ipv6_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
ulp = NULL;
break;
case IPPROTO_ROUTING: /* RFC 2460 */
- PULLUP_TO(hlen, ulp, struct ip6_rthdr);
+ PULLUP_TO(hlen, ulp, struct ip6_rthdr);
hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
ulp = NULL;
@@ -729,689 +383,395 @@ ipv6_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
}
}
- if (src_port == 0) {
- receive_failed:
- return (ENOTSUP);
- }
-
-skipports:
- dsin6->sin6_family = AF_INET6;
- dsin6->sin6_len = sizeof(*dsin6);
- dsin6->sin6_port = dst_port;
- memcpy(&dsin6->sin6_addr, &ip6->ip6_dst, sizeof(struct in6_addr));
+ bcopy(&ip6->ip6_dst, &key[0], sizeof(struct in6_addr));
+ bcopy(&ip6->ip6_src, &key[4], sizeof(struct in6_addr));
+ key[8] = (dport << 16) | sport;
+ fibnum |= proto << 16;
- ssin6->sin6_family = AF_INET6;
- ssin6->sin6_len = sizeof(*ssin6);
- ssin6->sin6_port = src_port;
- memcpy(&ssin6->sin6_addr, &ip6->ip6_src, sizeof(struct in6_addr));
- *flags |= proto_to_flags(proto);
+ fle = flowtable_lookup_common(&V_ip6_ft, key, 9 * sizeof(uint32_t),
+ fibnum);
+#else /* !FLOWTABLE_HASH_ALL */
+ bcopy(&ip6->ip6_dst, &key[0], sizeof(struct in6_addr));
+ fle = flowtable_lookup_common(&V_ip6_ft, key, sizeof(struct in6_addr),
+ fibnum);
+#endif /* FLOWTABLE_HASH_ALL */
- return (0);
-}
-
-#define zero_key(key) \
-do { \
- key[0] = 0; \
- key[1] = 0; \
- key[2] = 0; \
- key[3] = 0; \
- key[4] = 0; \
- key[5] = 0; \
- key[6] = 0; \
- key[7] = 0; \
- key[8] = 0; \
-} while (0)
-
-static uint32_t
-ipv6_flow_lookup_hash_internal(
- struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6,
- uint32_t *key, uint16_t flags)
-{
- uint16_t sport, dport;
- uint8_t proto;
- int offset = 0;
-
- if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
- return (0);
-
- proto = flags_to_proto(flags);
- zero_key(key);
- sport = dport = 0;
- if (dsin6 != NULL) {
- memcpy(&key[1], &dsin6->sin6_addr, sizeof(struct in6_addr));
- dport = dsin6->sin6_port;
- }
- if ((ssin6 != NULL) && (flags & FL_HASH_ALL)) {
- memcpy(&key[5], &ssin6->sin6_addr, sizeof(struct in6_addr));
- sport = ssin6->sin6_port;
- }
- if (flags & FL_HASH_ALL) {
- ((uint16_t *)key)[0] = sport;
- ((uint16_t *)key)[1] = dport;
- } else
- offset = V_flow_hashjitter + proto;
-
- return (jenkins_hash32(key, 9, offset));
-}
-
-static struct flentry *
-flowtable_lookup_mbuf6(struct flowtable *ft, struct mbuf *m)
-{
- struct sockaddr_storage ssa, dsa;
- struct sockaddr_in6 *dsin6, *ssin6;
- uint16_t flags;
-
- dsin6 = (struct sockaddr_in6 *)&dsa;
- ssin6 = (struct sockaddr_in6 *)&ssa;
- bzero(dsin6, sizeof(*dsin6));
- bzero(ssin6, sizeof(*ssin6));
- flags = ft->ft_flags;
-
- if (ipv6_mbuf_demarshal(ft, m, ssin6, dsin6, &flags) != 0)
+ if (fle == NULL)
return (NULL);
- return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags));
-}
-
-void
-flow_to_route_in6(struct flentry *fle, struct route_in6 *ro)
-{
- uint32_t *hashkey = NULL;
- struct sockaddr_in6 *sin6;
-
sin6 = (struct sockaddr_in6 *)&ro->ro_dst;
-
sin6->sin6_family = AF_INET6;
sin6->sin6_len = sizeof(*sin6);
- hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
- memcpy(&sin6->sin6_addr, &hashkey[5], sizeof (struct in6_addr));
- ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
- ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle);
- ro->ro_flags |= RT_NORTREF;
+ bcopy(&ip6->ip6_dst, &sin6->sin6_addr, sizeof(struct in6_addr));
+
+ return (fle);
}
#endif /* INET6 */
static bitstr_t *
flowtable_mask(struct flowtable *ft)
{
- bitstr_t *mask;
- if (ft->ft_flags & FL_PCPU)
- mask = ft->ft_masks[curcpu];
- else
- mask = ft->ft_masks[0];
+ /*
+ * flowtable_free_stale() calls w/o critical section, but
+ * with sched_bind(). Since pointer is stable throughout
+ * ft lifetime, it is safe, otherwise...
+ *
+ * CRITICAL_ASSERT(curthread);
+ */
- return (mask);
+ return (*(bitstr_t **)zpcpu_get(ft->ft_masks));
}
-static struct flentry **
-flowtable_entry(struct flowtable *ft, uint32_t hash)
+static struct flist *
+flowtable_list(struct flowtable *ft, uint32_t hash)
{
- struct flentry **fle;
- int index = (hash % ft->ft_size);
- if (ft->ft_flags & FL_PCPU) {
- KASSERT(&ft->ft_table.pcpu[curcpu][0] != NULL, ("pcpu not set"));
- fle = &ft->ft_table.pcpu[curcpu][index];
- } else {
- KASSERT(&ft->ft_table.global[0] != NULL, ("global not set"));
- fle = &ft->ft_table.global[index];
- }
-
- return (fle);
+ CRITICAL_ASSERT(curthread);
+ return (zpcpu_get(ft->ft_table[hash % ft->ft_size]));
}
static int
-flow_stale(struct flowtable *ft, struct flentry *fle)
+flow_stale(struct flowtable *ft, struct flentry *fle, int maxidle)
{
- time_t idle_time;
-
- if ((fle->f_fhash == 0)
- || ((fle->f_rt->rt_flags & RTF_HOST) &&
- ((fle->f_rt->rt_flags & (RTF_UP))
- != (RTF_UP)))
- || (fle->f_rt->rt_ifp == NULL)
- || !RT_LINK_IS_UP(fle->f_rt->rt_ifp))
- return (1);
- idle_time = time_uptime - fle->f_uptime;
-
- if ((fle->f_flags & FL_STALE) ||
- ((fle->f_flags & (TH_SYN|TH_ACK|TH_FIN)) == 0
- && (idle_time > ft->ft_udp_idle)) ||
- ((fle->f_flags & TH_FIN)
- && (idle_time > ft->ft_fin_wait_idle)) ||
- ((fle->f_flags & (TH_SYN|TH_ACK)) == TH_SYN
- && (idle_time > ft->ft_syn_idle)) ||
- ((fle->f_flags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)
- && (idle_time > ft->ft_tcp_idle)) ||
- ((fle->f_rt->rt_flags & RTF_UP) == 0 ||
- (fle->f_rt->rt_ifp == NULL)))
+ if (((fle->f_rt->rt_flags & RTF_HOST) &&
+ ((fle->f_rt->rt_flags & (RTF_UP)) != (RTF_UP))) ||
+ (fle->f_rt->rt_ifp == NULL) ||
+ !RT_LINK_IS_UP(fle->f_rt->rt_ifp) ||
+ (fle->f_lle->la_flags & LLE_VALID) == 0)
return (1);
- return (0);
-}
+ if (time_uptime - fle->f_uptime > maxidle)
+ return (1);
-static void
-flowtable_set_hashkey(struct flentry *fle, uint32_t *key)
-{
- uint32_t *hashkey;
- int i, nwords;
+#ifdef FLOWTABLE_HASH_ALL
+ if (fle->f_flags & FL_STALE)
+ return (1);
+#endif
- if (fle->f_flags & FL_IPV6) {
- nwords = 9;
- hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
- } else {
- nwords = 3;
- hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
- }
-
- for (i = 0; i < nwords; i++)
- hashkey[i] = key[i];
+ return (0);
}
-static struct flentry *
-flow_alloc(struct flowtable *ft)
+static int
+flow_full(void)
{
- struct flentry *newfle;
- uma_zone_t zone;
+ int count, max;
- newfle = NULL;
- zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
+ count = uma_zone_get_cur(flow_zone);
+ max = uma_zone_get_max(flow_zone);
- newfle = uma_zalloc(zone, M_NOWAIT | M_ZERO);
- if (newfle != NULL)
- atomic_add_int(&ft->ft_count, 1);
- return (newfle);
+ return (count > (max - (max >> 3)));
}
-static void
-flow_free(struct flentry *fle, struct flowtable *ft)
+static int
+flow_matches(struct flentry *fle, uint32_t *key, int keylen, uint32_t fibnum)
{
- uma_zone_t zone;
+#ifdef FLOWTABLE_HASH_ALL
+ uint8_t proto;
- zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
- atomic_add_int(&ft->ft_count, -1);
- uma_zfree(zone, fle);
-}
+ proto = (fibnum >> 16) & 0xff;
+ fibnum &= 0xffff;
+#endif
-static int
-flow_full(struct flowtable *ft)
-{
- boolean_t full;
- uint32_t count;
-
- full = ft->ft_full;
- count = ft->ft_count;
-
- if (full && (count < (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 3))))
- ft->ft_full = FALSE;
- else if (!full && (count > (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 5))))
- ft->ft_full = TRUE;
-
- if (full && !ft->ft_full) {
- flowclean_freq = 4*hz;
- if ((ft->ft_flags & FL_HASH_ALL) == 0)
- ft->ft_udp_idle = ft->ft_fin_wait_idle =
- ft->ft_syn_idle = ft->ft_tcp_idle = 5;
- cv_broadcast(&flowclean_c_cv);
- } else if (!full && ft->ft_full) {
- flowclean_freq = 20*hz;
- if ((ft->ft_flags & FL_HASH_ALL) == 0)
- ft->ft_udp_idle = ft->ft_fin_wait_idle =
- ft->ft_syn_idle = ft->ft_tcp_idle = 30;
- }
+ CRITICAL_ASSERT(curthread);
- return (ft->ft_full);
+ /* Microoptimization for IPv4: don't use bcmp(). */
+ if (((keylen == sizeof(uint32_t) && (fle->f_key[0] != key[0])) ||
+ (bcmp(fle->f_key, key, keylen) == 0)) &&
+ fibnum == fle->f_fibnum &&
+#ifdef FLOWTABLE_HASH_ALL
+ proto == fle->f_proto &&
+#endif
+ (fle->f_rt->rt_flags & RTF_UP) &&
+ fle->f_rt->rt_ifp != NULL &&
+ (fle->f_lle->la_flags & LLE_VALID))
+ return (1);
+
+ return (0);
}
-static int
+static struct flentry *
flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key,
- uint32_t fibnum, struct route *ro, uint16_t flags)
+ int keylen, uint32_t fibnum0)
{
- struct flentry *fle, *fletail, *newfle, **flep;
- struct flowtable_stats *fs = &ft->ft_stats[curcpu];
- int depth;
+#ifdef INET6
+ struct route_in6 sro6;
+#endif
+#ifdef INET
+ struct route sro;
+#endif
+ struct route *ro = NULL;
+ struct rtentry *rt;
+ struct lltable *lt = NULL;
+ struct llentry *lle;
+ struct sockaddr_storage *l3addr;
+ struct ifnet *ifp;
+ struct flist *flist;
+ struct flentry *fle, *iter;
bitstr_t *mask;
+ uint16_t fibnum = fibnum0;
+#ifdef FLOWTABLE_HASH_ALL
uint8_t proto;
- newfle = flow_alloc(ft);
- if (newfle == NULL)
- return (ENOMEM);
-
- newfle->f_flags |= (flags & FL_IPV6);
- proto = flags_to_proto(flags);
-
- FL_ENTRY_LOCK(ft, hash);
- mask = flowtable_mask(ft);
- flep = flowtable_entry(ft, hash);
- fletail = fle = *flep;
+ proto = (fibnum0 >> 16) & 0xff;
+ fibnum = fibnum0 & 0xffff;
+#endif
- if (fle == NULL) {
- bit_set(mask, FL_ENTRY_INDEX(ft, hash));
- *flep = fle = newfle;
- goto skip;
- }
-
- depth = 0;
- fs->ft_collisions++;
/*
- * find end of list and make sure that we were not
- * preempted by another thread handling this flow
+ * This bit of code ends up locking the
+ * same route 3 times (just like ip_output + ether_output)
+ * - at lookup
+ * - in rt_check when called by arpresolve
+ * - dropping the refcount for the rtentry
+ *
+ * This could be consolidated to one if we wrote a variant
+ * of arpresolve with an rt_check variant that expected to
+ * receive the route locked
*/
- while (fle != NULL) {
- if (fle->f_fhash == hash && !flow_stale(ft, fle)) {
- /*
- * there was either a hash collision
- * or we lost a race to insert
- */
- FL_ENTRY_UNLOCK(ft, hash);
- flow_free(newfle, ft);
-
- if (flags & FL_OVERWRITE)
- goto skip;
- return (EEXIST);
- }
- /*
- * re-visit this double condition XXX
- */
- if (fletail->f_next != NULL)
- fletail = fle->f_next;
-
- depth++;
- fle = fle->f_next;
- }
-
- if (depth > fs->ft_max_depth)
- fs->ft_max_depth = depth;
- fletail->f_next = newfle;
- fle = newfle;
-skip:
- flowtable_set_hashkey(fle, key);
-
- fle->f_proto = proto;
- fle->f_rt = ro->ro_rt;
- fle->f_lle = ro->ro_lle;
- fle->f_fhash = hash;
- fle->f_fibnum = fibnum;
- fle->f_uptime = time_uptime;
- FL_ENTRY_UNLOCK(ft, hash);
- return (0);
-}
-
-int
-kern_flowtable_insert(struct flowtable *ft,
- struct sockaddr_storage *ssa, struct sockaddr_storage *dsa,
- struct route *ro, uint32_t fibnum, int flags)
-{
- uint32_t key[9], hash;
+#ifdef INET
+ if (ft == &V_ip4_ft) {
+ struct sockaddr_in *sin;
- flags = (ft->ft_flags | flags | FL_OVERWRITE);
- hash = 0;
+ ro = &sro;
+ bzero(&sro.ro_dst, sizeof(sro.ro_dst));
-#ifdef INET
- if (ssa->ss_family == AF_INET)
- hash = ipv4_flow_lookup_hash_internal((struct sockaddr_in *)ssa,
- (struct sockaddr_in *)dsa, key, flags);
+ sin = (struct sockaddr_in *)&sro.ro_dst;
+ sin->sin_family = AF_INET;
+ sin->sin_len = sizeof(*sin);
+ sin->sin_addr.s_addr = key[0];
+ }
#endif
#ifdef INET6
- if (ssa->ss_family == AF_INET6)
- hash = ipv6_flow_lookup_hash_internal((struct sockaddr_in6 *)ssa,
- (struct sockaddr_in6 *)dsa, key, flags);
-#endif
- if (ro->ro_rt == NULL || ro->ro_lle == NULL)
- return (EINVAL);
-
- FLDPRINTF(ft, FL_DEBUG,
- "kern_flowtable_insert: key=%x:%x:%x hash=%x fibnum=%d flags=%x\n",
- key[0], key[1], key[2], hash, fibnum, flags);
- return (flowtable_insert(ft, hash, key, fibnum, ro, flags));
-}
+ if (ft == &V_ip6_ft) {
+ struct sockaddr_in6 *sin6;
-static int
-flowtable_key_equal(struct flentry *fle, uint32_t *key)
-{
- uint32_t *hashkey;
- int i, nwords;
+ ro = (struct route *)&sro6;
+ sin6 = &sro6.ro_dst;
- if (fle->f_flags & FL_IPV6) {
- nwords = 9;
- hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
- } else {
- nwords = 3;
- hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
+ bzero(sin6, sizeof(*sin6));
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_len = sizeof(*sin6);
+ bcopy(key, &sin6->sin6_addr, sizeof(struct in6_addr));
}
+#endif
- for (i = 0; i < nwords; i++)
- if (hashkey[i] != key[i])
- return (0);
+ ro->ro_rt = NULL;
+#ifdef RADIX_MPATH
+ rtalloc_mpath_fib(ro, hash, fibnum);
+#else
+ rtalloc_ign_fib(ro, 0, fibnum);
+#endif
+ if (ro->ro_rt == NULL)
+ return (NULL);
- return (1);
-}
+ rt = ro->ro_rt;
+ ifp = rt->rt_ifp;
-struct flentry *
-flowtable_lookup_mbuf(struct flowtable *ft, struct mbuf *m, int af)
-{
- struct flentry *fle = NULL;
+ if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) {
+ RTFREE(rt);
+ return (NULL);
+ }
#ifdef INET
- if (af == AF_INET)
- fle = flowtable_lookup_mbuf4(ft, m);
+ if (ft == &V_ip4_ft)
+ lt = LLTABLE(ifp);
#endif
#ifdef INET6
- if (af == AF_INET6)
- fle = flowtable_lookup_mbuf6(ft, m);
-#endif
- if (fle != NULL && m != NULL && (m->m_flags & M_FLOWID) == 0) {
- m->m_flags |= M_FLOWID;
- m->m_pkthdr.flowid = fle->f_fhash;
- }
- return (fle);
-}
-
-struct flentry *
-flowtable_lookup(struct flowtable *ft, struct sockaddr_storage *ssa,
- struct sockaddr_storage *dsa, uint32_t fibnum, int flags)
-{
- uint32_t key[9], hash;
- struct flentry *fle;
- struct flowtable_stats *fs = &ft->ft_stats[curcpu];
- uint8_t proto = 0;
- int error = 0;
- struct rtentry *rt;
- struct llentry *lle;
- struct route sro, *ro;
- struct route_in6 sro6;
+ if (ft == &V_ip6_ft)
+ lt = LLTABLE6(ifp);
+#endif
- sro.ro_rt = sro6.ro_rt = NULL;
- sro.ro_lle = sro6.ro_lle = NULL;
- ro = NULL;
- hash = 0;
- flags |= ft->ft_flags;
- proto = flags_to_proto(flags);
-#ifdef INET
- if (ssa->ss_family == AF_INET) {
- struct sockaddr_in *ssin, *dsin;
+ if (rt->rt_flags & RTF_GATEWAY)
+ l3addr = (struct sockaddr_storage *)rt->rt_gateway;
+ else
+ l3addr = (struct sockaddr_storage *)&ro->ro_dst;
+ lle = llentry_alloc(ifp, lt, l3addr);
- ro = &sro;
- memcpy(&ro->ro_dst, dsa, sizeof(struct sockaddr_in));
- /*
- * The harvested source and destination addresses
- * may contain port information if the packet is
- * from a transport protocol (e.g. TCP/UDP). The
- * port field must be cleared before performing
- * a route lookup.
- */
- ((struct sockaddr_in *)&ro->ro_dst)->sin_port = 0;
- dsin = (struct sockaddr_in *)dsa;
- ssin = (struct sockaddr_in *)ssa;
- if ((dsin->sin_addr.s_addr == ssin->sin_addr.s_addr) ||
- (ntohl(dsin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
- (ntohl(ssin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
- return (NULL);
-
- hash = ipv4_flow_lookup_hash_internal(ssin, dsin, key, flags);
+ if (lle == NULL) {
+ RTFREE(rt);
+ return (NULL);
}
-#endif
-#ifdef INET6
- if (ssa->ss_family == AF_INET6) {
- struct sockaddr_in6 *ssin6, *dsin6;
- ro = (struct route *)&sro6;
- memcpy(&sro6.ro_dst, dsa,
- sizeof(struct sockaddr_in6));
- ((struct sockaddr_in6 *)&ro->ro_dst)->sin6_port = 0;
- dsin6 = (struct sockaddr_in6 *)dsa;
- ssin6 = (struct sockaddr_in6 *)ssa;
-
- flags |= FL_IPV6;
- hash = ipv6_flow_lookup_hash_internal(ssin6, dsin6, key, flags);
- }
-#endif
- /*
- * Ports are zero and this isn't a transmit cache
- * - thus not a protocol for which we need to keep
- * state
- * FL_HASH_ALL => key[0] != 0 for TCP || UDP || SCTP
- */
- if (hash == 0 || (key[0] == 0 && (ft->ft_flags & FL_HASH_ALL)))
+ /* Don't insert the entry if the ARP hasn't yet finished resolving. */
+ if ((lle->la_flags & LLE_VALID) == 0) {
+ RTFREE(rt);
+ LLE_FREE(lle);
+ FLOWSTAT_INC(ft, ft_fail_lle_invalid);
return (NULL);
+ }
- fs->ft_lookups++;
- FL_ENTRY_LOCK(ft, hash);
- if ((fle = FL_ENTRY(ft, hash)) == NULL) {
- FL_ENTRY_UNLOCK(ft, hash);
- goto uncached;
+ fle = uma_zalloc(flow_zone, M_NOWAIT | M_ZERO);
+ if (fle == NULL) {
+ RTFREE(rt);
+ LLE_FREE(lle);
+ return (NULL);
}
-keycheck:
- rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
- lle = __DEVOLATILE(struct llentry *, fle->f_lle);
- if ((rt != NULL)
- && lle != NULL
- && fle->f_fhash == hash
- && flowtable_key_equal(fle, key)
- && (proto == fle->f_proto)
- && (fibnum == fle->f_fibnum)
- && (rt->rt_flags & RTF_UP)
- && (rt->rt_ifp != NULL)
- && (lle->la_flags & LLE_VALID)) {
- fs->ft_hits++;
- fle->f_uptime = time_uptime;
- fle->f_flags |= flags;
- FL_ENTRY_UNLOCK(ft, hash);
- return (fle);
- } else if (fle->f_next != NULL) {
- fle = fle->f_next;
- goto keycheck;
+
+ fle->f_hash = hash;
+ bcopy(key, &fle->f_key, keylen);
+ fle->f_rt = rt;
+ fle->f_lle = lle;
+ fle->f_fibnum = fibnum;
+ fle->f_uptime = time_uptime;
+#ifdef FLOWTABLE_HASH_ALL
+ fle->f_proto = proto;
+ fle->f_flags = fibnum0 >> 24;
+#endif
+
+ critical_enter();
+ mask = flowtable_mask(ft);
+ flist = flowtable_list(ft, hash);
+
+ if (SLIST_EMPTY(flist)) {
+ bit_set(mask, (hash % ft->ft_size));
+ SLIST_INSERT_HEAD(flist, fle, f_next);
+ goto skip;
}
- FL_ENTRY_UNLOCK(ft, hash);
-uncached:
- if (flags & FL_NOAUTO || flow_full(ft))
- return (NULL);
- fs->ft_misses++;
/*
- * This bit of code ends up locking the
- * same route 3 times (just like ip_output + ether_output)
- * - at lookup
- * - in rt_check when called by arpresolve
- * - dropping the refcount for the rtentry
- *
- * This could be consolidated to one if we wrote a variant
- * of arpresolve with an rt_check variant that expected to
- * receive the route locked
+ * find end of list and make sure that we were not
+ * preempted by another thread handling this flow
*/
-
-#ifdef INVARIANTS
- if ((ro->ro_dst.sa_family != AF_INET) &&
- (ro->ro_dst.sa_family != AF_INET6))
- panic("sa_family == %d\n", ro->ro_dst.sa_family);
+ SLIST_FOREACH(iter, flist, f_next) {
+ KASSERT(iter->f_hash % ft->ft_size == hash % ft->ft_size,
+ ("%s: wrong hash", __func__));
+ if (flow_matches(iter, key, keylen, fibnum)) {
+ /*
+ * We probably migrated to an other CPU after
+ * lookup in flowtable_lookup_common() failed.
+ * It appeared that this CPU already has flow
+ * entry.
+ */
+ iter->f_uptime = time_uptime;
+#ifdef FLOWTABLE_HASH_ALL
+ iter->f_flags |= fibnum >> 24;
#endif
-
- ft->ft_rtalloc(ro, hash, fibnum);
- if (ro->ro_rt == NULL)
- error = ENETUNREACH;
- else {
- struct llentry *lle = NULL;
- struct sockaddr_storage *l3addr;
- struct rtentry *rt = ro->ro_rt;
- struct ifnet *ifp = rt->rt_ifp;
-
- if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) {
- RTFREE(rt);
- ro->ro_rt = NULL;
- return (NULL);
+ critical_exit();
+ FLOWSTAT_INC(ft, ft_collisions);
+ uma_zfree(flow_zone, fle);
+ return (iter);
}
-#ifdef INET6
- if (ssa->ss_family == AF_INET6) {
- struct sockaddr_in6 *dsin6;
-
- dsin6 = (struct sockaddr_in6 *)dsa;
- if (in6_localaddr(&dsin6->sin6_addr)) {
- RTFREE(rt);
- ro->ro_rt = NULL;
- return (NULL);
- }
+ }
- if (rt->rt_flags & RTF_GATEWAY)
- l3addr = (struct sockaddr_storage *)rt->rt_gateway;
-
- else
- l3addr = (struct sockaddr_storage *)&ro->ro_dst;
- lle = llentry_alloc(ifp, LLTABLE6(ifp), l3addr);
- }
-#endif
+ SLIST_INSERT_HEAD(flist, fle, f_next);
+skip:
+ critical_exit();
+ FLOWSTAT_INC(ft, ft_inserts);
+
+ return (fle);
+}
+
+int
+flowtable_lookup(sa_family_t sa, struct mbuf *m, struct route *ro)
+{
+ struct flentry *fle;
+
+ if (V_flowtable_enable == 0)
+ return (ENXIO);
+
+ switch (sa) {
#ifdef INET
- if (ssa->ss_family == AF_INET) {
- if (rt->rt_flags & RTF_GATEWAY)
- l3addr = (struct sockaddr_storage *)rt->rt_gateway;
- else
- l3addr = (struct sockaddr_storage *)&ro->ro_dst;
- lle = llentry_alloc(ifp, LLTABLE(ifp), l3addr);
- }
-
+ case AF_INET:
+ fle = flowtable_lookup_ipv4(m, ro);
+ break;
+#endif
+#ifdef INET6
+ case AF_INET6:
+ fle = flowtable_lookup_ipv6(m, ro);
+ break;
#endif
- ro->ro_lle = lle;
+ default:
+ panic("%s: sa %d", __func__, sa);
+ }
- if (lle == NULL) {
- RTFREE(rt);
- ro->ro_rt = NULL;
- return (NULL);
- }
- error = flowtable_insert(ft, hash, key, fibnum, ro, flags);
+ if (fle == NULL)
+ return (EHOSTUNREACH);
- if (error) {
- RTFREE(rt);
- LLE_FREE(lle);
- ro->ro_rt = NULL;
- ro->ro_lle = NULL;
- }
- }
+ if (!(m->m_flags & M_FLOWID)) {
+ m->m_flags |= M_FLOWID;
+ m->m_pkthdr.flowid = fle->f_hash;
+ }
+
+ ro->ro_rt = fle->f_rt;
+ ro->ro_lle = fle->f_lle;
+ ro->ro_flags |= RT_NORTREF;
- return ((error) ? NULL : fle);
+ return (0);
}
-/*
- * used by the bit_alloc macro
- */
-#define calloc(count, size) malloc((count)*(size), M_DEVBUF, M_WAITOK|M_ZERO)
-
-struct flowtable *
-flowtable_alloc(char *name, int nentry, int flags)
+static struct flentry *
+flowtable_lookup_common(struct flowtable *ft, uint32_t *key, int keylen,
+ uint32_t fibnum)
{
- struct flowtable *ft, *fttail;
- int i;
-
- if (V_flow_hashjitter == 0)
- V_flow_hashjitter = arc4random();
+ struct flist *flist;
+ struct flentry *fle;
+ uint32_t hash;
- KASSERT(nentry > 0, ("nentry must be > 0, is %d\n", nentry));
+ FLOWSTAT_INC(ft, ft_lookups);
- ft = malloc(sizeof(struct flowtable),
- M_RTABLE, M_WAITOK | M_ZERO);
+ hash = jenkins_hash32(key, keylen / sizeof(uint32_t), flow_hashjitter);
- ft->ft_name = name;
- ft->ft_flags = flags;
- ft->ft_size = nentry;
-#ifdef RADIX_MPATH
- ft->ft_rtalloc = rtalloc_mpath_fib;
-#else
- ft->ft_rtalloc = rtalloc_ign_wrapper;
+ critical_enter();
+ flist = flowtable_list(ft, hash);
+ SLIST_FOREACH(fle, flist, f_next) {
+ KASSERT(fle->f_hash % ft->ft_size == hash % ft->ft_size,
+ ("%s: wrong hash", __func__));
+ if (flow_matches(fle, key, keylen, fibnum)) {
+ fle->f_uptime = time_uptime;
+#ifdef FLOWTABLE_HASH_ALL
+ fle->f_flags |= fibnum >> 24;
#endif
- if (flags & FL_PCPU) {
- ft->ft_lock = flowtable_pcpu_lock;
- ft->ft_unlock = flowtable_pcpu_unlock;
-
- for (i = 0; i <= mp_maxid; i++) {
- ft->ft_table.pcpu[i] =
- malloc(nentry*sizeof(struct flentry *),
- M_RTABLE, M_WAITOK | M_ZERO);
- ft->ft_masks[i] = bit_alloc(nentry);
+ critical_exit();
+ FLOWSTAT_INC(ft, ft_hits);
+ return (fle);
}
- } else {
- ft->ft_lock_count = 2*(powerof2(mp_maxid + 1) ? (mp_maxid + 1):
- (fls(mp_maxid + 1) << 1));
-
- ft->ft_lock = flowtable_global_lock;
- ft->ft_unlock = flowtable_global_unlock;
- ft->ft_table.global =
- malloc(nentry*sizeof(struct flentry *),
- M_RTABLE, M_WAITOK | M_ZERO);
- ft->ft_locks = malloc(ft->ft_lock_count*sizeof(struct mtx),
- M_RTABLE, M_WAITOK | M_ZERO);
- for (i = 0; i < ft->ft_lock_count; i++)
- mtx_init(&ft->ft_locks[i], "flow", NULL, MTX_DEF|MTX_DUPOK);
-
- ft->ft_masks[0] = bit_alloc(nentry);
}
- ft->ft_tmpmask = bit_alloc(nentry);
+ critical_exit();
- /*
- * In the local transmit case the table truly is
- * just a cache - so everything is eligible for
- * replacement after 5s of non-use
- */
- if (flags & FL_HASH_ALL) {
- ft->ft_udp_idle = V_flowtable_udp_expire;
- ft->ft_syn_idle = V_flowtable_syn_expire;
- ft->ft_fin_wait_idle = V_flowtable_fin_wait_expire;
- ft->ft_tcp_idle = V_flowtable_fin_wait_expire;
- } else {
- ft->ft_udp_idle = ft->ft_fin_wait_idle =
- ft->ft_syn_idle = ft->ft_tcp_idle = 30;
-
- }
+ FLOWSTAT_INC(ft, ft_misses);
- /*
- * hook in to the cleaner list
- */
- if (V_flow_list_head == NULL)
- V_flow_list_head = ft;
- else {
- fttail = V_flow_list_head;
- while (fttail->ft_next != NULL)
- fttail = fttail->ft_next;
- fttail->ft_next = ft;
- }
-
- return (ft);
+ return (flowtable_insert(ft, hash, key, keylen, fibnum));
}
/*
- * The rest of the code is devoted to garbage collection of expired entries.
- * It is a new additon made necessary by the switch to dynamically allocating
- * flow tables.
- *
+ * used by the bit_alloc macro
*/
+#define calloc(count, size) malloc((count)*(size), M_FTABLE, M_WAITOK | M_ZERO)
static void
-fle_free(struct flentry *fle, struct flowtable *ft)
+flowtable_alloc(struct flowtable *ft)
{
- struct rtentry *rt;
- struct llentry *lle;
- rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
- lle = __DEVOLATILE(struct llentry *, fle->f_lle);
- if (rt != NULL)
- RTFREE(rt);
- if (lle != NULL)
- LLE_FREE(lle);
- flow_free(fle, ft);
+ ft->ft_table = malloc(ft->ft_size * sizeof(struct flist),
+ M_FTABLE, M_WAITOK);
+ for (int i = 0; i < ft->ft_size; i++)
+ ft->ft_table[i] = uma_zalloc(pcpu_zone_ptr, M_WAITOK | M_ZERO);
+
+ ft->ft_masks = uma_zalloc(pcpu_zone_ptr, M_WAITOK);
+ for (int i = 0; i < mp_ncpus; i++) {
+ bitstr_t **b;
+
+ b = zpcpu_get_cpu(ft->ft_masks, i);
+ *b = bit_alloc(ft->ft_size);
+ }
+ ft->ft_tmpmask = bit_alloc(ft->ft_size);
}
+#undef calloc
static void
-flowtable_free_stale(struct flowtable *ft, struct rtentry *rt)
+flowtable_free_stale(struct flowtable *ft, struct rtentry *rt, int maxidle)
{
- int curbit = 0, count, tmpsize;
- struct flentry *fle, **flehead, *fleprev;
- struct flentry *flefreehead, *flefreetail, *fletmp;
+ struct flist *flist, freelist;
+ struct flentry *fle, *fle1, *fleprev;
bitstr_t *mask, *tmpmask;
- struct flowtable_stats *fs = &ft->ft_stats[curcpu];
+ int curbit, tmpsize;
- flefreehead = flefreetail = NULL;
+ SLIST_INIT(&freelist);
mask = flowtable_mask(ft);
tmpmask = ft->ft_tmpmask;
tmpsize = ft->ft_size;
memcpy(tmpmask, mask, ft->ft_size/8);
+ curbit = 0;
/*
* XXX Note to self, bit_ffs operates at the byte level
* and thus adds gratuitous overhead
@@ -1425,131 +785,96 @@ flowtable_free_stale(struct flowtable *ft, struct rtentry *rt)
break;
}
- FL_ENTRY_LOCK(ft, curbit);
- flehead = flowtable_entry(ft, curbit);
- fle = fleprev = *flehead;
+ FLOWSTAT_INC(ft, ft_free_checks);
- fs->ft_free_checks++;
+ critical_enter();
+ flist = flowtable_list(ft, curbit);
#ifdef DIAGNOSTIC
- if (fle == NULL && curbit > 0) {
+ if (SLIST_EMPTY(flist) && curbit > 0) {
log(LOG_ALERT,
"warning bit=%d set, but no fle found\n",
curbit);
}
-#endif
- while (fle != NULL) {
- if (rt != NULL) {
- if (__DEVOLATILE(struct rtentry *, fle->f_rt) != rt) {
- fleprev = fle;
- fle = fle->f_next;
- continue;
- }
- } else if (!flow_stale(ft, fle)) {
+#endif
+ SLIST_FOREACH_SAFE(fle, flist, f_next, fle1) {
+ if (rt != NULL && fle->f_rt != rt) {
fleprev = fle;
- fle = fle->f_next;
continue;
}
- /*
- * delete head of the list
- */
- if (fleprev == *flehead) {
- fletmp = fleprev;
- if (fle == fleprev) {
- fleprev = *flehead = fle->f_next;
- } else
- fleprev = *flehead = fle;
- fle = fle->f_next;
- } else {
- /*
- * don't advance fleprev
- */
- fletmp = fle;
- fleprev->f_next = fle->f_next;
- fle = fleprev->f_next;
+ if (!flow_stale(ft, fle, maxidle)) {
+ fleprev = fle;
+ continue;
}
- if (flefreehead == NULL)
- flefreehead = flefreetail = fletmp;
- else {
- flefreetail->f_next = fletmp;
- flefreetail = fletmp;
- }
- fletmp->f_next = NULL;
+ if (fle == SLIST_FIRST(flist))
+ SLIST_REMOVE_HEAD(flist, f_next);
+ else
+ SLIST_REMOVE_AFTER(fleprev, f_next);
+ SLIST_INSERT_HEAD(&freelist, fle, f_next);
}
- if (*flehead == NULL)
+ if (SLIST_EMPTY(flist))
bit_clear(mask, curbit);
- FL_ENTRY_UNLOCK(ft, curbit);
+ critical_exit();
+
bit_clear(tmpmask, curbit);
tmpmask += (curbit / 8);
tmpsize -= (curbit / 8) * 8;
bit_ffs(tmpmask, tmpsize, &curbit);
}
- count = 0;
- while ((fle = flefreehead) != NULL) {
- flefreehead = fle->f_next;
- count++;
- fs->ft_frees++;
- fle_free(fle, ft);
+
+ SLIST_FOREACH_SAFE(fle, &freelist, f_next, fle1) {
+ FLOWSTAT_INC(ft, ft_frees);
+ if (fle->f_rt != NULL)
+ RTFREE(fle->f_rt);
+ if (fle->f_lle != NULL)
+ LLE_FREE(fle->f_lle);
+ uma_zfree(flow_zone, fle);
}
- if (V_flowtable_debug && count)
- log(LOG_DEBUG, "freed %d flow entries\n", count);
}
-void
-flowtable_route_flush(struct flowtable *ft, struct rtentry *rt)
+static void
+flowtable_clean_vnet(struct flowtable *ft, struct rtentry *rt, int maxidle)
{
int i;
- if (ft->ft_flags & FL_PCPU) {
- CPU_FOREACH(i) {
- if (smp_started == 1) {
- thread_lock(curthread);
- sched_bind(curthread, i);
- thread_unlock(curthread);
- }
+ CPU_FOREACH(i) {
+ if (smp_started == 1) {
+ thread_lock(curthread);
+ sched_bind(curthread, i);
+ thread_unlock(curthread);
+ }
- flowtable_free_stale(ft, rt);
+ flowtable_free_stale(ft, rt, maxidle);
- if (smp_started == 1) {
- thread_lock(curthread);
- sched_unbind(curthread);
- thread_unlock(curthread);
- }
+ if (smp_started == 1) {
+ thread_lock(curthread);
+ sched_unbind(curthread);
+ thread_unlock(curthread);
}
- } else {
- flowtable_free_stale(ft, rt);
}
}
-static void
-flowtable_clean_vnet(void)
+void
+flowtable_route_flush(sa_family_t sa, struct rtentry *rt)
{
struct flowtable *ft;
- int i;
- ft = V_flow_list_head;
- while (ft != NULL) {
- if (ft->ft_flags & FL_PCPU) {
- CPU_FOREACH(i) {
- if (smp_started == 1) {
- thread_lock(curthread);
- sched_bind(curthread, i);
- thread_unlock(curthread);
- }
-
- flowtable_free_stale(ft, NULL);
-
- if (smp_started == 1) {
- thread_lock(curthread);
- sched_unbind(curthread);
- thread_unlock(curthread);
- }
- }
- } else {
- flowtable_free_stale(ft, NULL);
- }
- ft = ft->ft_next;
+ switch (sa) {
+#ifdef INET
+ case AF_INET:
+ ft = &V_ip4_ft;
+ break;
+#endif
+#ifdef INET6
+ case AF_INET6:
+ ft = &V_ip6_ft;
+ break;
+#endif
+ default:
+ panic("%s: sa %d", __func__, sa);
}
+
+ flowtable_clean_vnet(ft, rt, 0);
}
static void
@@ -1562,18 +887,33 @@ flowtable_cleaner(void)
log(LOG_INFO, "flowtable cleaner started\n");
td = curthread;
while (1) {
+ uint32_t flowclean_freq, maxidle;
+
+ /*
+ * The maximum idle time, as well as frequency are arbitrary.
+ */
+ if (flow_full())
+ maxidle = 5;
+ else
+ maxidle = 30;
+
VNET_LIST_RLOCK();
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter);
- flowtable_clean_vnet();
+#ifdef INET
+ flowtable_clean_vnet(&V_ip4_ft, NULL, maxidle);
+#endif
+#ifdef INET6
+ flowtable_clean_vnet(&V_ip6_ft, NULL, maxidle);
+#endif
CURVNET_RESTORE();
}
VNET_LIST_RUNLOCK();
- /*
- * The 10 second interval between cleaning checks
- * is arbitrary
- */
+ if (flow_full())
+ flowclean_freq = 4*hz;
+ else
+ flowclean_freq = 20*hz;
mtx_lock(&flowclean_lock);
thread_lock(td);
sched_prio(td, PPAUSE);
@@ -1606,91 +946,106 @@ static struct kproc_desc flow_kp = {
};
SYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp);
-static void
-flowtable_init_vnet(const void *unused __unused)
+static int
+flowtable_get_size(char *name)
{
+ int size;
+
+ if (TUNABLE_INT_FETCH(name, &size)) {
+ if (size < 256)
+ size = 256;
+ if (!powerof2(size)) {
+ printf("%s must be power of 2\n", name);
+ size = 2048;
+ }
+ } else {
+ /*
+ * round up to the next power of 2
+ */
+ size = 1 << fls((1024 + maxusers * 64) - 1);
+ }
- V_flowtable_nmbflows = 1024 + maxusers * 64 * mp_ncpus;
- V_flow_ipv4_zone = uma_zcreate("ip4flow", sizeof(struct flentry_v4),
- NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);
- V_flow_ipv6_zone = uma_zcreate("ip6flow", sizeof(struct flentry_v6),
- NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);
- uma_zone_set_max(V_flow_ipv4_zone, V_flowtable_nmbflows);
- uma_zone_set_max(V_flow_ipv6_zone, V_flowtable_nmbflows);
- V_flowtable_ready = 1;
+ return (size);
}
-VNET_SYSINIT(flowtable_init_vnet, SI_SUB_SMP, SI_ORDER_ANY,
- flowtable_init_vnet, NULL);
static void
flowtable_init(const void *unused __unused)
{
+ flow_hashjitter = arc4random();
+
+ flow_zone = uma_zcreate("flows", sizeof(struct flentry),
+ NULL, NULL, NULL, NULL, (64-1), UMA_ZONE_MAXBUCKET);
+ uma_zone_set_max(flow_zone, 1024 + maxusers * 64 * mp_ncpus);
+
cv_init(&flowclean_c_cv, "c_flowcleanwait");
cv_init(&flowclean_f_cv, "f_flowcleanwait");
mtx_init(&flowclean_lock, "flowclean lock", NULL, MTX_DEF);
EVENTHANDLER_REGISTER(ifnet_departure_event, flowtable_flush, NULL,
EVENTHANDLER_PRI_ANY);
- flowclean_freq = 20*hz;
}
-SYSINIT(flowtable_init, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST,
+SYSINIT(flowtable_init, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST,
flowtable_init, NULL);
+#ifdef INET
+static SYSCTL_NODE(_net_flowtable, OID_AUTO, ip4, CTLFLAG_RD, NULL,
+ "Flowtable for IPv4");
+
+static VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip4_ftstat);
+VNET_PCPUSTAT_SYSINIT(ip4_ftstat);
+VNET_PCPUSTAT_SYSUNINIT(ip4_ftstat);
+SYSCTL_VNET_PCPUSTAT(_net_flowtable_ip4, OID_AUTO, stat, struct flowtable_stat,
+ ip4_ftstat, "Flowtable statistics for IPv4 "
+ "(struct flowtable_stat, net/flowtable.h)");
-#ifdef VIMAGE
static void
-flowtable_uninit(const void *unused __unused)
+flowtable_init_vnet_v4(const void *unused __unused)
{
- V_flowtable_ready = 0;
- uma_zdestroy(V_flow_ipv4_zone);
- uma_zdestroy(V_flow_ipv6_zone);
+ V_ip4_ft.ft_size = flowtable_get_size("net.flowtable.ip4.size");
+ V_ip4_ft.ft_stat = VNET(ip4_ftstat);
+ flowtable_alloc(&V_ip4_ft);
}
+VNET_SYSINIT(ft_vnet_v4, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
+ flowtable_init_vnet_v4, NULL);
+#endif /* INET */
-VNET_SYSUNINIT(flowtable_uninit, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY,
- flowtable_uninit, NULL);
-#endif
+#ifdef INET6
+static SYSCTL_NODE(_net_flowtable, OID_AUTO, ip6, CTLFLAG_RD, NULL,
+ "Flowtable for IPv6");
-#ifdef DDB
-static uint32_t *
-flowtable_get_hashkey(struct flentry *fle)
-{
- uint32_t *hashkey;
+static VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip6_ftstat);
+VNET_PCPUSTAT_SYSINIT(ip6_ftstat);
+VNET_PCPUSTAT_SYSUNINIT(ip6_ftstat);
+SYSCTL_VNET_PCPUSTAT(_net_flowtable_ip6, OID_AUTO, stat, struct flowtable_stat,
+ ip6_ftstat, "Flowtable statistics for IPv6 "
+ "(struct flowtable_stat, net/flowtable.h)");
- if (fle->f_flags & FL_IPV6)
- hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
- else
- hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
+static void
+flowtable_init_vnet_v6(const void *unused __unused)
+{
- return (hashkey);
+ V_ip6_ft.ft_size = flowtable_get_size("net.flowtable.ip6.size");
+ V_ip6_ft.ft_stat = VNET(ip6_ftstat);
+ flowtable_alloc(&V_ip6_ft);
}
+VNET_SYSINIT(flowtable_init_vnet_v6, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
+ flowtable_init_vnet_v6, NULL);
+#endif /* INET6 */
+#ifdef DDB
static bitstr_t *
flowtable_mask_pcpu(struct flowtable *ft, int cpuid)
{
- bitstr_t *mask;
- if (ft->ft_flags & FL_PCPU)
- mask = ft->ft_masks[cpuid];
- else
- mask = ft->ft_masks[0];
-
- return (mask);
+ return (zpcpu_get_cpu(*ft->ft_masks, cpuid));
}
-static struct flentry **
-flowtable_entry_pcpu(struct flowtable *ft, uint32_t hash, int cpuid)
+static struct flist *
+flowtable_list_pcpu(struct flowtable *ft, uint32_t hash, int cpuid)
{
- struct flentry **fle;
- int index = (hash % ft->ft_size);
- if (ft->ft_flags & FL_PCPU) {
- fle = &ft->ft_table.pcpu[cpuid][index];
- } else {
- fle = &ft->ft_table.global[index];
- }
-
- return (fle);
+ return (zpcpu_get_cpu(&ft->ft_table[hash % ft->ft_size], cpuid));
}
static void
@@ -1698,40 +1053,58 @@ flow_show(struct flowtable *ft, struct flentry *fle)
{
int idle_time;
int rt_valid, ifp_valid;
- uint16_t sport, dport;
- uint32_t *hashkey;
- char saddr[4*sizeof "123"], daddr[4*sizeof "123"];
volatile struct rtentry *rt;
struct ifnet *ifp = NULL;
+ uint32_t *hashkey = fle->f_key;
idle_time = (int)(time_uptime - fle->f_uptime);
rt = fle->f_rt;
rt_valid = rt != NULL;
- if (rt_valid)
+ if (rt_valid)
ifp = rt->rt_ifp;
ifp_valid = ifp != NULL;
- hashkey = flowtable_get_hashkey(fle);
- if (fle->f_flags & FL_IPV6)
- goto skipaddr;
-
- inet_ntoa_r(*(struct in_addr *) &hashkey[2], daddr);
- if (ft->ft_flags & FL_HASH_ALL) {
- inet_ntoa_r(*(struct in_addr *) &hashkey[1], saddr);
- sport = ntohs(((uint16_t *)hashkey)[0]);
- dport = ntohs(((uint16_t *)hashkey)[1]);
- db_printf("%s:%d->%s:%d",
- saddr, sport, daddr,
- dport);
- } else
+
+#ifdef INET
+ if (ft == &V_ip4_ft) {
+ char daddr[4*sizeof "123"];
+#ifdef FLOWTABLE_HASH_ALL
+ char saddr[4*sizeof "123"];
+ uint16_t sport, dport;
+#endif
+
+ inet_ntoa_r(*(struct in_addr *) &hashkey[0], daddr);
+#ifdef FLOWTABLE_HASH_ALL
+ inet_ntoa_r(*(struct in_addr *) &hashkey[1], saddr);
+ dport = ntohs((uint16_t)(hashkey[2] >> 16));
+ sport = ntohs((uint16_t)(hashkey[2] & 0xffff));
+ db_printf("%s:%d->%s:%d", saddr, sport, daddr, dport);
+#else
db_printf("%s ", daddr);
-
-skipaddr:
+#endif
+ }
+#endif /* INET */
+#ifdef INET6
+ if (ft == &V_ip6_ft) {
+#ifdef FLOWTABLE_HASH_ALL
+ db_printf("\n\tkey=%08x:%08x:%08x%08x:%08x:%08x%08x:%08x:%08x",
+ hashkey[0], hashkey[1], hashkey[2],
+ hashkey[3], hashkey[4], hashkey[5],
+ hashkey[6], hashkey[7], hashkey[8]);
+#else
+ db_printf("\n\tkey=%08x:%08x:%08x ",
+ hashkey[0], hashkey[1], hashkey[2]);
+#endif
+ }
+#endif /* INET6 */
+
+ db_printf("hash=%08x idle_time=%03d"
+ "\n\tfibnum=%02d rt=%p",
+ fle->f_hash, idle_time, fle->f_fibnum, fle->f_rt);
+
+#ifdef FLOWTABLE_HASH_ALL
if (fle->f_flags & FL_STALE)
db_printf(" FL_STALE ");
- if (fle->f_flags & FL_TCP)
- db_printf(" FL_TCP ");
- if (fle->f_flags & FL_UDP)
- db_printf(" FL_UDP ");
+#endif
if (rt_valid) {
if (rt->rt_flags & RTF_UP)
db_printf(" RTF_UP ");
@@ -1740,21 +1113,10 @@ skipaddr:
if (ifp->if_flags & IFF_LOOPBACK)
db_printf(" IFF_LOOPBACK ");
if (ifp->if_flags & IFF_UP)
- db_printf(" IFF_UP ");
+ db_printf(" IFF_UP ");
if (ifp->if_flags & IFF_POINTOPOINT)
- db_printf(" IFF_POINTOPOINT ");
+ db_printf(" IFF_POINTOPOINT ");
}
- if (fle->f_flags & FL_IPV6)
- db_printf("\n\tkey=%08x:%08x:%08x%08x:%08x:%08x%08x:%08x:%08x",
- hashkey[0], hashkey[1], hashkey[2],
- hashkey[3], hashkey[4], hashkey[5],
- hashkey[6], hashkey[7], hashkey[8]);
- else
- db_printf("\n\tkey=%08x:%08x:%08x ",
- hashkey[0], hashkey[1], hashkey[2]);
- db_printf("hash=%08x idle_time=%03d"
- "\n\tfibnum=%02d rt=%p",
- fle->f_fhash, idle_time, fle->f_fibnum, fle->f_rt);
db_printf("\n");
}
@@ -1762,7 +1124,6 @@ static void
flowtable_show(struct flowtable *ft, int cpuid)
{
int curbit = 0;
- struct flentry *fle, **flehead;
bitstr_t *mask, *tmpmask;
if (cpuid != -1)
@@ -1776,43 +1137,32 @@ flowtable_show(struct flowtable *ft, int cpuid)
*/
bit_ffs(tmpmask, ft->ft_size, &curbit);
while (curbit != -1) {
+ struct flist *flist;
+ struct flentry *fle;
+
if (curbit >= ft->ft_size || curbit < -1) {
db_printf("warning: bad curbit value %d \n",
curbit);
break;
}
- flehead = flowtable_entry_pcpu(ft, curbit, cpuid);
- fle = *flehead;
+ flist = flowtable_list_pcpu(ft, curbit, cpuid);
- while (fle != NULL) {
+ SLIST_FOREACH(fle, flist, f_next)
flow_show(ft, fle);
- fle = fle->f_next;
- continue;
- }
bit_clear(tmpmask, curbit);
bit_ffs(tmpmask, ft->ft_size, &curbit);
}
}
static void
-flowtable_show_vnet(void)
+flowtable_show_vnet(struct flowtable *ft)
{
- struct flowtable *ft;
+
int i;
- ft = V_flow_list_head;
- while (ft != NULL) {
- printf("name: %s\n", ft->ft_name);
- if (ft->ft_flags & FL_PCPU) {
- CPU_FOREACH(i) {
- flowtable_show(ft, i);
- }
- } else {
- flowtable_show(ft, -1);
- }
- ft = ft->ft_next;
- }
+ CPU_FOREACH(i)
+ flowtable_show(ft, i);
}
DB_SHOW_COMMAND(flowtables, db_show_flowtables)
@@ -1824,7 +1174,14 @@ DB_SHOW_COMMAND(flowtables, db_show_flowtables)
#ifdef VIMAGE
db_printf("vnet %p\n", vnet_iter);
#endif
- flowtable_show_vnet();
+#ifdef INET
+ printf("IPv4:\n");
+ flowtable_show_vnet(&V_ip4_ft);
+#endif
+#ifdef INET6
+ printf("IPv6:\n");
+ flowtable_show_vnet(&V_ip6_ft);
+#endif
CURVNET_RESTORE();
}
}
diff --git a/sys/net/flowtable.h b/sys/net/flowtable.h
index d810fa3..5a1d927 100644
--- a/sys/net/flowtable.h
+++ b/sys/net/flowtable.h
@@ -1,83 +1,56 @@
-/**************************************************************************
-
-Copyright (c) 2008-2010, BitGravity Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- 1. Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
-
- 2. Neither the name of the BitGravity Corporation nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-$FreeBSD$
-
-***************************************************************************/
+/*-
+ * Copyright (c) 2014 Gleb Smirnoff <glebius@FreeBSD.org>
+ * Copyright (c) 2008-2010, BitGravity Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Neither the name of the BitGravity Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ *
+ */
#ifndef _NET_FLOWTABLE_H_
#define _NET_FLOWTABLE_H_
-#ifdef _KERNEL
-
-#define FL_HASH_ALL (1<<0) /* hash 4-tuple + protocol */
-#define FL_PCPU (1<<1) /* pcpu cache */
-#define FL_NOAUTO (1<<2) /* don't automatically add flentry on miss */
-#define FL_IPV6 (1<<9)
-
-#define FL_TCP (1<<11)
-#define FL_SCTP (1<<12)
-#define FL_UDP (1<<13)
-#define FL_DEBUG (1<<14)
-#define FL_DEBUG_ALL (1<<15)
-
-struct flowtable;
-struct flentry;
-struct route;
-struct route_in6;
+struct flowtable_stat {
+ uint64_t ft_collisions;
+ uint64_t ft_misses;
+ uint64_t ft_free_checks;
+ uint64_t ft_frees;
+ uint64_t ft_hits;
+ uint64_t ft_lookups;
+ uint64_t ft_fail_lle_invalid;
+ uint64_t ft_inserts;
+};
-VNET_DECLARE(struct flowtable *, ip_ft);
-#define V_ip_ft VNET(ip_ft)
-
-VNET_DECLARE(struct flowtable *, ip6_ft);
-#define V_ip6_ft VNET(ip6_ft)
-
-struct flowtable *flowtable_alloc(char *name, int nentry, int flags);
+#ifdef _KERNEL
/*
- * Given a flow table, look up the L3 and L2 information and
- * return it in the route.
- *
+ * Given a flow table, look up the L3 and L2 information
+ * and return it in the route.
*/
-struct flentry *flowtable_lookup_mbuf(struct flowtable *ft, struct mbuf *m, int af);
-
-struct flentry *flowtable_lookup(struct flowtable *ft, struct sockaddr_storage *ssa,
- struct sockaddr_storage *dsa, uint32_t fibnum, int flags);
-
-int kern_flowtable_insert(struct flowtable *ft, struct sockaddr_storage *ssa,
- struct sockaddr_storage *dsa, struct route *ro, uint32_t fibnum, int flags);
-
-void flow_invalidate(struct flentry *fl);
-void flowtable_route_flush(struct flowtable *ft, struct rtentry *rt);
-
-void flow_to_route(struct flentry *fl, struct route *ro);
-
-void flow_to_route_in6(struct flentry *fl, struct route_in6 *ro);
-
+int flowtable_lookup(sa_family_t, struct mbuf *, struct route *);
+void flowtable_route_flush(sa_family_t, struct rtentry *);
#endif /* _KERNEL */
-#endif
+#endif /* !_NET_FLOWTABLE_H_ */
diff --git a/sys/net/route.c b/sys/net/route.c
index 20fe181..bb99496 100644
--- a/sys/net/route.c
+++ b/sys/net/route.c
@@ -1298,18 +1298,7 @@ rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt,
}
#ifdef FLOWTABLE
else if (rt0 != NULL) {
- switch (dst->sa_family) {
-#ifdef INET6
- case AF_INET6:
- flowtable_route_flush(V_ip6_ft, rt0);
- break;
-#endif
-#ifdef INET
- case AF_INET:
- flowtable_route_flush(V_ip_ft, rt0);
- break;
-#endif
- }
+ flowtable_route_flush(dst->sa_family, rt0);
RTFREE(rt0);
}
#endif
diff --git a/sys/netinet/ip_input.c b/sys/netinet/ip_input.c
index c265d02..cde30ee 100644
--- a/sys/netinet/ip_input.c
+++ b/sys/netinet/ip_input.c
@@ -62,7 +62,6 @@ __FBSDID("$FreeBSD$");
#include <net/route.h>
#include <net/netisr.h>
#include <net/vnet.h>
-#include <net/flowtable.h>
#include <netinet/in.h>
#include <netinet/in_kdtrace.h>
@@ -198,16 +197,6 @@ SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW,
"IP stealth mode, no TTL decrementation on forwarding");
#endif
-#ifdef FLOWTABLE
-static VNET_DEFINE(int, ip_output_flowtable_size) = 2048;
-VNET_DEFINE(struct flowtable *, ip_ft);
-#define V_ip_output_flowtable_size VNET(ip_output_flowtable_size)
-
-SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, output_flowtable_size, CTLFLAG_RDTUN,
- &VNET_NAME(ip_output_flowtable_size), 2048,
- "number of entries in the per-cpu output flow caches");
-#endif
-
static void ip_freef(struct ipqhead *, struct ipq *);
/*
@@ -309,24 +298,6 @@ ip_init(void)
printf("%s: WARNING: unable to register pfil hook, "
"error %d\n", __func__, i);
-#ifdef FLOWTABLE
- if (TUNABLE_INT_FETCH("net.inet.ip.output_flowtable_size",
- &V_ip_output_flowtable_size)) {
- if (V_ip_output_flowtable_size < 256)
- V_ip_output_flowtable_size = 256;
- if (!powerof2(V_ip_output_flowtable_size)) {
- printf("flowtable must be power of 2 size\n");
- V_ip_output_flowtable_size = 2048;
- }
- } else {
- /*
- * round up to the next power of 2
- */
- V_ip_output_flowtable_size = 1 << fls((1024 + maxusers * 64)-1);
- }
- V_ip_ft = flowtable_alloc("ipv4", V_ip_output_flowtable_size, FL_PCPU);
-#endif
-
/* Skip initialization of globals for non-default instances. */
if (!IS_DEFAULT_VNET(curvnet))
return;
diff --git a/sys/netinet/ip_output.c b/sys/netinet/ip_output.c
index 7764bc3..2d8be1b 100644
--- a/sys/netinet/ip_output.c
+++ b/sys/netinet/ip_output.c
@@ -32,6 +32,7 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+#include "opt_inet.h"
#include "opt_ipfw.h"
#include "opt_ipsec.h"
#include "opt_kdtrace.h"
@@ -154,19 +155,8 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
}
#ifdef FLOWTABLE
- if (ro->ro_rt == NULL) {
- struct flentry *fle;
-
- /*
- * The flow table returns route entries valid for up to 30
- * seconds; we rely on the remainder of ip_output() taking no
- * longer than that long for the stability of ro_rt. The
- * flow ID assignment must have happened before this point.
- */
- fle = flowtable_lookup_mbuf(V_ip_ft, m, AF_INET);
- if (fle != NULL)
- flow_to_route(fle, ro);
- }
+ if (ro->ro_rt == NULL)
+ (void )flowtable_lookup(AF_INET, m, ro);
#endif
if (opt) {
diff --git a/sys/netinet6/in6_proto.c b/sys/netinet6/in6_proto.c
index 965de60..b0631ae 100644
--- a/sys/netinet6/in6_proto.c
+++ b/sys/netinet6/in6_proto.c
@@ -126,10 +126,6 @@ __FBSDID("$FreeBSD$");
#include <netinet6/ip6protosw.h>
-#ifdef FLOWTABLE
-#include <net/flowtable.h>
-#endif
-
/*
* TCP/IP protocol family: IP6, ICMP6, UDP, TCP.
*/
@@ -575,16 +571,6 @@ SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_STEALTH, stealth, CTLFLAG_RW,
&VNET_NAME(ip6stealth), 0, "");
#endif
-#ifdef FLOWTABLE
-VNET_DEFINE(int, ip6_output_flowtable_size) = 2048;
-VNET_DEFINE(struct flowtable *, ip6_ft);
-#define V_ip6_output_flowtable_size VNET(ip6_output_flowtable_size)
-
-SYSCTL_VNET_INT(_net_inet6_ip6, OID_AUTO, output_flowtable_size, CTLFLAG_RDTUN,
- &VNET_NAME(ip6_output_flowtable_size), 2048,
- "number of entries in the per-cpu output flow caches");
-#endif
-
/* net.inet6.icmp6 */
SYSCTL_VNET_INT(_net_inet6_icmp6, ICMPV6CTL_REDIRACCEPT, rediraccept,
CTLFLAG_RW, &VNET_NAME(icmp6_rediraccept), 0, "");
diff --git a/sys/netinet6/ip6_input.c b/sys/netinet6/ip6_input.c
index 8f70741..12249db 100644
--- a/sys/netinet6/ip6_input.c
+++ b/sys/netinet6/ip6_input.c
@@ -119,12 +119,6 @@ __FBSDID("$FreeBSD$");
#include <netinet6/ip6protosw.h>
-#ifdef FLOWTABLE
-#include <net/flowtable.h>
-VNET_DECLARE(int, ip6_output_flowtable_size);
-#define V_ip6_output_flowtable_size VNET(ip6_output_flowtable_size)
-#endif
-
extern struct domain inet6domain;
u_char ip6_protox[IPPROTO_MAX];
@@ -194,24 +188,6 @@ ip6_init(void)
nd6_init();
frag6_init();
-#ifdef FLOWTABLE
- if (TUNABLE_INT_FETCH("net.inet6.ip6.output_flowtable_size",
- &V_ip6_output_flowtable_size)) {
- if (V_ip6_output_flowtable_size < 256)
- V_ip6_output_flowtable_size = 256;
- if (!powerof2(V_ip6_output_flowtable_size)) {
- printf("flowtable must be power of 2 size\n");
- V_ip6_output_flowtable_size = 2048;
- }
- } else {
- /*
- * round up to the next power of 2
- */
- V_ip6_output_flowtable_size = 1 << fls((1024 + maxusers * 64)-1);
- }
- V_ip6_ft = flowtable_alloc("ipv6", V_ip6_output_flowtable_size, FL_IPV6|FL_PCPU);
-#endif
-
V_ip6_desync_factor = arc4random() % MAX_TEMP_DESYNC_FACTOR;
/* Skip global initialization stuff for non-default instances. */
diff --git a/sys/netinet6/ip6_output.c b/sys/netinet6/ip6_output.c
index 0d55b66..171a918 100644
--- a/sys/netinet6/ip6_output.c
+++ b/sys/netinet6/ip6_output.c
@@ -521,19 +521,8 @@ skip_ipsec2:;
ro = &opt->ip6po_route;
dst = (struct sockaddr_in6 *)&ro->ro_dst;
#ifdef FLOWTABLE
- if (ro->ro_rt == NULL) {
- struct flentry *fle;
-
- /*
- * The flow table returns route entries valid for up to 30
- * seconds; we rely on the remainder of ip_output() taking no
- * longer than that long for the stability of ro_rt. The
- * flow ID assignment must have happened before this point.
- */
- fle = flowtable_lookup_mbuf(V_ip6_ft, m, AF_INET6);
- if (fle != NULL)
- flow_to_route_in6(fle, ro);
- }
+ if (ro->ro_rt == NULL)
+ (void )flowtable_lookup(AF_INET6, m, (struct route *)ro);
#endif
again:
/*
diff --git a/usr.bin/netstat/Makefile b/usr.bin/netstat/Makefile
index 1071f0e..1644aab 100644
--- a/usr.bin/netstat/Makefile
+++ b/usr.bin/netstat/Makefile
@@ -5,7 +5,8 @@
PROG= netstat
SRCS= if.c inet.c main.c mbuf.c mroute.c netisr.c route.c \
- unix.c atalk.c mroute6.c ipsec.c bpf.c pfkey.c sctp.c
+ unix.c atalk.c mroute6.c ipsec.c bpf.c pfkey.c sctp.c \
+ flowtable.c
WARNS?= 3
CFLAGS+=-fno-strict-aliasing
diff --git a/usr.bin/netstat/flowtable.c b/usr.bin/netstat/flowtable.c
new file mode 100644
index 0000000..a3d5dd5
--- /dev/null
+++ b/usr.bin/netstat/flowtable.c
@@ -0,0 +1,84 @@
+/*-
+ * Copyright (c) 2014 Gleb Smirnoff <glebius@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#include <net/flowtable.h>
+#include <err.h>
+#include <stdint.h>
+#include <stdio.h>
+#include "netstat.h"
+
+/*
+ * Print flowtable statistics.
+ */
+
+static void
+print_stats(struct flowtable_stat *stat)
+{
+
+#define p(f, m) if (stat->f || sflag <= 1) \
+ printf(m, (uintmax_t)stat->f, plural(stat->f))
+#define p2(f, m) if (stat->f || sflag <= 1) \
+ printf(m, (uintmax_t)stat->f, plurales(stat->f))
+
+ p(ft_lookups, "\t%ju lookup%s\n");
+ p(ft_hits, "\t%ju hit%s\n");
+ p2(ft_misses, "\t%ju miss%s\n");
+ p(ft_inserts, "\t%ju insert%s\n");
+ p(ft_collisions, "\t%ju collision%s\n");
+ p(ft_free_checks, "\t%ju free check%s\n");
+ p(ft_frees, "\t%ju free%s\n");
+ p(ft_fail_lle_invalid,
+ "\t%ju lookup%s with not resolved Layer 2 address\n");
+
+#undef p2
+#undef p
+}
+
+void
+flowtable_stats(void)
+{
+ struct flowtable_stat stat;
+ size_t len = sizeof(stat);
+
+ if (!live)
+ return;
+
+ if (sysctlbyname("net.flowtable.ip4.stat", &stat, &len, NULL, 0) == 0) {
+ printf("flowtable for IPv4:\n");
+ print_stats(&stat);
+ }
+
+ if (sysctlbyname("net.flowtable.ip6.stat", &stat, &len, NULL, 0) == 0) {
+ printf("flowtable for IPv6:\n");
+ print_stats(&stat);
+ }
+}
diff --git a/usr.bin/netstat/main.c b/usr.bin/netstat/main.c
index feb97dc..5c952ad 100644
--- a/usr.bin/netstat/main.c
+++ b/usr.bin/netstat/main.c
@@ -556,9 +556,10 @@ main(int argc, char *argv[])
exit(0);
}
if (rflag) {
- if (sflag)
+ if (sflag) {
rt_stats(nl[N_RTSTAT].n_value, nl[N_RTTRASH].n_value);
- else
+ flowtable_stats();
+ } else
routepr(nl[N_RTREE].n_value, fib);
exit(0);
}
diff --git a/usr.bin/netstat/netstat.h b/usr.bin/netstat/netstat.h
index 114c84c..44bce94 100644
--- a/usr.bin/netstat/netstat.h
+++ b/usr.bin/netstat/netstat.h
@@ -124,6 +124,7 @@ void intpr(int, void (*)(char *));
void pr_rthdr(int);
void pr_family(int);
void rt_stats(u_long, u_long);
+void flowtable_stats(void);
char *ipx_pnet(struct sockaddr *);
char *ipx_phost(struct sockaddr *);
char *ns_phost(struct sockaddr *);
OpenPOWER on IntegriCloud