summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--sys/conf/options1
-rw-r--r--sys/net/flowtable.c1971
-rw-r--r--sys/net/flowtable.h119
-rw-r--r--sys/net/route.c13
-rw-r--r--sys/netinet/ip_input.c29
-rw-r--r--sys/netinet/ip_output.c16
-rw-r--r--sys/netinet6/in6_proto.c14
-rw-r--r--sys/netinet6/ip6_input.c24
-rw-r--r--sys/netinet6/ip6_output.c15
-rw-r--r--usr.bin/netstat/Makefile3
-rw-r--r--usr.bin/netstat/flowtable.c84
-rw-r--r--usr.bin/netstat/main.c5
-rw-r--r--usr.bin/netstat/netstat.h1
13 files changed, 807 insertions, 1488 deletions
diff --git a/sys/conf/options b/sys/conf/options
index 642064d..8a288fe 100644
--- a/sys/conf/options
+++ b/sys/conf/options
@@ -438,6 +438,7 @@ TCP_SIGNATURE opt_inet.h
VLAN_ARRAY opt_vlan.h
XBONEHACK
FLOWTABLE opt_route.h
+FLOWTABLE_HASH_ALL opt_route.h
#
# SCTP
diff --git a/sys/net/flowtable.c b/sys/net/flowtable.c
index 32b953c..873ec36 100644
--- a/sys/net/flowtable.c
+++ b/sys/net/flowtable.c
@@ -1,31 +1,30 @@
-/**************************************************************************
-
-Copyright (c) 2008-2010, BitGravity Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- 1. Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
-
- 2. Neither the name of the BitGravity Corporation nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-***************************************************************************/
+/*-
+ * Copyright (c) 2014 Gleb Smirnoff <glebius@FreeBSD.org>
+ * Copyright (c) 2008-2010, BitGravity Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Neither the name of the BitGravity Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
#include "opt_route.h"
#include "opt_mpath.h"
@@ -36,29 +35,32 @@ POSSIBILITY OF SUCH DAMAGE.
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
-#include <sys/param.h>
+#include <sys/param.h>
#include <sys/types.h>
#include <sys/bitstring.h>
#include <sys/condvar.h>
#include <sys/callout.h>
#include <sys/hash.h>
-#include <sys/kernel.h>
+#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/limits.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
+#include <sys/pcpu.h>
#include <sys/proc.h>
+#include <sys/queue.h>
#include <sys/sbuf.h>
#include <sys/sched.h>
#include <sys/smp.h>
#include <sys/socket.h>
#include <sys/syslog.h>
#include <sys/sysctl.h>
+#include <vm/uma.h>
#include <net/if.h>
#include <net/if_llatbl.h>
#include <net/if_var.h>
-#include <net/route.h>
+#include <net/route.h>
#include <net/flowtable.h>
#include <net/vnet.h>
@@ -70,156 +72,79 @@ __FBSDID("$FreeBSD$");
#ifdef INET6
#include <netinet/ip6.h>
#endif
+#ifdef FLOWTABLE_HASH_ALL
#include <netinet/tcp.h>
#include <netinet/udp.h>
#include <netinet/sctp.h>
+#endif
#include <ddb/ddb.h>
-struct ipv4_tuple {
- uint16_t ip_sport; /* source port */
- uint16_t ip_dport; /* destination port */
- in_addr_t ip_saddr; /* source address */
- in_addr_t ip_daddr; /* destination address */
-};
-
-union ipv4_flow {
- struct ipv4_tuple ipf_ipt;
- uint32_t ipf_key[3];
-};
+#ifdef FLOWTABLE_HASH_ALL
+#define KEY_PORTS (sizeof(uint16_t) * 2)
+#define KEY_ADDRS 2
+#else
+#define KEY_PORTS 0
+#define KEY_ADDRS 1
+#endif
-struct ipv6_tuple {
- uint16_t ip_sport; /* source port */
- uint16_t ip_dport; /* destination port */
- struct in6_addr ip_saddr; /* source address */
- struct in6_addr ip_daddr; /* destination address */
-};
+#ifdef INET6
+#define KEY_ADDR_LEN sizeof(struct in6_addr)
+#else
+#define KEY_ADDR_LEN sizeof(struct in_addr)
+#endif
-union ipv6_flow {
- struct ipv6_tuple ipf_ipt;
- uint32_t ipf_key[9];
-};
+#define KEYLEN ((KEY_ADDR_LEN * KEY_ADDRS + KEY_PORTS) / sizeof(uint32_t))
struct flentry {
- volatile uint32_t f_fhash; /* hash flowing forward */
- uint16_t f_flags; /* flow flags */
- uint8_t f_pad;
- uint8_t f_proto; /* protocol */
- uint32_t f_fibnum; /* fib index */
+ uint32_t f_hash; /* hash flowing forward */
+ uint32_t f_key[KEYLEN]; /* address(es and ports) */
uint32_t f_uptime; /* uptime at last access */
- struct flentry *f_next; /* pointer to collision entry */
- volatile struct rtentry *f_rt; /* rtentry for flow */
- volatile struct llentry *f_lle; /* llentry for flow */
-};
-
-struct flentry_v4 {
- struct flentry fl_entry;
- union ipv4_flow fl_flow;
-};
-
-struct flentry_v6 {
- struct flentry fl_entry;
- union ipv6_flow fl_flow;
-};
-
-#define fl_fhash fl_entry.fl_fhash
-#define fl_flags fl_entry.fl_flags
-#define fl_proto fl_entry.fl_proto
-#define fl_uptime fl_entry.fl_uptime
-#define fl_rt fl_entry.fl_rt
-#define fl_lle fl_entry.fl_lle
-
-#define SECS_PER_HOUR 3600
-#define SECS_PER_DAY (24*SECS_PER_HOUR)
-
-#define SYN_IDLE 300
-#define UDP_IDLE 300
-#define FIN_WAIT_IDLE 600
-#define TCP_IDLE SECS_PER_DAY
-
-
-typedef void fl_lock_t(struct flowtable *, uint32_t);
-typedef void fl_rtalloc_t(struct route *, uint32_t, u_int);
-
-union flentryp {
- struct flentry **global;
- struct flentry **pcpu[MAXCPU];
+ uint16_t f_fibnum; /* fib index */
+#ifdef FLOWTABLE_HASH_ALL
+ uint8_t f_proto; /* protocol */
+ uint8_t f_flags; /* stale? */
+#define FL_STALE 1
+#endif
+ SLIST_ENTRY(flentry) f_next; /* pointer to collision entry */
+ struct rtentry *f_rt; /* rtentry for flow */
+ struct llentry *f_lle; /* llentry for flow */
};
+#undef KEYLEN
-struct flowtable_stats {
- uint64_t ft_collisions;
- uint64_t ft_allocated;
- uint64_t ft_misses;
- uint64_t ft_max_depth;
- uint64_t ft_free_checks;
- uint64_t ft_frees;
- uint64_t ft_hits;
- uint64_t ft_lookups;
-} __aligned(CACHE_LINE_SIZE);
+SLIST_HEAD(flist, flentry);
+/* Make sure we can use pcpu_zone_ptr for struct flist. */
+CTASSERT(sizeof(struct flist) == sizeof(void *));
struct flowtable {
- struct flowtable_stats ft_stats[MAXCPU];
+ counter_u64_t *ft_stat;
int ft_size;
- int ft_lock_count;
- uint32_t ft_flags;
- char *ft_name;
- fl_lock_t *ft_lock;
- fl_lock_t *ft_unlock;
- fl_rtalloc_t *ft_rtalloc;
/*
- * XXX need to pad out
- */
- struct mtx *ft_locks;
- union flentryp ft_table;
- bitstr_t *ft_masks[MAXCPU];
+ * ft_table is a malloc(9)ed array of pointers. Pointers point to
+ * memory from UMA_ZONE_PCPU zone.
+ * ft_masks is per-cpu pointer itself. Each instance points
+ * to a malloc(9)ed bitset, that is private to corresponding CPU.
+ */
+ struct flist **ft_table;
+ bitstr_t **ft_masks;
bitstr_t *ft_tmpmask;
- struct flowtable *ft_next;
+};
- uint32_t ft_count __aligned(CACHE_LINE_SIZE);
- uint32_t ft_udp_idle __aligned(CACHE_LINE_SIZE);
- uint32_t ft_fin_wait_idle;
- uint32_t ft_syn_idle;
- uint32_t ft_tcp_idle;
- boolean_t ft_full;
-} __aligned(CACHE_LINE_SIZE);
+#define FLOWSTAT_ADD(ft, name, v) \
+ counter_u64_add((ft)->ft_stat[offsetof(struct flowtable_stat, name) / sizeof(uint64_t)], (v))
+#define FLOWSTAT_INC(ft, name) FLOWSTAT_ADD(ft, name, 1)
static struct proc *flowcleanerproc;
-static VNET_DEFINE(struct flowtable *, flow_list_head);
-static VNET_DEFINE(uint32_t, flow_hashjitter);
-static VNET_DEFINE(uma_zone_t, flow_ipv4_zone);
-static VNET_DEFINE(uma_zone_t, flow_ipv6_zone);
-
-#define V_flow_list_head VNET(flow_list_head)
-#define V_flow_hashjitter VNET(flow_hashjitter)
-#define V_flow_ipv4_zone VNET(flow_ipv4_zone)
-#define V_flow_ipv6_zone VNET(flow_ipv6_zone)
-
+static uint32_t flow_hashjitter;
static struct cv flowclean_f_cv;
static struct cv flowclean_c_cv;
static struct mtx flowclean_lock;
static uint32_t flowclean_cycles;
-static uint32_t flowclean_freq;
-
-#ifdef FLOWTABLE_DEBUG
-#define FLDPRINTF(ft, flags, fmt, ...) \
-do { \
- if ((ft)->ft_flags & (flags)) \
- printf((fmt), __VA_ARGS__); \
-} while (0); \
-
-#else
-#define FLDPRINTF(ft, flags, fmt, ...)
-
-#endif
-
/*
* TODO:
- * - Make flowtable stats per-cpu, aggregated at sysctl call time,
- * to avoid extra cache evictions caused by incrementing a shared
- * counter
- * - add sysctls to resize && flush flow tables
+ * - add sysctls to resize && flush flow tables
* - Add per flowtable sysctls for statistics and configuring timeouts
* - add saturation counter to rtentry to support per-packet load-balancing
* add flag to indicate round-robin flow, add list lookup from head
@@ -230,396 +155,117 @@ do { \
* - support explicit connection state (currently only ad-hoc for DSR)
* - idetach() cleanup for options VIMAGE builds.
*/
-VNET_DEFINE(int, flowtable_enable) = 1;
-static VNET_DEFINE(int, flowtable_debug);
-static VNET_DEFINE(int, flowtable_syn_expire) = SYN_IDLE;
-static VNET_DEFINE(int, flowtable_udp_expire) = UDP_IDLE;
-static VNET_DEFINE(int, flowtable_fin_wait_expire) = FIN_WAIT_IDLE;
-static VNET_DEFINE(int, flowtable_tcp_expire) = TCP_IDLE;
-static VNET_DEFINE(int, flowtable_nmbflows);
-static VNET_DEFINE(int, flowtable_ready) = 0;
-
-#define V_flowtable_enable VNET(flowtable_enable)
-#define V_flowtable_debug VNET(flowtable_debug)
-#define V_flowtable_syn_expire VNET(flowtable_syn_expire)
-#define V_flowtable_udp_expire VNET(flowtable_udp_expire)
-#define V_flowtable_fin_wait_expire VNET(flowtable_fin_wait_expire)
-#define V_flowtable_tcp_expire VNET(flowtable_tcp_expire)
-#define V_flowtable_nmbflows VNET(flowtable_nmbflows)
-#define V_flowtable_ready VNET(flowtable_ready)
-
-static SYSCTL_NODE(_net_inet, OID_AUTO, flowtable, CTLFLAG_RD, NULL,
- "flowtable");
-SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, debug, CTLFLAG_RW,
- &VNET_NAME(flowtable_debug), 0, "print debug info.");
-SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, enable, CTLFLAG_RW,
- &VNET_NAME(flowtable_enable), 0, "enable flowtable caching.");
-
-/*
- * XXX This does not end up updating timeouts at runtime
- * and only reflects the value for the last table added :-/
- */
-SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, syn_expire, CTLFLAG_RW,
- &VNET_NAME(flowtable_syn_expire), 0,
- "seconds after which to remove syn allocated flow.");
-SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, udp_expire, CTLFLAG_RW,
- &VNET_NAME(flowtable_udp_expire), 0,
- "seconds after which to remove flow allocated to UDP.");
-SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, fin_wait_expire, CTLFLAG_RW,
- &VNET_NAME(flowtable_fin_wait_expire), 0,
- "seconds after which to remove a flow in FIN_WAIT.");
-SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, tcp_expire, CTLFLAG_RW,
- &VNET_NAME(flowtable_tcp_expire), 0,
- "seconds after which to remove flow allocated to a TCP connection.");
-
-
-/*
- * Maximum number of flows that can be allocated of a given type.
- *
- * The table is allocated at boot time (for the pure caching case
- * there is no reason why this could not be changed at runtime)
- * and thus (currently) needs to be set with a tunable.
- */
-static int
-sysctl_nmbflows(SYSCTL_HANDLER_ARGS)
-{
- int error, newnmbflows;
-
- newnmbflows = V_flowtable_nmbflows;
- error = sysctl_handle_int(oidp, &newnmbflows, 0, req);
- if (error == 0 && req->newptr) {
- if (newnmbflows > V_flowtable_nmbflows) {
- V_flowtable_nmbflows = newnmbflows;
- uma_zone_set_max(V_flow_ipv4_zone,
- V_flowtable_nmbflows);
- uma_zone_set_max(V_flow_ipv6_zone,
- V_flowtable_nmbflows);
- } else
- error = EINVAL;
- }
- return (error);
-}
-SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, nmbflows,
- CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_nmbflows, "IU",
- "Maximum number of flows allowed");
-
-
-
-#define FS_PRINT(sb, field) sbuf_printf((sb), "\t%s: %jd\n", #field, fs->ft_##field)
-
-static void
-fs_print(struct sbuf *sb, struct flowtable_stats *fs)
-{
-
- FS_PRINT(sb, collisions);
- FS_PRINT(sb, allocated);
- FS_PRINT(sb, misses);
- FS_PRINT(sb, max_depth);
- FS_PRINT(sb, free_checks);
- FS_PRINT(sb, frees);
- FS_PRINT(sb, hits);
- FS_PRINT(sb, lookups);
-}
-
-static void
-flowtable_show_stats(struct sbuf *sb, struct flowtable *ft)
-{
- int i;
- struct flowtable_stats fs, *pfs;
-
- if (ft->ft_flags & FL_PCPU) {
- bzero(&fs, sizeof(fs));
- pfs = &fs;
- CPU_FOREACH(i) {
- pfs->ft_collisions += ft->ft_stats[i].ft_collisions;
- pfs->ft_allocated += ft->ft_stats[i].ft_allocated;
- pfs->ft_misses += ft->ft_stats[i].ft_misses;
- pfs->ft_free_checks += ft->ft_stats[i].ft_free_checks;
- pfs->ft_frees += ft->ft_stats[i].ft_frees;
- pfs->ft_hits += ft->ft_stats[i].ft_hits;
- pfs->ft_lookups += ft->ft_stats[i].ft_lookups;
- if (ft->ft_stats[i].ft_max_depth > pfs->ft_max_depth)
- pfs->ft_max_depth = ft->ft_stats[i].ft_max_depth;
- }
- } else {
- pfs = &ft->ft_stats[0];
- }
- fs_print(sb, pfs);
-}
-
-static int
-sysctl_flowtable_stats(SYSCTL_HANDLER_ARGS)
-{
- struct flowtable *ft;
- struct sbuf *sb;
- int error;
-
- sb = sbuf_new(NULL, NULL, 64*1024, SBUF_FIXEDLEN);
-
- ft = V_flow_list_head;
- while (ft != NULL) {
- sbuf_printf(sb, "\ntable name: %s\n", ft->ft_name);
- flowtable_show_stats(sb, ft);
- ft = ft->ft_next;
- }
- sbuf_finish(sb);
- error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
- sbuf_delete(sb);
-
- return (error);
-}
-SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, stats, CTLTYPE_STRING|CTLFLAG_RD,
- NULL, 0, sysctl_flowtable_stats, "A", "flowtable statistics");
-
-
-#ifndef RADIX_MPATH
-static void
-rtalloc_ign_wrapper(struct route *ro, uint32_t hash, u_int fibnum)
-{
-
- rtalloc_ign_fib(ro, 0, fibnum);
-}
+#ifdef INET
+static VNET_DEFINE(struct flowtable, ip4_ft);
+#define V_ip4_ft VNET(ip4_ft)
+#endif
+#ifdef INET6
+static VNET_DEFINE(struct flowtable, ip6_ft);
+#define V_ip6_ft VNET(ip6_ft)
#endif
-static void
-flowtable_global_lock(struct flowtable *table, uint32_t hash)
-{
- int lock_index = (hash)&(table->ft_lock_count - 1);
-
- mtx_lock(&table->ft_locks[lock_index]);
-}
-
-static void
-flowtable_global_unlock(struct flowtable *table, uint32_t hash)
-{
- int lock_index = (hash)&(table->ft_lock_count - 1);
-
- mtx_unlock(&table->ft_locks[lock_index]);
-}
-
-static void
-flowtable_pcpu_lock(struct flowtable *table, uint32_t hash)
-{
-
- critical_enter();
-}
-
-static void
-flowtable_pcpu_unlock(struct flowtable *table, uint32_t hash)
-{
-
- critical_exit();
-}
-
-#define FL_ENTRY_INDEX(table, hash)((hash) % (table)->ft_size)
-#define FL_ENTRY(table, hash) *flowtable_entry((table), (hash))
-#define FL_ENTRY_LOCK(table, hash) (table)->ft_lock((table), (hash))
-#define FL_ENTRY_UNLOCK(table, hash) (table)->ft_unlock((table), (hash))
-
-#define FL_STALE (1<<8)
-#define FL_OVERWRITE (1<<10)
-
-void
-flow_invalidate(struct flentry *fle)
-{
-
- fle->f_flags |= FL_STALE;
-}
-
-static __inline int
-proto_to_flags(uint8_t proto)
-{
- int flag;
+static uma_zone_t flow_zone;
- switch (proto) {
- case IPPROTO_TCP:
- flag = FL_TCP;
- break;
- case IPPROTO_SCTP:
- flag = FL_SCTP;
- break;
- case IPPROTO_UDP:
- flag = FL_UDP;
- break;
- default:
- flag = 0;
- break;
- }
+static VNET_DEFINE(int, flowtable_enable) = 1;
+#define V_flowtable_enable VNET(flowtable_enable)
- return (flag);
-}
+static SYSCTL_NODE(_net, OID_AUTO, flowtable, CTLFLAG_RD, NULL,
+ "flowtable");
+SYSCTL_VNET_INT(_net_flowtable, OID_AUTO, enable, CTLFLAG_RW,
+ &VNET_NAME(flowtable_enable), 0, "enable flowtable caching.");
+SYSCTL_UMA_MAX(_net_flowtable, OID_AUTO, maxflows, CTLFLAG_RW,
+ &flow_zone, "Maximum number of flows allowed");
-static __inline int
-flags_to_proto(int flags)
-{
- int proto, protoflags;
+static MALLOC_DEFINE(M_FTABLE, "flowtable", "flowtable hashes and bitstrings");
- protoflags = flags & (FL_TCP|FL_SCTP|FL_UDP);
- switch (protoflags) {
- case FL_TCP:
- proto = IPPROTO_TCP;
- break;
- case FL_SCTP:
- proto = IPPROTO_SCTP;
- break;
- case FL_UDP:
- proto = IPPROTO_UDP;
- break;
- default:
- proto = 0;
- break;
- }
- return (proto);
-}
+static struct flentry *
+flowtable_lookup_common(struct flowtable *, uint32_t *, int, uint32_t);
#ifdef INET
-#ifdef FLOWTABLE_DEBUG
-static void
-ipv4_flow_print_tuple(int flags, int proto, struct sockaddr_in *ssin,
- struct sockaddr_in *dsin)
-{
- char saddr[4*sizeof "123"], daddr[4*sizeof "123"];
-
- if (flags & FL_HASH_ALL) {
- inet_ntoa_r(ssin->sin_addr, saddr);
- inet_ntoa_r(dsin->sin_addr, daddr);
- printf("proto=%d %s:%d->%s:%d\n",
- proto, saddr, ntohs(ssin->sin_port), daddr,
- ntohs(dsin->sin_port));
- } else {
- inet_ntoa_r(*(struct in_addr *) &dsin->sin_addr, daddr);
- printf("proto=%d %s\n", proto, daddr);
- }
-
-}
-#endif
-
-static int
-ipv4_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
- struct sockaddr_in *ssin, struct sockaddr_in *dsin, uint16_t *flags)
+static struct flentry *
+flowtable_lookup_ipv4(struct mbuf *m, struct route *ro)
{
+ struct flentry *fle;
+ struct sockaddr_in *sin;
struct ip *ip;
- uint8_t proto;
+ uint32_t fibnum;
+#ifdef FLOWTABLE_HASH_ALL
+ uint32_t key[3];
int iphlen;
- struct tcphdr *th;
- struct udphdr *uh;
- struct sctphdr *sh;
uint16_t sport, dport;
+ uint8_t proto;
+#endif
- proto = sport = dport = 0;
ip = mtod(m, struct ip *);
- dsin->sin_family = AF_INET;
- dsin->sin_len = sizeof(*dsin);
- dsin->sin_addr = ip->ip_dst;
- ssin->sin_family = AF_INET;
- ssin->sin_len = sizeof(*ssin);
- ssin->sin_addr = ip->ip_src;
- proto = ip->ip_p;
- if ((*flags & FL_HASH_ALL) == 0) {
- FLDPRINTF(ft, FL_DEBUG_ALL, "skip port check flags=0x%x ",
- *flags);
- goto skipports;
- }
+ if (ip->ip_src.s_addr == ip->ip_dst.s_addr ||
+ (ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
+ (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
+ return (NULL);
- iphlen = ip->ip_hl << 2; /* XXX options? */
+ fibnum = M_GETFIB(m);
+
+#ifdef FLOWTABLE_HASH_ALL
+ iphlen = ip->ip_hl << 2;
+ proto = ip->ip_p;
switch (proto) {
- case IPPROTO_TCP:
- th = (struct tcphdr *)((caddr_t)ip + iphlen);
+ case IPPROTO_TCP: {
+ struct tcphdr *th;
+
+ th = (struct tcphdr *)((char *)ip + iphlen);
sport = th->th_sport;
dport = th->th_dport;
- if ((*flags & FL_HASH_ALL) &&
- (th->th_flags & (TH_RST|TH_FIN)))
- *flags |= FL_STALE;
- break;
- case IPPROTO_UDP:
- uh = (struct udphdr *)((caddr_t)ip + iphlen);
+ if (th->th_flags & (TH_RST|TH_FIN))
+ fibnum |= (FL_STALE << 24);
+ break;
+ }
+ case IPPROTO_UDP: {
+ struct udphdr *uh;
+
+ uh = (struct udphdr *)((char *)ip + iphlen);
sport = uh->uh_sport;
dport = uh->uh_dport;
- break;
- case IPPROTO_SCTP:
- sh = (struct sctphdr *)((caddr_t)ip + iphlen);
+ break;
+ }
+ case IPPROTO_SCTP: {
+ struct sctphdr *sh;
+
+ sh = (struct sctphdr *)((char *)ip + iphlen);
sport = sh->src_port;
dport = sh->dest_port;
- break;
+ /* XXXGL: handle stale? */
+ break;
+ }
default:
- FLDPRINTF(ft, FL_DEBUG_ALL, "proto=0x%x not supported\n", proto);
- return (ENOTSUP);
- /* no port - hence not a protocol we care about */
+ sport = dport = 0;
break;
-
}
-skipports:
- *flags |= proto_to_flags(proto);
- ssin->sin_port = sport;
- dsin->sin_port = dport;
- return (0);
-}
+ key[0] = ip->ip_dst.s_addr;
+ key[1] = ip->ip_src.s_addr;
+ key[2] = (dport << 16) | sport;
+ fibnum |= proto << 16;
-static uint32_t
-ipv4_flow_lookup_hash_internal(
- struct sockaddr_in *ssin, struct sockaddr_in *dsin,
- uint32_t *key, uint16_t flags)
-{
- uint16_t sport, dport;
- uint8_t proto;
- int offset = 0;
-
- if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
- return (0);
- proto = flags_to_proto(flags);
- sport = dport = key[2] = key[1] = key[0] = 0;
- if ((ssin != NULL) && (flags & FL_HASH_ALL)) {
- key[1] = ssin->sin_addr.s_addr;
- sport = ssin->sin_port;
- }
- if (dsin != NULL) {
- key[2] = dsin->sin_addr.s_addr;
- dport = dsin->sin_port;
- }
- if (flags & FL_HASH_ALL) {
- ((uint16_t *)key)[0] = sport;
- ((uint16_t *)key)[1] = dport;
- } else
- offset = V_flow_hashjitter + proto;
+ fle = flowtable_lookup_common(&V_ip4_ft, key, 3 * sizeof(uint32_t),
+ fibnum);
- return (jenkins_hash32(key, 3, offset));
-}
+#else /* !FLOWTABLE_HASH_ALL */
-static struct flentry *
-flowtable_lookup_mbuf4(struct flowtable *ft, struct mbuf *m)
-{
- struct sockaddr_storage ssa, dsa;
- uint16_t flags;
- struct sockaddr_in *dsin, *ssin;
-
- dsin = (struct sockaddr_in *)&dsa;
- ssin = (struct sockaddr_in *)&ssa;
- bzero(dsin, sizeof(*dsin));
- bzero(ssin, sizeof(*ssin));
- flags = ft->ft_flags;
- if (ipv4_mbuf_demarshal(ft, m, ssin, dsin, &flags) != 0)
- return (NULL);
+ fle = flowtable_lookup_common(&V_ip4_ft, (uint32_t *)&ip->ip_dst,
+ sizeof(struct in_addr), fibnum);
- return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags));
-}
+#endif /* FLOWTABLE_HASH_ALL */
-void
-flow_to_route(struct flentry *fle, struct route *ro)
-{
- uint32_t *hashkey = NULL;
- struct sockaddr_in *sin;
+ if (fle == NULL)
+ return (NULL);
sin = (struct sockaddr_in *)&ro->ro_dst;
sin->sin_family = AF_INET;
sin->sin_len = sizeof(*sin);
- hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
- sin->sin_addr.s_addr = hashkey[2];
- ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
- ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle);
- ro->ro_flags |= RT_NORTREF;
+ sin->sin_addr = ip->ip_dst;
+
+ return (fle);
}
#endif /* INET */
@@ -633,9 +279,8 @@ flow_to_route(struct flentry *fle, struct route *ro)
#define PULLUP_TO(_len, p, T) \
do { \
int x = (_len) + sizeof(T); \
- if ((m)->m_len < x) { \
- goto receive_failed; \
- } \
+ if ((m)->m_len < x) \
+ return (NULL); \
p = (mtod(m, char *) + (_len)); \
} while (0)
@@ -643,26 +288,35 @@ do { \
#define SCTP(p) ((struct sctphdr *)(p))
#define UDP(p) ((struct udphdr *)(p))
-static int
-ipv6_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
- struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6, uint16_t *flags)
+static struct flentry *
+flowtable_lookup_ipv6(struct mbuf *m, struct route *ro)
{
+ struct flentry *fle;
+ struct sockaddr_in6 *sin6;
struct ip6_hdr *ip6;
- uint8_t proto;
+ uint32_t fibnum;
+#ifdef FLOWTABLE_HASH_ALL
+ uint32_t key[9];
+ void *ulp;
int hlen;
- uint16_t src_port, dst_port;
+ uint16_t sport, dport;
u_short offset;
- void *ulp;
+ uint8_t proto;
+#else
+ uint32_t key[4];
+#endif
- offset = hlen = src_port = dst_port = 0;
- ulp = NULL;
ip6 = mtod(m, struct ip6_hdr *);
- hlen = sizeof(struct ip6_hdr);
- proto = ip6->ip6_nxt;
+ if (in6_localaddr(&ip6->ip6_dst))
+ return (NULL);
- if ((*flags & FL_HASH_ALL) == 0)
- goto skipports;
+ fibnum = M_GETFIB(m);
+#ifdef FLOWTABLE_HASH_ALL
+ hlen = sizeof(struct ip6_hdr);
+ proto = ip6->ip6_nxt;
+ offset = sport = dport = 0;
+ ulp = NULL;
while (ulp == NULL) {
switch (proto) {
case IPPROTO_ICMPV6:
@@ -675,21 +329,21 @@ ipv6_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
break;
case IPPROTO_TCP:
PULLUP_TO(hlen, ulp, struct tcphdr);
- dst_port = TCP(ulp)->th_dport;
- src_port = TCP(ulp)->th_sport;
- if ((*flags & FL_HASH_ALL) &&
- (TCP(ulp)->th_flags & (TH_RST|TH_FIN)))
- *flags |= FL_STALE;
+ dport = TCP(ulp)->th_dport;
+ sport = TCP(ulp)->th_sport;
+ if (TCP(ulp)->th_flags & (TH_RST|TH_FIN))
+ fibnum |= (FL_STALE << 24);
break;
case IPPROTO_SCTP:
PULLUP_TO(hlen, ulp, struct sctphdr);
- src_port = SCTP(ulp)->src_port;
- dst_port = SCTP(ulp)->dest_port;
+ dport = SCTP(ulp)->src_port;
+ sport = SCTP(ulp)->dest_port;
+ /* XXXGL: handle stale? */
break;
case IPPROTO_UDP:
PULLUP_TO(hlen, ulp, struct udphdr);
- dst_port = UDP(ulp)->uh_dport;
- src_port = UDP(ulp)->uh_sport;
+ dport = UDP(ulp)->uh_dport;
+ sport = UDP(ulp)->uh_sport;
break;
case IPPROTO_HOPOPTS: /* RFC 2460 */
PULLUP_TO(hlen, ulp, struct ip6_hbh);
@@ -698,7 +352,7 @@ ipv6_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
ulp = NULL;
break;
case IPPROTO_ROUTING: /* RFC 2460 */
- PULLUP_TO(hlen, ulp, struct ip6_rthdr);
+ PULLUP_TO(hlen, ulp, struct ip6_rthdr);
hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
ulp = NULL;
@@ -729,689 +383,395 @@ ipv6_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
}
}
- if (src_port == 0) {
- receive_failed:
- return (ENOTSUP);
- }
-
-skipports:
- dsin6->sin6_family = AF_INET6;
- dsin6->sin6_len = sizeof(*dsin6);
- dsin6->sin6_port = dst_port;
- memcpy(&dsin6->sin6_addr, &ip6->ip6_dst, sizeof(struct in6_addr));
+ bcopy(&ip6->ip6_dst, &key[0], sizeof(struct in6_addr));
+ bcopy(&ip6->ip6_src, &key[4], sizeof(struct in6_addr));
+ key[8] = (dport << 16) | sport;
+ fibnum |= proto << 16;
- ssin6->sin6_family = AF_INET6;
- ssin6->sin6_len = sizeof(*ssin6);
- ssin6->sin6_port = src_port;
- memcpy(&ssin6->sin6_addr, &ip6->ip6_src, sizeof(struct in6_addr));
- *flags |= proto_to_flags(proto);
+ fle = flowtable_lookup_common(&V_ip6_ft, key, 9 * sizeof(uint32_t),
+ fibnum);
+#else /* !FLOWTABLE_HASH_ALL */
+ bcopy(&ip6->ip6_dst, &key[0], sizeof(struct in6_addr));
+ fle = flowtable_lookup_common(&V_ip6_ft, key, sizeof(struct in6_addr),
+ fibnum);
+#endif /* FLOWTABLE_HASH_ALL */
- return (0);
-}
-
-#define zero_key(key) \
-do { \
- key[0] = 0; \
- key[1] = 0; \
- key[2] = 0; \
- key[3] = 0; \
- key[4] = 0; \
- key[5] = 0; \
- key[6] = 0; \
- key[7] = 0; \
- key[8] = 0; \
-} while (0)
-
-static uint32_t
-ipv6_flow_lookup_hash_internal(
- struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6,
- uint32_t *key, uint16_t flags)
-{
- uint16_t sport, dport;
- uint8_t proto;
- int offset = 0;
-
- if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
- return (0);
-
- proto = flags_to_proto(flags);
- zero_key(key);
- sport = dport = 0;
- if (dsin6 != NULL) {
- memcpy(&key[1], &dsin6->sin6_addr, sizeof(struct in6_addr));
- dport = dsin6->sin6_port;
- }
- if ((ssin6 != NULL) && (flags & FL_HASH_ALL)) {
- memcpy(&key[5], &ssin6->sin6_addr, sizeof(struct in6_addr));
- sport = ssin6->sin6_port;
- }
- if (flags & FL_HASH_ALL) {
- ((uint16_t *)key)[0] = sport;
- ((uint16_t *)key)[1] = dport;
- } else
- offset = V_flow_hashjitter + proto;
-
- return (jenkins_hash32(key, 9, offset));
-}
-
-static struct flentry *
-flowtable_lookup_mbuf6(struct flowtable *ft, struct mbuf *m)
-{
- struct sockaddr_storage ssa, dsa;
- struct sockaddr_in6 *dsin6, *ssin6;
- uint16_t flags;
-
- dsin6 = (struct sockaddr_in6 *)&dsa;
- ssin6 = (struct sockaddr_in6 *)&ssa;
- bzero(dsin6, sizeof(*dsin6));
- bzero(ssin6, sizeof(*ssin6));
- flags = ft->ft_flags;
-
- if (ipv6_mbuf_demarshal(ft, m, ssin6, dsin6, &flags) != 0)
+ if (fle == NULL)
return (NULL);
- return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags));
-}
-
-void
-flow_to_route_in6(struct flentry *fle, struct route_in6 *ro)
-{
- uint32_t *hashkey = NULL;
- struct sockaddr_in6 *sin6;
-
sin6 = (struct sockaddr_in6 *)&ro->ro_dst;
-
sin6->sin6_family = AF_INET6;
sin6->sin6_len = sizeof(*sin6);
- hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
- memcpy(&sin6->sin6_addr, &hashkey[5], sizeof (struct in6_addr));
- ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
- ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle);
- ro->ro_flags |= RT_NORTREF;
+ bcopy(&ip6->ip6_dst, &sin6->sin6_addr, sizeof(struct in6_addr));
+
+ return (fle);
}
#endif /* INET6 */
static bitstr_t *
flowtable_mask(struct flowtable *ft)
{
- bitstr_t *mask;
- if (ft->ft_flags & FL_PCPU)
- mask = ft->ft_masks[curcpu];
- else
- mask = ft->ft_masks[0];
+ /*
+ * flowtable_free_stale() calls w/o critical section, but
+ * with sched_bind(). Since pointer is stable throughout
+ * ft lifetime, it is safe, otherwise...
+ *
+ * CRITICAL_ASSERT(curthread);
+ */
- return (mask);
+ return (*(bitstr_t **)zpcpu_get(ft->ft_masks));
}
-static struct flentry **
-flowtable_entry(struct flowtable *ft, uint32_t hash)
+static struct flist *
+flowtable_list(struct flowtable *ft, uint32_t hash)
{
- struct flentry **fle;
- int index = (hash % ft->ft_size);
- if (ft->ft_flags & FL_PCPU) {
- KASSERT(&ft->ft_table.pcpu[curcpu][0] != NULL, ("pcpu not set"));
- fle = &ft->ft_table.pcpu[curcpu][index];
- } else {
- KASSERT(&ft->ft_table.global[0] != NULL, ("global not set"));
- fle = &ft->ft_table.global[index];
- }
-
- return (fle);
+ CRITICAL_ASSERT(curthread);
+ return (zpcpu_get(ft->ft_table[hash % ft->ft_size]));
}
static int
-flow_stale(struct flowtable *ft, struct flentry *fle)
+flow_stale(struct flowtable *ft, struct flentry *fle, int maxidle)
{
- time_t idle_time;
-
- if ((fle->f_fhash == 0)
- || ((fle->f_rt->rt_flags & RTF_HOST) &&
- ((fle->f_rt->rt_flags & (RTF_UP))
- != (RTF_UP)))
- || (fle->f_rt->rt_ifp == NULL)
- || !RT_LINK_IS_UP(fle->f_rt->rt_ifp))
- return (1);
- idle_time = time_uptime - fle->f_uptime;
-
- if ((fle->f_flags & FL_STALE) ||
- ((fle->f_flags & (TH_SYN|TH_ACK|TH_FIN)) == 0
- && (idle_time > ft->ft_udp_idle)) ||
- ((fle->f_flags & TH_FIN)
- && (idle_time > ft->ft_fin_wait_idle)) ||
- ((fle->f_flags & (TH_SYN|TH_ACK)) == TH_SYN
- && (idle_time > ft->ft_syn_idle)) ||
- ((fle->f_flags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)
- && (idle_time > ft->ft_tcp_idle)) ||
- ((fle->f_rt->rt_flags & RTF_UP) == 0 ||
- (fle->f_rt->rt_ifp == NULL)))
+ if (((fle->f_rt->rt_flags & RTF_HOST) &&
+ ((fle->f_rt->rt_flags & (RTF_UP)) != (RTF_UP))) ||
+ (fle->f_rt->rt_ifp == NULL) ||
+ !RT_LINK_IS_UP(fle->f_rt->rt_ifp) ||
+ (fle->f_lle->la_flags & LLE_VALID) == 0)
return (1);
- return (0);
-}
+ if (time_uptime - fle->f_uptime > maxidle)
+ return (1);
-static void
-flowtable_set_hashkey(struct flentry *fle, uint32_t *key)
-{
- uint32_t *hashkey;
- int i, nwords;
+#ifdef FLOWTABLE_HASH_ALL
+ if (fle->f_flags & FL_STALE)
+ return (1);
+#endif
- if (fle->f_flags & FL_IPV6) {
- nwords = 9;
- hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
- } else {
- nwords = 3;
- hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
- }
-
- for (i = 0; i < nwords; i++)
- hashkey[i] = key[i];
+ return (0);
}
-static struct flentry *
-flow_alloc(struct flowtable *ft)
+static int
+flow_full(void)
{
- struct flentry *newfle;
- uma_zone_t zone;
+ int count, max;
- newfle = NULL;
- zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
+ count = uma_zone_get_cur(flow_zone);
+ max = uma_zone_get_max(flow_zone);
- newfle = uma_zalloc(zone, M_NOWAIT | M_ZERO);
- if (newfle != NULL)
- atomic_add_int(&ft->ft_count, 1);
- return (newfle);
+ return (count > (max - (max >> 3)));
}
-static void
-flow_free(struct flentry *fle, struct flowtable *ft)
+static int
+flow_matches(struct flentry *fle, uint32_t *key, int keylen, uint32_t fibnum)
{
- uma_zone_t zone;
+#ifdef FLOWTABLE_HASH_ALL
+ uint8_t proto;
- zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
- atomic_add_int(&ft->ft_count, -1);
- uma_zfree(zone, fle);
-}
+ proto = (fibnum >> 16) & 0xff;
+ fibnum &= 0xffff;
+#endif
-static int
-flow_full(struct flowtable *ft)
-{
- boolean_t full;
- uint32_t count;
-
- full = ft->ft_full;
- count = ft->ft_count;
-
- if (full && (count < (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 3))))
- ft->ft_full = FALSE;
- else if (!full && (count > (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 5))))
- ft->ft_full = TRUE;
-
- if (full && !ft->ft_full) {
- flowclean_freq = 4*hz;
- if ((ft->ft_flags & FL_HASH_ALL) == 0)
- ft->ft_udp_idle = ft->ft_fin_wait_idle =
- ft->ft_syn_idle = ft->ft_tcp_idle = 5;
- cv_broadcast(&flowclean_c_cv);
- } else if (!full && ft->ft_full) {
- flowclean_freq = 20*hz;
- if ((ft->ft_flags & FL_HASH_ALL) == 0)
- ft->ft_udp_idle = ft->ft_fin_wait_idle =
- ft->ft_syn_idle = ft->ft_tcp_idle = 30;
- }
+ CRITICAL_ASSERT(curthread);
- return (ft->ft_full);
+ /* Microoptimization for IPv4: don't use bcmp(). */
+ if (((keylen == sizeof(uint32_t) && (fle->f_key[0] != key[0])) ||
+ (bcmp(fle->f_key, key, keylen) == 0)) &&
+ fibnum == fle->f_fibnum &&
+#ifdef FLOWTABLE_HASH_ALL
+ proto == fle->f_proto &&
+#endif
+ (fle->f_rt->rt_flags & RTF_UP) &&
+ fle->f_rt->rt_ifp != NULL &&
+ (fle->f_lle->la_flags & LLE_VALID))
+ return (1);
+
+ return (0);
}
-static int
+static struct flentry *
flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key,
- uint32_t fibnum, struct route *ro, uint16_t flags)
+ int keylen, uint32_t fibnum0)
{
- struct flentry *fle, *fletail, *newfle, **flep;
- struct flowtable_stats *fs = &ft->ft_stats[curcpu];
- int depth;
+#ifdef INET6
+ struct route_in6 sro6;
+#endif
+#ifdef INET
+ struct route sro;
+#endif
+ struct route *ro = NULL;
+ struct rtentry *rt;
+ struct lltable *lt = NULL;
+ struct llentry *lle;
+ struct sockaddr_storage *l3addr;
+ struct ifnet *ifp;
+ struct flist *flist;
+ struct flentry *fle, *iter;
bitstr_t *mask;
+ uint16_t fibnum = fibnum0;
+#ifdef FLOWTABLE_HASH_ALL
uint8_t proto;
- newfle = flow_alloc(ft);
- if (newfle == NULL)
- return (ENOMEM);
-
- newfle->f_flags |= (flags & FL_IPV6);
- proto = flags_to_proto(flags);
-
- FL_ENTRY_LOCK(ft, hash);
- mask = flowtable_mask(ft);
- flep = flowtable_entry(ft, hash);
- fletail = fle = *flep;
+ proto = (fibnum0 >> 16) & 0xff;
+ fibnum = fibnum0 & 0xffff;
+#endif
- if (fle == NULL) {
- bit_set(mask, FL_ENTRY_INDEX(ft, hash));
- *flep = fle = newfle;
- goto skip;
- }
-
- depth = 0;
- fs->ft_collisions++;
/*
- * find end of list and make sure that we were not
- * preempted by another thread handling this flow
+ * This bit of code ends up locking the
+ * same route 3 times (just like ip_output + ether_output)
+ * - at lookup
+ * - in rt_check when called by arpresolve
+ * - dropping the refcount for the rtentry
+ *
+ * This could be consolidated to one if we wrote a variant
+ * of arpresolve with an rt_check variant that expected to
+ * receive the route locked
*/
- while (fle != NULL) {
- if (fle->f_fhash == hash && !flow_stale(ft, fle)) {
- /*
- * there was either a hash collision
- * or we lost a race to insert
- */
- FL_ENTRY_UNLOCK(ft, hash);
- flow_free(newfle, ft);
-
- if (flags & FL_OVERWRITE)
- goto skip;
- return (EEXIST);
- }
- /*
- * re-visit this double condition XXX
- */
- if (fletail->f_next != NULL)
- fletail = fle->f_next;
-
- depth++;
- fle = fle->f_next;
- }
-
- if (depth > fs->ft_max_depth)
- fs->ft_max_depth = depth;
- fletail->f_next = newfle;
- fle = newfle;
-skip:
- flowtable_set_hashkey(fle, key);
-
- fle->f_proto = proto;
- fle->f_rt = ro->ro_rt;
- fle->f_lle = ro->ro_lle;
- fle->f_fhash = hash;
- fle->f_fibnum = fibnum;
- fle->f_uptime = time_uptime;
- FL_ENTRY_UNLOCK(ft, hash);
- return (0);
-}
-
-int
-kern_flowtable_insert(struct flowtable *ft,
- struct sockaddr_storage *ssa, struct sockaddr_storage *dsa,
- struct route *ro, uint32_t fibnum, int flags)
-{
- uint32_t key[9], hash;
+#ifdef INET
+ if (ft == &V_ip4_ft) {
+ struct sockaddr_in *sin;
- flags = (ft->ft_flags | flags | FL_OVERWRITE);
- hash = 0;
+ ro = &sro;
+ bzero(&sro.ro_dst, sizeof(sro.ro_dst));
-#ifdef INET
- if (ssa->ss_family == AF_INET)
- hash = ipv4_flow_lookup_hash_internal((struct sockaddr_in *)ssa,
- (struct sockaddr_in *)dsa, key, flags);
+ sin = (struct sockaddr_in *)&sro.ro_dst;
+ sin->sin_family = AF_INET;
+ sin->sin_len = sizeof(*sin);
+ sin->sin_addr.s_addr = key[0];
+ }
#endif
#ifdef INET6
- if (ssa->ss_family == AF_INET6)
- hash = ipv6_flow_lookup_hash_internal((struct sockaddr_in6 *)ssa,
- (struct sockaddr_in6 *)dsa, key, flags);
-#endif
- if (ro->ro_rt == NULL || ro->ro_lle == NULL)
- return (EINVAL);
-
- FLDPRINTF(ft, FL_DEBUG,
- "kern_flowtable_insert: key=%x:%x:%x hash=%x fibnum=%d flags=%x\n",
- key[0], key[1], key[2], hash, fibnum, flags);
- return (flowtable_insert(ft, hash, key, fibnum, ro, flags));
-}
+ if (ft == &V_ip6_ft) {
+ struct sockaddr_in6 *sin6;
-static int
-flowtable_key_equal(struct flentry *fle, uint32_t *key)
-{
- uint32_t *hashkey;
- int i, nwords;
+ ro = (struct route *)&sro6;
+ sin6 = &sro6.ro_dst;
- if (fle->f_flags & FL_IPV6) {
- nwords = 9;
- hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
- } else {
- nwords = 3;
- hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
+ bzero(sin6, sizeof(*sin6));
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_len = sizeof(*sin6);
+ bcopy(key, &sin6->sin6_addr, sizeof(struct in6_addr));
}
+#endif
- for (i = 0; i < nwords; i++)
- if (hashkey[i] != key[i])
- return (0);
+ ro->ro_rt = NULL;
+#ifdef RADIX_MPATH
+ rtalloc_mpath_fib(ro, hash, fibnum);
+#else
+ rtalloc_ign_fib(ro, 0, fibnum);
+#endif
+ if (ro->ro_rt == NULL)
+ return (NULL);
- return (1);
-}
+ rt = ro->ro_rt;
+ ifp = rt->rt_ifp;
-struct flentry *
-flowtable_lookup_mbuf(struct flowtable *ft, struct mbuf *m, int af)
-{
- struct flentry *fle = NULL;
+ if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) {
+ RTFREE(rt);
+ return (NULL);
+ }
#ifdef INET
- if (af == AF_INET)
- fle = flowtable_lookup_mbuf4(ft, m);
+ if (ft == &V_ip4_ft)
+ lt = LLTABLE(ifp);
#endif
#ifdef INET6
- if (af == AF_INET6)
- fle = flowtable_lookup_mbuf6(ft, m);
-#endif
- if (fle != NULL && m != NULL && (m->m_flags & M_FLOWID) == 0) {
- m->m_flags |= M_FLOWID;
- m->m_pkthdr.flowid = fle->f_fhash;
- }
- return (fle);
-}
-
-struct flentry *
-flowtable_lookup(struct flowtable *ft, struct sockaddr_storage *ssa,
- struct sockaddr_storage *dsa, uint32_t fibnum, int flags)
-{
- uint32_t key[9], hash;
- struct flentry *fle;
- struct flowtable_stats *fs = &ft->ft_stats[curcpu];
- uint8_t proto = 0;
- int error = 0;
- struct rtentry *rt;
- struct llentry *lle;
- struct route sro, *ro;
- struct route_in6 sro6;
+ if (ft == &V_ip6_ft)
+ lt = LLTABLE6(ifp);
+#endif
- sro.ro_rt = sro6.ro_rt = NULL;
- sro.ro_lle = sro6.ro_lle = NULL;
- ro = NULL;
- hash = 0;
- flags |= ft->ft_flags;
- proto = flags_to_proto(flags);
-#ifdef INET
- if (ssa->ss_family == AF_INET) {
- struct sockaddr_in *ssin, *dsin;
+ if (rt->rt_flags & RTF_GATEWAY)
+ l3addr = (struct sockaddr_storage *)rt->rt_gateway;
+ else
+ l3addr = (struct sockaddr_storage *)&ro->ro_dst;
+ lle = llentry_alloc(ifp, lt, l3addr);
- ro = &sro;
- memcpy(&ro->ro_dst, dsa, sizeof(struct sockaddr_in));
- /*
- * The harvested source and destination addresses
- * may contain port information if the packet is
- * from a transport protocol (e.g. TCP/UDP). The
- * port field must be cleared before performing
- * a route lookup.
- */
- ((struct sockaddr_in *)&ro->ro_dst)->sin_port = 0;
- dsin = (struct sockaddr_in *)dsa;
- ssin = (struct sockaddr_in *)ssa;
- if ((dsin->sin_addr.s_addr == ssin->sin_addr.s_addr) ||
- (ntohl(dsin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
- (ntohl(ssin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
- return (NULL);
-
- hash = ipv4_flow_lookup_hash_internal(ssin, dsin, key, flags);
+ if (lle == NULL) {
+ RTFREE(rt);
+ return (NULL);
}
-#endif
-#ifdef INET6
- if (ssa->ss_family == AF_INET6) {
- struct sockaddr_in6 *ssin6, *dsin6;
- ro = (struct route *)&sro6;
- memcpy(&sro6.ro_dst, dsa,
- sizeof(struct sockaddr_in6));
- ((struct sockaddr_in6 *)&ro->ro_dst)->sin6_port = 0;
- dsin6 = (struct sockaddr_in6 *)dsa;
- ssin6 = (struct sockaddr_in6 *)ssa;
-
- flags |= FL_IPV6;
- hash = ipv6_flow_lookup_hash_internal(ssin6, dsin6, key, flags);
- }
-#endif
- /*
- * Ports are zero and this isn't a transmit cache
- * - thus not a protocol for which we need to keep
- * state
- * FL_HASH_ALL => key[0] != 0 for TCP || UDP || SCTP
- */
- if (hash == 0 || (key[0] == 0 && (ft->ft_flags & FL_HASH_ALL)))
+ /* Don't insert the entry if the ARP hasn't yet finished resolving. */
+ if ((lle->la_flags & LLE_VALID) == 0) {
+ RTFREE(rt);
+ LLE_FREE(lle);
+ FLOWSTAT_INC(ft, ft_fail_lle_invalid);
return (NULL);
+ }
- fs->ft_lookups++;
- FL_ENTRY_LOCK(ft, hash);
- if ((fle = FL_ENTRY(ft, hash)) == NULL) {
- FL_ENTRY_UNLOCK(ft, hash);
- goto uncached;
+ fle = uma_zalloc(flow_zone, M_NOWAIT | M_ZERO);
+ if (fle == NULL) {
+ RTFREE(rt);
+ LLE_FREE(lle);
+ return (NULL);
}
-keycheck:
- rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
- lle = __DEVOLATILE(struct llentry *, fle->f_lle);
- if ((rt != NULL)
- && lle != NULL
- && fle->f_fhash == hash
- && flowtable_key_equal(fle, key)
- && (proto == fle->f_proto)
- && (fibnum == fle->f_fibnum)
- && (rt->rt_flags & RTF_UP)
- && (rt->rt_ifp != NULL)
- && (lle->la_flags & LLE_VALID)) {
- fs->ft_hits++;
- fle->f_uptime = time_uptime;
- fle->f_flags |= flags;
- FL_ENTRY_UNLOCK(ft, hash);
- return (fle);
- } else if (fle->f_next != NULL) {
- fle = fle->f_next;
- goto keycheck;
+
+ fle->f_hash = hash;
+ bcopy(key, &fle->f_key, keylen);
+ fle->f_rt = rt;
+ fle->f_lle = lle;
+ fle->f_fibnum = fibnum;
+ fle->f_uptime = time_uptime;
+#ifdef FLOWTABLE_HASH_ALL
+ fle->f_proto = proto;
+ fle->f_flags = fibnum0 >> 24;
+#endif
+
+ critical_enter();
+ mask = flowtable_mask(ft);
+ flist = flowtable_list(ft, hash);
+
+ if (SLIST_EMPTY(flist)) {
+ bit_set(mask, (hash % ft->ft_size));
+ SLIST_INSERT_HEAD(flist, fle, f_next);
+ goto skip;
}
- FL_ENTRY_UNLOCK(ft, hash);
-uncached:
- if (flags & FL_NOAUTO || flow_full(ft))
- return (NULL);
- fs->ft_misses++;
/*
- * This bit of code ends up locking the
- * same route 3 times (just like ip_output + ether_output)
- * - at lookup
- * - in rt_check when called by arpresolve
- * - dropping the refcount for the rtentry
- *
- * This could be consolidated to one if we wrote a variant
- * of arpresolve with an rt_check variant that expected to
- * receive the route locked
+ * find end of list and make sure that we were not
+ * preempted by another thread handling this flow
*/
-
-#ifdef INVARIANTS
- if ((ro->ro_dst.sa_family != AF_INET) &&
- (ro->ro_dst.sa_family != AF_INET6))
- panic("sa_family == %d\n", ro->ro_dst.sa_family);
+ SLIST_FOREACH(iter, flist, f_next) {
+ KASSERT(iter->f_hash % ft->ft_size == hash % ft->ft_size,
+ ("%s: wrong hash", __func__));
+ if (flow_matches(iter, key, keylen, fibnum)) {
+ /*
+ * We probably migrated to an other CPU after
+ * lookup in flowtable_lookup_common() failed.
+ * It appeared that this CPU already has flow
+ * entry.
+ */
+ iter->f_uptime = time_uptime;
+#ifdef FLOWTABLE_HASH_ALL
+ iter->f_flags |= fibnum >> 24;
#endif
-
- ft->ft_rtalloc(ro, hash, fibnum);
- if (ro->ro_rt == NULL)
- error = ENETUNREACH;
- else {
- struct llentry *lle = NULL;
- struct sockaddr_storage *l3addr;
- struct rtentry *rt = ro->ro_rt;
- struct ifnet *ifp = rt->rt_ifp;
-
- if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) {
- RTFREE(rt);
- ro->ro_rt = NULL;
- return (NULL);
+ critical_exit();
+ FLOWSTAT_INC(ft, ft_collisions);
+ uma_zfree(flow_zone, fle);
+ return (iter);
}
-#ifdef INET6
- if (ssa->ss_family == AF_INET6) {
- struct sockaddr_in6 *dsin6;
-
- dsin6 = (struct sockaddr_in6 *)dsa;
- if (in6_localaddr(&dsin6->sin6_addr)) {
- RTFREE(rt);
- ro->ro_rt = NULL;
- return (NULL);
- }
+ }
- if (rt->rt_flags & RTF_GATEWAY)
- l3addr = (struct sockaddr_storage *)rt->rt_gateway;
-
- else
- l3addr = (struct sockaddr_storage *)&ro->ro_dst;
- lle = llentry_alloc(ifp, LLTABLE6(ifp), l3addr);
- }
-#endif
+ SLIST_INSERT_HEAD(flist, fle, f_next);
+skip:
+ critical_exit();
+ FLOWSTAT_INC(ft, ft_inserts);
+
+ return (fle);
+}
+
+int
+flowtable_lookup(sa_family_t sa, struct mbuf *m, struct route *ro)
+{
+ struct flentry *fle;
+
+ if (V_flowtable_enable == 0)
+ return (ENXIO);
+
+ switch (sa) {
#ifdef INET
- if (ssa->ss_family == AF_INET) {
- if (rt->rt_flags & RTF_GATEWAY)
- l3addr = (struct sockaddr_storage *)rt->rt_gateway;
- else
- l3addr = (struct sockaddr_storage *)&ro->ro_dst;
- lle = llentry_alloc(ifp, LLTABLE(ifp), l3addr);
- }
-
+ case AF_INET:
+ fle = flowtable_lookup_ipv4(m, ro);
+ break;
+#endif
+#ifdef INET6
+ case AF_INET6:
+ fle = flowtable_lookup_ipv6(m, ro);
+ break;
#endif
- ro->ro_lle = lle;
+ default:
+ panic("%s: sa %d", __func__, sa);
+ }
- if (lle == NULL) {
- RTFREE(rt);
- ro->ro_rt = NULL;
- return (NULL);
- }
- error = flowtable_insert(ft, hash, key, fibnum, ro, flags);
+ if (fle == NULL)
+ return (EHOSTUNREACH);
- if (error) {
- RTFREE(rt);
- LLE_FREE(lle);
- ro->ro_rt = NULL;
- ro->ro_lle = NULL;
- }
- }
+ if (!(m->m_flags & M_FLOWID)) {
+ m->m_flags |= M_FLOWID;
+ m->m_pkthdr.flowid = fle->f_hash;
+ }
+
+ ro->ro_rt = fle->f_rt;
+ ro->ro_lle = fle->f_lle;
+ ro->ro_flags |= RT_NORTREF;
- return ((error) ? NULL : fle);
+ return (0);
}
-/*
- * used by the bit_alloc macro
- */
-#define calloc(count, size) malloc((count)*(size), M_DEVBUF, M_WAITOK|M_ZERO)
-
-struct flowtable *
-flowtable_alloc(char *name, int nentry, int flags)
+static struct flentry *
+flowtable_lookup_common(struct flowtable *ft, uint32_t *key, int keylen,
+ uint32_t fibnum)
{
- struct flowtable *ft, *fttail;
- int i;
-
- if (V_flow_hashjitter == 0)
- V_flow_hashjitter = arc4random();
+ struct flist *flist;
+ struct flentry *fle;
+ uint32_t hash;
- KASSERT(nentry > 0, ("nentry must be > 0, is %d\n", nentry));
+ FLOWSTAT_INC(ft, ft_lookups);
- ft = malloc(sizeof(struct flowtable),
- M_RTABLE, M_WAITOK | M_ZERO);
+ hash = jenkins_hash32(key, keylen / sizeof(uint32_t), flow_hashjitter);
- ft->ft_name = name;
- ft->ft_flags = flags;
- ft->ft_size = nentry;
-#ifdef RADIX_MPATH
- ft->ft_rtalloc = rtalloc_mpath_fib;
-#else
- ft->ft_rtalloc = rtalloc_ign_wrapper;
+ critical_enter();
+ flist = flowtable_list(ft, hash);
+ SLIST_FOREACH(fle, flist, f_next) {
+ KASSERT(fle->f_hash % ft->ft_size == hash % ft->ft_size,
+ ("%s: wrong hash", __func__));
+ if (flow_matches(fle, key, keylen, fibnum)) {
+ fle->f_uptime = time_uptime;
+#ifdef FLOWTABLE_HASH_ALL
+ fle->f_flags |= fibnum >> 24;
#endif
- if (flags & FL_PCPU) {
- ft->ft_lock = flowtable_pcpu_lock;
- ft->ft_unlock = flowtable_pcpu_unlock;
-
- for (i = 0; i <= mp_maxid; i++) {
- ft->ft_table.pcpu[i] =
- malloc(nentry*sizeof(struct flentry *),
- M_RTABLE, M_WAITOK | M_ZERO);
- ft->ft_masks[i] = bit_alloc(nentry);
+ critical_exit();
+ FLOWSTAT_INC(ft, ft_hits);
+ return (fle);
}
- } else {
- ft->ft_lock_count = 2*(powerof2(mp_maxid + 1) ? (mp_maxid + 1):
- (fls(mp_maxid + 1) << 1));
-
- ft->ft_lock = flowtable_global_lock;
- ft->ft_unlock = flowtable_global_unlock;
- ft->ft_table.global =
- malloc(nentry*sizeof(struct flentry *),
- M_RTABLE, M_WAITOK | M_ZERO);
- ft->ft_locks = malloc(ft->ft_lock_count*sizeof(struct mtx),
- M_RTABLE, M_WAITOK | M_ZERO);
- for (i = 0; i < ft->ft_lock_count; i++)
- mtx_init(&ft->ft_locks[i], "flow", NULL, MTX_DEF|MTX_DUPOK);
-
- ft->ft_masks[0] = bit_alloc(nentry);
}
- ft->ft_tmpmask = bit_alloc(nentry);
+ critical_exit();
- /*
- * In the local transmit case the table truly is
- * just a cache - so everything is eligible for
- * replacement after 5s of non-use
- */
- if (flags & FL_HASH_ALL) {
- ft->ft_udp_idle = V_flowtable_udp_expire;
- ft->ft_syn_idle = V_flowtable_syn_expire;
- ft->ft_fin_wait_idle = V_flowtable_fin_wait_expire;
- ft->ft_tcp_idle = V_flowtable_fin_wait_expire;
- } else {
- ft->ft_udp_idle = ft->ft_fin_wait_idle =
- ft->ft_syn_idle = ft->ft_tcp_idle = 30;
-
- }
+ FLOWSTAT_INC(ft, ft_misses);
- /*
- * hook in to the cleaner list
- */
- if (V_flow_list_head == NULL)
- V_flow_list_head = ft;
- else {
- fttail = V_flow_list_head;
- while (fttail->ft_next != NULL)
- fttail = fttail->ft_next;
- fttail->ft_next = ft;
- }
-
- return (ft);
+ return (flowtable_insert(ft, hash, key, keylen, fibnum));
}
/*
- * The rest of the code is devoted to garbage collection of expired entries.
- * It is a new additon made necessary by the switch to dynamically allocating
- * flow tables.
- *
+ * used by the bit_alloc macro
*/
+#define calloc(count, size) malloc((count)*(size), M_FTABLE, M_WAITOK | M_ZERO)
static void
-fle_free(struct flentry *fle, struct flowtable *ft)
+flowtable_alloc(struct flowtable *ft)
{
- struct rtentry *rt;
- struct llentry *lle;
- rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
- lle = __DEVOLATILE(struct llentry *, fle->f_lle);
- if (rt != NULL)
- RTFREE(rt);
- if (lle != NULL)
- LLE_FREE(lle);
- flow_free(fle, ft);
+ ft->ft_table = malloc(ft->ft_size * sizeof(struct flist),
+ M_FTABLE, M_WAITOK);
+ for (int i = 0; i < ft->ft_size; i++)
+ ft->ft_table[i] = uma_zalloc(pcpu_zone_ptr, M_WAITOK | M_ZERO);
+
+ ft->ft_masks = uma_zalloc(pcpu_zone_ptr, M_WAITOK);
+ for (int i = 0; i < mp_ncpus; i++) {
+ bitstr_t **b;
+
+ b = zpcpu_get_cpu(ft->ft_masks, i);
+ *b = bit_alloc(ft->ft_size);
+ }
+ ft->ft_tmpmask = bit_alloc(ft->ft_size);
}
+#undef calloc
static void
-flowtable_free_stale(struct flowtable *ft, struct rtentry *rt)
+flowtable_free_stale(struct flowtable *ft, struct rtentry *rt, int maxidle)
{
- int curbit = 0, count, tmpsize;
- struct flentry *fle, **flehead, *fleprev;
- struct flentry *flefreehead, *flefreetail, *fletmp;
+ struct flist *flist, freelist;
+ struct flentry *fle, *fle1, *fleprev;
bitstr_t *mask, *tmpmask;
- struct flowtable_stats *fs = &ft->ft_stats[curcpu];
+ int curbit, tmpsize;
- flefreehead = flefreetail = NULL;
+ SLIST_INIT(&freelist);
mask = flowtable_mask(ft);
tmpmask = ft->ft_tmpmask;
tmpsize = ft->ft_size;
memcpy(tmpmask, mask, ft->ft_size/8);
+ curbit = 0;
/*
* XXX Note to self, bit_ffs operates at the byte level
* and thus adds gratuitous overhead
@@ -1425,131 +785,96 @@ flowtable_free_stale(struct flowtable *ft, struct rtentry *rt)
break;
}
- FL_ENTRY_LOCK(ft, curbit);
- flehead = flowtable_entry(ft, curbit);
- fle = fleprev = *flehead;
+ FLOWSTAT_INC(ft, ft_free_checks);
- fs->ft_free_checks++;
+ critical_enter();
+ flist = flowtable_list(ft, curbit);
#ifdef DIAGNOSTIC
- if (fle == NULL && curbit > 0) {
+ if (SLIST_EMPTY(flist) && curbit > 0) {
log(LOG_ALERT,
"warning bit=%d set, but no fle found\n",
curbit);
}
-#endif
- while (fle != NULL) {
- if (rt != NULL) {
- if (__DEVOLATILE(struct rtentry *, fle->f_rt) != rt) {
- fleprev = fle;
- fle = fle->f_next;
- continue;
- }
- } else if (!flow_stale(ft, fle)) {
+#endif
+ SLIST_FOREACH_SAFE(fle, flist, f_next, fle1) {
+ if (rt != NULL && fle->f_rt != rt) {
fleprev = fle;
- fle = fle->f_next;
continue;
}
- /*
- * delete head of the list
- */
- if (fleprev == *flehead) {
- fletmp = fleprev;
- if (fle == fleprev) {
- fleprev = *flehead = fle->f_next;
- } else
- fleprev = *flehead = fle;
- fle = fle->f_next;
- } else {
- /*
- * don't advance fleprev
- */
- fletmp = fle;
- fleprev->f_next = fle->f_next;
- fle = fleprev->f_next;
+ if (!flow_stale(ft, fle, maxidle)) {
+ fleprev = fle;
+ continue;
}
- if (flefreehead == NULL)
- flefreehead = flefreetail = fletmp;
- else {
- flefreetail->f_next = fletmp;
- flefreetail = fletmp;
- }
- fletmp->f_next = NULL;
+ if (fle == SLIST_FIRST(flist))
+ SLIST_REMOVE_HEAD(flist, f_next);
+ else
+ SLIST_REMOVE_AFTER(fleprev, f_next);
+ SLIST_INSERT_HEAD(&freelist, fle, f_next);
}
- if (*flehead == NULL)
+ if (SLIST_EMPTY(flist))
bit_clear(mask, curbit);
- FL_ENTRY_UNLOCK(ft, curbit);
+ critical_exit();
+
bit_clear(tmpmask, curbit);
tmpmask += (curbit / 8);
tmpsize -= (curbit / 8) * 8;
bit_ffs(tmpmask, tmpsize, &curbit);
}
- count = 0;
- while ((fle = flefreehead) != NULL) {
- flefreehead = fle->f_next;
- count++;
- fs->ft_frees++;
- fle_free(fle, ft);
+
+ SLIST_FOREACH_SAFE(fle, &freelist, f_next, fle1) {
+ FLOWSTAT_INC(ft, ft_frees);
+ if (fle->f_rt != NULL)
+ RTFREE(fle->f_rt);
+ if (fle->f_lle != NULL)
+ LLE_FREE(fle->f_lle);
+ uma_zfree(flow_zone, fle);
}
- if (V_flowtable_debug && count)
- log(LOG_DEBUG, "freed %d flow entries\n", count);
}
-void
-flowtable_route_flush(struct flowtable *ft, struct rtentry *rt)
+static void
+flowtable_clean_vnet(struct flowtable *ft, struct rtentry *rt, int maxidle)
{
int i;
- if (ft->ft_flags & FL_PCPU) {
- CPU_FOREACH(i) {
- if (smp_started == 1) {
- thread_lock(curthread);
- sched_bind(curthread, i);
- thread_unlock(curthread);
- }
+ CPU_FOREACH(i) {
+ if (smp_started == 1) {
+ thread_lock(curthread);
+ sched_bind(curthread, i);
+ thread_unlock(curthread);
+ }
- flowtable_free_stale(ft, rt);
+ flowtable_free_stale(ft, rt, maxidle);
- if (smp_started == 1) {
- thread_lock(curthread);
- sched_unbind(curthread);
- thread_unlock(curthread);
- }
+ if (smp_started == 1) {
+ thread_lock(curthread);
+ sched_unbind(curthread);
+ thread_unlock(curthread);
}
- } else {
- flowtable_free_stale(ft, rt);
}
}
-static void
-flowtable_clean_vnet(void)
+void
+flowtable_route_flush(sa_family_t sa, struct rtentry *rt)
{
struct flowtable *ft;
- int i;
- ft = V_flow_list_head;
- while (ft != NULL) {
- if (ft->ft_flags & FL_PCPU) {
- CPU_FOREACH(i) {
- if (smp_started == 1) {
- thread_lock(curthread);
- sched_bind(curthread, i);
- thread_unlock(curthread);
- }
-
- flowtable_free_stale(ft, NULL);
-
- if (smp_started == 1) {
- thread_lock(curthread);
- sched_unbind(curthread);
- thread_unlock(curthread);
- }
- }
- } else {
- flowtable_free_stale(ft, NULL);
- }
- ft = ft->ft_next;
+ switch (sa) {
+#ifdef INET
+ case AF_INET:
+ ft = &V_ip4_ft;
+ break;
+#endif
+#ifdef INET6
+ case AF_INET6:
+ ft = &V_ip6_ft;
+ break;
+#endif
+ default:
+ panic("%s: sa %d", __func__, sa);
}
+
+ flowtable_clean_vnet(ft, rt, 0);
}
static void
@@ -1562,18 +887,33 @@ flowtable_cleaner(void)
log(LOG_INFO, "flowtable cleaner started\n");
td = curthread;
while (1) {
+ uint32_t flowclean_freq, maxidle;
+
+ /*
+ * The maximum idle time, as well as frequency are arbitrary.
+ */
+ if (flow_full())
+ maxidle = 5;
+ else
+ maxidle = 30;
+
VNET_LIST_RLOCK();
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter);
- flowtable_clean_vnet();
+#ifdef INET
+ flowtable_clean_vnet(&V_ip4_ft, NULL, maxidle);
+#endif
+#ifdef INET6
+ flowtable_clean_vnet(&V_ip6_ft, NULL, maxidle);
+#endif
CURVNET_RESTORE();
}
VNET_LIST_RUNLOCK();
- /*
- * The 10 second interval between cleaning checks
- * is arbitrary
- */
+ if (flow_full())
+ flowclean_freq = 4*hz;
+ else
+ flowclean_freq = 20*hz;
mtx_lock(&flowclean_lock);
thread_lock(td);
sched_prio(td, PPAUSE);
@@ -1606,91 +946,106 @@ static struct kproc_desc flow_kp = {
};
SYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp);
-static void
-flowtable_init_vnet(const void *unused __unused)
+static int
+flowtable_get_size(char *name)
{
+ int size;
+
+ if (TUNABLE_INT_FETCH(name, &size)) {
+ if (size < 256)
+ size = 256;
+ if (!powerof2(size)) {
+ printf("%s must be power of 2\n", name);
+ size = 2048;
+ }
+ } else {
+ /*
+ * round up to the next power of 2
+ */
+ size = 1 << fls((1024 + maxusers * 64) - 1);
+ }
- V_flowtable_nmbflows = 1024 + maxusers * 64 * mp_ncpus;
- V_flow_ipv4_zone = uma_zcreate("ip4flow", sizeof(struct flentry_v4),
- NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);
- V_flow_ipv6_zone = uma_zcreate("ip6flow", sizeof(struct flentry_v6),
- NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);
- uma_zone_set_max(V_flow_ipv4_zone, V_flowtable_nmbflows);
- uma_zone_set_max(V_flow_ipv6_zone, V_flowtable_nmbflows);
- V_flowtable_ready = 1;
+ return (size);
}
-VNET_SYSINIT(flowtable_init_vnet, SI_SUB_SMP, SI_ORDER_ANY,
- flowtable_init_vnet, NULL);
static void
flowtable_init(const void *unused __unused)
{
+ flow_hashjitter = arc4random();
+
+ flow_zone = uma_zcreate("flows", sizeof(struct flentry),
+ NULL, NULL, NULL, NULL, (64-1), UMA_ZONE_MAXBUCKET);
+ uma_zone_set_max(flow_zone, 1024 + maxusers * 64 * mp_ncpus);
+
cv_init(&flowclean_c_cv, "c_flowcleanwait");
cv_init(&flowclean_f_cv, "f_flowcleanwait");
mtx_init(&flowclean_lock, "flowclean lock", NULL, MTX_DEF);
EVENTHANDLER_REGISTER(ifnet_departure_event, flowtable_flush, NULL,
EVENTHANDLER_PRI_ANY);
- flowclean_freq = 20*hz;
}
-SYSINIT(flowtable_init, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST,
+SYSINIT(flowtable_init, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST,
flowtable_init, NULL);
+#ifdef INET
+static SYSCTL_NODE(_net_flowtable, OID_AUTO, ip4, CTLFLAG_RD, NULL,
+ "Flowtable for IPv4");
+
+static VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip4_ftstat);
+VNET_PCPUSTAT_SYSINIT(ip4_ftstat);
+VNET_PCPUSTAT_SYSUNINIT(ip4_ftstat);
+SYSCTL_VNET_PCPUSTAT(_net_flowtable_ip4, OID_AUTO, stat, struct flowtable_stat,
+ ip4_ftstat, "Flowtable statistics for IPv4 "
+ "(struct flowtable_stat, net/flowtable.h)");
-#ifdef VIMAGE
static void
-flowtable_uninit(const void *unused __unused)
+flowtable_init_vnet_v4(const void *unused __unused)
{
- V_flowtable_ready = 0;
- uma_zdestroy(V_flow_ipv4_zone);
- uma_zdestroy(V_flow_ipv6_zone);
+ V_ip4_ft.ft_size = flowtable_get_size("net.flowtable.ip4.size");
+ V_ip4_ft.ft_stat = VNET(ip4_ftstat);
+ flowtable_alloc(&V_ip4_ft);
}
+VNET_SYSINIT(ft_vnet_v4, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
+ flowtable_init_vnet_v4, NULL);
+#endif /* INET */
-VNET_SYSUNINIT(flowtable_uninit, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY,
- flowtable_uninit, NULL);
-#endif
+#ifdef INET6
+static SYSCTL_NODE(_net_flowtable, OID_AUTO, ip6, CTLFLAG_RD, NULL,
+ "Flowtable for IPv6");
-#ifdef DDB
-static uint32_t *
-flowtable_get_hashkey(struct flentry *fle)
-{
- uint32_t *hashkey;
+static VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip6_ftstat);
+VNET_PCPUSTAT_SYSINIT(ip6_ftstat);
+VNET_PCPUSTAT_SYSUNINIT(ip6_ftstat);
+SYSCTL_VNET_PCPUSTAT(_net_flowtable_ip6, OID_AUTO, stat, struct flowtable_stat,
+ ip6_ftstat, "Flowtable statistics for IPv6 "
+ "(struct flowtable_stat, net/flowtable.h)");
- if (fle->f_flags & FL_IPV6)
- hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
- else
- hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
+static void
+flowtable_init_vnet_v6(const void *unused __unused)
+{
- return (hashkey);
+ V_ip6_ft.ft_size = flowtable_get_size("net.flowtable.ip6.size");
+ V_ip6_ft.ft_stat = VNET(ip6_ftstat);
+ flowtable_alloc(&V_ip6_ft);
}
+VNET_SYSINIT(flowtable_init_vnet_v6, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
+ flowtable_init_vnet_v6, NULL);
+#endif /* INET6 */
+#ifdef DDB
static bitstr_t *
flowtable_mask_pcpu(struct flowtable *ft, int cpuid)
{
- bitstr_t *mask;
- if (ft->ft_flags & FL_PCPU)
- mask = ft->ft_masks[cpuid];
- else
- mask = ft->ft_masks[0];
-
- return (mask);
+ return (zpcpu_get_cpu(*ft->ft_masks, cpuid));
}
-static struct flentry **
-flowtable_entry_pcpu(struct flowtable *ft, uint32_t hash, int cpuid)
+static struct flist *
+flowtable_list_pcpu(struct flowtable *ft, uint32_t hash, int cpuid)
{
- struct flentry **fle;
- int index = (hash % ft->ft_size);
- if (ft->ft_flags & FL_PCPU) {
- fle = &ft->ft_table.pcpu[cpuid][index];
- } else {
- fle = &ft->ft_table.global[index];
- }
-
- return (fle);
+ return (zpcpu_get_cpu(&ft->ft_table[hash % ft->ft_size], cpuid));
}
static void
@@ -1698,40 +1053,58 @@ flow_show(struct flowtable *ft, struct flentry *fle)
{
int idle_time;
int rt_valid, ifp_valid;
- uint16_t sport, dport;
- uint32_t *hashkey;
- char saddr[4*sizeof "123"], daddr[4*sizeof "123"];
volatile struct rtentry *rt;
struct ifnet *ifp = NULL;
+ uint32_t *hashkey = fle->f_key;
idle_time = (int)(time_uptime - fle->f_uptime);
rt = fle->f_rt;
rt_valid = rt != NULL;
- if (rt_valid)
+ if (rt_valid)
ifp = rt->rt_ifp;
ifp_valid = ifp != NULL;
- hashkey = flowtable_get_hashkey(fle);
- if (fle->f_flags & FL_IPV6)
- goto skipaddr;
-
- inet_ntoa_r(*(struct in_addr *) &hashkey[2], daddr);
- if (ft->ft_flags & FL_HASH_ALL) {
- inet_ntoa_r(*(struct in_addr *) &hashkey[1], saddr);
- sport = ntohs(((uint16_t *)hashkey)[0]);
- dport = ntohs(((uint16_t *)hashkey)[1]);
- db_printf("%s:%d->%s:%d",
- saddr, sport, daddr,
- dport);
- } else
+
+#ifdef INET
+ if (ft == &V_ip4_ft) {
+ char daddr[4*sizeof "123"];
+#ifdef FLOWTABLE_HASH_ALL
+ char saddr[4*sizeof "123"];
+ uint16_t sport, dport;
+#endif
+
+ inet_ntoa_r(*(struct in_addr *) &hashkey[0], daddr);
+#ifdef FLOWTABLE_HASH_ALL
+ inet_ntoa_r(*(struct in_addr *) &hashkey[1], saddr);
+ dport = ntohs((uint16_t)(hashkey[2] >> 16));
+ sport = ntohs((uint16_t)(hashkey[2] & 0xffff));
+ db_printf("%s:%d->%s:%d", saddr, sport, daddr, dport);
+#else
db_printf("%s ", daddr);
-
-skipaddr:
+#endif
+ }
+#endif /* INET */
+#ifdef INET6
+ if (ft == &V_ip6_ft) {
+#ifdef FLOWTABLE_HASH_ALL
+ db_printf("\n\tkey=%08x:%08x:%08x%08x:%08x:%08x%08x:%08x:%08x",
+ hashkey[0], hashkey[1], hashkey[2],
+ hashkey[3], hashkey[4], hashkey[5],
+ hashkey[6], hashkey[7], hashkey[8]);
+#else
+ db_printf("\n\tkey=%08x:%08x:%08x ",
+ hashkey[0], hashkey[1], hashkey[2]);
+#endif
+ }
+#endif /* INET6 */
+
+ db_printf("hash=%08x idle_time=%03d"
+ "\n\tfibnum=%02d rt=%p",
+ fle->f_hash, idle_time, fle->f_fibnum, fle->f_rt);
+
+#ifdef FLOWTABLE_HASH_ALL
if (fle->f_flags & FL_STALE)
db_printf(" FL_STALE ");
- if (fle->f_flags & FL_TCP)
- db_printf(" FL_TCP ");
- if (fle->f_flags & FL_UDP)
- db_printf(" FL_UDP ");
+#endif
if (rt_valid) {
if (rt->rt_flags & RTF_UP)
db_printf(" RTF_UP ");
@@ -1740,21 +1113,10 @@ skipaddr:
if (ifp->if_flags & IFF_LOOPBACK)
db_printf(" IFF_LOOPBACK ");
if (ifp->if_flags & IFF_UP)
- db_printf(" IFF_UP ");
+ db_printf(" IFF_UP ");
if (ifp->if_flags & IFF_POINTOPOINT)
- db_printf(" IFF_POINTOPOINT ");
+ db_printf(" IFF_POINTOPOINT ");
}
- if (fle->f_flags & FL_IPV6)
- db_printf("\n\tkey=%08x:%08x:%08x%08x:%08x:%08x%08x:%08x:%08x",
- hashkey[0], hashkey[1], hashkey[2],
- hashkey[3], hashkey[4], hashkey[5],
- hashkey[6], hashkey[7], hashkey[8]);
- else
- db_printf("\n\tkey=%08x:%08x:%08x ",
- hashkey[0], hashkey[1], hashkey[2]);
- db_printf("hash=%08x idle_time=%03d"
- "\n\tfibnum=%02d rt=%p",
- fle->f_fhash, idle_time, fle->f_fibnum, fle->f_rt);
db_printf("\n");
}
@@ -1762,7 +1124,6 @@ static void
flowtable_show(struct flowtable *ft, int cpuid)
{
int curbit = 0;
- struct flentry *fle, **flehead;
bitstr_t *mask, *tmpmask;
if (cpuid != -1)
@@ -1776,43 +1137,32 @@ flowtable_show(struct flowtable *ft, int cpuid)
*/
bit_ffs(tmpmask, ft->ft_size, &curbit);
while (curbit != -1) {
+ struct flist *flist;
+ struct flentry *fle;
+
if (curbit >= ft->ft_size || curbit < -1) {
db_printf("warning: bad curbit value %d \n",
curbit);
break;
}
- flehead = flowtable_entry_pcpu(ft, curbit, cpuid);
- fle = *flehead;
+ flist = flowtable_list_pcpu(ft, curbit, cpuid);
- while (fle != NULL) {
+ SLIST_FOREACH(fle, flist, f_next)
flow_show(ft, fle);
- fle = fle->f_next;
- continue;
- }
bit_clear(tmpmask, curbit);
bit_ffs(tmpmask, ft->ft_size, &curbit);
}
}
static void
-flowtable_show_vnet(void)
+flowtable_show_vnet(struct flowtable *ft)
{
- struct flowtable *ft;
+
int i;
- ft = V_flow_list_head;
- while (ft != NULL) {
- printf("name: %s\n", ft->ft_name);
- if (ft->ft_flags & FL_PCPU) {
- CPU_FOREACH(i) {
- flowtable_show(ft, i);
- }
- } else {
- flowtable_show(ft, -1);
- }
- ft = ft->ft_next;
- }
+ CPU_FOREACH(i)
+ flowtable_show(ft, i);
}
DB_SHOW_COMMAND(flowtables, db_show_flowtables)
@@ -1824,7 +1174,14 @@ DB_SHOW_COMMAND(flowtables, db_show_flowtables)
#ifdef VIMAGE
db_printf("vnet %p\n", vnet_iter);
#endif
- flowtable_show_vnet();
+#ifdef INET
+ printf("IPv4:\n");
+ flowtable_show_vnet(&V_ip4_ft);
+#endif
+#ifdef INET6
+ printf("IPv6:\n");
+ flowtable_show_vnet(&V_ip6_ft);
+#endif
CURVNET_RESTORE();
}
}
diff --git a/sys/net/flowtable.h b/sys/net/flowtable.h
index d810fa3..5a1d927 100644
--- a/sys/net/flowtable.h
+++ b/sys/net/flowtable.h
@@ -1,83 +1,56 @@
-/**************************************************************************
-
-Copyright (c) 2008-2010, BitGravity Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- 1. Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
-
- 2. Neither the name of the BitGravity Corporation nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-$FreeBSD$
-
-***************************************************************************/
+/*-
+ * Copyright (c) 2014 Gleb Smirnoff <glebius@FreeBSD.org>
+ * Copyright (c) 2008-2010, BitGravity Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Neither the name of the BitGravity Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ *
+ */
#ifndef _NET_FLOWTABLE_H_
#define _NET_FLOWTABLE_H_
-#ifdef _KERNEL
-
-#define FL_HASH_ALL (1<<0) /* hash 4-tuple + protocol */
-#define FL_PCPU (1<<1) /* pcpu cache */
-#define FL_NOAUTO (1<<2) /* don't automatically add flentry on miss */
-#define FL_IPV6 (1<<9)
-
-#define FL_TCP (1<<11)
-#define FL_SCTP (1<<12)
-#define FL_UDP (1<<13)
-#define FL_DEBUG (1<<14)
-#define FL_DEBUG_ALL (1<<15)
-
-struct flowtable;
-struct flentry;
-struct route;
-struct route_in6;
+struct flowtable_stat {
+ uint64_t ft_collisions;
+ uint64_t ft_misses;
+ uint64_t ft_free_checks;
+ uint64_t ft_frees;
+ uint64_t ft_hits;
+ uint64_t ft_lookups;
+ uint64_t ft_fail_lle_invalid;
+ uint64_t ft_inserts;
+};
-VNET_DECLARE(struct flowtable *, ip_ft);
-#define V_ip_ft VNET(ip_ft)
-
-VNET_DECLARE(struct flowtable *, ip6_ft);
-#define V_ip6_ft VNET(ip6_ft)
-
-struct flowtable *flowtable_alloc(char *name, int nentry, int flags);
+#ifdef _KERNEL
/*
- * Given a flow table, look up the L3 and L2 information and
- * return it in the route.
- *
+ * Given a flow table, look up the L3 and L2 information
+ * and return it in the route.
*/
-struct flentry *flowtable_lookup_mbuf(struct flowtable *ft, struct mbuf *m, int af);
-
-struct flentry *flowtable_lookup(struct flowtable *ft, struct sockaddr_storage *ssa,
- struct sockaddr_storage *dsa, uint32_t fibnum, int flags);
-
-int kern_flowtable_insert(struct flowtable *ft, struct sockaddr_storage *ssa,
- struct sockaddr_storage *dsa, struct route *ro, uint32_t fibnum, int flags);
-
-void flow_invalidate(struct flentry *fl);
-void flowtable_route_flush(struct flowtable *ft, struct rtentry *rt);
-
-void flow_to_route(struct flentry *fl, struct route *ro);
-
-void flow_to_route_in6(struct flentry *fl, struct route_in6 *ro);
-
+int flowtable_lookup(sa_family_t, struct mbuf *, struct route *);
+void flowtable_route_flush(sa_family_t, struct rtentry *);
#endif /* _KERNEL */
-#endif
+#endif /* !_NET_FLOWTABLE_H_ */
diff --git a/sys/net/route.c b/sys/net/route.c
index 20fe181..bb99496 100644
--- a/sys/net/route.c
+++ b/sys/net/route.c
@@ -1298,18 +1298,7 @@ rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt,
}
#ifdef FLOWTABLE
else if (rt0 != NULL) {
- switch (dst->sa_family) {
-#ifdef INET6
- case AF_INET6:
- flowtable_route_flush(V_ip6_ft, rt0);
- break;
-#endif
-#ifdef INET
- case AF_INET:
- flowtable_route_flush(V_ip_ft, rt0);
- break;
-#endif
- }
+ flowtable_route_flush(dst->sa_family, rt0);
RTFREE(rt0);
}
#endif
diff --git a/sys/netinet/ip_input.c b/sys/netinet/ip_input.c
index c265d02..cde30ee 100644
--- a/sys/netinet/ip_input.c
+++ b/sys/netinet/ip_input.c
@@ -62,7 +62,6 @@ __FBSDID("$FreeBSD$");
#include <net/route.h>
#include <net/netisr.h>
#include <net/vnet.h>
-#include <net/flowtable.h>
#include <netinet/in.h>
#include <netinet/in_kdtrace.h>
@@ -198,16 +197,6 @@ SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW,
"IP stealth mode, no TTL decrementation on forwarding");
#endif
-#ifdef FLOWTABLE
-static VNET_DEFINE(int, ip_output_flowtable_size) = 2048;
-VNET_DEFINE(struct flowtable *, ip_ft);
-#define V_ip_output_flowtable_size VNET(ip_output_flowtable_size)
-
-SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, output_flowtable_size, CTLFLAG_RDTUN,
- &VNET_NAME(ip_output_flowtable_size), 2048,
- "number of entries in the per-cpu output flow caches");
-#endif
-
static void ip_freef(struct ipqhead *, struct ipq *);
/*
@@ -309,24 +298,6 @@ ip_init(void)
printf("%s: WARNING: unable to register pfil hook, "
"error %d\n", __func__, i);
-#ifdef FLOWTABLE
- if (TUNABLE_INT_FETCH("net.inet.ip.output_flowtable_size",
- &V_ip_output_flowtable_size)) {
- if (V_ip_output_flowtable_size < 256)
- V_ip_output_flowtable_size = 256;
- if (!powerof2(V_ip_output_flowtable_size)) {
- printf("flowtable must be power of 2 size\n");
- V_ip_output_flowtable_size = 2048;
- }
- } else {
- /*
- * round up to the next power of 2
- */
- V_ip_output_flowtable_size = 1 << fls((1024 + maxusers * 64)-1);
- }
- V_ip_ft = flowtable_alloc("ipv4", V_ip_output_flowtable_size, FL_PCPU);
-#endif
-
/* Skip initialization of globals for non-default instances. */
if (!IS_DEFAULT_VNET(curvnet))
return;
diff --git a/sys/netinet/ip_output.c b/sys/netinet/ip_output.c
index 7764bc3..2d8be1b 100644
--- a/sys/netinet/ip_output.c
+++ b/sys/netinet/ip_output.c
@@ -32,6 +32,7 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+#include "opt_inet.h"
#include "opt_ipfw.h"
#include "opt_ipsec.h"
#include "opt_kdtrace.h"
@@ -154,19 +155,8 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
}
#ifdef FLOWTABLE
- if (ro->ro_rt == NULL) {
- struct flentry *fle;
-
- /*
- * The flow table returns route entries valid for up to 30
- * seconds; we rely on the remainder of ip_output() taking no
- * longer than that long for the stability of ro_rt. The
- * flow ID assignment must have happened before this point.
- */
- fle = flowtable_lookup_mbuf(V_ip_ft, m, AF_INET);
- if (fle != NULL)
- flow_to_route(fle, ro);
- }
+ if (ro->ro_rt == NULL)
+ (void )flowtable_lookup(AF_INET, m, ro);
#endif
if (opt) {
diff --git a/sys/netinet6/in6_proto.c b/sys/netinet6/in6_proto.c
index 965de60..b0631ae 100644
--- a/sys/netinet6/in6_proto.c
+++ b/sys/netinet6/in6_proto.c
@@ -126,10 +126,6 @@ __FBSDID("$FreeBSD$");
#include <netinet6/ip6protosw.h>
-#ifdef FLOWTABLE
-#include <net/flowtable.h>
-#endif
-
/*
* TCP/IP protocol family: IP6, ICMP6, UDP, TCP.
*/
@@ -575,16 +571,6 @@ SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_STEALTH, stealth, CTLFLAG_RW,
&VNET_NAME(ip6stealth), 0, "");
#endif
-#ifdef FLOWTABLE
-VNET_DEFINE(int, ip6_output_flowtable_size) = 2048;
-VNET_DEFINE(struct flowtable *, ip6_ft);
-#define V_ip6_output_flowtable_size VNET(ip6_output_flowtable_size)
-
-SYSCTL_VNET_INT(_net_inet6_ip6, OID_AUTO, output_flowtable_size, CTLFLAG_RDTUN,
- &VNET_NAME(ip6_output_flowtable_size), 2048,
- "number of entries in the per-cpu output flow caches");
-#endif
-
/* net.inet6.icmp6 */
SYSCTL_VNET_INT(_net_inet6_icmp6, ICMPV6CTL_REDIRACCEPT, rediraccept,
CTLFLAG_RW, &VNET_NAME(icmp6_rediraccept), 0, "");
diff --git a/sys/netinet6/ip6_input.c b/sys/netinet6/ip6_input.c
index 8f70741..12249db 100644
--- a/sys/netinet6/ip6_input.c
+++ b/sys/netinet6/ip6_input.c
@@ -119,12 +119,6 @@ __FBSDID("$FreeBSD$");
#include <netinet6/ip6protosw.h>
-#ifdef FLOWTABLE
-#include <net/flowtable.h>
-VNET_DECLARE(int, ip6_output_flowtable_size);
-#define V_ip6_output_flowtable_size VNET(ip6_output_flowtable_size)
-#endif
-
extern struct domain inet6domain;
u_char ip6_protox[IPPROTO_MAX];
@@ -194,24 +188,6 @@ ip6_init(void)
nd6_init();
frag6_init();
-#ifdef FLOWTABLE
- if (TUNABLE_INT_FETCH("net.inet6.ip6.output_flowtable_size",
- &V_ip6_output_flowtable_size)) {
- if (V_ip6_output_flowtable_size < 256)
- V_ip6_output_flowtable_size = 256;
- if (!powerof2(V_ip6_output_flowtable_size)) {
- printf("flowtable must be power of 2 size\n");
- V_ip6_output_flowtable_size = 2048;
- }
- } else {
- /*
- * round up to the next power of 2
- */
- V_ip6_output_flowtable_size = 1 << fls((1024 + maxusers * 64)-1);
- }
- V_ip6_ft = flowtable_alloc("ipv6", V_ip6_output_flowtable_size, FL_IPV6|FL_PCPU);
-#endif
-
V_ip6_desync_factor = arc4random() % MAX_TEMP_DESYNC_FACTOR;
/* Skip global initialization stuff for non-default instances. */
diff --git a/sys/netinet6/ip6_output.c b/sys/netinet6/ip6_output.c
index 0d55b66..171a918 100644
--- a/sys/netinet6/ip6_output.c
+++ b/sys/netinet6/ip6_output.c
@@ -521,19 +521,8 @@ skip_ipsec2:;
ro = &opt->ip6po_route;
dst = (struct sockaddr_in6 *)&ro->ro_dst;
#ifdef FLOWTABLE
- if (ro->ro_rt == NULL) {
- struct flentry *fle;
-
- /*
- * The flow table returns route entries valid for up to 30
- * seconds; we rely on the remainder of ip_output() taking no
- * longer than that long for the stability of ro_rt. The
- * flow ID assignment must have happened before this point.
- */
- fle = flowtable_lookup_mbuf(V_ip6_ft, m, AF_INET6);
- if (fle != NULL)
- flow_to_route_in6(fle, ro);
- }
+ if (ro->ro_rt == NULL)
+ (void )flowtable_lookup(AF_INET6, m, (struct route *)ro);
#endif
again:
/*
diff --git a/usr.bin/netstat/Makefile b/usr.bin/netstat/Makefile
index 1071f0e..1644aab 100644
--- a/usr.bin/netstat/Makefile
+++ b/usr.bin/netstat/Makefile
@@ -5,7 +5,8 @@
PROG= netstat
SRCS= if.c inet.c main.c mbuf.c mroute.c netisr.c route.c \
- unix.c atalk.c mroute6.c ipsec.c bpf.c pfkey.c sctp.c
+ unix.c atalk.c mroute6.c ipsec.c bpf.c pfkey.c sctp.c \
+ flowtable.c
WARNS?= 3
CFLAGS+=-fno-strict-aliasing
diff --git a/usr.bin/netstat/flowtable.c b/usr.bin/netstat/flowtable.c
new file mode 100644
index 0000000..a3d5dd5
--- /dev/null
+++ b/usr.bin/netstat/flowtable.c
@@ -0,0 +1,84 @@
+/*-
+ * Copyright (c) 2014 Gleb Smirnoff <glebius@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#include <net/flowtable.h>
+#include <err.h>
+#include <stdint.h>
+#include <stdio.h>
+#include "netstat.h"
+
+/*
+ * Print flowtable statistics.
+ */
+
+static void
+print_stats(struct flowtable_stat *stat)
+{
+
+#define p(f, m) if (stat->f || sflag <= 1) \
+ printf(m, (uintmax_t)stat->f, plural(stat->f))
+#define p2(f, m) if (stat->f || sflag <= 1) \
+ printf(m, (uintmax_t)stat->f, plurales(stat->f))
+
+ p(ft_lookups, "\t%ju lookup%s\n");
+ p(ft_hits, "\t%ju hit%s\n");
+ p2(ft_misses, "\t%ju miss%s\n");
+ p(ft_inserts, "\t%ju insert%s\n");
+ p(ft_collisions, "\t%ju collision%s\n");
+ p(ft_free_checks, "\t%ju free check%s\n");
+ p(ft_frees, "\t%ju free%s\n");
+ p(ft_fail_lle_invalid,
+ "\t%ju lookup%s with not resolved Layer 2 address\n");
+
+#undef p2
+#undef p
+}
+
+void
+flowtable_stats(void)
+{
+ struct flowtable_stat stat;
+ size_t len = sizeof(stat);
+
+ if (!live)
+ return;
+
+ if (sysctlbyname("net.flowtable.ip4.stat", &stat, &len, NULL, 0) == 0) {
+ printf("flowtable for IPv4:\n");
+ print_stats(&stat);
+ }
+
+ if (sysctlbyname("net.flowtable.ip6.stat", &stat, &len, NULL, 0) == 0) {
+ printf("flowtable for IPv6:\n");
+ print_stats(&stat);
+ }
+}
diff --git a/usr.bin/netstat/main.c b/usr.bin/netstat/main.c
index feb97dc..5c952ad 100644
--- a/usr.bin/netstat/main.c
+++ b/usr.bin/netstat/main.c
@@ -556,9 +556,10 @@ main(int argc, char *argv[])
exit(0);
}
if (rflag) {
- if (sflag)
+ if (sflag) {
rt_stats(nl[N_RTSTAT].n_value, nl[N_RTTRASH].n_value);
- else
+ flowtable_stats();
+ } else
routepr(nl[N_RTREE].n_value, fib);
exit(0);
}
diff --git a/usr.bin/netstat/netstat.h b/usr.bin/netstat/netstat.h
index 114c84c..44bce94 100644
--- a/usr.bin/netstat/netstat.h
+++ b/usr.bin/netstat/netstat.h
@@ -124,6 +124,7 @@ void intpr(int, void (*)(char *));
void pr_rthdr(int);
void pr_family(int);
void rt_stats(u_long, u_long);
+void flowtable_stats(void);
char *ipx_pnet(struct sockaddr *);
char *ipx_phost(struct sockaddr *);
char *ns_phost(struct sockaddr *);
OpenPOWER on IntegriCloud