diff options
-rw-r--r-- | sys/conf/options | 1 | ||||
-rw-r--r-- | sys/net/flowtable.c | 1971 | ||||
-rw-r--r-- | sys/net/flowtable.h | 119 | ||||
-rw-r--r-- | sys/net/route.c | 13 | ||||
-rw-r--r-- | sys/netinet/ip_input.c | 29 | ||||
-rw-r--r-- | sys/netinet/ip_output.c | 16 | ||||
-rw-r--r-- | sys/netinet6/in6_proto.c | 14 | ||||
-rw-r--r-- | sys/netinet6/ip6_input.c | 24 | ||||
-rw-r--r-- | sys/netinet6/ip6_output.c | 15 | ||||
-rw-r--r-- | usr.bin/netstat/Makefile | 3 | ||||
-rw-r--r-- | usr.bin/netstat/flowtable.c | 84 | ||||
-rw-r--r-- | usr.bin/netstat/main.c | 5 | ||||
-rw-r--r-- | usr.bin/netstat/netstat.h | 1 |
13 files changed, 807 insertions, 1488 deletions
diff --git a/sys/conf/options b/sys/conf/options index 642064d..8a288fe 100644 --- a/sys/conf/options +++ b/sys/conf/options @@ -438,6 +438,7 @@ TCP_SIGNATURE opt_inet.h VLAN_ARRAY opt_vlan.h XBONEHACK FLOWTABLE opt_route.h +FLOWTABLE_HASH_ALL opt_route.h # # SCTP diff --git a/sys/net/flowtable.c b/sys/net/flowtable.c index 32b953c..873ec36 100644 --- a/sys/net/flowtable.c +++ b/sys/net/flowtable.c @@ -1,31 +1,30 @@ -/************************************************************************** - -Copyright (c) 2008-2010, BitGravity Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - 2. Neither the name of the BitGravity Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - -***************************************************************************/ +/*- + * Copyright (c) 2014 Gleb Smirnoff <glebius@FreeBSD.org> + * Copyright (c) 2008-2010, BitGravity Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Neither the name of the BitGravity Corporation nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ #include "opt_route.h" #include "opt_mpath.h" @@ -36,29 +35,32 @@ POSSIBILITY OF SUCH DAMAGE. #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); -#include <sys/param.h> +#include <sys/param.h> #include <sys/types.h> #include <sys/bitstring.h> #include <sys/condvar.h> #include <sys/callout.h> #include <sys/hash.h> -#include <sys/kernel.h> +#include <sys/kernel.h> #include <sys/kthread.h> #include <sys/limits.h> #include <sys/malloc.h> #include <sys/mbuf.h> +#include <sys/pcpu.h> #include <sys/proc.h> +#include <sys/queue.h> #include <sys/sbuf.h> #include <sys/sched.h> #include <sys/smp.h> #include <sys/socket.h> #include <sys/syslog.h> #include <sys/sysctl.h> +#include <vm/uma.h> #include <net/if.h> #include <net/if_llatbl.h> #include <net/if_var.h> -#include <net/route.h> +#include <net/route.h> #include <net/flowtable.h> #include <net/vnet.h> @@ -70,156 +72,79 @@ __FBSDID("$FreeBSD$"); #ifdef INET6 #include <netinet/ip6.h> #endif +#ifdef FLOWTABLE_HASH_ALL #include <netinet/tcp.h> #include <netinet/udp.h> #include <netinet/sctp.h> +#endif #include <ddb/ddb.h> -struct ipv4_tuple { - uint16_t ip_sport; /* source port */ - uint16_t ip_dport; /* destination port */ - in_addr_t ip_saddr; /* source address */ - in_addr_t ip_daddr; /* destination address */ -}; - -union ipv4_flow { - struct ipv4_tuple ipf_ipt; - uint32_t ipf_key[3]; -}; +#ifdef FLOWTABLE_HASH_ALL +#define KEY_PORTS (sizeof(uint16_t) * 2) +#define KEY_ADDRS 2 +#else +#define KEY_PORTS 0 +#define KEY_ADDRS 1 +#endif -struct ipv6_tuple { - uint16_t ip_sport; /* source port */ - uint16_t ip_dport; /* destination port */ - struct in6_addr ip_saddr; /* source address */ - struct in6_addr ip_daddr; /* destination address */ -}; +#ifdef INET6 +#define KEY_ADDR_LEN sizeof(struct in6_addr) +#else +#define KEY_ADDR_LEN sizeof(struct in_addr) +#endif -union ipv6_flow { - struct ipv6_tuple ipf_ipt; - uint32_t ipf_key[9]; -}; +#define KEYLEN ((KEY_ADDR_LEN * KEY_ADDRS + KEY_PORTS) / sizeof(uint32_t)) struct flentry { - volatile uint32_t f_fhash; /* hash flowing forward */ - uint16_t f_flags; /* flow flags */ - uint8_t f_pad; - uint8_t f_proto; /* protocol */ - uint32_t f_fibnum; /* fib index */ + uint32_t f_hash; /* hash flowing forward */ + uint32_t f_key[KEYLEN]; /* address(es and ports) */ uint32_t f_uptime; /* uptime at last access */ - struct flentry *f_next; /* pointer to collision entry */ - volatile struct rtentry *f_rt; /* rtentry for flow */ - volatile struct llentry *f_lle; /* llentry for flow */ -}; - -struct flentry_v4 { - struct flentry fl_entry; - union ipv4_flow fl_flow; -}; - -struct flentry_v6 { - struct flentry fl_entry; - union ipv6_flow fl_flow; -}; - -#define fl_fhash fl_entry.fl_fhash -#define fl_flags fl_entry.fl_flags -#define fl_proto fl_entry.fl_proto -#define fl_uptime fl_entry.fl_uptime -#define fl_rt fl_entry.fl_rt -#define fl_lle fl_entry.fl_lle - -#define SECS_PER_HOUR 3600 -#define SECS_PER_DAY (24*SECS_PER_HOUR) - -#define SYN_IDLE 300 -#define UDP_IDLE 300 -#define FIN_WAIT_IDLE 600 -#define TCP_IDLE SECS_PER_DAY - - -typedef void fl_lock_t(struct flowtable *, uint32_t); -typedef void fl_rtalloc_t(struct route *, uint32_t, u_int); - -union flentryp { - struct flentry **global; - struct flentry **pcpu[MAXCPU]; + uint16_t f_fibnum; /* fib index */ +#ifdef FLOWTABLE_HASH_ALL + uint8_t f_proto; /* protocol */ + uint8_t f_flags; /* stale? */ +#define FL_STALE 1 +#endif + SLIST_ENTRY(flentry) f_next; /* pointer to collision entry */ + struct rtentry *f_rt; /* rtentry for flow */ + struct llentry *f_lle; /* llentry for flow */ }; +#undef KEYLEN -struct flowtable_stats { - uint64_t ft_collisions; - uint64_t ft_allocated; - uint64_t ft_misses; - uint64_t ft_max_depth; - uint64_t ft_free_checks; - uint64_t ft_frees; - uint64_t ft_hits; - uint64_t ft_lookups; -} __aligned(CACHE_LINE_SIZE); +SLIST_HEAD(flist, flentry); +/* Make sure we can use pcpu_zone_ptr for struct flist. */ +CTASSERT(sizeof(struct flist) == sizeof(void *)); struct flowtable { - struct flowtable_stats ft_stats[MAXCPU]; + counter_u64_t *ft_stat; int ft_size; - int ft_lock_count; - uint32_t ft_flags; - char *ft_name; - fl_lock_t *ft_lock; - fl_lock_t *ft_unlock; - fl_rtalloc_t *ft_rtalloc; /* - * XXX need to pad out - */ - struct mtx *ft_locks; - union flentryp ft_table; - bitstr_t *ft_masks[MAXCPU]; + * ft_table is a malloc(9)ed array of pointers. Pointers point to + * memory from UMA_ZONE_PCPU zone. + * ft_masks is per-cpu pointer itself. Each instance points + * to a malloc(9)ed bitset, that is private to corresponding CPU. + */ + struct flist **ft_table; + bitstr_t **ft_masks; bitstr_t *ft_tmpmask; - struct flowtable *ft_next; +}; - uint32_t ft_count __aligned(CACHE_LINE_SIZE); - uint32_t ft_udp_idle __aligned(CACHE_LINE_SIZE); - uint32_t ft_fin_wait_idle; - uint32_t ft_syn_idle; - uint32_t ft_tcp_idle; - boolean_t ft_full; -} __aligned(CACHE_LINE_SIZE); +#define FLOWSTAT_ADD(ft, name, v) \ + counter_u64_add((ft)->ft_stat[offsetof(struct flowtable_stat, name) / sizeof(uint64_t)], (v)) +#define FLOWSTAT_INC(ft, name) FLOWSTAT_ADD(ft, name, 1) static struct proc *flowcleanerproc; -static VNET_DEFINE(struct flowtable *, flow_list_head); -static VNET_DEFINE(uint32_t, flow_hashjitter); -static VNET_DEFINE(uma_zone_t, flow_ipv4_zone); -static VNET_DEFINE(uma_zone_t, flow_ipv6_zone); - -#define V_flow_list_head VNET(flow_list_head) -#define V_flow_hashjitter VNET(flow_hashjitter) -#define V_flow_ipv4_zone VNET(flow_ipv4_zone) -#define V_flow_ipv6_zone VNET(flow_ipv6_zone) - +static uint32_t flow_hashjitter; static struct cv flowclean_f_cv; static struct cv flowclean_c_cv; static struct mtx flowclean_lock; static uint32_t flowclean_cycles; -static uint32_t flowclean_freq; - -#ifdef FLOWTABLE_DEBUG -#define FLDPRINTF(ft, flags, fmt, ...) \ -do { \ - if ((ft)->ft_flags & (flags)) \ - printf((fmt), __VA_ARGS__); \ -} while (0); \ - -#else -#define FLDPRINTF(ft, flags, fmt, ...) - -#endif - /* * TODO: - * - Make flowtable stats per-cpu, aggregated at sysctl call time, - * to avoid extra cache evictions caused by incrementing a shared - * counter - * - add sysctls to resize && flush flow tables + * - add sysctls to resize && flush flow tables * - Add per flowtable sysctls for statistics and configuring timeouts * - add saturation counter to rtentry to support per-packet load-balancing * add flag to indicate round-robin flow, add list lookup from head @@ -230,396 +155,117 @@ do { \ * - support explicit connection state (currently only ad-hoc for DSR) * - idetach() cleanup for options VIMAGE builds. */ -VNET_DEFINE(int, flowtable_enable) = 1; -static VNET_DEFINE(int, flowtable_debug); -static VNET_DEFINE(int, flowtable_syn_expire) = SYN_IDLE; -static VNET_DEFINE(int, flowtable_udp_expire) = UDP_IDLE; -static VNET_DEFINE(int, flowtable_fin_wait_expire) = FIN_WAIT_IDLE; -static VNET_DEFINE(int, flowtable_tcp_expire) = TCP_IDLE; -static VNET_DEFINE(int, flowtable_nmbflows); -static VNET_DEFINE(int, flowtable_ready) = 0; - -#define V_flowtable_enable VNET(flowtable_enable) -#define V_flowtable_debug VNET(flowtable_debug) -#define V_flowtable_syn_expire VNET(flowtable_syn_expire) -#define V_flowtable_udp_expire VNET(flowtable_udp_expire) -#define V_flowtable_fin_wait_expire VNET(flowtable_fin_wait_expire) -#define V_flowtable_tcp_expire VNET(flowtable_tcp_expire) -#define V_flowtable_nmbflows VNET(flowtable_nmbflows) -#define V_flowtable_ready VNET(flowtable_ready) - -static SYSCTL_NODE(_net_inet, OID_AUTO, flowtable, CTLFLAG_RD, NULL, - "flowtable"); -SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, debug, CTLFLAG_RW, - &VNET_NAME(flowtable_debug), 0, "print debug info."); -SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, enable, CTLFLAG_RW, - &VNET_NAME(flowtable_enable), 0, "enable flowtable caching."); - -/* - * XXX This does not end up updating timeouts at runtime - * and only reflects the value for the last table added :-/ - */ -SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, syn_expire, CTLFLAG_RW, - &VNET_NAME(flowtable_syn_expire), 0, - "seconds after which to remove syn allocated flow."); -SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, udp_expire, CTLFLAG_RW, - &VNET_NAME(flowtable_udp_expire), 0, - "seconds after which to remove flow allocated to UDP."); -SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, fin_wait_expire, CTLFLAG_RW, - &VNET_NAME(flowtable_fin_wait_expire), 0, - "seconds after which to remove a flow in FIN_WAIT."); -SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, tcp_expire, CTLFLAG_RW, - &VNET_NAME(flowtable_tcp_expire), 0, - "seconds after which to remove flow allocated to a TCP connection."); - - -/* - * Maximum number of flows that can be allocated of a given type. - * - * The table is allocated at boot time (for the pure caching case - * there is no reason why this could not be changed at runtime) - * and thus (currently) needs to be set with a tunable. - */ -static int -sysctl_nmbflows(SYSCTL_HANDLER_ARGS) -{ - int error, newnmbflows; - - newnmbflows = V_flowtable_nmbflows; - error = sysctl_handle_int(oidp, &newnmbflows, 0, req); - if (error == 0 && req->newptr) { - if (newnmbflows > V_flowtable_nmbflows) { - V_flowtable_nmbflows = newnmbflows; - uma_zone_set_max(V_flow_ipv4_zone, - V_flowtable_nmbflows); - uma_zone_set_max(V_flow_ipv6_zone, - V_flowtable_nmbflows); - } else - error = EINVAL; - } - return (error); -} -SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, nmbflows, - CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_nmbflows, "IU", - "Maximum number of flows allowed"); - - - -#define FS_PRINT(sb, field) sbuf_printf((sb), "\t%s: %jd\n", #field, fs->ft_##field) - -static void -fs_print(struct sbuf *sb, struct flowtable_stats *fs) -{ - - FS_PRINT(sb, collisions); - FS_PRINT(sb, allocated); - FS_PRINT(sb, misses); - FS_PRINT(sb, max_depth); - FS_PRINT(sb, free_checks); - FS_PRINT(sb, frees); - FS_PRINT(sb, hits); - FS_PRINT(sb, lookups); -} - -static void -flowtable_show_stats(struct sbuf *sb, struct flowtable *ft) -{ - int i; - struct flowtable_stats fs, *pfs; - - if (ft->ft_flags & FL_PCPU) { - bzero(&fs, sizeof(fs)); - pfs = &fs; - CPU_FOREACH(i) { - pfs->ft_collisions += ft->ft_stats[i].ft_collisions; - pfs->ft_allocated += ft->ft_stats[i].ft_allocated; - pfs->ft_misses += ft->ft_stats[i].ft_misses; - pfs->ft_free_checks += ft->ft_stats[i].ft_free_checks; - pfs->ft_frees += ft->ft_stats[i].ft_frees; - pfs->ft_hits += ft->ft_stats[i].ft_hits; - pfs->ft_lookups += ft->ft_stats[i].ft_lookups; - if (ft->ft_stats[i].ft_max_depth > pfs->ft_max_depth) - pfs->ft_max_depth = ft->ft_stats[i].ft_max_depth; - } - } else { - pfs = &ft->ft_stats[0]; - } - fs_print(sb, pfs); -} - -static int -sysctl_flowtable_stats(SYSCTL_HANDLER_ARGS) -{ - struct flowtable *ft; - struct sbuf *sb; - int error; - - sb = sbuf_new(NULL, NULL, 64*1024, SBUF_FIXEDLEN); - - ft = V_flow_list_head; - while (ft != NULL) { - sbuf_printf(sb, "\ntable name: %s\n", ft->ft_name); - flowtable_show_stats(sb, ft); - ft = ft->ft_next; - } - sbuf_finish(sb); - error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1); - sbuf_delete(sb); - - return (error); -} -SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, stats, CTLTYPE_STRING|CTLFLAG_RD, - NULL, 0, sysctl_flowtable_stats, "A", "flowtable statistics"); - - -#ifndef RADIX_MPATH -static void -rtalloc_ign_wrapper(struct route *ro, uint32_t hash, u_int fibnum) -{ - - rtalloc_ign_fib(ro, 0, fibnum); -} +#ifdef INET +static VNET_DEFINE(struct flowtable, ip4_ft); +#define V_ip4_ft VNET(ip4_ft) +#endif +#ifdef INET6 +static VNET_DEFINE(struct flowtable, ip6_ft); +#define V_ip6_ft VNET(ip6_ft) #endif -static void -flowtable_global_lock(struct flowtable *table, uint32_t hash) -{ - int lock_index = (hash)&(table->ft_lock_count - 1); - - mtx_lock(&table->ft_locks[lock_index]); -} - -static void -flowtable_global_unlock(struct flowtable *table, uint32_t hash) -{ - int lock_index = (hash)&(table->ft_lock_count - 1); - - mtx_unlock(&table->ft_locks[lock_index]); -} - -static void -flowtable_pcpu_lock(struct flowtable *table, uint32_t hash) -{ - - critical_enter(); -} - -static void -flowtable_pcpu_unlock(struct flowtable *table, uint32_t hash) -{ - - critical_exit(); -} - -#define FL_ENTRY_INDEX(table, hash)((hash) % (table)->ft_size) -#define FL_ENTRY(table, hash) *flowtable_entry((table), (hash)) -#define FL_ENTRY_LOCK(table, hash) (table)->ft_lock((table), (hash)) -#define FL_ENTRY_UNLOCK(table, hash) (table)->ft_unlock((table), (hash)) - -#define FL_STALE (1<<8) -#define FL_OVERWRITE (1<<10) - -void -flow_invalidate(struct flentry *fle) -{ - - fle->f_flags |= FL_STALE; -} - -static __inline int -proto_to_flags(uint8_t proto) -{ - int flag; +static uma_zone_t flow_zone; - switch (proto) { - case IPPROTO_TCP: - flag = FL_TCP; - break; - case IPPROTO_SCTP: - flag = FL_SCTP; - break; - case IPPROTO_UDP: - flag = FL_UDP; - break; - default: - flag = 0; - break; - } +static VNET_DEFINE(int, flowtable_enable) = 1; +#define V_flowtable_enable VNET(flowtable_enable) - return (flag); -} +static SYSCTL_NODE(_net, OID_AUTO, flowtable, CTLFLAG_RD, NULL, + "flowtable"); +SYSCTL_VNET_INT(_net_flowtable, OID_AUTO, enable, CTLFLAG_RW, + &VNET_NAME(flowtable_enable), 0, "enable flowtable caching."); +SYSCTL_UMA_MAX(_net_flowtable, OID_AUTO, maxflows, CTLFLAG_RW, + &flow_zone, "Maximum number of flows allowed"); -static __inline int -flags_to_proto(int flags) -{ - int proto, protoflags; +static MALLOC_DEFINE(M_FTABLE, "flowtable", "flowtable hashes and bitstrings"); - protoflags = flags & (FL_TCP|FL_SCTP|FL_UDP); - switch (protoflags) { - case FL_TCP: - proto = IPPROTO_TCP; - break; - case FL_SCTP: - proto = IPPROTO_SCTP; - break; - case FL_UDP: - proto = IPPROTO_UDP; - break; - default: - proto = 0; - break; - } - return (proto); -} +static struct flentry * +flowtable_lookup_common(struct flowtable *, uint32_t *, int, uint32_t); #ifdef INET -#ifdef FLOWTABLE_DEBUG -static void -ipv4_flow_print_tuple(int flags, int proto, struct sockaddr_in *ssin, - struct sockaddr_in *dsin) -{ - char saddr[4*sizeof "123"], daddr[4*sizeof "123"]; - - if (flags & FL_HASH_ALL) { - inet_ntoa_r(ssin->sin_addr, saddr); - inet_ntoa_r(dsin->sin_addr, daddr); - printf("proto=%d %s:%d->%s:%d\n", - proto, saddr, ntohs(ssin->sin_port), daddr, - ntohs(dsin->sin_port)); - } else { - inet_ntoa_r(*(struct in_addr *) &dsin->sin_addr, daddr); - printf("proto=%d %s\n", proto, daddr); - } - -} -#endif - -static int -ipv4_mbuf_demarshal(struct flowtable *ft, struct mbuf *m, - struct sockaddr_in *ssin, struct sockaddr_in *dsin, uint16_t *flags) +static struct flentry * +flowtable_lookup_ipv4(struct mbuf *m, struct route *ro) { + struct flentry *fle; + struct sockaddr_in *sin; struct ip *ip; - uint8_t proto; + uint32_t fibnum; +#ifdef FLOWTABLE_HASH_ALL + uint32_t key[3]; int iphlen; - struct tcphdr *th; - struct udphdr *uh; - struct sctphdr *sh; uint16_t sport, dport; + uint8_t proto; +#endif - proto = sport = dport = 0; ip = mtod(m, struct ip *); - dsin->sin_family = AF_INET; - dsin->sin_len = sizeof(*dsin); - dsin->sin_addr = ip->ip_dst; - ssin->sin_family = AF_INET; - ssin->sin_len = sizeof(*ssin); - ssin->sin_addr = ip->ip_src; - proto = ip->ip_p; - if ((*flags & FL_HASH_ALL) == 0) { - FLDPRINTF(ft, FL_DEBUG_ALL, "skip port check flags=0x%x ", - *flags); - goto skipports; - } + if (ip->ip_src.s_addr == ip->ip_dst.s_addr || + (ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || + (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) + return (NULL); - iphlen = ip->ip_hl << 2; /* XXX options? */ + fibnum = M_GETFIB(m); + +#ifdef FLOWTABLE_HASH_ALL + iphlen = ip->ip_hl << 2; + proto = ip->ip_p; switch (proto) { - case IPPROTO_TCP: - th = (struct tcphdr *)((caddr_t)ip + iphlen); + case IPPROTO_TCP: { + struct tcphdr *th; + + th = (struct tcphdr *)((char *)ip + iphlen); sport = th->th_sport; dport = th->th_dport; - if ((*flags & FL_HASH_ALL) && - (th->th_flags & (TH_RST|TH_FIN))) - *flags |= FL_STALE; - break; - case IPPROTO_UDP: - uh = (struct udphdr *)((caddr_t)ip + iphlen); + if (th->th_flags & (TH_RST|TH_FIN)) + fibnum |= (FL_STALE << 24); + break; + } + case IPPROTO_UDP: { + struct udphdr *uh; + + uh = (struct udphdr *)((char *)ip + iphlen); sport = uh->uh_sport; dport = uh->uh_dport; - break; - case IPPROTO_SCTP: - sh = (struct sctphdr *)((caddr_t)ip + iphlen); + break; + } + case IPPROTO_SCTP: { + struct sctphdr *sh; + + sh = (struct sctphdr *)((char *)ip + iphlen); sport = sh->src_port; dport = sh->dest_port; - break; + /* XXXGL: handle stale? */ + break; + } default: - FLDPRINTF(ft, FL_DEBUG_ALL, "proto=0x%x not supported\n", proto); - return (ENOTSUP); - /* no port - hence not a protocol we care about */ + sport = dport = 0; break; - } -skipports: - *flags |= proto_to_flags(proto); - ssin->sin_port = sport; - dsin->sin_port = dport; - return (0); -} + key[0] = ip->ip_dst.s_addr; + key[1] = ip->ip_src.s_addr; + key[2] = (dport << 16) | sport; + fibnum |= proto << 16; -static uint32_t -ipv4_flow_lookup_hash_internal( - struct sockaddr_in *ssin, struct sockaddr_in *dsin, - uint32_t *key, uint16_t flags) -{ - uint16_t sport, dport; - uint8_t proto; - int offset = 0; - - if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0)) - return (0); - proto = flags_to_proto(flags); - sport = dport = key[2] = key[1] = key[0] = 0; - if ((ssin != NULL) && (flags & FL_HASH_ALL)) { - key[1] = ssin->sin_addr.s_addr; - sport = ssin->sin_port; - } - if (dsin != NULL) { - key[2] = dsin->sin_addr.s_addr; - dport = dsin->sin_port; - } - if (flags & FL_HASH_ALL) { - ((uint16_t *)key)[0] = sport; - ((uint16_t *)key)[1] = dport; - } else - offset = V_flow_hashjitter + proto; + fle = flowtable_lookup_common(&V_ip4_ft, key, 3 * sizeof(uint32_t), + fibnum); - return (jenkins_hash32(key, 3, offset)); -} +#else /* !FLOWTABLE_HASH_ALL */ -static struct flentry * -flowtable_lookup_mbuf4(struct flowtable *ft, struct mbuf *m) -{ - struct sockaddr_storage ssa, dsa; - uint16_t flags; - struct sockaddr_in *dsin, *ssin; - - dsin = (struct sockaddr_in *)&dsa; - ssin = (struct sockaddr_in *)&ssa; - bzero(dsin, sizeof(*dsin)); - bzero(ssin, sizeof(*ssin)); - flags = ft->ft_flags; - if (ipv4_mbuf_demarshal(ft, m, ssin, dsin, &flags) != 0) - return (NULL); + fle = flowtable_lookup_common(&V_ip4_ft, (uint32_t *)&ip->ip_dst, + sizeof(struct in_addr), fibnum); - return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags)); -} +#endif /* FLOWTABLE_HASH_ALL */ -void -flow_to_route(struct flentry *fle, struct route *ro) -{ - uint32_t *hashkey = NULL; - struct sockaddr_in *sin; + if (fle == NULL) + return (NULL); sin = (struct sockaddr_in *)&ro->ro_dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); - hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key; - sin->sin_addr.s_addr = hashkey[2]; - ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt); - ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle); - ro->ro_flags |= RT_NORTREF; + sin->sin_addr = ip->ip_dst; + + return (fle); } #endif /* INET */ @@ -633,9 +279,8 @@ flow_to_route(struct flentry *fle, struct route *ro) #define PULLUP_TO(_len, p, T) \ do { \ int x = (_len) + sizeof(T); \ - if ((m)->m_len < x) { \ - goto receive_failed; \ - } \ + if ((m)->m_len < x) \ + return (NULL); \ p = (mtod(m, char *) + (_len)); \ } while (0) @@ -643,26 +288,35 @@ do { \ #define SCTP(p) ((struct sctphdr *)(p)) #define UDP(p) ((struct udphdr *)(p)) -static int -ipv6_mbuf_demarshal(struct flowtable *ft, struct mbuf *m, - struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6, uint16_t *flags) +static struct flentry * +flowtable_lookup_ipv6(struct mbuf *m, struct route *ro) { + struct flentry *fle; + struct sockaddr_in6 *sin6; struct ip6_hdr *ip6; - uint8_t proto; + uint32_t fibnum; +#ifdef FLOWTABLE_HASH_ALL + uint32_t key[9]; + void *ulp; int hlen; - uint16_t src_port, dst_port; + uint16_t sport, dport; u_short offset; - void *ulp; + uint8_t proto; +#else + uint32_t key[4]; +#endif - offset = hlen = src_port = dst_port = 0; - ulp = NULL; ip6 = mtod(m, struct ip6_hdr *); - hlen = sizeof(struct ip6_hdr); - proto = ip6->ip6_nxt; + if (in6_localaddr(&ip6->ip6_dst)) + return (NULL); - if ((*flags & FL_HASH_ALL) == 0) - goto skipports; + fibnum = M_GETFIB(m); +#ifdef FLOWTABLE_HASH_ALL + hlen = sizeof(struct ip6_hdr); + proto = ip6->ip6_nxt; + offset = sport = dport = 0; + ulp = NULL; while (ulp == NULL) { switch (proto) { case IPPROTO_ICMPV6: @@ -675,21 +329,21 @@ ipv6_mbuf_demarshal(struct flowtable *ft, struct mbuf *m, break; case IPPROTO_TCP: PULLUP_TO(hlen, ulp, struct tcphdr); - dst_port = TCP(ulp)->th_dport; - src_port = TCP(ulp)->th_sport; - if ((*flags & FL_HASH_ALL) && - (TCP(ulp)->th_flags & (TH_RST|TH_FIN))) - *flags |= FL_STALE; + dport = TCP(ulp)->th_dport; + sport = TCP(ulp)->th_sport; + if (TCP(ulp)->th_flags & (TH_RST|TH_FIN)) + fibnum |= (FL_STALE << 24); break; case IPPROTO_SCTP: PULLUP_TO(hlen, ulp, struct sctphdr); - src_port = SCTP(ulp)->src_port; - dst_port = SCTP(ulp)->dest_port; + dport = SCTP(ulp)->src_port; + sport = SCTP(ulp)->dest_port; + /* XXXGL: handle stale? */ break; case IPPROTO_UDP: PULLUP_TO(hlen, ulp, struct udphdr); - dst_port = UDP(ulp)->uh_dport; - src_port = UDP(ulp)->uh_sport; + dport = UDP(ulp)->uh_dport; + sport = UDP(ulp)->uh_sport; break; case IPPROTO_HOPOPTS: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_hbh); @@ -698,7 +352,7 @@ ipv6_mbuf_demarshal(struct flowtable *ft, struct mbuf *m, ulp = NULL; break; case IPPROTO_ROUTING: /* RFC 2460 */ - PULLUP_TO(hlen, ulp, struct ip6_rthdr); + PULLUP_TO(hlen, ulp, struct ip6_rthdr); hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3; proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt; ulp = NULL; @@ -729,689 +383,395 @@ ipv6_mbuf_demarshal(struct flowtable *ft, struct mbuf *m, } } - if (src_port == 0) { - receive_failed: - return (ENOTSUP); - } - -skipports: - dsin6->sin6_family = AF_INET6; - dsin6->sin6_len = sizeof(*dsin6); - dsin6->sin6_port = dst_port; - memcpy(&dsin6->sin6_addr, &ip6->ip6_dst, sizeof(struct in6_addr)); + bcopy(&ip6->ip6_dst, &key[0], sizeof(struct in6_addr)); + bcopy(&ip6->ip6_src, &key[4], sizeof(struct in6_addr)); + key[8] = (dport << 16) | sport; + fibnum |= proto << 16; - ssin6->sin6_family = AF_INET6; - ssin6->sin6_len = sizeof(*ssin6); - ssin6->sin6_port = src_port; - memcpy(&ssin6->sin6_addr, &ip6->ip6_src, sizeof(struct in6_addr)); - *flags |= proto_to_flags(proto); + fle = flowtable_lookup_common(&V_ip6_ft, key, 9 * sizeof(uint32_t), + fibnum); +#else /* !FLOWTABLE_HASH_ALL */ + bcopy(&ip6->ip6_dst, &key[0], sizeof(struct in6_addr)); + fle = flowtable_lookup_common(&V_ip6_ft, key, sizeof(struct in6_addr), + fibnum); +#endif /* FLOWTABLE_HASH_ALL */ - return (0); -} - -#define zero_key(key) \ -do { \ - key[0] = 0; \ - key[1] = 0; \ - key[2] = 0; \ - key[3] = 0; \ - key[4] = 0; \ - key[5] = 0; \ - key[6] = 0; \ - key[7] = 0; \ - key[8] = 0; \ -} while (0) - -static uint32_t -ipv6_flow_lookup_hash_internal( - struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6, - uint32_t *key, uint16_t flags) -{ - uint16_t sport, dport; - uint8_t proto; - int offset = 0; - - if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0)) - return (0); - - proto = flags_to_proto(flags); - zero_key(key); - sport = dport = 0; - if (dsin6 != NULL) { - memcpy(&key[1], &dsin6->sin6_addr, sizeof(struct in6_addr)); - dport = dsin6->sin6_port; - } - if ((ssin6 != NULL) && (flags & FL_HASH_ALL)) { - memcpy(&key[5], &ssin6->sin6_addr, sizeof(struct in6_addr)); - sport = ssin6->sin6_port; - } - if (flags & FL_HASH_ALL) { - ((uint16_t *)key)[0] = sport; - ((uint16_t *)key)[1] = dport; - } else - offset = V_flow_hashjitter + proto; - - return (jenkins_hash32(key, 9, offset)); -} - -static struct flentry * -flowtable_lookup_mbuf6(struct flowtable *ft, struct mbuf *m) -{ - struct sockaddr_storage ssa, dsa; - struct sockaddr_in6 *dsin6, *ssin6; - uint16_t flags; - - dsin6 = (struct sockaddr_in6 *)&dsa; - ssin6 = (struct sockaddr_in6 *)&ssa; - bzero(dsin6, sizeof(*dsin6)); - bzero(ssin6, sizeof(*ssin6)); - flags = ft->ft_flags; - - if (ipv6_mbuf_demarshal(ft, m, ssin6, dsin6, &flags) != 0) + if (fle == NULL) return (NULL); - return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags)); -} - -void -flow_to_route_in6(struct flentry *fle, struct route_in6 *ro) -{ - uint32_t *hashkey = NULL; - struct sockaddr_in6 *sin6; - sin6 = (struct sockaddr_in6 *)&ro->ro_dst; - sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(*sin6); - hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key; - memcpy(&sin6->sin6_addr, &hashkey[5], sizeof (struct in6_addr)); - ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt); - ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle); - ro->ro_flags |= RT_NORTREF; + bcopy(&ip6->ip6_dst, &sin6->sin6_addr, sizeof(struct in6_addr)); + + return (fle); } #endif /* INET6 */ static bitstr_t * flowtable_mask(struct flowtable *ft) { - bitstr_t *mask; - if (ft->ft_flags & FL_PCPU) - mask = ft->ft_masks[curcpu]; - else - mask = ft->ft_masks[0]; + /* + * flowtable_free_stale() calls w/o critical section, but + * with sched_bind(). Since pointer is stable throughout + * ft lifetime, it is safe, otherwise... + * + * CRITICAL_ASSERT(curthread); + */ - return (mask); + return (*(bitstr_t **)zpcpu_get(ft->ft_masks)); } -static struct flentry ** -flowtable_entry(struct flowtable *ft, uint32_t hash) +static struct flist * +flowtable_list(struct flowtable *ft, uint32_t hash) { - struct flentry **fle; - int index = (hash % ft->ft_size); - if (ft->ft_flags & FL_PCPU) { - KASSERT(&ft->ft_table.pcpu[curcpu][0] != NULL, ("pcpu not set")); - fle = &ft->ft_table.pcpu[curcpu][index]; - } else { - KASSERT(&ft->ft_table.global[0] != NULL, ("global not set")); - fle = &ft->ft_table.global[index]; - } - - return (fle); + CRITICAL_ASSERT(curthread); + return (zpcpu_get(ft->ft_table[hash % ft->ft_size])); } static int -flow_stale(struct flowtable *ft, struct flentry *fle) +flow_stale(struct flowtable *ft, struct flentry *fle, int maxidle) { - time_t idle_time; - - if ((fle->f_fhash == 0) - || ((fle->f_rt->rt_flags & RTF_HOST) && - ((fle->f_rt->rt_flags & (RTF_UP)) - != (RTF_UP))) - || (fle->f_rt->rt_ifp == NULL) - || !RT_LINK_IS_UP(fle->f_rt->rt_ifp)) - return (1); - idle_time = time_uptime - fle->f_uptime; - - if ((fle->f_flags & FL_STALE) || - ((fle->f_flags & (TH_SYN|TH_ACK|TH_FIN)) == 0 - && (idle_time > ft->ft_udp_idle)) || - ((fle->f_flags & TH_FIN) - && (idle_time > ft->ft_fin_wait_idle)) || - ((fle->f_flags & (TH_SYN|TH_ACK)) == TH_SYN - && (idle_time > ft->ft_syn_idle)) || - ((fle->f_flags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK) - && (idle_time > ft->ft_tcp_idle)) || - ((fle->f_rt->rt_flags & RTF_UP) == 0 || - (fle->f_rt->rt_ifp == NULL))) + if (((fle->f_rt->rt_flags & RTF_HOST) && + ((fle->f_rt->rt_flags & (RTF_UP)) != (RTF_UP))) || + (fle->f_rt->rt_ifp == NULL) || + !RT_LINK_IS_UP(fle->f_rt->rt_ifp) || + (fle->f_lle->la_flags & LLE_VALID) == 0) return (1); - return (0); -} + if (time_uptime - fle->f_uptime > maxidle) + return (1); -static void -flowtable_set_hashkey(struct flentry *fle, uint32_t *key) -{ - uint32_t *hashkey; - int i, nwords; +#ifdef FLOWTABLE_HASH_ALL + if (fle->f_flags & FL_STALE) + return (1); +#endif - if (fle->f_flags & FL_IPV6) { - nwords = 9; - hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key; - } else { - nwords = 3; - hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key; - } - - for (i = 0; i < nwords; i++) - hashkey[i] = key[i]; + return (0); } -static struct flentry * -flow_alloc(struct flowtable *ft) +static int +flow_full(void) { - struct flentry *newfle; - uma_zone_t zone; + int count, max; - newfle = NULL; - zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone; + count = uma_zone_get_cur(flow_zone); + max = uma_zone_get_max(flow_zone); - newfle = uma_zalloc(zone, M_NOWAIT | M_ZERO); - if (newfle != NULL) - atomic_add_int(&ft->ft_count, 1); - return (newfle); + return (count > (max - (max >> 3))); } -static void -flow_free(struct flentry *fle, struct flowtable *ft) +static int +flow_matches(struct flentry *fle, uint32_t *key, int keylen, uint32_t fibnum) { - uma_zone_t zone; +#ifdef FLOWTABLE_HASH_ALL + uint8_t proto; - zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone; - atomic_add_int(&ft->ft_count, -1); - uma_zfree(zone, fle); -} + proto = (fibnum >> 16) & 0xff; + fibnum &= 0xffff; +#endif -static int -flow_full(struct flowtable *ft) -{ - boolean_t full; - uint32_t count; - - full = ft->ft_full; - count = ft->ft_count; - - if (full && (count < (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 3)))) - ft->ft_full = FALSE; - else if (!full && (count > (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 5)))) - ft->ft_full = TRUE; - - if (full && !ft->ft_full) { - flowclean_freq = 4*hz; - if ((ft->ft_flags & FL_HASH_ALL) == 0) - ft->ft_udp_idle = ft->ft_fin_wait_idle = - ft->ft_syn_idle = ft->ft_tcp_idle = 5; - cv_broadcast(&flowclean_c_cv); - } else if (!full && ft->ft_full) { - flowclean_freq = 20*hz; - if ((ft->ft_flags & FL_HASH_ALL) == 0) - ft->ft_udp_idle = ft->ft_fin_wait_idle = - ft->ft_syn_idle = ft->ft_tcp_idle = 30; - } + CRITICAL_ASSERT(curthread); - return (ft->ft_full); + /* Microoptimization for IPv4: don't use bcmp(). */ + if (((keylen == sizeof(uint32_t) && (fle->f_key[0] != key[0])) || + (bcmp(fle->f_key, key, keylen) == 0)) && + fibnum == fle->f_fibnum && +#ifdef FLOWTABLE_HASH_ALL + proto == fle->f_proto && +#endif + (fle->f_rt->rt_flags & RTF_UP) && + fle->f_rt->rt_ifp != NULL && + (fle->f_lle->la_flags & LLE_VALID)) + return (1); + + return (0); } -static int +static struct flentry * flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key, - uint32_t fibnum, struct route *ro, uint16_t flags) + int keylen, uint32_t fibnum0) { - struct flentry *fle, *fletail, *newfle, **flep; - struct flowtable_stats *fs = &ft->ft_stats[curcpu]; - int depth; +#ifdef INET6 + struct route_in6 sro6; +#endif +#ifdef INET + struct route sro; +#endif + struct route *ro = NULL; + struct rtentry *rt; + struct lltable *lt = NULL; + struct llentry *lle; + struct sockaddr_storage *l3addr; + struct ifnet *ifp; + struct flist *flist; + struct flentry *fle, *iter; bitstr_t *mask; + uint16_t fibnum = fibnum0; +#ifdef FLOWTABLE_HASH_ALL uint8_t proto; - newfle = flow_alloc(ft); - if (newfle == NULL) - return (ENOMEM); - - newfle->f_flags |= (flags & FL_IPV6); - proto = flags_to_proto(flags); - - FL_ENTRY_LOCK(ft, hash); - mask = flowtable_mask(ft); - flep = flowtable_entry(ft, hash); - fletail = fle = *flep; + proto = (fibnum0 >> 16) & 0xff; + fibnum = fibnum0 & 0xffff; +#endif - if (fle == NULL) { - bit_set(mask, FL_ENTRY_INDEX(ft, hash)); - *flep = fle = newfle; - goto skip; - } - - depth = 0; - fs->ft_collisions++; /* - * find end of list and make sure that we were not - * preempted by another thread handling this flow + * This bit of code ends up locking the + * same route 3 times (just like ip_output + ether_output) + * - at lookup + * - in rt_check when called by arpresolve + * - dropping the refcount for the rtentry + * + * This could be consolidated to one if we wrote a variant + * of arpresolve with an rt_check variant that expected to + * receive the route locked */ - while (fle != NULL) { - if (fle->f_fhash == hash && !flow_stale(ft, fle)) { - /* - * there was either a hash collision - * or we lost a race to insert - */ - FL_ENTRY_UNLOCK(ft, hash); - flow_free(newfle, ft); - - if (flags & FL_OVERWRITE) - goto skip; - return (EEXIST); - } - /* - * re-visit this double condition XXX - */ - if (fletail->f_next != NULL) - fletail = fle->f_next; - - depth++; - fle = fle->f_next; - } - - if (depth > fs->ft_max_depth) - fs->ft_max_depth = depth; - fletail->f_next = newfle; - fle = newfle; -skip: - flowtable_set_hashkey(fle, key); - - fle->f_proto = proto; - fle->f_rt = ro->ro_rt; - fle->f_lle = ro->ro_lle; - fle->f_fhash = hash; - fle->f_fibnum = fibnum; - fle->f_uptime = time_uptime; - FL_ENTRY_UNLOCK(ft, hash); - return (0); -} - -int -kern_flowtable_insert(struct flowtable *ft, - struct sockaddr_storage *ssa, struct sockaddr_storage *dsa, - struct route *ro, uint32_t fibnum, int flags) -{ - uint32_t key[9], hash; +#ifdef INET + if (ft == &V_ip4_ft) { + struct sockaddr_in *sin; - flags = (ft->ft_flags | flags | FL_OVERWRITE); - hash = 0; + ro = &sro; + bzero(&sro.ro_dst, sizeof(sro.ro_dst)); -#ifdef INET - if (ssa->ss_family == AF_INET) - hash = ipv4_flow_lookup_hash_internal((struct sockaddr_in *)ssa, - (struct sockaddr_in *)dsa, key, flags); + sin = (struct sockaddr_in *)&sro.ro_dst; + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + sin->sin_addr.s_addr = key[0]; + } #endif #ifdef INET6 - if (ssa->ss_family == AF_INET6) - hash = ipv6_flow_lookup_hash_internal((struct sockaddr_in6 *)ssa, - (struct sockaddr_in6 *)dsa, key, flags); -#endif - if (ro->ro_rt == NULL || ro->ro_lle == NULL) - return (EINVAL); - - FLDPRINTF(ft, FL_DEBUG, - "kern_flowtable_insert: key=%x:%x:%x hash=%x fibnum=%d flags=%x\n", - key[0], key[1], key[2], hash, fibnum, flags); - return (flowtable_insert(ft, hash, key, fibnum, ro, flags)); -} + if (ft == &V_ip6_ft) { + struct sockaddr_in6 *sin6; -static int -flowtable_key_equal(struct flentry *fle, uint32_t *key) -{ - uint32_t *hashkey; - int i, nwords; + ro = (struct route *)&sro6; + sin6 = &sro6.ro_dst; - if (fle->f_flags & FL_IPV6) { - nwords = 9; - hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key; - } else { - nwords = 3; - hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key; + bzero(sin6, sizeof(*sin6)); + sin6->sin6_family = AF_INET6; + sin6->sin6_len = sizeof(*sin6); + bcopy(key, &sin6->sin6_addr, sizeof(struct in6_addr)); } +#endif - for (i = 0; i < nwords; i++) - if (hashkey[i] != key[i]) - return (0); + ro->ro_rt = NULL; +#ifdef RADIX_MPATH + rtalloc_mpath_fib(ro, hash, fibnum); +#else + rtalloc_ign_fib(ro, 0, fibnum); +#endif + if (ro->ro_rt == NULL) + return (NULL); - return (1); -} + rt = ro->ro_rt; + ifp = rt->rt_ifp; -struct flentry * -flowtable_lookup_mbuf(struct flowtable *ft, struct mbuf *m, int af) -{ - struct flentry *fle = NULL; + if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) { + RTFREE(rt); + return (NULL); + } #ifdef INET - if (af == AF_INET) - fle = flowtable_lookup_mbuf4(ft, m); + if (ft == &V_ip4_ft) + lt = LLTABLE(ifp); #endif #ifdef INET6 - if (af == AF_INET6) - fle = flowtable_lookup_mbuf6(ft, m); -#endif - if (fle != NULL && m != NULL && (m->m_flags & M_FLOWID) == 0) { - m->m_flags |= M_FLOWID; - m->m_pkthdr.flowid = fle->f_fhash; - } - return (fle); -} - -struct flentry * -flowtable_lookup(struct flowtable *ft, struct sockaddr_storage *ssa, - struct sockaddr_storage *dsa, uint32_t fibnum, int flags) -{ - uint32_t key[9], hash; - struct flentry *fle; - struct flowtable_stats *fs = &ft->ft_stats[curcpu]; - uint8_t proto = 0; - int error = 0; - struct rtentry *rt; - struct llentry *lle; - struct route sro, *ro; - struct route_in6 sro6; + if (ft == &V_ip6_ft) + lt = LLTABLE6(ifp); +#endif - sro.ro_rt = sro6.ro_rt = NULL; - sro.ro_lle = sro6.ro_lle = NULL; - ro = NULL; - hash = 0; - flags |= ft->ft_flags; - proto = flags_to_proto(flags); -#ifdef INET - if (ssa->ss_family == AF_INET) { - struct sockaddr_in *ssin, *dsin; + if (rt->rt_flags & RTF_GATEWAY) + l3addr = (struct sockaddr_storage *)rt->rt_gateway; + else + l3addr = (struct sockaddr_storage *)&ro->ro_dst; + lle = llentry_alloc(ifp, lt, l3addr); - ro = &sro; - memcpy(&ro->ro_dst, dsa, sizeof(struct sockaddr_in)); - /* - * The harvested source and destination addresses - * may contain port information if the packet is - * from a transport protocol (e.g. TCP/UDP). The - * port field must be cleared before performing - * a route lookup. - */ - ((struct sockaddr_in *)&ro->ro_dst)->sin_port = 0; - dsin = (struct sockaddr_in *)dsa; - ssin = (struct sockaddr_in *)ssa; - if ((dsin->sin_addr.s_addr == ssin->sin_addr.s_addr) || - (ntohl(dsin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || - (ntohl(ssin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) - return (NULL); - - hash = ipv4_flow_lookup_hash_internal(ssin, dsin, key, flags); + if (lle == NULL) { + RTFREE(rt); + return (NULL); } -#endif -#ifdef INET6 - if (ssa->ss_family == AF_INET6) { - struct sockaddr_in6 *ssin6, *dsin6; - ro = (struct route *)&sro6; - memcpy(&sro6.ro_dst, dsa, - sizeof(struct sockaddr_in6)); - ((struct sockaddr_in6 *)&ro->ro_dst)->sin6_port = 0; - dsin6 = (struct sockaddr_in6 *)dsa; - ssin6 = (struct sockaddr_in6 *)ssa; - - flags |= FL_IPV6; - hash = ipv6_flow_lookup_hash_internal(ssin6, dsin6, key, flags); - } -#endif - /* - * Ports are zero and this isn't a transmit cache - * - thus not a protocol for which we need to keep - * state - * FL_HASH_ALL => key[0] != 0 for TCP || UDP || SCTP - */ - if (hash == 0 || (key[0] == 0 && (ft->ft_flags & FL_HASH_ALL))) + /* Don't insert the entry if the ARP hasn't yet finished resolving. */ + if ((lle->la_flags & LLE_VALID) == 0) { + RTFREE(rt); + LLE_FREE(lle); + FLOWSTAT_INC(ft, ft_fail_lle_invalid); return (NULL); + } - fs->ft_lookups++; - FL_ENTRY_LOCK(ft, hash); - if ((fle = FL_ENTRY(ft, hash)) == NULL) { - FL_ENTRY_UNLOCK(ft, hash); - goto uncached; + fle = uma_zalloc(flow_zone, M_NOWAIT | M_ZERO); + if (fle == NULL) { + RTFREE(rt); + LLE_FREE(lle); + return (NULL); } -keycheck: - rt = __DEVOLATILE(struct rtentry *, fle->f_rt); - lle = __DEVOLATILE(struct llentry *, fle->f_lle); - if ((rt != NULL) - && lle != NULL - && fle->f_fhash == hash - && flowtable_key_equal(fle, key) - && (proto == fle->f_proto) - && (fibnum == fle->f_fibnum) - && (rt->rt_flags & RTF_UP) - && (rt->rt_ifp != NULL) - && (lle->la_flags & LLE_VALID)) { - fs->ft_hits++; - fle->f_uptime = time_uptime; - fle->f_flags |= flags; - FL_ENTRY_UNLOCK(ft, hash); - return (fle); - } else if (fle->f_next != NULL) { - fle = fle->f_next; - goto keycheck; + + fle->f_hash = hash; + bcopy(key, &fle->f_key, keylen); + fle->f_rt = rt; + fle->f_lle = lle; + fle->f_fibnum = fibnum; + fle->f_uptime = time_uptime; +#ifdef FLOWTABLE_HASH_ALL + fle->f_proto = proto; + fle->f_flags = fibnum0 >> 24; +#endif + + critical_enter(); + mask = flowtable_mask(ft); + flist = flowtable_list(ft, hash); + + if (SLIST_EMPTY(flist)) { + bit_set(mask, (hash % ft->ft_size)); + SLIST_INSERT_HEAD(flist, fle, f_next); + goto skip; } - FL_ENTRY_UNLOCK(ft, hash); -uncached: - if (flags & FL_NOAUTO || flow_full(ft)) - return (NULL); - fs->ft_misses++; /* - * This bit of code ends up locking the - * same route 3 times (just like ip_output + ether_output) - * - at lookup - * - in rt_check when called by arpresolve - * - dropping the refcount for the rtentry - * - * This could be consolidated to one if we wrote a variant - * of arpresolve with an rt_check variant that expected to - * receive the route locked + * find end of list and make sure that we were not + * preempted by another thread handling this flow */ - -#ifdef INVARIANTS - if ((ro->ro_dst.sa_family != AF_INET) && - (ro->ro_dst.sa_family != AF_INET6)) - panic("sa_family == %d\n", ro->ro_dst.sa_family); + SLIST_FOREACH(iter, flist, f_next) { + KASSERT(iter->f_hash % ft->ft_size == hash % ft->ft_size, + ("%s: wrong hash", __func__)); + if (flow_matches(iter, key, keylen, fibnum)) { + /* + * We probably migrated to an other CPU after + * lookup in flowtable_lookup_common() failed. + * It appeared that this CPU already has flow + * entry. + */ + iter->f_uptime = time_uptime; +#ifdef FLOWTABLE_HASH_ALL + iter->f_flags |= fibnum >> 24; #endif - - ft->ft_rtalloc(ro, hash, fibnum); - if (ro->ro_rt == NULL) - error = ENETUNREACH; - else { - struct llentry *lle = NULL; - struct sockaddr_storage *l3addr; - struct rtentry *rt = ro->ro_rt; - struct ifnet *ifp = rt->rt_ifp; - - if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) { - RTFREE(rt); - ro->ro_rt = NULL; - return (NULL); + critical_exit(); + FLOWSTAT_INC(ft, ft_collisions); + uma_zfree(flow_zone, fle); + return (iter); } -#ifdef INET6 - if (ssa->ss_family == AF_INET6) { - struct sockaddr_in6 *dsin6; - - dsin6 = (struct sockaddr_in6 *)dsa; - if (in6_localaddr(&dsin6->sin6_addr)) { - RTFREE(rt); - ro->ro_rt = NULL; - return (NULL); - } + } - if (rt->rt_flags & RTF_GATEWAY) - l3addr = (struct sockaddr_storage *)rt->rt_gateway; - - else - l3addr = (struct sockaddr_storage *)&ro->ro_dst; - lle = llentry_alloc(ifp, LLTABLE6(ifp), l3addr); - } -#endif + SLIST_INSERT_HEAD(flist, fle, f_next); +skip: + critical_exit(); + FLOWSTAT_INC(ft, ft_inserts); + + return (fle); +} + +int +flowtable_lookup(sa_family_t sa, struct mbuf *m, struct route *ro) +{ + struct flentry *fle; + + if (V_flowtable_enable == 0) + return (ENXIO); + + switch (sa) { #ifdef INET - if (ssa->ss_family == AF_INET) { - if (rt->rt_flags & RTF_GATEWAY) - l3addr = (struct sockaddr_storage *)rt->rt_gateway; - else - l3addr = (struct sockaddr_storage *)&ro->ro_dst; - lle = llentry_alloc(ifp, LLTABLE(ifp), l3addr); - } - + case AF_INET: + fle = flowtable_lookup_ipv4(m, ro); + break; +#endif +#ifdef INET6 + case AF_INET6: + fle = flowtable_lookup_ipv6(m, ro); + break; #endif - ro->ro_lle = lle; + default: + panic("%s: sa %d", __func__, sa); + } - if (lle == NULL) { - RTFREE(rt); - ro->ro_rt = NULL; - return (NULL); - } - error = flowtable_insert(ft, hash, key, fibnum, ro, flags); + if (fle == NULL) + return (EHOSTUNREACH); - if (error) { - RTFREE(rt); - LLE_FREE(lle); - ro->ro_rt = NULL; - ro->ro_lle = NULL; - } - } + if (!(m->m_flags & M_FLOWID)) { + m->m_flags |= M_FLOWID; + m->m_pkthdr.flowid = fle->f_hash; + } + + ro->ro_rt = fle->f_rt; + ro->ro_lle = fle->f_lle; + ro->ro_flags |= RT_NORTREF; - return ((error) ? NULL : fle); + return (0); } -/* - * used by the bit_alloc macro - */ -#define calloc(count, size) malloc((count)*(size), M_DEVBUF, M_WAITOK|M_ZERO) - -struct flowtable * -flowtable_alloc(char *name, int nentry, int flags) +static struct flentry * +flowtable_lookup_common(struct flowtable *ft, uint32_t *key, int keylen, + uint32_t fibnum) { - struct flowtable *ft, *fttail; - int i; - - if (V_flow_hashjitter == 0) - V_flow_hashjitter = arc4random(); + struct flist *flist; + struct flentry *fle; + uint32_t hash; - KASSERT(nentry > 0, ("nentry must be > 0, is %d\n", nentry)); + FLOWSTAT_INC(ft, ft_lookups); - ft = malloc(sizeof(struct flowtable), - M_RTABLE, M_WAITOK | M_ZERO); + hash = jenkins_hash32(key, keylen / sizeof(uint32_t), flow_hashjitter); - ft->ft_name = name; - ft->ft_flags = flags; - ft->ft_size = nentry; -#ifdef RADIX_MPATH - ft->ft_rtalloc = rtalloc_mpath_fib; -#else - ft->ft_rtalloc = rtalloc_ign_wrapper; + critical_enter(); + flist = flowtable_list(ft, hash); + SLIST_FOREACH(fle, flist, f_next) { + KASSERT(fle->f_hash % ft->ft_size == hash % ft->ft_size, + ("%s: wrong hash", __func__)); + if (flow_matches(fle, key, keylen, fibnum)) { + fle->f_uptime = time_uptime; +#ifdef FLOWTABLE_HASH_ALL + fle->f_flags |= fibnum >> 24; #endif - if (flags & FL_PCPU) { - ft->ft_lock = flowtable_pcpu_lock; - ft->ft_unlock = flowtable_pcpu_unlock; - - for (i = 0; i <= mp_maxid; i++) { - ft->ft_table.pcpu[i] = - malloc(nentry*sizeof(struct flentry *), - M_RTABLE, M_WAITOK | M_ZERO); - ft->ft_masks[i] = bit_alloc(nentry); + critical_exit(); + FLOWSTAT_INC(ft, ft_hits); + return (fle); } - } else { - ft->ft_lock_count = 2*(powerof2(mp_maxid + 1) ? (mp_maxid + 1): - (fls(mp_maxid + 1) << 1)); - - ft->ft_lock = flowtable_global_lock; - ft->ft_unlock = flowtable_global_unlock; - ft->ft_table.global = - malloc(nentry*sizeof(struct flentry *), - M_RTABLE, M_WAITOK | M_ZERO); - ft->ft_locks = malloc(ft->ft_lock_count*sizeof(struct mtx), - M_RTABLE, M_WAITOK | M_ZERO); - for (i = 0; i < ft->ft_lock_count; i++) - mtx_init(&ft->ft_locks[i], "flow", NULL, MTX_DEF|MTX_DUPOK); - - ft->ft_masks[0] = bit_alloc(nentry); } - ft->ft_tmpmask = bit_alloc(nentry); + critical_exit(); - /* - * In the local transmit case the table truly is - * just a cache - so everything is eligible for - * replacement after 5s of non-use - */ - if (flags & FL_HASH_ALL) { - ft->ft_udp_idle = V_flowtable_udp_expire; - ft->ft_syn_idle = V_flowtable_syn_expire; - ft->ft_fin_wait_idle = V_flowtable_fin_wait_expire; - ft->ft_tcp_idle = V_flowtable_fin_wait_expire; - } else { - ft->ft_udp_idle = ft->ft_fin_wait_idle = - ft->ft_syn_idle = ft->ft_tcp_idle = 30; - - } + FLOWSTAT_INC(ft, ft_misses); - /* - * hook in to the cleaner list - */ - if (V_flow_list_head == NULL) - V_flow_list_head = ft; - else { - fttail = V_flow_list_head; - while (fttail->ft_next != NULL) - fttail = fttail->ft_next; - fttail->ft_next = ft; - } - - return (ft); + return (flowtable_insert(ft, hash, key, keylen, fibnum)); } /* - * The rest of the code is devoted to garbage collection of expired entries. - * It is a new additon made necessary by the switch to dynamically allocating - * flow tables. - * + * used by the bit_alloc macro */ +#define calloc(count, size) malloc((count)*(size), M_FTABLE, M_WAITOK | M_ZERO) static void -fle_free(struct flentry *fle, struct flowtable *ft) +flowtable_alloc(struct flowtable *ft) { - struct rtentry *rt; - struct llentry *lle; - rt = __DEVOLATILE(struct rtentry *, fle->f_rt); - lle = __DEVOLATILE(struct llentry *, fle->f_lle); - if (rt != NULL) - RTFREE(rt); - if (lle != NULL) - LLE_FREE(lle); - flow_free(fle, ft); + ft->ft_table = malloc(ft->ft_size * sizeof(struct flist), + M_FTABLE, M_WAITOK); + for (int i = 0; i < ft->ft_size; i++) + ft->ft_table[i] = uma_zalloc(pcpu_zone_ptr, M_WAITOK | M_ZERO); + + ft->ft_masks = uma_zalloc(pcpu_zone_ptr, M_WAITOK); + for (int i = 0; i < mp_ncpus; i++) { + bitstr_t **b; + + b = zpcpu_get_cpu(ft->ft_masks, i); + *b = bit_alloc(ft->ft_size); + } + ft->ft_tmpmask = bit_alloc(ft->ft_size); } +#undef calloc static void -flowtable_free_stale(struct flowtable *ft, struct rtentry *rt) +flowtable_free_stale(struct flowtable *ft, struct rtentry *rt, int maxidle) { - int curbit = 0, count, tmpsize; - struct flentry *fle, **flehead, *fleprev; - struct flentry *flefreehead, *flefreetail, *fletmp; + struct flist *flist, freelist; + struct flentry *fle, *fle1, *fleprev; bitstr_t *mask, *tmpmask; - struct flowtable_stats *fs = &ft->ft_stats[curcpu]; + int curbit, tmpsize; - flefreehead = flefreetail = NULL; + SLIST_INIT(&freelist); mask = flowtable_mask(ft); tmpmask = ft->ft_tmpmask; tmpsize = ft->ft_size; memcpy(tmpmask, mask, ft->ft_size/8); + curbit = 0; /* * XXX Note to self, bit_ffs operates at the byte level * and thus adds gratuitous overhead @@ -1425,131 +785,96 @@ flowtable_free_stale(struct flowtable *ft, struct rtentry *rt) break; } - FL_ENTRY_LOCK(ft, curbit); - flehead = flowtable_entry(ft, curbit); - fle = fleprev = *flehead; + FLOWSTAT_INC(ft, ft_free_checks); - fs->ft_free_checks++; + critical_enter(); + flist = flowtable_list(ft, curbit); #ifdef DIAGNOSTIC - if (fle == NULL && curbit > 0) { + if (SLIST_EMPTY(flist) && curbit > 0) { log(LOG_ALERT, "warning bit=%d set, but no fle found\n", curbit); } -#endif - while (fle != NULL) { - if (rt != NULL) { - if (__DEVOLATILE(struct rtentry *, fle->f_rt) != rt) { - fleprev = fle; - fle = fle->f_next; - continue; - } - } else if (!flow_stale(ft, fle)) { +#endif + SLIST_FOREACH_SAFE(fle, flist, f_next, fle1) { + if (rt != NULL && fle->f_rt != rt) { fleprev = fle; - fle = fle->f_next; continue; } - /* - * delete head of the list - */ - if (fleprev == *flehead) { - fletmp = fleprev; - if (fle == fleprev) { - fleprev = *flehead = fle->f_next; - } else - fleprev = *flehead = fle; - fle = fle->f_next; - } else { - /* - * don't advance fleprev - */ - fletmp = fle; - fleprev->f_next = fle->f_next; - fle = fleprev->f_next; + if (!flow_stale(ft, fle, maxidle)) { + fleprev = fle; + continue; } - if (flefreehead == NULL) - flefreehead = flefreetail = fletmp; - else { - flefreetail->f_next = fletmp; - flefreetail = fletmp; - } - fletmp->f_next = NULL; + if (fle == SLIST_FIRST(flist)) + SLIST_REMOVE_HEAD(flist, f_next); + else + SLIST_REMOVE_AFTER(fleprev, f_next); + SLIST_INSERT_HEAD(&freelist, fle, f_next); } - if (*flehead == NULL) + if (SLIST_EMPTY(flist)) bit_clear(mask, curbit); - FL_ENTRY_UNLOCK(ft, curbit); + critical_exit(); + bit_clear(tmpmask, curbit); tmpmask += (curbit / 8); tmpsize -= (curbit / 8) * 8; bit_ffs(tmpmask, tmpsize, &curbit); } - count = 0; - while ((fle = flefreehead) != NULL) { - flefreehead = fle->f_next; - count++; - fs->ft_frees++; - fle_free(fle, ft); + + SLIST_FOREACH_SAFE(fle, &freelist, f_next, fle1) { + FLOWSTAT_INC(ft, ft_frees); + if (fle->f_rt != NULL) + RTFREE(fle->f_rt); + if (fle->f_lle != NULL) + LLE_FREE(fle->f_lle); + uma_zfree(flow_zone, fle); } - if (V_flowtable_debug && count) - log(LOG_DEBUG, "freed %d flow entries\n", count); } -void -flowtable_route_flush(struct flowtable *ft, struct rtentry *rt) +static void +flowtable_clean_vnet(struct flowtable *ft, struct rtentry *rt, int maxidle) { int i; - if (ft->ft_flags & FL_PCPU) { - CPU_FOREACH(i) { - if (smp_started == 1) { - thread_lock(curthread); - sched_bind(curthread, i); - thread_unlock(curthread); - } + CPU_FOREACH(i) { + if (smp_started == 1) { + thread_lock(curthread); + sched_bind(curthread, i); + thread_unlock(curthread); + } - flowtable_free_stale(ft, rt); + flowtable_free_stale(ft, rt, maxidle); - if (smp_started == 1) { - thread_lock(curthread); - sched_unbind(curthread); - thread_unlock(curthread); - } + if (smp_started == 1) { + thread_lock(curthread); + sched_unbind(curthread); + thread_unlock(curthread); } - } else { - flowtable_free_stale(ft, rt); } } -static void -flowtable_clean_vnet(void) +void +flowtable_route_flush(sa_family_t sa, struct rtentry *rt) { struct flowtable *ft; - int i; - ft = V_flow_list_head; - while (ft != NULL) { - if (ft->ft_flags & FL_PCPU) { - CPU_FOREACH(i) { - if (smp_started == 1) { - thread_lock(curthread); - sched_bind(curthread, i); - thread_unlock(curthread); - } - - flowtable_free_stale(ft, NULL); - - if (smp_started == 1) { - thread_lock(curthread); - sched_unbind(curthread); - thread_unlock(curthread); - } - } - } else { - flowtable_free_stale(ft, NULL); - } - ft = ft->ft_next; + switch (sa) { +#ifdef INET + case AF_INET: + ft = &V_ip4_ft; + break; +#endif +#ifdef INET6 + case AF_INET6: + ft = &V_ip6_ft; + break; +#endif + default: + panic("%s: sa %d", __func__, sa); } + + flowtable_clean_vnet(ft, rt, 0); } static void @@ -1562,18 +887,33 @@ flowtable_cleaner(void) log(LOG_INFO, "flowtable cleaner started\n"); td = curthread; while (1) { + uint32_t flowclean_freq, maxidle; + + /* + * The maximum idle time, as well as frequency are arbitrary. + */ + if (flow_full()) + maxidle = 5; + else + maxidle = 30; + VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); - flowtable_clean_vnet(); +#ifdef INET + flowtable_clean_vnet(&V_ip4_ft, NULL, maxidle); +#endif +#ifdef INET6 + flowtable_clean_vnet(&V_ip6_ft, NULL, maxidle); +#endif CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); - /* - * The 10 second interval between cleaning checks - * is arbitrary - */ + if (flow_full()) + flowclean_freq = 4*hz; + else + flowclean_freq = 20*hz; mtx_lock(&flowclean_lock); thread_lock(td); sched_prio(td, PPAUSE); @@ -1606,91 +946,106 @@ static struct kproc_desc flow_kp = { }; SYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp); -static void -flowtable_init_vnet(const void *unused __unused) +static int +flowtable_get_size(char *name) { + int size; + + if (TUNABLE_INT_FETCH(name, &size)) { + if (size < 256) + size = 256; + if (!powerof2(size)) { + printf("%s must be power of 2\n", name); + size = 2048; + } + } else { + /* + * round up to the next power of 2 + */ + size = 1 << fls((1024 + maxusers * 64) - 1); + } - V_flowtable_nmbflows = 1024 + maxusers * 64 * mp_ncpus; - V_flow_ipv4_zone = uma_zcreate("ip4flow", sizeof(struct flentry_v4), - NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET); - V_flow_ipv6_zone = uma_zcreate("ip6flow", sizeof(struct flentry_v6), - NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET); - uma_zone_set_max(V_flow_ipv4_zone, V_flowtable_nmbflows); - uma_zone_set_max(V_flow_ipv6_zone, V_flowtable_nmbflows); - V_flowtable_ready = 1; + return (size); } -VNET_SYSINIT(flowtable_init_vnet, SI_SUB_SMP, SI_ORDER_ANY, - flowtable_init_vnet, NULL); static void flowtable_init(const void *unused __unused) { + flow_hashjitter = arc4random(); + + flow_zone = uma_zcreate("flows", sizeof(struct flentry), + NULL, NULL, NULL, NULL, (64-1), UMA_ZONE_MAXBUCKET); + uma_zone_set_max(flow_zone, 1024 + maxusers * 64 * mp_ncpus); + cv_init(&flowclean_c_cv, "c_flowcleanwait"); cv_init(&flowclean_f_cv, "f_flowcleanwait"); mtx_init(&flowclean_lock, "flowclean lock", NULL, MTX_DEF); EVENTHANDLER_REGISTER(ifnet_departure_event, flowtable_flush, NULL, EVENTHANDLER_PRI_ANY); - flowclean_freq = 20*hz; } -SYSINIT(flowtable_init, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, +SYSINIT(flowtable_init, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST, flowtable_init, NULL); +#ifdef INET +static SYSCTL_NODE(_net_flowtable, OID_AUTO, ip4, CTLFLAG_RD, NULL, + "Flowtable for IPv4"); + +static VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip4_ftstat); +VNET_PCPUSTAT_SYSINIT(ip4_ftstat); +VNET_PCPUSTAT_SYSUNINIT(ip4_ftstat); +SYSCTL_VNET_PCPUSTAT(_net_flowtable_ip4, OID_AUTO, stat, struct flowtable_stat, + ip4_ftstat, "Flowtable statistics for IPv4 " + "(struct flowtable_stat, net/flowtable.h)"); -#ifdef VIMAGE static void -flowtable_uninit(const void *unused __unused) +flowtable_init_vnet_v4(const void *unused __unused) { - V_flowtable_ready = 0; - uma_zdestroy(V_flow_ipv4_zone); - uma_zdestroy(V_flow_ipv6_zone); + V_ip4_ft.ft_size = flowtable_get_size("net.flowtable.ip4.size"); + V_ip4_ft.ft_stat = VNET(ip4_ftstat); + flowtable_alloc(&V_ip4_ft); } +VNET_SYSINIT(ft_vnet_v4, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, + flowtable_init_vnet_v4, NULL); +#endif /* INET */ -VNET_SYSUNINIT(flowtable_uninit, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY, - flowtable_uninit, NULL); -#endif +#ifdef INET6 +static SYSCTL_NODE(_net_flowtable, OID_AUTO, ip6, CTLFLAG_RD, NULL, + "Flowtable for IPv6"); -#ifdef DDB -static uint32_t * -flowtable_get_hashkey(struct flentry *fle) -{ - uint32_t *hashkey; +static VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip6_ftstat); +VNET_PCPUSTAT_SYSINIT(ip6_ftstat); +VNET_PCPUSTAT_SYSUNINIT(ip6_ftstat); +SYSCTL_VNET_PCPUSTAT(_net_flowtable_ip6, OID_AUTO, stat, struct flowtable_stat, + ip6_ftstat, "Flowtable statistics for IPv6 " + "(struct flowtable_stat, net/flowtable.h)"); - if (fle->f_flags & FL_IPV6) - hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key; - else - hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key; +static void +flowtable_init_vnet_v6(const void *unused __unused) +{ - return (hashkey); + V_ip6_ft.ft_size = flowtable_get_size("net.flowtable.ip6.size"); + V_ip6_ft.ft_stat = VNET(ip6_ftstat); + flowtable_alloc(&V_ip6_ft); } +VNET_SYSINIT(flowtable_init_vnet_v6, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, + flowtable_init_vnet_v6, NULL); +#endif /* INET6 */ +#ifdef DDB static bitstr_t * flowtable_mask_pcpu(struct flowtable *ft, int cpuid) { - bitstr_t *mask; - if (ft->ft_flags & FL_PCPU) - mask = ft->ft_masks[cpuid]; - else - mask = ft->ft_masks[0]; - - return (mask); + return (zpcpu_get_cpu(*ft->ft_masks, cpuid)); } -static struct flentry ** -flowtable_entry_pcpu(struct flowtable *ft, uint32_t hash, int cpuid) +static struct flist * +flowtable_list_pcpu(struct flowtable *ft, uint32_t hash, int cpuid) { - struct flentry **fle; - int index = (hash % ft->ft_size); - if (ft->ft_flags & FL_PCPU) { - fle = &ft->ft_table.pcpu[cpuid][index]; - } else { - fle = &ft->ft_table.global[index]; - } - - return (fle); + return (zpcpu_get_cpu(&ft->ft_table[hash % ft->ft_size], cpuid)); } static void @@ -1698,40 +1053,58 @@ flow_show(struct flowtable *ft, struct flentry *fle) { int idle_time; int rt_valid, ifp_valid; - uint16_t sport, dport; - uint32_t *hashkey; - char saddr[4*sizeof "123"], daddr[4*sizeof "123"]; volatile struct rtentry *rt; struct ifnet *ifp = NULL; + uint32_t *hashkey = fle->f_key; idle_time = (int)(time_uptime - fle->f_uptime); rt = fle->f_rt; rt_valid = rt != NULL; - if (rt_valid) + if (rt_valid) ifp = rt->rt_ifp; ifp_valid = ifp != NULL; - hashkey = flowtable_get_hashkey(fle); - if (fle->f_flags & FL_IPV6) - goto skipaddr; - - inet_ntoa_r(*(struct in_addr *) &hashkey[2], daddr); - if (ft->ft_flags & FL_HASH_ALL) { - inet_ntoa_r(*(struct in_addr *) &hashkey[1], saddr); - sport = ntohs(((uint16_t *)hashkey)[0]); - dport = ntohs(((uint16_t *)hashkey)[1]); - db_printf("%s:%d->%s:%d", - saddr, sport, daddr, - dport); - } else + +#ifdef INET + if (ft == &V_ip4_ft) { + char daddr[4*sizeof "123"]; +#ifdef FLOWTABLE_HASH_ALL + char saddr[4*sizeof "123"]; + uint16_t sport, dport; +#endif + + inet_ntoa_r(*(struct in_addr *) &hashkey[0], daddr); +#ifdef FLOWTABLE_HASH_ALL + inet_ntoa_r(*(struct in_addr *) &hashkey[1], saddr); + dport = ntohs((uint16_t)(hashkey[2] >> 16)); + sport = ntohs((uint16_t)(hashkey[2] & 0xffff)); + db_printf("%s:%d->%s:%d", saddr, sport, daddr, dport); +#else db_printf("%s ", daddr); - -skipaddr: +#endif + } +#endif /* INET */ +#ifdef INET6 + if (ft == &V_ip6_ft) { +#ifdef FLOWTABLE_HASH_ALL + db_printf("\n\tkey=%08x:%08x:%08x%08x:%08x:%08x%08x:%08x:%08x", + hashkey[0], hashkey[1], hashkey[2], + hashkey[3], hashkey[4], hashkey[5], + hashkey[6], hashkey[7], hashkey[8]); +#else + db_printf("\n\tkey=%08x:%08x:%08x ", + hashkey[0], hashkey[1], hashkey[2]); +#endif + } +#endif /* INET6 */ + + db_printf("hash=%08x idle_time=%03d" + "\n\tfibnum=%02d rt=%p", + fle->f_hash, idle_time, fle->f_fibnum, fle->f_rt); + +#ifdef FLOWTABLE_HASH_ALL if (fle->f_flags & FL_STALE) db_printf(" FL_STALE "); - if (fle->f_flags & FL_TCP) - db_printf(" FL_TCP "); - if (fle->f_flags & FL_UDP) - db_printf(" FL_UDP "); +#endif if (rt_valid) { if (rt->rt_flags & RTF_UP) db_printf(" RTF_UP "); @@ -1740,21 +1113,10 @@ skipaddr: if (ifp->if_flags & IFF_LOOPBACK) db_printf(" IFF_LOOPBACK "); if (ifp->if_flags & IFF_UP) - db_printf(" IFF_UP "); + db_printf(" IFF_UP "); if (ifp->if_flags & IFF_POINTOPOINT) - db_printf(" IFF_POINTOPOINT "); + db_printf(" IFF_POINTOPOINT "); } - if (fle->f_flags & FL_IPV6) - db_printf("\n\tkey=%08x:%08x:%08x%08x:%08x:%08x%08x:%08x:%08x", - hashkey[0], hashkey[1], hashkey[2], - hashkey[3], hashkey[4], hashkey[5], - hashkey[6], hashkey[7], hashkey[8]); - else - db_printf("\n\tkey=%08x:%08x:%08x ", - hashkey[0], hashkey[1], hashkey[2]); - db_printf("hash=%08x idle_time=%03d" - "\n\tfibnum=%02d rt=%p", - fle->f_fhash, idle_time, fle->f_fibnum, fle->f_rt); db_printf("\n"); } @@ -1762,7 +1124,6 @@ static void flowtable_show(struct flowtable *ft, int cpuid) { int curbit = 0; - struct flentry *fle, **flehead; bitstr_t *mask, *tmpmask; if (cpuid != -1) @@ -1776,43 +1137,32 @@ flowtable_show(struct flowtable *ft, int cpuid) */ bit_ffs(tmpmask, ft->ft_size, &curbit); while (curbit != -1) { + struct flist *flist; + struct flentry *fle; + if (curbit >= ft->ft_size || curbit < -1) { db_printf("warning: bad curbit value %d \n", curbit); break; } - flehead = flowtable_entry_pcpu(ft, curbit, cpuid); - fle = *flehead; + flist = flowtable_list_pcpu(ft, curbit, cpuid); - while (fle != NULL) { + SLIST_FOREACH(fle, flist, f_next) flow_show(ft, fle); - fle = fle->f_next; - continue; - } bit_clear(tmpmask, curbit); bit_ffs(tmpmask, ft->ft_size, &curbit); } } static void -flowtable_show_vnet(void) +flowtable_show_vnet(struct flowtable *ft) { - struct flowtable *ft; + int i; - ft = V_flow_list_head; - while (ft != NULL) { - printf("name: %s\n", ft->ft_name); - if (ft->ft_flags & FL_PCPU) { - CPU_FOREACH(i) { - flowtable_show(ft, i); - } - } else { - flowtable_show(ft, -1); - } - ft = ft->ft_next; - } + CPU_FOREACH(i) + flowtable_show(ft, i); } DB_SHOW_COMMAND(flowtables, db_show_flowtables) @@ -1824,7 +1174,14 @@ DB_SHOW_COMMAND(flowtables, db_show_flowtables) #ifdef VIMAGE db_printf("vnet %p\n", vnet_iter); #endif - flowtable_show_vnet(); +#ifdef INET + printf("IPv4:\n"); + flowtable_show_vnet(&V_ip4_ft); +#endif +#ifdef INET6 + printf("IPv6:\n"); + flowtable_show_vnet(&V_ip6_ft); +#endif CURVNET_RESTORE(); } } diff --git a/sys/net/flowtable.h b/sys/net/flowtable.h index d810fa3..5a1d927 100644 --- a/sys/net/flowtable.h +++ b/sys/net/flowtable.h @@ -1,83 +1,56 @@ -/************************************************************************** - -Copyright (c) 2008-2010, BitGravity Inc. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - 2. Neither the name of the BitGravity Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - -$FreeBSD$ - -***************************************************************************/ +/*- + * Copyright (c) 2014 Gleb Smirnoff <glebius@FreeBSD.org> + * Copyright (c) 2008-2010, BitGravity Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Neither the name of the BitGravity Corporation nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + * + */ #ifndef _NET_FLOWTABLE_H_ #define _NET_FLOWTABLE_H_ -#ifdef _KERNEL - -#define FL_HASH_ALL (1<<0) /* hash 4-tuple + protocol */ -#define FL_PCPU (1<<1) /* pcpu cache */ -#define FL_NOAUTO (1<<2) /* don't automatically add flentry on miss */ -#define FL_IPV6 (1<<9) - -#define FL_TCP (1<<11) -#define FL_SCTP (1<<12) -#define FL_UDP (1<<13) -#define FL_DEBUG (1<<14) -#define FL_DEBUG_ALL (1<<15) - -struct flowtable; -struct flentry; -struct route; -struct route_in6; +struct flowtable_stat { + uint64_t ft_collisions; + uint64_t ft_misses; + uint64_t ft_free_checks; + uint64_t ft_frees; + uint64_t ft_hits; + uint64_t ft_lookups; + uint64_t ft_fail_lle_invalid; + uint64_t ft_inserts; +}; -VNET_DECLARE(struct flowtable *, ip_ft); -#define V_ip_ft VNET(ip_ft) - -VNET_DECLARE(struct flowtable *, ip6_ft); -#define V_ip6_ft VNET(ip6_ft) - -struct flowtable *flowtable_alloc(char *name, int nentry, int flags); +#ifdef _KERNEL /* - * Given a flow table, look up the L3 and L2 information and - * return it in the route. - * + * Given a flow table, look up the L3 and L2 information + * and return it in the route. */ -struct flentry *flowtable_lookup_mbuf(struct flowtable *ft, struct mbuf *m, int af); - -struct flentry *flowtable_lookup(struct flowtable *ft, struct sockaddr_storage *ssa, - struct sockaddr_storage *dsa, uint32_t fibnum, int flags); - -int kern_flowtable_insert(struct flowtable *ft, struct sockaddr_storage *ssa, - struct sockaddr_storage *dsa, struct route *ro, uint32_t fibnum, int flags); - -void flow_invalidate(struct flentry *fl); -void flowtable_route_flush(struct flowtable *ft, struct rtentry *rt); - -void flow_to_route(struct flentry *fl, struct route *ro); - -void flow_to_route_in6(struct flentry *fl, struct route_in6 *ro); - +int flowtable_lookup(sa_family_t, struct mbuf *, struct route *); +void flowtable_route_flush(sa_family_t, struct rtentry *); #endif /* _KERNEL */ -#endif +#endif /* !_NET_FLOWTABLE_H_ */ diff --git a/sys/net/route.c b/sys/net/route.c index 20fe181..bb99496 100644 --- a/sys/net/route.c +++ b/sys/net/route.c @@ -1298,18 +1298,7 @@ rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt, } #ifdef FLOWTABLE else if (rt0 != NULL) { - switch (dst->sa_family) { -#ifdef INET6 - case AF_INET6: - flowtable_route_flush(V_ip6_ft, rt0); - break; -#endif -#ifdef INET - case AF_INET: - flowtable_route_flush(V_ip_ft, rt0); - break; -#endif - } + flowtable_route_flush(dst->sa_family, rt0); RTFREE(rt0); } #endif diff --git a/sys/netinet/ip_input.c b/sys/netinet/ip_input.c index c265d02..cde30ee 100644 --- a/sys/netinet/ip_input.c +++ b/sys/netinet/ip_input.c @@ -62,7 +62,6 @@ __FBSDID("$FreeBSD$"); #include <net/route.h> #include <net/netisr.h> #include <net/vnet.h> -#include <net/flowtable.h> #include <netinet/in.h> #include <netinet/in_kdtrace.h> @@ -198,16 +197,6 @@ SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW, "IP stealth mode, no TTL decrementation on forwarding"); #endif -#ifdef FLOWTABLE -static VNET_DEFINE(int, ip_output_flowtable_size) = 2048; -VNET_DEFINE(struct flowtable *, ip_ft); -#define V_ip_output_flowtable_size VNET(ip_output_flowtable_size) - -SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, output_flowtable_size, CTLFLAG_RDTUN, - &VNET_NAME(ip_output_flowtable_size), 2048, - "number of entries in the per-cpu output flow caches"); -#endif - static void ip_freef(struct ipqhead *, struct ipq *); /* @@ -309,24 +298,6 @@ ip_init(void) printf("%s: WARNING: unable to register pfil hook, " "error %d\n", __func__, i); -#ifdef FLOWTABLE - if (TUNABLE_INT_FETCH("net.inet.ip.output_flowtable_size", - &V_ip_output_flowtable_size)) { - if (V_ip_output_flowtable_size < 256) - V_ip_output_flowtable_size = 256; - if (!powerof2(V_ip_output_flowtable_size)) { - printf("flowtable must be power of 2 size\n"); - V_ip_output_flowtable_size = 2048; - } - } else { - /* - * round up to the next power of 2 - */ - V_ip_output_flowtable_size = 1 << fls((1024 + maxusers * 64)-1); - } - V_ip_ft = flowtable_alloc("ipv4", V_ip_output_flowtable_size, FL_PCPU); -#endif - /* Skip initialization of globals for non-default instances. */ if (!IS_DEFAULT_VNET(curvnet)) return; diff --git a/sys/netinet/ip_output.c b/sys/netinet/ip_output.c index 7764bc3..2d8be1b 100644 --- a/sys/netinet/ip_output.c +++ b/sys/netinet/ip_output.c @@ -32,6 +32,7 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); +#include "opt_inet.h" #include "opt_ipfw.h" #include "opt_ipsec.h" #include "opt_kdtrace.h" @@ -154,19 +155,8 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, } #ifdef FLOWTABLE - if (ro->ro_rt == NULL) { - struct flentry *fle; - - /* - * The flow table returns route entries valid for up to 30 - * seconds; we rely on the remainder of ip_output() taking no - * longer than that long for the stability of ro_rt. The - * flow ID assignment must have happened before this point. - */ - fle = flowtable_lookup_mbuf(V_ip_ft, m, AF_INET); - if (fle != NULL) - flow_to_route(fle, ro); - } + if (ro->ro_rt == NULL) + (void )flowtable_lookup(AF_INET, m, ro); #endif if (opt) { diff --git a/sys/netinet6/in6_proto.c b/sys/netinet6/in6_proto.c index 965de60..b0631ae 100644 --- a/sys/netinet6/in6_proto.c +++ b/sys/netinet6/in6_proto.c @@ -126,10 +126,6 @@ __FBSDID("$FreeBSD$"); #include <netinet6/ip6protosw.h> -#ifdef FLOWTABLE -#include <net/flowtable.h> -#endif - /* * TCP/IP protocol family: IP6, ICMP6, UDP, TCP. */ @@ -575,16 +571,6 @@ SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_STEALTH, stealth, CTLFLAG_RW, &VNET_NAME(ip6stealth), 0, ""); #endif -#ifdef FLOWTABLE -VNET_DEFINE(int, ip6_output_flowtable_size) = 2048; -VNET_DEFINE(struct flowtable *, ip6_ft); -#define V_ip6_output_flowtable_size VNET(ip6_output_flowtable_size) - -SYSCTL_VNET_INT(_net_inet6_ip6, OID_AUTO, output_flowtable_size, CTLFLAG_RDTUN, - &VNET_NAME(ip6_output_flowtable_size), 2048, - "number of entries in the per-cpu output flow caches"); -#endif - /* net.inet6.icmp6 */ SYSCTL_VNET_INT(_net_inet6_icmp6, ICMPV6CTL_REDIRACCEPT, rediraccept, CTLFLAG_RW, &VNET_NAME(icmp6_rediraccept), 0, ""); diff --git a/sys/netinet6/ip6_input.c b/sys/netinet6/ip6_input.c index 8f70741..12249db 100644 --- a/sys/netinet6/ip6_input.c +++ b/sys/netinet6/ip6_input.c @@ -119,12 +119,6 @@ __FBSDID("$FreeBSD$"); #include <netinet6/ip6protosw.h> -#ifdef FLOWTABLE -#include <net/flowtable.h> -VNET_DECLARE(int, ip6_output_flowtable_size); -#define V_ip6_output_flowtable_size VNET(ip6_output_flowtable_size) -#endif - extern struct domain inet6domain; u_char ip6_protox[IPPROTO_MAX]; @@ -194,24 +188,6 @@ ip6_init(void) nd6_init(); frag6_init(); -#ifdef FLOWTABLE - if (TUNABLE_INT_FETCH("net.inet6.ip6.output_flowtable_size", - &V_ip6_output_flowtable_size)) { - if (V_ip6_output_flowtable_size < 256) - V_ip6_output_flowtable_size = 256; - if (!powerof2(V_ip6_output_flowtable_size)) { - printf("flowtable must be power of 2 size\n"); - V_ip6_output_flowtable_size = 2048; - } - } else { - /* - * round up to the next power of 2 - */ - V_ip6_output_flowtable_size = 1 << fls((1024 + maxusers * 64)-1); - } - V_ip6_ft = flowtable_alloc("ipv6", V_ip6_output_flowtable_size, FL_IPV6|FL_PCPU); -#endif - V_ip6_desync_factor = arc4random() % MAX_TEMP_DESYNC_FACTOR; /* Skip global initialization stuff for non-default instances. */ diff --git a/sys/netinet6/ip6_output.c b/sys/netinet6/ip6_output.c index 0d55b66..171a918 100644 --- a/sys/netinet6/ip6_output.c +++ b/sys/netinet6/ip6_output.c @@ -521,19 +521,8 @@ skip_ipsec2:; ro = &opt->ip6po_route; dst = (struct sockaddr_in6 *)&ro->ro_dst; #ifdef FLOWTABLE - if (ro->ro_rt == NULL) { - struct flentry *fle; - - /* - * The flow table returns route entries valid for up to 30 - * seconds; we rely on the remainder of ip_output() taking no - * longer than that long for the stability of ro_rt. The - * flow ID assignment must have happened before this point. - */ - fle = flowtable_lookup_mbuf(V_ip6_ft, m, AF_INET6); - if (fle != NULL) - flow_to_route_in6(fle, ro); - } + if (ro->ro_rt == NULL) + (void )flowtable_lookup(AF_INET6, m, (struct route *)ro); #endif again: /* diff --git a/usr.bin/netstat/Makefile b/usr.bin/netstat/Makefile index 1071f0e..1644aab 100644 --- a/usr.bin/netstat/Makefile +++ b/usr.bin/netstat/Makefile @@ -5,7 +5,8 @@ PROG= netstat SRCS= if.c inet.c main.c mbuf.c mroute.c netisr.c route.c \ - unix.c atalk.c mroute6.c ipsec.c bpf.c pfkey.c sctp.c + unix.c atalk.c mroute6.c ipsec.c bpf.c pfkey.c sctp.c \ + flowtable.c WARNS?= 3 CFLAGS+=-fno-strict-aliasing diff --git a/usr.bin/netstat/flowtable.c b/usr.bin/netstat/flowtable.c new file mode 100644 index 0000000..a3d5dd5 --- /dev/null +++ b/usr.bin/netstat/flowtable.c @@ -0,0 +1,84 @@ +/*- + * Copyright (c) 2014 Gleb Smirnoff <glebius@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); +#include <sys/param.h> +#include <sys/sysctl.h> +#include <net/flowtable.h> +#include <err.h> +#include <stdint.h> +#include <stdio.h> +#include "netstat.h" + +/* + * Print flowtable statistics. + */ + +static void +print_stats(struct flowtable_stat *stat) +{ + +#define p(f, m) if (stat->f || sflag <= 1) \ + printf(m, (uintmax_t)stat->f, plural(stat->f)) +#define p2(f, m) if (stat->f || sflag <= 1) \ + printf(m, (uintmax_t)stat->f, plurales(stat->f)) + + p(ft_lookups, "\t%ju lookup%s\n"); + p(ft_hits, "\t%ju hit%s\n"); + p2(ft_misses, "\t%ju miss%s\n"); + p(ft_inserts, "\t%ju insert%s\n"); + p(ft_collisions, "\t%ju collision%s\n"); + p(ft_free_checks, "\t%ju free check%s\n"); + p(ft_frees, "\t%ju free%s\n"); + p(ft_fail_lle_invalid, + "\t%ju lookup%s with not resolved Layer 2 address\n"); + +#undef p2 +#undef p +} + +void +flowtable_stats(void) +{ + struct flowtable_stat stat; + size_t len = sizeof(stat); + + if (!live) + return; + + if (sysctlbyname("net.flowtable.ip4.stat", &stat, &len, NULL, 0) == 0) { + printf("flowtable for IPv4:\n"); + print_stats(&stat); + } + + if (sysctlbyname("net.flowtable.ip6.stat", &stat, &len, NULL, 0) == 0) { + printf("flowtable for IPv6:\n"); + print_stats(&stat); + } +} diff --git a/usr.bin/netstat/main.c b/usr.bin/netstat/main.c index feb97dc..5c952ad 100644 --- a/usr.bin/netstat/main.c +++ b/usr.bin/netstat/main.c @@ -556,9 +556,10 @@ main(int argc, char *argv[]) exit(0); } if (rflag) { - if (sflag) + if (sflag) { rt_stats(nl[N_RTSTAT].n_value, nl[N_RTTRASH].n_value); - else + flowtable_stats(); + } else routepr(nl[N_RTREE].n_value, fib); exit(0); } diff --git a/usr.bin/netstat/netstat.h b/usr.bin/netstat/netstat.h index 114c84c..44bce94 100644 --- a/usr.bin/netstat/netstat.h +++ b/usr.bin/netstat/netstat.h @@ -124,6 +124,7 @@ void intpr(int, void (*)(char *)); void pr_rthdr(int); void pr_family(int); void rt_stats(u_long, u_long); +void flowtable_stats(void); char *ipx_pnet(struct sockaddr *); char *ipx_phost(struct sockaddr *); char *ns_phost(struct sockaddr *); |