diff options
Diffstat (limited to 'sys/net/flowtable.c')
-rw-r--r-- | sys/net/flowtable.c | 870 |
1 files changed, 715 insertions, 155 deletions
diff --git a/sys/net/flowtable.c b/sys/net/flowtable.c index b7ec578..2e9f045 100644 --- a/sys/net/flowtable.c +++ b/sys/net/flowtable.c @@ -1,6 +1,6 @@ /************************************************************************** -Copyright (c) 2008-2009, BitGravity Inc. +Copyright (c) 2008-2010, BitGravity Inc. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,6 +30,8 @@ POSSIBILITY OF SUCH DAMAGE. #include "opt_route.h" #include "opt_mpath.h" #include "opt_ddb.h" +#include "opt_inet.h" +#include "opt_inet6.h" #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); @@ -45,6 +47,7 @@ __FBSDID("$FreeBSD$"); #include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/proc.h> +#include <sys/sbuf.h> #include <sys/sched.h> #include <sys/smp.h> #include <sys/socket.h> @@ -63,6 +66,9 @@ __FBSDID("$FreeBSD$"); #include <netinet/in_var.h> #include <netinet/if_ether.h> #include <netinet/ip.h> +#ifdef INET6 +#include <netinet/ip6.h> +#endif #include <netinet/tcp.h> #include <netinet/udp.h> #include <netinet/sctp.h> @@ -140,31 +146,39 @@ union flentryp { struct flentry **pcpu[MAXCPU]; }; +struct flowtable_stats { + uint64_t ft_collisions; + uint64_t ft_allocated; + uint64_t ft_misses; + uint64_t ft_max_depth; + uint64_t ft_free_checks; + uint64_t ft_frees; + uint64_t ft_hits; + uint64_t ft_lookups; +} __aligned(128); + struct flowtable { + struct flowtable_stats ft_stats[MAXCPU]; int ft_size; int ft_lock_count; uint32_t ft_flags; - uint32_t ft_collisions; - uint32_t ft_allocated; - uint32_t ft_misses; - uint64_t ft_hits; uint32_t ft_udp_idle; uint32_t ft_fin_wait_idle; uint32_t ft_syn_idle; uint32_t ft_tcp_idle; + char *ft_name; fl_lock_t *ft_lock; fl_lock_t *ft_unlock; fl_rtalloc_t *ft_rtalloc; struct mtx *ft_locks; - union flentryp ft_table; bitstr_t *ft_masks[MAXCPU]; bitstr_t *ft_tmpmask; struct flowtable *ft_next; -}; +} __aligned(128); static struct proc *flowcleanerproc; static VNET_DEFINE(struct flowtable *, flow_list_head); @@ -181,12 +195,24 @@ static struct cv flowclean_cv; static struct mtx flowclean_lock; static uint32_t flowclean_cycles; +#ifdef FLOWTABLE_DEBUG +#define FLDPRINTF(ft, flags, fmt, ...) \ +do { \ + if ((ft)->ft_flags & (flags)) \ + printf((fmt), __VA_ARGS__); \ +} while (0); \ + +#else +#define FLDPRINTF(ft, flags, fmt, ...) + +#endif + + /* * TODO: * - Make flowtable stats per-cpu, aggregated at sysctl call time, * to avoid extra cache evictions caused by incrementing a shared * counter - * - add IPv6 support to flow lookup * - add sysctls to resize && flush flow tables * - Add per flowtable sysctls for statistics and configuring timeouts * - add saturation counter to rtentry to support per-packet load-balancing @@ -200,13 +226,6 @@ static uint32_t flowclean_cycles; */ VNET_DEFINE(int, flowtable_enable) = 1; static VNET_DEFINE(int, flowtable_debug); -static VNET_DEFINE(int, flowtable_hits); -static VNET_DEFINE(int, flowtable_lookups); -static VNET_DEFINE(int, flowtable_misses); -static VNET_DEFINE(int, flowtable_frees); -static VNET_DEFINE(int, flowtable_free_checks); -static VNET_DEFINE(int, flowtable_max_depth); -static VNET_DEFINE(int, flowtable_collisions); static VNET_DEFINE(int, flowtable_syn_expire) = SYN_IDLE; static VNET_DEFINE(int, flowtable_udp_expire) = UDP_IDLE; static VNET_DEFINE(int, flowtable_fin_wait_expire) = FIN_WAIT_IDLE; @@ -216,13 +235,6 @@ static VNET_DEFINE(int, flowtable_ready) = 0; #define V_flowtable_enable VNET(flowtable_enable) #define V_flowtable_debug VNET(flowtable_debug) -#define V_flowtable_hits VNET(flowtable_hits) -#define V_flowtable_lookups VNET(flowtable_lookups) -#define V_flowtable_misses VNET(flowtable_misses) -#define V_flowtable_frees VNET(flowtable_frees) -#define V_flowtable_free_checks VNET(flowtable_free_checks) -#define V_flowtable_max_depth VNET(flowtable_max_depth) -#define V_flowtable_collisions VNET(flowtable_collisions) #define V_flowtable_syn_expire VNET(flowtable_syn_expire) #define V_flowtable_udp_expire VNET(flowtable_udp_expire) #define V_flowtable_fin_wait_expire VNET(flowtable_fin_wait_expire) @@ -235,20 +247,6 @@ SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, debug, CTLFLAG_RW, &VNET_NAME(flowtable_debug), 0, "print debug info."); SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, enable, CTLFLAG_RW, &VNET_NAME(flowtable_enable), 0, "enable flowtable caching."); -SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, hits, CTLFLAG_RD, - &VNET_NAME(flowtable_hits), 0, "# flowtable hits."); -SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, lookups, CTLFLAG_RD, - &VNET_NAME(flowtable_lookups), 0, "# flowtable lookups."); -SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, misses, CTLFLAG_RD, - &VNET_NAME(flowtable_misses), 0, "#flowtable misses."); -SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, frees, CTLFLAG_RD, - &VNET_NAME(flowtable_frees), 0, "#flows freed."); -SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, free_checks, CTLFLAG_RD, - &VNET_NAME(flowtable_free_checks), 0, "#flows free checks."); -SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, max_depth, CTLFLAG_RD, - &VNET_NAME(flowtable_max_depth), 0, "max collision list length."); -SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, collisions, CTLFLAG_RD, - &VNET_NAME(flowtable_collisions), 0, "#flowtable collisions."); /* * XXX This does not end up updating timeouts at runtime @@ -298,6 +296,77 @@ SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, nmbflows, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_nmbflows, "IU", "Maximum number of flows allowed"); + + +#define FS_PRINT(sb, field) sbuf_printf((sb), "\t%s=%jd", #field, fs->ft_##field) + +static void +fs_print(struct flowtable_stats *fs) +{ + struct sbuf *sb; + + sb = sbuf_new(NULL, NULL, 32*1024, SBUF_FIXEDLEN); + + FS_PRINT(sb, collisions); + FS_PRINT(sb, allocated); + FS_PRINT(sb, misses); + FS_PRINT(sb, max_depth); + FS_PRINT(sb, free_checks); + FS_PRINT(sb, frees); + FS_PRINT(sb, hits); + FS_PRINT(sb, lookups); + sbuf_finish(sb); + +} + +static void +flowtable_show_stats(struct flowtable *ft) +{ + int i; + struct flowtable_stats fs, *pfs; + + if (ft->ft_flags & FL_PCPU) { + bzero(&fs, sizeof(fs)); + pfs = &fs; + for (i = 0; i <= mp_maxid; i++) { + if (CPU_ABSENT(i)) + continue; + pfs->ft_collisions += ft->ft_stats[i].ft_collisions; + pfs->ft_allocated += ft->ft_stats[i].ft_allocated; + pfs->ft_misses += ft->ft_stats[i].ft_misses; + pfs->ft_free_checks += ft->ft_stats[i].ft_free_checks; + pfs->ft_frees += ft->ft_stats[i].ft_frees; + pfs->ft_hits += ft->ft_stats[i].ft_hits; + pfs->ft_lookups += ft->ft_stats[i].ft_lookups; + if (ft->ft_stats[i].ft_max_depth > pfs->ft_max_depth) + pfs->ft_max_depth = ft->ft_stats[i].ft_max_depth; + } + } else { + pfs = &ft->ft_stats[0]; + } + + fs_print(pfs); +} + +static int +sysctl_flowtable_stats(SYSCTL_HANDLER_ARGS) +{ + struct flowtable *ft; + + ft = V_flow_list_head; + while (ft != NULL) { + printf("name: %s\n", ft->ft_name); + flowtable_show_stats(ft); + ft = ft->ft_next; + } + + return (0); +} +SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, stats, + CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_flowtable_stats, "IU", + "flowtable statistics"); + + #ifndef RADIX_MPATH static void in_rtalloc_ign_wrapper(struct route *ro, uint32_t hash, u_int fibnum) @@ -342,52 +411,122 @@ flowtable_pcpu_unlock(struct flowtable *table, uint32_t hash) #define FL_ENTRY_LOCK(table, hash) (table)->ft_lock((table), (hash)) #define FL_ENTRY_UNLOCK(table, hash) (table)->ft_unlock((table), (hash)) -#define FL_STALE (1<<8) -#define FL_IPV6 (1<<9) +#define FL_STALE (1<<8) +#define FL_IPV6 (1<<9) +#define FL_OVERWRITE (1<<10) -static uint32_t -ipv4_flow_lookup_hash_internal(struct mbuf *m, struct route *ro, - uint32_t *key, uint16_t *flags, uint8_t *protop) +void +flow_invalidate(struct flentry *fle) { - uint16_t sport = 0, dport = 0; - struct ip *ip = NULL; - uint8_t proto = 0; + + fle->f_flags |= FL_STALE; +} + +static __inline int +proto_to_flags(uint8_t proto) +{ + int flag; + + switch (proto) { + case IPPROTO_TCP: + flag = FL_TCP; + break; + case IPPROTO_SCTP: + flag = FL_SCTP; + break; + case IPPROTO_UDP: + flag = FL_UDP; + break; + default: + flag = 0; + break; + } + + return (flag); +} + +static __inline int +flags_to_proto(int flags) +{ + int proto, protoflags; + + protoflags = flags & (FL_TCP|FL_SCTP|FL_UDP); + switch (protoflags) { + case FL_TCP: + proto = IPPROTO_TCP; + break; + case FL_SCTP: + proto = IPPROTO_SCTP; + break; + case FL_UDP: + proto = IPPROTO_UDP; + break; + default: + proto = 0; + break; + } + return (proto); +} + +#ifdef INET +#ifdef FLOWTABLE_DEBUG +static void +ipv4_flow_print_tuple(int flags, int proto, struct sockaddr_in *ssin, + struct sockaddr_in *dsin) +{ + char saddr[4*sizeof "123"], daddr[4*sizeof "123"]; + + if (flags & FL_HASH_ALL) { + inet_ntoa_r(ssin->sin_addr, saddr); + inet_ntoa_r(dsin->sin_addr, daddr); + printf("proto=%d %s:%d->%s:%d\n", + proto, saddr, ntohs(ssin->sin_port), daddr, + ntohs(dsin->sin_port)); + } else { + inet_ntoa_r(*(struct in_addr *) &dsin->sin_addr, daddr); + printf("proto=%d %s\n", proto, daddr); + } + +} +#endif + +static int +ipv4_mbuf_demarshal(struct flowtable *ft, struct mbuf *m, + struct sockaddr_in *ssin, struct sockaddr_in *dsin, uint16_t *flags) +{ + struct ip *ip; + uint8_t proto; int iphlen; - uint32_t hash; - struct sockaddr_in *sin; struct tcphdr *th; struct udphdr *uh; struct sctphdr *sh; + uint16_t sport, dport; - if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0)) - return (0); - - key[1] = key[0] = 0; - sin = (struct sockaddr_in *)&ro->ro_dst; - if (m != NULL) { - ip = mtod(m, struct ip *); - sin->sin_family = AF_INET; - sin->sin_len = sizeof(*sin); - sin->sin_addr = ip->ip_dst; - } else - *flags &= ~FL_HASH_PORTS; - - key[2] = sin->sin_addr.s_addr; + proto = sport = dport = 0; + ip = mtod(m, struct ip *); + dsin->sin_family = AF_INET; + dsin->sin_len = sizeof(*dsin); + dsin->sin_addr = ip->ip_dst; + ssin->sin_family = AF_INET; + ssin->sin_len = sizeof(*ssin); + ssin->sin_addr = ip->ip_src; - if ((*flags & FL_HASH_PORTS) == 0) + proto = ip->ip_p; + if ((*flags & FL_HASH_ALL) == 0) { + FLDPRINTF(ft, FL_DEBUG_ALL, "skip port check flags=0x%x ", + *flags); goto skipports; + } - proto = ip->ip_p; iphlen = ip->ip_hl << 2; /* XXX options? */ - key[1] = ip->ip_src.s_addr; - + switch (proto) { case IPPROTO_TCP: th = (struct tcphdr *)((caddr_t)ip + iphlen); - sport = ntohs(th->th_sport); - dport = ntohs(th->th_dport); - *flags |= th->th_flags; - if (*flags & TH_RST) + sport = th->th_sport; + dport = th->th_dport; + if ((*flags & FL_HASH_ALL) && + (th->th_flags & (TH_RST|TH_FIN))) *flags |= FL_STALE; break; case IPPROTO_UDP: @@ -401,38 +540,288 @@ ipv4_flow_lookup_hash_internal(struct mbuf *m, struct route *ro, dport = sh->dest_port; break; default: - if (*flags & FL_HASH_PORTS) - goto noop; + FLDPRINTF(ft, FL_DEBUG_ALL, "proto=0x%x not supported\n", proto); + return (ENOTSUP); /* no port - hence not a protocol we care about */ break; } - *protop = proto; - /* - * If this is a transmit route cache then - * hash all flows to a given destination to - * the same bucket - */ - if ((*flags & FL_HASH_PORTS) == 0) - proto = sport = dport = 0; +skipports: + *flags |= proto_to_flags(proto); + ssin->sin_port = sport; + dsin->sin_port = dport; + return (0); +} - ((uint16_t *)key)[0] = sport; - ((uint16_t *)key)[1] = dport; +static uint32_t +ipv4_flow_lookup_hash_internal( + struct sockaddr_in *ssin, struct sockaddr_in *dsin, + uint32_t *key, uint16_t flags) +{ + uint16_t sport, dport; + uint8_t proto; + int offset = 0; -skipports: - hash = jenkins_hashword(key, 3, V_flow_hashjitter + proto); - if (m != NULL && (m->m_flags & M_FLOWID) == 0) { - m->m_flags |= M_FLOWID; - m->m_pkthdr.flowid = hash; + if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0)) + return (0); + proto = flags_to_proto(flags); + sport = dport = key[2] = key[1] = key[0] = 0; + if ((ssin != NULL) && (flags & FL_HASH_ALL)) { + key[1] = ssin->sin_addr.s_addr; + sport = ssin->sin_port; + } + if (dsin != NULL) { + key[2] = dsin->sin_addr.s_addr; + dport = dsin->sin_port; + } + if (flags & FL_HASH_ALL) { + ((uint16_t *)key)[0] = sport; + ((uint16_t *)key)[1] = dport; + } else + offset = V_flow_hashjitter + proto; + + return (jenkins_hashword(key, 3, offset)); +} + +static struct flentry * +flowtable_lookup_mbuf4(struct flowtable *ft, struct mbuf *m) +{ + struct sockaddr_storage ssa, dsa; + uint16_t flags; + struct sockaddr_in *dsin, *ssin; + + dsin = (struct sockaddr_in *)&dsa; + ssin = (struct sockaddr_in *)&ssa; + flags = ft->ft_flags; + if (ipv4_mbuf_demarshal(ft, m, ssin, dsin, &flags) != 0) + return (NULL); + + return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags)); +} + +void +flow_to_route(struct flentry *fle, struct route *ro) +{ + uint32_t *hashkey = NULL; + struct sockaddr_in *sin; + + sin = (struct sockaddr_in *)&ro->ro_dst; + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key; + sin->sin_addr.s_addr = hashkey[2]; + ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt); + ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle); +} +#endif /* INET */ + +#ifdef INET6 +/* + * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous, + * then it sets p to point at the offset "len" in the mbuf. WARNING: the + * pointer might become stale after other pullups (but we never use it + * this way). + */ +#define PULLUP_TO(_len, p, T) \ +do { \ + int x = (_len) + sizeof(T); \ + if ((m)->m_len < x) { \ + goto receive_failed; \ + } \ + p = (mtod(m, char *) + (_len)); \ +} while (0) + +#define TCP(p) ((struct tcphdr *)(p)) +#define SCTP(p) ((struct sctphdr *)(p)) +#define UDP(p) ((struct udphdr *)(p)) + +static int +ipv6_mbuf_demarshal(struct flowtable *ft, struct mbuf *m, + struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6, uint16_t *flags) +{ + struct ip6_hdr *ip6; + uint8_t proto; + int hlen; + uint16_t src_port, dst_port; + u_short offset; + void *ulp; + + offset = hlen = src_port = dst_port = 0; + ulp = NULL; + ip6 = mtod(m, struct ip6_hdr *); + hlen = sizeof(struct ip6_hdr); + proto = ip6->ip6_nxt; + + if ((*flags & FL_HASH_ALL) == 0) + goto skipports; + + while (ulp == NULL) { + switch (proto) { + case IPPROTO_ICMPV6: + case IPPROTO_OSPFIGP: + case IPPROTO_PIM: + case IPPROTO_CARP: + case IPPROTO_ESP: + case IPPROTO_NONE: + ulp = ip6; + break; + case IPPROTO_TCP: + PULLUP_TO(hlen, ulp, struct tcphdr); + dst_port = TCP(ulp)->th_dport; + src_port = TCP(ulp)->th_sport; + if ((*flags & FL_HASH_ALL) && + (TCP(ulp)->th_flags & (TH_RST|TH_FIN))) + *flags |= FL_STALE; + break; + case IPPROTO_SCTP: + PULLUP_TO(hlen, ulp, struct sctphdr); + src_port = SCTP(ulp)->src_port; + dst_port = SCTP(ulp)->dest_port; + break; + case IPPROTO_UDP: + PULLUP_TO(hlen, ulp, struct udphdr); + dst_port = UDP(ulp)->uh_dport; + src_port = UDP(ulp)->uh_sport; + break; + case IPPROTO_HOPOPTS: /* RFC 2460 */ + PULLUP_TO(hlen, ulp, struct ip6_hbh); + hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; + proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; + ulp = NULL; + break; + case IPPROTO_ROUTING: /* RFC 2460 */ + PULLUP_TO(hlen, ulp, struct ip6_rthdr); + hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3; + proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt; + ulp = NULL; + break; + case IPPROTO_FRAGMENT: /* RFC 2460 */ + PULLUP_TO(hlen, ulp, struct ip6_frag); + hlen += sizeof (struct ip6_frag); + proto = ((struct ip6_frag *)ulp)->ip6f_nxt; + offset = ((struct ip6_frag *)ulp)->ip6f_offlg & + IP6F_OFF_MASK; + ulp = NULL; + break; + case IPPROTO_DSTOPTS: /* RFC 2460 */ + PULLUP_TO(hlen, ulp, struct ip6_hbh); + hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; + proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; + ulp = NULL; + break; + case IPPROTO_AH: /* RFC 2402 */ + PULLUP_TO(hlen, ulp, struct ip6_ext); + hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2; + proto = ((struct ip6_ext *)ulp)->ip6e_nxt; + ulp = NULL; + break; + default: + PULLUP_TO(hlen, ulp, struct ip6_ext); + break; + } + } + + if (src_port == 0) { + receive_failed: + return (ENOTSUP); } - return (hash); -noop: - *protop = proto; +skipports: + dsin6->sin6_family = AF_INET6; + dsin6->sin6_len = sizeof(*dsin6); + dsin6->sin6_port = dst_port; + memcpy(&dsin6->sin6_addr, &ip6->ip6_dst, sizeof(struct in6_addr)); + + ssin6->sin6_family = AF_INET6; + ssin6->sin6_len = sizeof(*ssin6); + ssin6->sin6_port = src_port; + memcpy(&ssin6->sin6_addr, &ip6->ip6_src, sizeof(struct in6_addr)); + *flags |= proto_to_flags(proto); + return (0); } +#define zero_key(key) \ +do { \ + key[0] = 0; \ + key[1] = 0; \ + key[2] = 0; \ + key[3] = 0; \ + key[4] = 0; \ + key[5] = 0; \ + key[6] = 0; \ + key[7] = 0; \ + key[8] = 0; \ +} while (0) + +static uint32_t +ipv6_flow_lookup_hash_internal( + struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6, + uint32_t *key, uint16_t flags) +{ + uint16_t sport, dport; + uint8_t proto; + int offset = 0; + + if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0)) + return (0); + + proto = flags_to_proto(flags); + zero_key(key); + sport = dport = 0; + if (dsin6 != NULL) { + memcpy(&key[1], &dsin6->sin6_addr, sizeof(struct in6_addr)); + dport = dsin6->sin6_port; + } + if ((ssin6 != NULL) && (flags & FL_HASH_ALL)) { + memcpy(&key[5], &ssin6->sin6_addr, sizeof(struct in6_addr)); + sport = ssin6->sin6_port; + } + if (flags & FL_HASH_ALL) { + ((uint16_t *)key)[0] = sport; + ((uint16_t *)key)[1] = dport; + } else + offset = V_flow_hashjitter + proto; + + return (jenkins_hashword(key, 9, offset)); +} + +static struct flentry * +flowtable_lookup_mbuf6(struct flowtable *ft, struct mbuf *m) +{ + struct sockaddr_storage ssa, dsa; + struct sockaddr_in6 *dsin6, *ssin6; + uint16_t flags; + + dsin6 = (struct sockaddr_in6 *)&dsa; + ssin6 = (struct sockaddr_in6 *)&ssa; + flags = ft->ft_flags; + + if (ipv6_mbuf_demarshal(ft, m, ssin6, dsin6, &flags) != 0) + return (NULL); + + return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags)); +} + +void +flow_to_route_in6(struct flentry *fle, struct route_in6 *ro) +{ + uint32_t *hashkey = NULL; + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)&ro->ro_dst; + + sin6->sin6_family = AF_INET6; + sin6->sin6_len = sizeof(*sin6); + hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key; + memcpy(&sin6->sin6_addr, &hashkey[5], sizeof (struct in6_addr)); + ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt); + ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle); + +} +#endif /* INET6 */ + static bitstr_t * flowtable_mask(struct flowtable *ft) { @@ -512,14 +901,30 @@ flowtable_set_hashkey(struct flentry *fle, uint32_t *key) hashkey[i] = key[i]; } + +static uint32_t * +flowtable_get_hashkey(struct flentry *fle) +{ + uint32_t *hashkey; + + if (fle->f_flags & FL_IPV6) + hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key; + else + hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key; + + return (hashkey); +} + static int flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key, - uint8_t proto, uint32_t fibnum, struct route *ro, uint16_t flags) + uint32_t fibnum, struct route *ro, uint16_t flags) { struct flentry *fle, *fletail, *newfle, **flep; + struct flowtable_stats *fs = &ft->ft_stats[curcpu]; int depth; uma_zone_t flezone; bitstr_t *mask; + uint8_t proto; flezone = (flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone; newfle = uma_zalloc(flezone, M_NOWAIT | M_ZERO); @@ -527,7 +932,8 @@ flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key, return (ENOMEM); newfle->f_flags |= (flags & FL_IPV6); - + proto = flags_to_proto(flags); + FL_ENTRY_LOCK(ft, hash); mask = flowtable_mask(ft); flep = flowtable_entry(ft, hash); @@ -540,7 +946,7 @@ flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key, } depth = 0; - V_flowtable_collisions++; + fs->ft_collisions++; /* * find end of list and make sure that we were not * preempted by another thread handling this flow @@ -554,6 +960,9 @@ flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key, FL_ENTRY_UNLOCK(ft, hash); uma_zfree((newfle->f_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone, newfle); + + if (flags & FL_OVERWRITE) + goto skip; return (EEXIST); } /* @@ -566,8 +975,8 @@ flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key, fle = fle->f_next; } - if (depth > V_flowtable_max_depth) - V_flowtable_max_depth = depth; + if (depth > fs->ft_max_depth) + fs->ft_max_depth = depth; fletail->f_next = newfle; fle = newfle; skip: @@ -583,6 +992,35 @@ skip: return (0); } +int +kern_flowtable_insert(struct flowtable *ft, + struct sockaddr_storage *ssa, struct sockaddr_storage *dsa, + struct route *ro, uint32_t fibnum, int flags) +{ + uint32_t key[9], hash; + + flags = (ft->ft_flags | flags | FL_OVERWRITE); + hash = 0; + +#ifdef INET + if (ssa->ss_family == AF_INET) + hash = ipv4_flow_lookup_hash_internal((struct sockaddr_in *)ssa, + (struct sockaddr_in *)dsa, key, flags); +#endif +#ifdef INET6 + if (ssa->ss_family == AF_INET6) + hash = ipv6_flow_lookup_hash_internal((struct sockaddr_in6 *)ssa, + (struct sockaddr_in6 *)dsa, key, flags); +#endif + if (ro->ro_rt == NULL || ro->ro_lle == NULL) + return (EINVAL); + + FLDPRINTF(ft, FL_DEBUG, + "kern_flowtable_insert: key=%x:%x:%x hash=%x fibnum=%d flags=%x\n", + key[0], key[1], key[2], hash, fibnum, flags); + return (flowtable_insert(ft, hash, key, fibnum, ro, flags)); +} + static int flowtable_key_equal(struct flentry *fle, uint32_t *key) { @@ -596,7 +1034,7 @@ flowtable_key_equal(struct flentry *fle, uint32_t *key) nwords = 3; hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key; } - + for (i = 0; i < nwords; i++) if (hashkey[i] != key[i]) return (0); @@ -604,44 +1042,86 @@ flowtable_key_equal(struct flentry *fle, uint32_t *key) return (1); } -int -flowtable_lookup(struct flowtable *ft, struct mbuf *m, struct route *ro, uint32_t fibnum) +struct flentry * +flowtable_lookup_mbuf(struct flowtable *ft, struct mbuf *m, int af) +{ + struct flentry *fle = NULL; + +#ifdef INET + if (af == AF_INET) + fle = flowtable_lookup_mbuf4(ft, m); +#endif +#ifdef INET6 + if (af == AF_INET6) + fle = flowtable_lookup_mbuf6(ft, m); +#endif + if (fle != NULL && m != NULL && (m->m_flags & M_FLOWID) == 0) { + m->m_flags |= M_FLOWID; + m->m_pkthdr.flowid = fle->f_fhash; + } + return (fle); +} + +struct flentry * +flowtable_lookup(struct flowtable *ft, struct sockaddr_storage *ssa, + struct sockaddr_storage *dsa, uint32_t fibnum, int flags) { uint32_t key[9], hash; struct flentry *fle; - uint16_t flags; + struct flowtable_stats *fs = &ft->ft_stats[curcpu]; uint8_t proto = 0; int error = 0; struct rtentry *rt; struct llentry *lle; - - flags = ft->ft_flags; - ro->ro_rt = NULL; - ro->ro_lle = NULL; - - /* - * The internal hash lookup is the only IPv4 specific bit - * remaining - * - * XXX BZ: to add IPv6 support just add a check for the - * address type in m and ro and an equivalent ipv6 lookup - * function - the rest of the code should automatically - * handle an ipv6 flow (note that m can be NULL in which - * case ro will be set) - */ - hash = ipv4_flow_lookup_hash_internal(m, ro, key, - &flags, &proto); - + struct route sro, *ro; + struct route_in6 sro6; + + sro.ro_rt = sro6.ro_rt = NULL; + sro.ro_lle = sro6.ro_lle = NULL; + ro = NULL; + hash = 0; + flags |= ft->ft_flags; + proto = flags_to_proto(flags); +#ifdef INET + if (ssa->ss_family == AF_INET) { + struct sockaddr_in *ssin, *dsin; + + ro = &sro; + memcpy(&ro->ro_dst, dsa, sizeof(struct sockaddr_in)); + dsin = (struct sockaddr_in *)dsa; + ssin = (struct sockaddr_in *)ssa; + if ((dsin->sin_addr.s_addr == ssin->sin_addr.s_addr) || + (ntohl(dsin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || + (ntohl(ssin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) + return (NULL); + + hash = ipv4_flow_lookup_hash_internal(ssin, dsin, key, flags); + } +#endif +#ifdef INET6 + if (ssa->ss_family == AF_INET6) { + struct sockaddr_in6 *ssin6, *dsin6; + + ro = (struct route *)&sro6; + memcpy(&sro6.ro_dst, dsa, + sizeof(struct sockaddr_in6)); + dsin6 = (struct sockaddr_in6 *)dsa; + ssin6 = (struct sockaddr_in6 *)ssa; + + flags |= FL_IPV6; + hash = ipv6_flow_lookup_hash_internal(ssin6, dsin6, key, flags); + } +#endif /* * Ports are zero and this isn't a transmit cache * - thus not a protocol for which we need to keep * state - * FL_HASH_PORTS => key[0] != 0 for TCP || UDP || SCTP + * FL_HASH_ALL => key[0] != 0 for TCP || UDP || SCTP */ - if (hash == 0 || (key[0] == 0 && (ft->ft_flags & FL_HASH_PORTS))) - return (ENOENT); + if (hash == 0 || (key[0] == 0 && (ft->ft_flags & FL_HASH_ALL))) + return (NULL); - V_flowtable_lookups++; + fs->ft_lookups++; FL_ENTRY_LOCK(ft, hash); if ((fle = FL_ENTRY(ft, hash)) == NULL) { FL_ENTRY_UNLOCK(ft, hash); @@ -657,21 +1137,21 @@ keycheck: && (fibnum == fle->f_fibnum) && (rt->rt_flags & RTF_UP) && (rt->rt_ifp != NULL)) { - V_flowtable_hits++; + fs->ft_hits++; fle->f_uptime = time_uptime; fle->f_flags |= flags; - ro->ro_rt = rt; - ro->ro_lle = lle; FL_ENTRY_UNLOCK(ft, hash); - return (0); + return (fle); } else if (fle->f_next != NULL) { fle = fle->f_next; goto keycheck; } FL_ENTRY_UNLOCK(ft, hash); - uncached: - V_flowtable_misses++; + if (flags & FL_NOAUTO) + return (NULL); + + fs->ft_misses++; /* * This bit of code ends up locking the * same route 3 times (just like ip_output + ether_output) @@ -684,36 +1164,64 @@ uncached: * receive the route locked */ +#ifdef INVARIANTS + if ((ro->ro_dst.sa_family != AF_INET) && + (ro->ro_dst.sa_family != AF_INET6)) + panic("sa_family == %d\n", ro->ro_dst.sa_family); +#endif + ft->ft_rtalloc(ro, hash, fibnum); if (ro->ro_rt == NULL) error = ENETUNREACH; else { struct llentry *lle = NULL; - struct sockaddr *l3addr; + struct sockaddr_storage *l3addr; struct rtentry *rt = ro->ro_rt; struct ifnet *ifp = rt->rt_ifp; if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) { RTFREE(rt); ro->ro_rt = NULL; - return (ENOENT); + return (NULL); } +#ifdef INET6 + if (ssa->ss_family == AF_INET6) { + struct sockaddr_in6 *dsin6; + + dsin6 = (struct sockaddr_in6 *)dsa; + if (in6_localaddr(&dsin6->sin6_addr)) { + RTFREE(rt); + ro->ro_rt = NULL; + return (NULL); + } - if (rt->rt_flags & RTF_GATEWAY) - l3addr = rt->rt_gateway; - else - l3addr = &ro->ro_dst; - llentry_update(&lle, LLTABLE(ifp), l3addr, ifp); + if (rt->rt_flags & RTF_GATEWAY) + l3addr = (struct sockaddr_storage *)rt->rt_gateway; + + else + l3addr = (struct sockaddr_storage *)&ro->ro_dst; + llentry_update(&lle, LLTABLE6(ifp), l3addr, ifp); + } +#endif +#ifdef INET + if (ssa->ss_family == AF_INET) { + if (rt->rt_flags & RTF_GATEWAY) + l3addr = (struct sockaddr_storage *)rt->rt_gateway; + else + l3addr = (struct sockaddr_storage *)&ro->ro_dst; + llentry_update(&lle, LLTABLE(ifp), l3addr, ifp); + } + +#endif ro->ro_lle = lle; if (lle == NULL) { RTFREE(rt); ro->ro_rt = NULL; - return (ENOENT); + return (NULL); } - error = flowtable_insert(ft, hash, key, proto, fibnum, - ro, flags); - + error = flowtable_insert(ft, hash, key, fibnum, ro, flags); + if (error) { RTFREE(rt); LLE_FREE(lle); @@ -722,7 +1230,7 @@ uncached: } } - return (error); + return ((error) ? NULL : fle); } /* @@ -731,7 +1239,7 @@ uncached: #define calloc(count, size) malloc((count)*(size), M_DEVBUF, M_WAITOK|M_ZERO) struct flowtable * -flowtable_alloc(int nentry, int flags) +flowtable_alloc(char *name, int nentry, int flags) { struct flowtable *ft, *fttail; int i; @@ -743,7 +1251,8 @@ flowtable_alloc(int nentry, int flags) ft = malloc(sizeof(struct flowtable), M_RTABLE, M_WAITOK | M_ZERO); - + + ft->ft_name = name; ft->ft_flags = flags; ft->ft_size = nentry; #ifdef RADIX_MPATH @@ -784,7 +1293,7 @@ flowtable_alloc(int nentry, int flags) * just a cache - so everything is eligible for * replacement after 5s of non-use */ - if (flags & FL_HASH_PORTS) { + if (flags & FL_HASH_ALL) { ft->ft_udp_idle = V_flowtable_udp_expire; ft->ft_syn_idle = V_flowtable_syn_expire; ft->ft_fin_wait_idle = V_flowtable_fin_wait_expire; @@ -837,7 +1346,8 @@ flowtable_free_stale(struct flowtable *ft, struct rtentry *rt) struct flentry *fle, **flehead, *fleprev; struct flentry *flefreehead, *flefreetail, *fletmp; bitstr_t *mask, *tmpmask; - + struct flowtable_stats *fs = &ft->ft_stats[curcpu]; + flefreehead = flefreetail = NULL; mask = flowtable_mask(ft); tmpmask = ft->ft_tmpmask; @@ -854,12 +1364,12 @@ flowtable_free_stale(struct flowtable *ft, struct rtentry *rt) curbit); break; } - + FL_ENTRY_LOCK(ft, curbit); flehead = flowtable_entry(ft, curbit); fle = fleprev = *flehead; - V_flowtable_free_checks++; + fs->ft_free_checks++; #ifdef DIAGNOSTIC if (fle == NULL && curbit > 0) { log(LOG_ALERT, @@ -897,7 +1407,7 @@ flowtable_free_stale(struct flowtable *ft, struct rtentry *rt) fleprev->f_next = fle->f_next; fle = fleprev->f_next; } - + if (flefreehead == NULL) flefreehead = flefreetail = fletmp; else { @@ -916,7 +1426,7 @@ flowtable_free_stale(struct flowtable *ft, struct rtentry *rt) while ((fle = flefreehead) != NULL) { flefreehead = fle->f_next; count++; - V_flowtable_frees++; + fs->ft_frees++; fle_free(fle); } if (V_flowtable_debug && count) @@ -927,6 +1437,7 @@ void flowtable_route_flush(struct flowtable *ft, struct rtentry *rt) { int i; + if (ft->ft_flags & FL_PCPU) { for (i = 0; i <= mp_maxid; i++) { if (CPU_ABSENT(i)) @@ -1017,7 +1528,7 @@ static void flowtable_flush(void *unused __unused) { uint64_t start; - + mtx_lock(&flowclean_lock); start = flowclean_cycles; while (start == flowclean_cycles) { @@ -1109,17 +1620,64 @@ static void flow_show(struct flowtable *ft, struct flentry *fle) { int idle_time; - int rt_valid; + int rt_valid, ifp_valid; + uint16_t sport, dport; + uint32_t *hashkey; + char saddr[4*sizeof "123"], daddr[4*sizeof "123"]; + volatile struct rtentry *rt; + struct ifnet *ifp = NULL; idle_time = (int)(time_uptime - fle->f_uptime); - rt_valid = fle->f_rt != NULL; - db_printf("hash=0x%08x idle_time=%03d rt=%p ifp=%p", - fle->f_fhash, idle_time, - fle->f_rt, rt_valid ? fle->f_rt->rt_ifp : NULL); - if (rt_valid && (fle->f_rt->rt_flags & RTF_UP)) - db_printf(" RTF_UP "); + rt = fle->f_rt; + rt_valid = rt != NULL; + if (rt_valid) + ifp = rt->rt_ifp; + ifp_valid = ifp != NULL; + hashkey = flowtable_get_hashkey(fle); + if (fle->f_flags & FL_IPV6) + goto skipaddr; + + inet_ntoa_r(*(struct in_addr *) &hashkey[2], daddr); + if (ft->ft_flags & FL_HASH_ALL) { + inet_ntoa_r(*(struct in_addr *) &hashkey[1], saddr); + sport = ntohs(((uint16_t *)hashkey)[0]); + dport = ntohs(((uint16_t *)hashkey)[1]); + db_printf("%s:%d->%s:%d", + saddr, sport, daddr, + dport); + } else + db_printf("%s ", daddr); + +skipaddr: if (fle->f_flags & FL_STALE) db_printf(" FL_STALE "); + if (fle->f_flags & FL_TCP) + db_printf(" FL_TCP "); + if (fle->f_flags & FL_UDP) + db_printf(" FL_UDP "); + if (rt_valid) { + if (rt->rt_flags & RTF_UP) + db_printf(" RTF_UP "); + } + if (ifp_valid) { + if (ifp->if_flags & IFF_LOOPBACK) + db_printf(" IFF_LOOPBACK "); + if (ifp->if_flags & IFF_UP) + db_printf(" IFF_UP "); + if (ifp->if_flags & IFF_POINTOPOINT) + db_printf(" IFF_POINTOPOINT "); + } + if (fle->f_flags & FL_IPV6) + db_printf("\n\tkey=%08x:%08x:%08x%08x:%08x:%08x%08x:%08x:%08x", + hashkey[0], hashkey[1], hashkey[2], + hashkey[3], hashkey[4], hashkey[5], + hashkey[6], hashkey[7], hashkey[8]); + else + db_printf("\n\tkey=%08x:%08x:%08x ", + hashkey[0], hashkey[1], hashkey[2]); + db_printf("hash=%08x idle_time=%03d" + "\n\tfibnum=%02d rt=%p", + fle->f_fhash, idle_time, fle->f_fibnum, fle->f_rt); db_printf("\n"); } @@ -1130,7 +1688,8 @@ flowtable_show(struct flowtable *ft, int cpuid) struct flentry *fle, **flehead; bitstr_t *mask, *tmpmask; - db_printf("cpu: %d\n", cpuid); + if (cpuid != -1) + db_printf("cpu: %d\n", cpuid); mask = flowtable_mask_pcpu(ft, cpuid); tmpmask = ft->ft_tmpmask; memcpy(tmpmask, mask, ft->ft_size/8); @@ -1167,6 +1726,7 @@ flowtable_show_vnet(void) ft = V_flow_list_head; while (ft != NULL) { + printf("name: %s\n", ft->ft_name); if (ft->ft_flags & FL_PCPU) { for (i = 0; i <= mp_maxid; i++) { if (CPU_ABSENT(i)) @@ -1174,7 +1734,7 @@ flowtable_show_vnet(void) flowtable_show(ft, i); } } else { - flowtable_show(ft, 0); + flowtable_show(ft, -1); } ft = ft->ft_next; } |