diff options
author | grehan <grehan@FreeBSD.org> | 2013-07-19 03:27:04 +0000 |
---|---|---|
committer | grehan <grehan@FreeBSD.org> | 2013-07-19 03:27:04 +0000 |
commit | 749888363db1320e1cc3818fcd71e411c3a955ca (patch) | |
tree | e4dd80ba19e005ead5dfb35b03225d5fb4926884 /sys/netinet | |
parent | c8195f5331ccad33ad4e265362523f51b96abd5c (diff) | |
parent | 61d6ee86f76db7ef69b0748be4d4b6e14ffc7109 (diff) | |
download | FreeBSD-src-749888363db1320e1cc3818fcd71e411c3a955ca.zip FreeBSD-src-749888363db1320e1cc3818fcd71e411c3a955ca.tar.gz |
IFC @ r253461
Diffstat (limited to 'sys/netinet')
-rw-r--r-- | sys/netinet/icmp6.h | 76 | ||||
-rw-r--r-- | sys/netinet/icmp_var.h | 11 | ||||
-rw-r--r-- | sys/netinet/if_ether.c | 13 | ||||
-rw-r--r-- | sys/netinet/in.c | 3 | ||||
-rw-r--r-- | sys/netinet/in_pcb.c | 2 | ||||
-rw-r--r-- | sys/netinet/ip_carp.c | 27 | ||||
-rw-r--r-- | sys/netinet/ip_carp.h | 5 | ||||
-rw-r--r-- | sys/netinet/ip_icmp.c | 13 | ||||
-rw-r--r-- | sys/netinet/ip_input.c | 72 | ||||
-rw-r--r-- | sys/netinet/ip_mroute.c | 21 | ||||
-rw-r--r-- | sys/netinet/ip_mroute.h | 29 | ||||
-rw-r--r-- | sys/netinet/ip_var.h | 47 | ||||
-rw-r--r-- | sys/netinet/pim_var.h | 25 | ||||
-rw-r--r-- | sys/netinet/sctp_constants.h | 4 | ||||
-rw-r--r-- | sys/netinet/sctp_input.c | 16 | ||||
-rw-r--r-- | sys/netinet/sctp_pcb.c | 4 | ||||
-rw-r--r-- | sys/netinet/sctp_sysctl.c | 6 | ||||
-rw-r--r-- | sys/netinet/sctp_sysctl.h | 7 | ||||
-rw-r--r-- | sys/netinet/tcp_input.c | 95 | ||||
-rw-r--r-- | sys/netinet/tcp_syncache.c | 590 | ||||
-rw-r--r-- | sys/netinet/tcp_syncache.h | 32 | ||||
-rw-r--r-- | sys/netinet/tcp_var.h | 113 | ||||
-rw-r--r-- | sys/netinet/udp_usrreq.c | 13 | ||||
-rw-r--r-- | sys/netinet/udp_var.h | 40 |
24 files changed, 632 insertions, 632 deletions
diff --git a/sys/netinet/icmp6.h b/sys/netinet/icmp6.h index 5483721..15f4c2d 100644 --- a/sys/netinet/icmp6.h +++ b/sys/netinet/icmp6.h @@ -555,39 +555,39 @@ do { \ * of the internet control message protocol version 6. */ struct icmp6errstat { - u_quad_t icp6errs_dst_unreach_noroute; - u_quad_t icp6errs_dst_unreach_admin; - u_quad_t icp6errs_dst_unreach_beyondscope; - u_quad_t icp6errs_dst_unreach_addr; - u_quad_t icp6errs_dst_unreach_noport; - u_quad_t icp6errs_packet_too_big; - u_quad_t icp6errs_time_exceed_transit; - u_quad_t icp6errs_time_exceed_reassembly; - u_quad_t icp6errs_paramprob_header; - u_quad_t icp6errs_paramprob_nextheader; - u_quad_t icp6errs_paramprob_option; - u_quad_t icp6errs_redirect; /* we regard redirect as an error here */ - u_quad_t icp6errs_unknown; + uint64_t icp6errs_dst_unreach_noroute; + uint64_t icp6errs_dst_unreach_admin; + uint64_t icp6errs_dst_unreach_beyondscope; + uint64_t icp6errs_dst_unreach_addr; + uint64_t icp6errs_dst_unreach_noport; + uint64_t icp6errs_packet_too_big; + uint64_t icp6errs_time_exceed_transit; + uint64_t icp6errs_time_exceed_reassembly; + uint64_t icp6errs_paramprob_header; + uint64_t icp6errs_paramprob_nextheader; + uint64_t icp6errs_paramprob_option; + uint64_t icp6errs_redirect; /* we regard redirect as an error here */ + uint64_t icp6errs_unknown; }; struct icmp6stat { /* statistics related to icmp6 packets generated */ - u_quad_t icp6s_error; /* # of calls to icmp6_error */ - u_quad_t icp6s_canterror; /* no error 'cuz old was icmp */ - u_quad_t icp6s_toofreq; /* no error 'cuz rate limitation */ - u_quad_t icp6s_outhist[256]; + uint64_t icp6s_error; /* # of calls to icmp6_error */ + uint64_t icp6s_canterror; /* no error 'cuz old was icmp */ + uint64_t icp6s_toofreq; /* no error 'cuz rate limitation */ + uint64_t icp6s_outhist[256]; /* statistics related to input message processed */ - u_quad_t icp6s_badcode; /* icmp6_code out of range */ - u_quad_t icp6s_tooshort; /* packet < sizeof(struct icmp6_hdr) */ - u_quad_t icp6s_checksum; /* bad checksum */ - u_quad_t icp6s_badlen; /* calculated bound mismatch */ + uint64_t icp6s_badcode; /* icmp6_code out of range */ + uint64_t icp6s_tooshort; /* packet < sizeof(struct icmp6_hdr) */ + uint64_t icp6s_checksum; /* bad checksum */ + uint64_t icp6s_badlen; /* calculated bound mismatch */ /* * number of responses: this member is inherited from netinet code, but * for netinet6 code, it is already available in icp6s_outhist[]. */ - u_quad_t icp6s_reflect; - u_quad_t icp6s_inhist[256]; - u_quad_t icp6s_nd_toomanyopt; /* too many ND options */ + uint64_t icp6s_reflect; + uint64_t icp6s_inhist[256]; + uint64_t icp6s_nd_toomanyopt; /* too many ND options */ struct icmp6errstat icp6s_outerrhist; #define icp6s_odst_unreach_noroute \ icp6s_outerrhist.icp6errs_dst_unreach_noroute @@ -607,29 +607,33 @@ struct icmp6stat { #define icp6s_oparamprob_option icp6s_outerrhist.icp6errs_paramprob_option #define icp6s_oredirect icp6s_outerrhist.icp6errs_redirect #define icp6s_ounknown icp6s_outerrhist.icp6errs_unknown - u_quad_t icp6s_pmtuchg; /* path MTU changes */ - u_quad_t icp6s_nd_badopt; /* bad ND options */ - u_quad_t icp6s_badns; /* bad neighbor solicitation */ - u_quad_t icp6s_badna; /* bad neighbor advertisement */ - u_quad_t icp6s_badrs; /* bad router advertisement */ - u_quad_t icp6s_badra; /* bad router advertisement */ - u_quad_t icp6s_badredirect; /* bad redirect message */ + uint64_t icp6s_pmtuchg; /* path MTU changes */ + uint64_t icp6s_nd_badopt; /* bad ND options */ + uint64_t icp6s_badns; /* bad neighbor solicitation */ + uint64_t icp6s_badna; /* bad neighbor advertisement */ + uint64_t icp6s_badrs; /* bad router advertisement */ + uint64_t icp6s_badra; /* bad router advertisement */ + uint64_t icp6s_badredirect; /* bad redirect message */ }; #ifdef _KERNEL +#include <sys/counter.h> + +VNET_PCPUSTAT_DECLARE(struct icmp6stat, icmp6stat); /* * In-kernel consumers can use these accessor macros directly to update * stats. */ -#define ICMP6STAT_ADD(name, val) V_icmp6stat.name += (val) +#define ICMP6STAT_ADD(name, val) \ + VNET_PCPUSTAT_ADD(struct icmp6stat, icmp6stat, name, (val)) #define ICMP6STAT_INC(name) ICMP6STAT_ADD(name, 1) /* * Kernel module consumers must use this accessor macro. */ void kmod_icmp6stat_inc(int statnum); -#define KMOD_ICMP6STAT_INC(name) \ - kmod_icmp6stat_inc(offsetof(struct icmp6stat, name) / sizeof(u_quad_t)) +#define KMOD_ICMP6STAT_INC(name) \ + kmod_icmp6stat_inc(offsetof(struct icmp6stat, name) / sizeof(uint64_t)) #endif /* @@ -688,7 +692,9 @@ void icmp6_mtudisc_update(struct ip6ctlparam *, int); #define icmp6_ifstat_inc(ifp, tag) \ do { \ if (ifp) \ - ((struct in6_ifextra *)((ifp)->if_afdata[AF_INET6]))->icmp6_ifstat->tag++; \ + counter_u64_add(((struct in6_ifextra *) \ + ((ifp)->if_afdata[AF_INET6]))->icmp6_ifstat[\ + offsetof(struct icmp6_ifstat, tag) / sizeof(uint64_t)], 1);\ } while (/*CONSTCOND*/ 0) #define icmp6_ifoutstat_inc(ifp, type, code) \ diff --git a/sys/netinet/icmp_var.h b/sys/netinet/icmp_var.h index d939cc2..809879d 100644 --- a/sys/netinet/icmp_var.h +++ b/sys/netinet/icmp_var.h @@ -58,11 +58,15 @@ struct icmpstat { }; #ifdef _KERNEL +#include <sys/counter.h> + +VNET_PCPUSTAT_DECLARE(struct icmpstat, icmpstat); /* * In-kernel consumers can use these accessor macros directly to update * stats. */ -#define ICMPSTAT_ADD(name, val) V_icmpstat.name += (val) +#define ICMPSTAT_ADD(name, val) \ + VNET_PCPUSTAT_ADD(struct icmpstat, icmpstat, name, (val)) #define ICMPSTAT_INC(name) ICMPSTAT_ADD(name, 1) /* @@ -70,7 +74,7 @@ struct icmpstat { */ void kmod_icmpstat_inc(int statnum); #define KMOD_ICMPSTAT_INC(name) \ - kmod_icmpstat_inc(offsetof(struct icmpstat, name) / sizeof(u_long)) + kmod_icmpstat_inc(offsetof(struct icmpstat, name) / sizeof(uint64_t)) #endif /* @@ -91,9 +95,6 @@ void kmod_icmpstat_inc(int statnum); #ifdef _KERNEL SYSCTL_DECL(_net_inet_icmp); -VNET_DECLARE(struct icmpstat, icmpstat); /* icmp statistics. */ -#define V_icmpstat VNET(icmpstat) - extern int badport_bandlim(int); #define BANDLIM_UNLIMITED -1 #define BANDLIM_ICMP_UNREACH 0 diff --git a/sys/netinet/if_ether.c b/sys/netinet/if_ether.c index ad31557..675e0dd 100644 --- a/sys/netinet/if_ether.c +++ b/sys/netinet/if_ether.c @@ -89,7 +89,12 @@ VNET_DEFINE(int, useloopback) = 1; /* use loopback interface for static VNET_DEFINE(int, arp_proxyall) = 0; static VNET_DEFINE(int, arpt_down) = 20; /* keep incomplete entries for * 20 seconds */ -VNET_DEFINE(struct arpstat, arpstat); /* ARP statistics, see if_arp.h */ +VNET_PCPUSTAT_DEFINE(struct arpstat, arpstat); /* ARP statistics, see if_arp.h */ +VNET_PCPUSTAT_SYSINIT(arpstat); + +#ifdef VIMAGE +VNET_PCPUSTAT_SYSUNINIT(arpstat); +#endif /* VIMAGE */ static VNET_DEFINE(int, arp_maxhold) = 1; @@ -97,7 +102,6 @@ static VNET_DEFINE(int, arp_maxhold) = 1; #define V_arpt_down VNET(arpt_down) #define V_arp_maxtries VNET(arp_maxtries) #define V_arp_proxyall VNET(arp_proxyall) -#define V_arpstat VNET(arpstat) #define V_arp_maxhold VNET(arp_maxhold) SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, max_age, CTLFLAG_RW, @@ -115,9 +119,8 @@ SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, proxyall, CTLFLAG_RW, SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, wait, CTLFLAG_RW, &VNET_NAME(arpt_down), 0, "Incomplete ARP entry lifetime in seconds"); -SYSCTL_VNET_STRUCT(_net_link_ether_arp, OID_AUTO, stats, CTLFLAG_RW, - &VNET_NAME(arpstat), arpstat, - "ARP statistics (struct arpstat, net/if_arp.h)"); +SYSCTL_VNET_PCPUSTAT(_net_link_ether_arp, OID_AUTO, stats, struct arpstat, + arpstat, "ARP statistics (struct arpstat, net/if_arp.h)"); SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, maxhold, CTLFLAG_RW, &VNET_NAME(arp_maxhold), 0, "Number of packets to hold per ARP entry"); diff --git a/sys/netinet/in.c b/sys/netinet/in.c index 4e73b95..363f671 100644 --- a/sys/netinet/in.c +++ b/sys/netinet/in.c @@ -85,9 +85,6 @@ SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, no_same_prefix, CTLFLAG_RW, VNET_DECLARE(struct inpcbinfo, ripcbinfo); #define V_ripcbinfo VNET(ripcbinfo) -VNET_DECLARE(struct arpstat, arpstat); /* ARP statistics, see if_arp.h */ -#define V_arpstat VNET(arpstat) - /* * Return 1 if an internet address is for a ``local'' host * (one to which we have a connection). diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c index 3506b74..eb15a38 100644 --- a/sys/netinet/in_pcb.c +++ b/sys/netinet/in_pcb.c @@ -554,7 +554,7 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, * and a multicast address is bound on both * new and duplicated sockets. */ - if (so->so_options & SO_REUSEADDR) + if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0) reuseport = SO_REUSEADDR|SO_REUSEPORT; } else if (sin->sin_addr.s_addr != INADDR_ANY) { sin->sin_port = 0; /* yech... */ diff --git a/sys/netinet/ip_carp.c b/sys/netinet/ip_carp.c index 43c387f..9228a8f 100644 --- a/sys/netinet/ip_carp.c +++ b/sys/netinet/ip_carp.c @@ -50,6 +50,7 @@ __FBSDID("$FreeBSD$"); #include <sys/sysctl.h> #include <sys/syslog.h> #include <sys/taskqueue.h> +#include <sys/counter.h> #include <net/ethernet.h> #include <net/fddi.h> @@ -209,9 +210,25 @@ SYSCTL_INT(_net_inet_carp, OID_AUTO, senderr_demotion_factor, CTLFLAG_RW, SYSCTL_INT(_net_inet_carp, OID_AUTO, ifdown_demotion_factor, CTLFLAG_RW, &carp_ifdown_adj, 0, "Interface down demotion factor adjustment"); -static struct carpstats carpstats; -SYSCTL_STRUCT(_net_inet_carp, OID_AUTO, stats, CTLFLAG_RW, &carpstats, - carpstats, "CARP statistics (struct carpstats, netinet/ip_carp.h)"); +static counter_u64_t carpstats[sizeof(struct carpstats) / sizeof(uint64_t)]; +#define CARPSTATS_ADD(name, val) \ + counter_u64_add(carpstats[offsetof(struct carpstats, name) / \ + sizeof(uint64_t)], (val)) +#define CARPSTATS_INC(name) CARPSTATS_ADD(name, 1) + +static int +carpstats_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct carpstats s; + + COUNTER_ARRAY_COPY(carpstats, &s, sizeof(s) / sizeof(uint64_t)); + if (req->newptr) + COUNTER_ARRAY_ZERO(carpstats, sizeof(s) / sizeof(uint64_t)); + return (SYSCTL_OUT(req, &s, sizeof(s))); +} +SYSCTL_PROC(_net_inet_carp, OID_AUTO, stats, CTLTYPE_OPAQUE | CTLFLAG_RW, + NULL, 0, carpstats_sysctl, "I", + "CARP statistics (struct carpstats, netinet/ip_carp.h)"); #define CARP_LOCK_INIT(sc) mtx_init(&(sc)->sc_mtx, "carp_softc", \ NULL, MTX_DEF) @@ -2084,6 +2101,8 @@ carp_mod_cleanup(void) mtx_unlock(&carp_mtx); taskqueue_drain(taskqueue_swi, &carp_sendall_task); mtx_destroy(&carp_mtx); + COUNTER_ARRAY_FREE(carpstats, + sizeof(struct carpstats) / sizeof(uint64_t)); } static int @@ -2093,6 +2112,8 @@ carp_mod_load(void) mtx_init(&carp_mtx, "carp_mtx", NULL, MTX_DEF); LIST_INIT(&carp_list); + COUNTER_ARRAY_ALLOC(carpstats, + sizeof(struct carpstats) / sizeof(uint64_t), M_WAITOK); carp_get_vhid_p = carp_get_vhid; carp_forus_p = carp_forus; carp_output_p = carp_output; diff --git a/sys/netinet/ip_carp.h b/sys/netinet/ip_carp.h index de71185..9f03d58 100644 --- a/sys/netinet/ip_carp.h +++ b/sys/netinet/ip_carp.h @@ -117,11 +117,6 @@ struct carpstats { uint64_t carps_preempt; /* if enabled, preemptions */ }; -#ifdef _KERNEL -#define CARPSTATS_ADD(name, val) carpstats.name += (val) -#define CARPSTATS_INC(name) CARPSTATS_ADD(name, 1) -#endif - /* * Configuration structure for SIOCSVH SIOCGVH */ diff --git a/sys/netinet/ip_icmp.c b/sys/netinet/ip_icmp.c index 39d6f7e..deabf44 100644 --- a/sys/netinet/ip_icmp.c +++ b/sys/netinet/ip_icmp.c @@ -92,9 +92,14 @@ SYSCTL_VNET_INT(_net_inet_icmp, OID_AUTO, icmplim_output, CTLFLAG_RW, "Enable logging of ICMP response rate limiting"); #ifdef INET -VNET_DEFINE(struct icmpstat, icmpstat); -SYSCTL_VNET_STRUCT(_net_inet_icmp, ICMPCTL_STATS, stats, CTLFLAG_RW, - &VNET_NAME(icmpstat), icmpstat, ""); +VNET_PCPUSTAT_DEFINE(struct icmpstat, icmpstat); +VNET_PCPUSTAT_SYSINIT(icmpstat); +SYSCTL_VNET_PCPUSTAT(_net_inet_icmp, ICMPCTL_STATS, stats, struct icmpstat, + icmpstat, "ICMP statistics (struct icmpstat, netinet/icmp_var.h)"); + +#ifdef VIMAGE +VNET_PCPUSTAT_SYSUNINIT(icmpstat); +#endif /* VIMAGE */ static VNET_DEFINE(int, icmpmaskrepl) = 0; #define V_icmpmaskrepl VNET(icmpmaskrepl) @@ -197,7 +202,7 @@ void kmod_icmpstat_inc(int statnum) { - (*((u_long *)&V_icmpstat + statnum))++; + counter_u64_add(VNET(icmpstat)[statnum], 1); } /* diff --git a/sys/netinet/ip_input.c b/sys/netinet/ip_input.c index 0268ebc..219f362 100644 --- a/sys/netinet/ip_input.c +++ b/sys/netinet/ip_input.c @@ -208,73 +208,17 @@ SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, output_flowtable_size, CTLFLAG_RDTUN, static void ip_freef(struct ipqhead *, struct ipq *); /* - * IP statistics are stored in struct ipstat_p, which is - * an "array" of counter(9)s. Although it isn't a real - * array, we treat it as array to reduce code bloat. + * IP statistics are stored in the "array" of counter(9)s. */ -VNET_DEFINE(struct ipstat_p, ipstatp); - -static void -vnet_ipstatp_init(const void *unused) -{ - counter_u64_t *c; - int i; - - for (i = 0, c = (counter_u64_t *)&V_ipstatp; - i < sizeof(V_ipstatp) / sizeof(counter_u64_t); - i++, c++) { - *c = counter_u64_alloc(M_WAITOK); - counter_u64_zero(*c); - } -} -VNET_SYSINIT(vnet_ipstatp_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, - vnet_ipstatp_init, NULL); +VNET_PCPUSTAT_DEFINE(struct ipstat, ipstat); +VNET_PCPUSTAT_SYSINIT(ipstat); +SYSCTL_VNET_PCPUSTAT(_net_inet_ip, IPCTL_STATS, stats, struct ipstat, ipstat, + "IP statistics (struct ipstat, netinet/ip_var.h)"); #ifdef VIMAGE -static void -vnet_ipstatp_uninit(const void *unused) -{ - counter_u64_t *c; - int i; - - for (i = 0, c = (counter_u64_t *)&V_ipstatp; - i < sizeof(V_ipstatp) / sizeof(counter_u64_t); - i++, c++) - counter_u64_free(*c); -} -VNET_SYSUNINIT(vnet_ipstatp_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, - vnet_ipstatp_uninit, NULL); +VNET_PCPUSTAT_SYSUNINIT(ipstat); #endif /* VIMAGE */ -static int -ipstat_sysctl(SYSCTL_HANDLER_ARGS) -{ - struct ipstat ipstat; - counter_u64_t *c; - uint64_t *v; - int i; - - for (i = 0, c = (counter_u64_t *)&V_ipstatp, v = (uint64_t *)&ipstat; - i < sizeof(V_ipstatp) / sizeof(counter_u64_t); - i++, c++, v++) { - *v = counter_u64_fetch(*c); - /* - * Old interface allowed to rewrite 'struct ipstat', and - * netstat(1) used it to zero the structure. To keep - * compatibility with old netstat(1) we will zero out - * statistics on every write attempt, however we no longer - * support writing arbitrary fake values to the statistics. - */ - if (req->newptr) - counter_u64_zero(*c); - } - - return (SYSCTL_OUT(req, &ipstat, sizeof(ipstat))); -} -SYSCTL_VNET_PROC(_net_inet_ip, IPCTL_STATS, stats, CTLTYPE_OPAQUE | CTLFLAG_RW, - NULL, 0, ipstat_sysctl, "I", - "IP statistics (struct ipstat, netinet/ip_var.h)"); - /* * Kernel module interface for updating ipstat. The argument is an index * into ipstat treated as an array. @@ -283,14 +227,14 @@ void kmod_ipstat_inc(int statnum) { - counter_u64_add(*((counter_u64_t *)&V_ipstatp + statnum), 1); + counter_u64_add(VNET(ipstat)[statnum], 1); } void kmod_ipstat_dec(int statnum) { - counter_u64_add(*((counter_u64_t *)&V_ipstatp + statnum), -1); + counter_u64_add(VNET(ipstat)[statnum], -1); } static int diff --git a/sys/netinet/ip_mroute.c b/sys/netinet/ip_mroute.c index b2b52c0..23f1be7 100644 --- a/sys/netinet/ip_mroute.c +++ b/sys/netinet/ip_mroute.c @@ -93,6 +93,7 @@ __FBSDID("$FreeBSD$"); #include <sys/syslog.h> #include <sys/systm.h> #include <sys/time.h> +#include <sys/counter.h> #include <net/if.h> #include <net/netisr.h> @@ -145,11 +146,11 @@ static struct mtx mrouter_mtx; static int ip_mrouter_cnt; /* # of vnets with active mrouters */ static int ip_mrouter_unloading; /* Allow no more V_ip_mrouter sockets */ -static VNET_DEFINE(struct mrtstat, mrtstat); -#define V_mrtstat VNET(mrtstat) -SYSCTL_VNET_STRUCT(_net_inet_ip, OID_AUTO, mrtstat, CTLFLAG_RW, - &VNET_NAME(mrtstat), mrtstat, - "IPv4 Multicast Forwarding Statistics (struct mrtstat, " +static VNET_PCPUSTAT_DEFINE(struct mrtstat, mrtstat); +VNET_PCPUSTAT_SYSINIT(mrtstat); +VNET_PCPUSTAT_SYSUNINIT(mrtstat); +SYSCTL_VNET_PCPUSTAT(_net_inet_ip, OID_AUTO, mrtstat, struct mrtstat, + mrtstat, "IPv4 Multicast Forwarding Statistics (struct mrtstat, " "netinet/ip_mroute.h)"); static VNET_DEFINE(u_long, mfchash); @@ -225,13 +226,13 @@ static VNET_DEFINE(struct callout, bw_upcalls_ch); #define BW_UPCALLS_PERIOD (hz) /* periodical flush of bw upcalls */ -static VNET_DEFINE(struct pimstat, pimstat); -#define V_pimstat VNET(pimstat) +static VNET_PCPUSTAT_DEFINE(struct pimstat, pimstat); +VNET_PCPUSTAT_SYSINIT(pimstat); +VNET_PCPUSTAT_SYSUNINIT(pimstat); SYSCTL_NODE(_net_inet, IPPROTO_PIM, pim, CTLFLAG_RW, 0, "PIM"); -SYSCTL_VNET_STRUCT(_net_inet_pim, PIMCTL_STATS, stats, CTLFLAG_RD, - &VNET_NAME(pimstat), pimstat, - "PIM Statistics (struct pimstat, netinet/pim_var.h)"); +SYSCTL_VNET_PCPUSTAT(_net_inet_pim, PIMCTL_STATS, stats, struct pimstat, + pimstat, "PIM Statistics (struct pimstat, netinet/pim_var.h)"); static u_long pim_squelch_wholepkt = 0; SYSCTL_ULONG(_net_inet_pim, OID_AUTO, squelch_wholepkt, CTLFLAG_RW, diff --git a/sys/netinet/ip_mroute.h b/sys/netinet/ip_mroute.h index e945b92..65f7d83 100644 --- a/sys/netinet/ip_mroute.h +++ b/sys/netinet/ip_mroute.h @@ -206,23 +206,24 @@ struct bw_upcall { * The kernel's multicast routing statistics. */ struct mrtstat { - u_long mrts_mfc_lookups; /* # forw. cache hash table hits */ - u_long mrts_mfc_misses; /* # forw. cache hash table misses */ - u_long mrts_upcalls; /* # calls to multicast routing daemon */ - u_long mrts_no_route; /* no route for packet's origin */ - u_long mrts_bad_tunnel; /* malformed tunnel options */ - u_long mrts_cant_tunnel; /* no room for tunnel options */ - u_long mrts_wrong_if; /* arrived on wrong interface */ - u_long mrts_upq_ovflw; /* upcall Q overflow */ - u_long mrts_cache_cleanups; /* # entries with no upcalls */ - u_long mrts_drop_sel; /* pkts dropped selectively */ - u_long mrts_q_overflow; /* pkts dropped - Q overflow */ - u_long mrts_pkt2large; /* pkts dropped - size > BKT SIZE */ - u_long mrts_upq_sockfull; /* upcalls dropped - socket full */ + uint64_t mrts_mfc_lookups; /* # forw. cache hash table hits */ + uint64_t mrts_mfc_misses; /* # forw. cache hash table misses */ + uint64_t mrts_upcalls; /* # calls to multicast routing daemon */ + uint64_t mrts_no_route; /* no route for packet's origin */ + uint64_t mrts_bad_tunnel; /* malformed tunnel options */ + uint64_t mrts_cant_tunnel; /* no room for tunnel options */ + uint64_t mrts_wrong_if; /* arrived on wrong interface */ + uint64_t mrts_upq_ovflw; /* upcall Q overflow */ + uint64_t mrts_cache_cleanups; /* # entries with no upcalls */ + uint64_t mrts_drop_sel; /* pkts dropped selectively */ + uint64_t mrts_q_overflow; /* pkts dropped - Q overflow */ + uint64_t mrts_pkt2large; /* pkts dropped - size > BKT SIZE */ + uint64_t mrts_upq_sockfull; /* upcalls dropped - socket full */ }; #ifdef _KERNEL -#define MRTSTAT_ADD(name, val) V_mrtstat.name += (val) +#define MRTSTAT_ADD(name, val) \ + VNET_PCPUSTAT_ADD(struct mrtstat, mrtstat, name, (val)) #define MRTSTAT_INC(name) MRTSTAT_ADD(name, 1) #endif diff --git a/sys/netinet/ip_var.h b/sys/netinet/ip_var.h index 7f5181e..1ac1bd2 100644 --- a/sys/netinet/ip_var.h +++ b/sys/netinet/ip_var.h @@ -133,46 +133,13 @@ struct ipstat { #include <sys/counter.h> #include <net/vnet.h> -/* Should match 'struct ipstat' above. */ -struct ipstat_p { - counter_u64_t ips_total; - counter_u64_t ips_badsum; - counter_u64_t ips_tooshort; - counter_u64_t ips_toosmall; - counter_u64_t ips_badhlen; - counter_u64_t ips_badlen; - counter_u64_t ips_fragments; - counter_u64_t ips_fragdropped; - counter_u64_t ips_fragtimeout; - counter_u64_t ips_forward; - counter_u64_t ips_fastforward; - counter_u64_t ips_cantforward; - counter_u64_t ips_redirectsent; - counter_u64_t ips_noproto; - counter_u64_t ips_delivered; - counter_u64_t ips_localout; - counter_u64_t ips_odropped; - counter_u64_t ips_reassembled; - counter_u64_t ips_fragmented; - counter_u64_t ips_ofragments; - counter_u64_t ips_cantfrag; - counter_u64_t ips_badoptions; - counter_u64_t ips_noroute; - counter_u64_t ips_badvers; - counter_u64_t ips_rawout; - counter_u64_t ips_toolong; - counter_u64_t ips_notmember; - counter_u64_t ips_nogif; - counter_u64_t ips_badaddr; -}; -VNET_DECLARE(struct ipstat_p, ipstatp); -#define V_ipstatp VNET(ipstatp) - +VNET_PCPUSTAT_DECLARE(struct ipstat, ipstat); /* * In-kernel consumers can use these accessor macros directly to update * stats. */ -#define IPSTAT_ADD(name, val) counter_u64_add(V_ipstatp.name, (val)) +#define IPSTAT_ADD(name, val) \ + VNET_PCPUSTAT_ADD(struct ipstat, ipstat, name, (val)) #define IPSTAT_SUB(name, val) IPSTAT_ADD(name, -(val)) #define IPSTAT_INC(name) IPSTAT_ADD(name, 1) #define IPSTAT_DEC(name) IPSTAT_SUB(name, 1) @@ -181,11 +148,11 @@ VNET_DECLARE(struct ipstat_p, ipstatp); * Kernel module consumers must use this accessor macro. */ void kmod_ipstat_inc(int statnum); -#define KMOD_IPSTAT_INC(name) \ - kmod_ipstat_inc(offsetof(struct ipstat_p, name) / sizeof(counter_u64_t)) +#define KMOD_IPSTAT_INC(name) \ + kmod_ipstat_inc(offsetof(struct ipstat, name) / sizeof(uint64_t)) void kmod_ipstat_dec(int statnum); -#define KMOD_IPSTAT_DEC(name) \ - kmod_ipstat_dec(offsetof(struct ipstat_p, name) / sizeof(counter_u64_t)) +#define KMOD_IPSTAT_DEC(name) \ + kmod_ipstat_dec(offsetof(struct ipstat, name) / sizeof(uint64_t)) /* flags passed to ip_output as last parameter */ #define IP_FORWARDING 0x1 /* most of ip header exists */ diff --git a/sys/netinet/pim_var.h b/sys/netinet/pim_var.h index 41657b6..1fdfb10 100644 --- a/sys/netinet/pim_var.h +++ b/sys/netinet/pim_var.h @@ -46,21 +46,22 @@ * PIM statistics kept in the kernel */ struct pimstat { - u_quad_t pims_rcv_total_msgs; /* total PIM messages received */ - u_quad_t pims_rcv_total_bytes; /* total PIM bytes received */ - u_quad_t pims_rcv_tooshort; /* rcvd with too few bytes */ - u_quad_t pims_rcv_badsum; /* rcvd with bad checksum */ - u_quad_t pims_rcv_badversion; /* rcvd bad PIM version */ - u_quad_t pims_rcv_registers_msgs; /* rcvd regs. msgs (data only) */ - u_quad_t pims_rcv_registers_bytes; /* rcvd regs. bytes (data only) */ - u_quad_t pims_rcv_registers_wrongiif; /* rcvd regs. on wrong iif */ - u_quad_t pims_rcv_badregisters; /* rcvd invalid registers */ - u_quad_t pims_snd_registers_msgs; /* sent regs. msgs (data only) */ - u_quad_t pims_snd_registers_bytes; /* sent regs. bytes (data only) */ + uint64_t pims_rcv_total_msgs; /* total PIM messages received */ + uint64_t pims_rcv_total_bytes; /* total PIM bytes received */ + uint64_t pims_rcv_tooshort; /* rcvd with too few bytes */ + uint64_t pims_rcv_badsum; /* rcvd with bad checksum */ + uint64_t pims_rcv_badversion; /* rcvd bad PIM version */ + uint64_t pims_rcv_registers_msgs; /* rcvd regs. msgs (data only) */ + uint64_t pims_rcv_registers_bytes; /* rcvd regs. bytes (data only) */ + uint64_t pims_rcv_registers_wrongiif; /* rcvd regs. on wrong iif */ + uint64_t pims_rcv_badregisters; /* rcvd invalid registers */ + uint64_t pims_snd_registers_msgs; /* sent regs. msgs (data only) */ + uint64_t pims_snd_registers_bytes; /* sent regs. bytes (data only) */ }; #ifdef _KERNEL -#define PIMSTAT_ADD(name, val) V_pimstat.name += (val) +#define PIMSTAT_ADD(name, val) \ + VNET_PCPUSTAT_ADD(struct pimstat, pimstat, name, (val)) #define PIMSTAT_INC(name) PIMSTAT_ADD(name, 1) #endif diff --git a/sys/netinet/sctp_constants.h b/sys/netinet/sctp_constants.h index dd6b6b2..58ca808 100644 --- a/sys/netinet/sctp_constants.h +++ b/sys/netinet/sctp_constants.h @@ -521,9 +521,6 @@ __FBSDID("$FreeBSD$"); /* How long a cookie lives in milli-seconds */ #define SCTP_DEFAULT_COOKIE_LIFE 60000 -/* resource limit of streams */ -#define MAX_SCTP_STREAMS 2048 - /* Maximum the mapping array will grow to (TSN mapping array) */ #define SCTP_MAPPING_ARRAY 512 @@ -658,6 +655,7 @@ __FBSDID("$FreeBSD$"); /* How many streams I request initally by default */ #define SCTP_OSTREAM_INITIAL 10 +#define SCTP_ISTREAM_INITIAL 2048 /* * How many smallest_mtu's need to increase before a window update sack is diff --git a/sys/netinet/sctp_input.c b/sys/netinet/sctp_input.c index c19464f..f65c262 100644 --- a/sys/netinet/sctp_input.c +++ b/sys/netinet/sctp_input.c @@ -389,9 +389,10 @@ sctp_process_init(struct sctp_init_chunk *cp, struct sctp_tcb *stcb) } SCTP_FREE(asoc->strmin, SCTP_M_STRMI); } - asoc->streamincnt = ntohs(init->num_outbound_streams); - if (asoc->streamincnt > MAX_SCTP_STREAMS) { - asoc->streamincnt = MAX_SCTP_STREAMS; + if (asoc->max_inbound_streams > ntohs(init->num_outbound_streams)) { + asoc->streamincnt = ntohs(init->num_outbound_streams); + } else { + asoc->streamincnt = asoc->max_inbound_streams; } SCTP_MALLOC(asoc->strmin, struct sctp_stream_in *, asoc->streamincnt * sizeof(struct sctp_stream_in), SCTP_M_STRMI); @@ -403,11 +404,6 @@ sctp_process_init(struct sctp_init_chunk *cp, struct sctp_tcb *stcb) for (i = 0; i < asoc->streamincnt; i++) { asoc->strmin[i].stream_no = i; asoc->strmin[i].last_sequence_delivered = 0xffff; - /* - * U-stream ranges will be set when the cookie is unpacked. - * Or for the INIT sender they are un set (if pr-sctp not - * supported) when the INIT-ACK arrives. - */ TAILQ_INIT(&asoc->strmin[i].inqueue); asoc->strmin[i].delivery_started = 0; } @@ -5709,7 +5705,7 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt #ifdef INET case AF_INET: if (ipsec4_in_reject(m, &inp->ip_inp.inp)) { - MODULE_GLOBAL(ipsec4stat).in_polvio++; + IPSECSTAT_INC(in_polvio); SCTP_STAT_INCR(sctps_hdrops); goto out; } @@ -5718,7 +5714,7 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt #ifdef INET6 case AF_INET6: if (ipsec6_in_reject(m, &inp->ip_inp.inp)) { - MODULE_GLOBAL(ipsec6stat).in_polvio++; + IPSEC6STAT_INC(in_polvio); SCTP_STAT_INCR(sctps_hdrops); goto out; } diff --git a/sys/netinet/sctp_pcb.c b/sys/netinet/sctp_pcb.c index 00f685f..95d8771 100644 --- a/sys/netinet/sctp_pcb.c +++ b/sys/netinet/sctp_pcb.c @@ -2503,9 +2503,6 @@ sctp_inpcb_alloc(struct socket *so, uint32_t vrf_id) m->initial_rto = SCTP_BASE_SYSCTL(sctp_rto_initial_default); m->initial_init_rto_max = SCTP_BASE_SYSCTL(sctp_init_rto_max_default); m->sctp_sack_freq = SCTP_BASE_SYSCTL(sctp_sack_freq_default); - - m->max_open_streams_intome = MAX_SCTP_STREAMS; - m->max_init_times = SCTP_BASE_SYSCTL(sctp_init_rtx_max_default); m->max_send_times = SCTP_BASE_SYSCTL(sctp_assoc_rtx_max_default); m->def_net_failure = SCTP_BASE_SYSCTL(sctp_path_rtx_max_default); @@ -2517,6 +2514,7 @@ sctp_inpcb_alloc(struct socket *so, uint32_t vrf_id) m->sctp_default_cc_module = SCTP_BASE_SYSCTL(sctp_default_cc_module); m->sctp_default_ss_module = SCTP_BASE_SYSCTL(sctp_default_ss_module); + m->max_open_streams_intome = SCTP_BASE_SYSCTL(sctp_nr_incoming_streams_default); /* number of streams to pre-open on a association */ m->pre_open_stream_count = SCTP_BASE_SYSCTL(sctp_nr_outgoing_streams_default); diff --git a/sys/netinet/sctp_sysctl.c b/sys/netinet/sctp_sysctl.c index debb8cc..53e9b04 100644 --- a/sys/netinet/sctp_sysctl.c +++ b/sys/netinet/sctp_sysctl.c @@ -81,6 +81,7 @@ sctp_init_sysctls() SCTP_BASE_SYSCTL(sctp_path_rtx_max_default) = SCTPCTL_PATH_RTX_MAX_DEFAULT; SCTP_BASE_SYSCTL(sctp_path_pf_threshold) = SCTPCTL_PATH_PF_THRESHOLD_DEFAULT; SCTP_BASE_SYSCTL(sctp_add_more_threshold) = SCTPCTL_ADD_MORE_ON_OUTPUT_DEFAULT; + SCTP_BASE_SYSCTL(sctp_nr_incoming_streams_default) = SCTPCTL_INCOMING_STREAMS_DEFAULT; SCTP_BASE_SYSCTL(sctp_nr_outgoing_streams_default) = SCTPCTL_OUTGOING_STREAMS_DEFAULT; SCTP_BASE_SYSCTL(sctp_cmt_on_off) = SCTPCTL_CMT_ON_OFF_DEFAULT; /* EY */ @@ -623,6 +624,7 @@ sysctl_sctp_check(SYSCTL_HANDLER_ARGS) RANGECHK(SCTP_BASE_SYSCTL(sctp_path_rtx_max_default), SCTPCTL_PATH_RTX_MAX_MIN, SCTPCTL_PATH_RTX_MAX_MAX); RANGECHK(SCTP_BASE_SYSCTL(sctp_path_pf_threshold), SCTPCTL_PATH_PF_THRESHOLD_MIN, SCTPCTL_PATH_PF_THRESHOLD_MAX); RANGECHK(SCTP_BASE_SYSCTL(sctp_add_more_threshold), SCTPCTL_ADD_MORE_ON_OUTPUT_MIN, SCTPCTL_ADD_MORE_ON_OUTPUT_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_nr_incoming_streams_default), SCTPCTL_INCOMING_STREAMS_MIN, SCTPCTL_INCOMING_STREAMS_MAX); RANGECHK(SCTP_BASE_SYSCTL(sctp_nr_outgoing_streams_default), SCTPCTL_OUTGOING_STREAMS_MIN, SCTPCTL_OUTGOING_STREAMS_MAX); RANGECHK(SCTP_BASE_SYSCTL(sctp_cmt_on_off), SCTPCTL_CMT_ON_OFF_MIN, SCTPCTL_CMT_ON_OFF_MAX); /* EY */ @@ -965,6 +967,10 @@ SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, add_more_on_output, CTLTYPE_UINT | CT &SCTP_BASE_SYSCTL(sctp_add_more_threshold), 0, sysctl_sctp_check, "IU", SCTPCTL_ADD_MORE_ON_OUTPUT_DESC); +SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, incoming_streams, CTLTYPE_UINT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_nr_incoming_streams_default), 0, sysctl_sctp_check, "IU", + SCTPCTL_INCOMING_STREAMS_DESC); + SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, outgoing_streams, CTLTYPE_UINT | CTLFLAG_RW, &SCTP_BASE_SYSCTL(sctp_nr_outgoing_streams_default), 0, sysctl_sctp_check, "IU", SCTPCTL_OUTGOING_STREAMS_DESC); diff --git a/sys/netinet/sctp_sysctl.h b/sys/netinet/sctp_sysctl.h index 4ec3715..8090373 100644 --- a/sys/netinet/sctp_sysctl.h +++ b/sys/netinet/sctp_sysctl.h @@ -72,6 +72,7 @@ struct sctp_sysctl { uint32_t sctp_path_rtx_max_default; uint32_t sctp_path_pf_threshold; uint32_t sctp_add_more_threshold; + uint32_t sctp_nr_incoming_streams_default; uint32_t sctp_nr_outgoing_streams_default; uint32_t sctp_cmt_on_off; uint32_t sctp_cmt_use_dac; @@ -322,6 +323,12 @@ struct sctp_sysctl { #define SCTPCTL_ADD_MORE_ON_OUTPUT_MAX 0xFFFFFFFF #define SCTPCTL_ADD_MORE_ON_OUTPUT_DEFAULT SCTP_DEFAULT_ADD_MORE +/* incoming_streams: Default number of incoming streams */ +#define SCTPCTL_INCOMING_STREAMS_DESC "Default number of incoming streams" +#define SCTPCTL_INCOMING_STREAMS_MIN 1 +#define SCTPCTL_INCOMING_STREAMS_MAX 65535 +#define SCTPCTL_INCOMING_STREAMS_DEFAULT SCTP_ISTREAM_INITIAL + /* outgoing_streams: Default number of outgoing streams */ #define SCTPCTL_OUTGOING_STREAMS_DESC "Default number of outgoing streams" #define SCTPCTL_OUTGOING_STREAMS_MIN 1 diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index 89f7eb4..7999263 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -240,67 +240,16 @@ static void inline hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to); /* - * TCP statistics are stored in struct tcpstat_p, which is - * an "array" of counter(9)s. Although it isn't a real - * array, we treat it as array to reduce code bloat. + * TCP statistics are stored in an "array" of counter(9)s. */ -VNET_DEFINE(struct tcpstat_p, tcpstatp); - -static void -vnet_tcpstatp_init(const void *unused) -{ - counter_u64_t *c; - int i; - - for (i = 0, c = (counter_u64_t *)&V_tcpstatp; - i < sizeof(V_tcpstatp) / sizeof(counter_u64_t); - i++, c++) { - *c = counter_u64_alloc(M_WAITOK); - counter_u64_zero(*c); - } -} -VNET_SYSINIT(vnet_tcpstatp_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, - vnet_tcpstatp_init, NULL); +VNET_PCPUSTAT_DEFINE(struct tcpstat, tcpstat); +VNET_PCPUSTAT_SYSINIT(tcpstat); +SYSCTL_VNET_PCPUSTAT(_net_inet_tcp, TCPCTL_STATS, stats, struct tcpstat, + tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); #ifdef VIMAGE -static void -vnet_tcpstatp_uninit(const void *unused) -{ - counter_u64_t *c; - int i; - - for (i = 0, c = (counter_u64_t *)&V_tcpstatp; - i < sizeof(V_tcpstatp) / sizeof(counter_u64_t); - i++, c++) - counter_u64_free(*c); -} -VNET_SYSUNINIT(vnet_tcpstatp_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, - vnet_tcpstatp_uninit, NULL); +VNET_PCPUSTAT_SYSUNINIT(tcpstat); #endif /* VIMAGE */ - -static int -tcpstat_sysctl(SYSCTL_HANDLER_ARGS) -{ - struct tcpstat tcpstat; - counter_u64_t *c; - uint64_t *v; - int i; - - for (i = 0, c = (counter_u64_t *)&V_tcpstatp, v = (uint64_t *)&tcpstat; - i < sizeof(V_tcpstatp) / sizeof(counter_u64_t); - i++, c++, v++) { - *v = counter_u64_fetch(*c); - if (req->newptr) - counter_u64_zero(*c); - } - - return (SYSCTL_OUT(req, &tcpstat, sizeof(tcpstat))); -} - -SYSCTL_VNET_PROC(_net_inet_tcp, TCPCTL_STATS, stats, CTLTYPE_OPAQUE | - CTLFLAG_RW, NULL, 0, tcpstat_sysctl, "I", - "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); - /* * Kernel module interface for updating tcpstat. The argument is an index * into tcpstat treated as an array. @@ -309,7 +258,7 @@ void kmod_tcpstat_inc(int statnum) { - counter_u64_add(*((counter_u64_t *)&V_tcpstatp + statnum), 1); + counter_u64_add(VNET(tcpstat)[statnum], 1); } /* @@ -1497,6 +1446,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, int thflags, acked, ourfinisacked, needoutput = 0; int rstreason, todrop, win; u_long tiwin; + char *s; + struct in_conninfo *inc; struct tcpopt to; #ifdef TCPDEBUG @@ -1509,6 +1460,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, short ostate = 0; #endif thflags = th->th_flags; + inc = &tp->t_inpcb->inp_inc; tp->sackhint.last_sack_ack = 0; /* @@ -1597,6 +1549,24 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks())) to.to_tsecr = 0; } + /* + * If timestamps were negotiated during SYN/ACK they should + * appear on every segment during this session and vice versa. + */ + if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) { + if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { + log(LOG_DEBUG, "%s; %s: Timestamp missing, " + "no action\n", s, __func__); + free(s, M_TCPLOG); + } + } + if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) { + if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { + log(LOG_DEBUG, "%s; %s: Timestamp not expected, " + "no action\n", s, __func__); + free(s, M_TCPLOG); + } + } /* * Process options only when we get SYN/ACK back. The SYN case @@ -2264,15 +2234,14 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, */ if ((so->so_state & SS_NOFDREF) && tp->t_state > TCPS_CLOSE_WAIT && tlen) { - char *s; - KASSERT(ti_locked == TI_WLOCKED, ("%s: SS_NOFDEREF && " "CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked)); INP_INFO_WLOCK_ASSERT(&V_tcbinfo); - if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, NULL))) { - log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data after socket " - "was closed, sending RST and removing tcpcb\n", + if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { + log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data " + "after socket was closed, " + "sending RST and removing tcpcb\n", s, __func__, tcpstates[tp->t_state], tlen); free(s, M_TCPLOG); } diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c index 441c269..cd7b424 100644 --- a/sys/netinet/tcp_syncache.c +++ b/sys/netinet/tcp_syncache.c @@ -1,12 +1,12 @@ /*- * Copyright (c) 2001 McAfee, Inc. - * Copyright (c) 2006 Andre Oppermann, Internet Business Solutions AG + * Copyright (c) 2006,2013 Andre Oppermann, Internet Business Solutions AG * All rights reserved. * * This software was developed for the FreeBSD Project by Jonathan Lemon * and McAfee Research, the Security Research Division of McAfee, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the - * DARPA CHATS research program. + * DARPA CHATS research program. [2001 McAfee, Inc.] * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -47,7 +47,6 @@ __FBSDID("$FreeBSD$"); #include <sys/mutex.h> #include <sys/malloc.h> #include <sys/mbuf.h> -#include <sys/md5.h> #include <sys/proc.h> /* for proc0 declaration */ #include <sys/random.h> #include <sys/socket.h> @@ -55,6 +54,9 @@ __FBSDID("$FreeBSD$"); #include <sys/syslog.h> #include <sys/ucred.h> +#include <sys/md5.h> +#include <crypto/siphash/siphash.h> + #include <vm/uma.h> #include <net/if.h> @@ -127,12 +129,20 @@ static int syncache_sysctl_count(SYSCTL_HANDLER_ARGS); static void syncache_timeout(struct syncache *sc, struct syncache_head *sch, int docallout); static void syncache_timer(void *); -static void syncookie_generate(struct syncache_head *, struct syncache *, - u_int32_t *); + +static uint32_t syncookie_mac(struct in_conninfo *, tcp_seq, uint8_t, + uint8_t *, uintptr_t); +static tcp_seq syncookie_generate(struct syncache_head *, struct syncache *); static struct syncache *syncookie_lookup(struct in_conninfo *, struct syncache_head *, - struct syncache *, struct tcpopt *, struct tcphdr *, + struct syncache *, struct tcphdr *, struct tcpopt *, struct socket *); +static void syncookie_reseed(void *); +#ifdef INVARIANTS +static int syncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch, + struct syncache *sc, struct tcphdr *th, struct tcpopt *to, + struct socket *lso); +#endif /* * Transmit the SYN,ACK fewer times than TCP_MAXRXTSHIFT specifies. @@ -252,17 +262,19 @@ syncache_init(void) V_tcp_syncache.hashbase = malloc(V_tcp_syncache.hashsize * sizeof(struct syncache_head), M_SYNCACHE, M_WAITOK | M_ZERO); - /* Initialize the hash buckets. */ - for (i = 0; i < V_tcp_syncache.hashsize; i++) { #ifdef VIMAGE - V_tcp_syncache.hashbase[i].sch_vnet = curvnet; + V_tcp_syncache.vnet = curvnet; #endif + + /* Initialize the hash buckets. */ + for (i = 0; i < V_tcp_syncache.hashsize; i++) { TAILQ_INIT(&V_tcp_syncache.hashbase[i].sch_bucket); mtx_init(&V_tcp_syncache.hashbase[i].sch_mtx, "tcp_sc_head", NULL, MTX_DEF); callout_init_mtx(&V_tcp_syncache.hashbase[i].sch_timer, &V_tcp_syncache.hashbase[i].sch_mtx, 0); V_tcp_syncache.hashbase[i].sch_length = 0; + V_tcp_syncache.hashbase[i].sch_sc = &V_tcp_syncache; } /* Create the syncache entry zone. */ @@ -270,6 +282,13 @@ syncache_init(void) NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); V_tcp_syncache.cache_limit = uma_zone_set_max(V_tcp_syncache.zone, V_tcp_syncache.cache_limit); + + /* Start the SYN cookie reseeder callout. */ + callout_init(&V_tcp_syncache.secret.reseed, 1); + arc4rand(V_tcp_syncache.secret.key[0], SYNCOOKIE_SECRET_SIZE, 0); + arc4rand(V_tcp_syncache.secret.key[1], SYNCOOKIE_SECRET_SIZE, 0); + callout_reset(&V_tcp_syncache.secret.reseed, SYNCOOKIE_LIFETIME * hz, + syncookie_reseed, &V_tcp_syncache); } #ifdef VIMAGE @@ -303,6 +322,8 @@ syncache_destroy(void) /* Free the allocated global resources. */ uma_zdestroy(V_tcp_syncache.zone); free(V_tcp_syncache.hashbase, M_SYNCACHE); + + callout_drain(&V_tcp_syncache.secret.reseed); } #endif @@ -414,7 +435,7 @@ syncache_timer(void *xsch) int tick = ticks; char *s; - CURVNET_SET(sch->sch_vnet); + CURVNET_SET(sch->sch_sc->vnet); /* NB: syncache_head has already been locked by the callout. */ SCH_LOCK_ASSERT(sch); @@ -927,6 +948,16 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, sc = syncache_lookup(inc, &sch); /* returns locked sch */ SCH_LOCK_ASSERT(sch); + +#ifdef INVARIANTS + /* + * Test code for syncookies comparing the syncache stored + * values with the reconstructed values from the cookie. + */ + if (sc != NULL) + syncookie_cmp(inc, sch, sc, th, to, *lsop); +#endif + if (sc == NULL) { /* * There is no syncache entry, so see if this ACK is @@ -946,7 +977,7 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, goto failed; } bzero(&scs, sizeof(scs)); - sc = syncookie_lookup(inc, sch, &scs, to, th, *lsop); + sc = syncookie_lookup(inc, sch, &scs, th, to, *lsop); SCH_UNLOCK(sch); if (sc == NULL) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) @@ -992,12 +1023,32 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, goto failed; } + /* + * If timestamps were not negotiated during SYN/ACK they + * must not appear on any segment during this session. + */ if (!(sc->sc_flags & SCF_TIMESTAMP) && (to->to_flags & TOF_TS)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Timestamp not expected, " "segment rejected\n", s, __func__); goto failed; } + + /* + * If timestamps were negotiated during SYN/ACK they should + * appear on every segment during this session. + * XXXAO: This is only informal as there have been unverified + * reports of non-compliants stacks. + */ + if ((sc->sc_flags & SCF_TIMESTAMP) && !(to->to_flags & TOF_TS)) { + if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { + log(LOG_DEBUG, "%s; %s: Timestamp missing, " + "no action\n", s, __func__); + free(s, M_TCPLOG); + s = NULL; + } + } + /* * If timestamps were negotiated the reflected timestamp * must be equal to what we actually sent in the SYN|ACK. @@ -1053,7 +1104,6 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, struct syncache *sc = NULL; struct syncache_head *sch; struct mbuf *ipopts = NULL; - u_int32_t flowtmp; u_int ltflags; int win, sb_hiwat, ip_ttl, ip_tos; char *s; @@ -1294,19 +1344,17 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, if ((th->th_flags & (TH_ECE|TH_CWR)) && V_tcp_do_ecn) sc->sc_flags |= SCF_ECN; - if (V_tcp_syncookies) { - syncookie_generate(sch, sc, &flowtmp); -#ifdef INET6 - if (autoflowlabel) - sc->sc_flowlabel = flowtmp; -#endif - } else { + if (V_tcp_syncookies) + sc->sc_iss = syncookie_generate(sch, sc); #ifdef INET6 - if (autoflowlabel) - sc->sc_flowlabel = - (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK); -#endif + if (autoflowlabel) { + if (V_tcp_syncookies) + sc->sc_flowlabel = sc->sc_iss; + else + sc->sc_flowlabel = ip6_randomflowlabel(); + sc->sc_flowlabel = htonl(sc->sc_flowlabel) & IPV6_FLOWLABEL_MASK; } +#endif SCH_UNLOCK(sch); /* @@ -1529,265 +1577,385 @@ syncache_respond(struct syncache *sc) } /* - * The purpose of SYN cookies is to avoid keeping track of all SYN's we - * receive and to be able to handle SYN floods from bogus source addresses - * (where we will never receive any reply). SYN floods try to exhaust all - * our memory and available slots in the SYN cache table to cause a denial - * of service to legitimate users of the local host. + * The purpose of syncookies is to handle spoofed SYN flooding DoS attacks + * that exceed the capacity of the syncache by avoiding the storage of any + * of the SYNs we receive. Syncookies defend against blind SYN flooding + * attacks where the attacker does not have access to our responses. * - * The idea of SYN cookies is to encode and include all necessary information - * about the connection setup state within the SYN-ACK we send back and thus - * to get along without keeping any local state until the ACK to the SYN-ACK - * arrives (if ever). Everything we need to know should be available from - * the information we encoded in the SYN-ACK. + * Syncookies encode and include all necessary information about the + * connection setup within the SYN|ACK that we send back. That way we + * can avoid keeping any local state until the ACK to our SYN|ACK returns + * (if ever). Normally the syncache and syncookies are running in parallel + * with the latter taking over when the former is exhausted. When matching + * syncache entry is found the syncookie is ignored. * - * More information about the theory behind SYN cookies and its first - * discussion and specification can be found at: - * http://cr.yp.to/syncookies.html (overview) - * http://cr.yp.to/syncookies/archive (gory details) + * The only reliable information persisting the 3WHS is our inital sequence + * number ISS of 32 bits. Syncookies embed a cryptographically sufficient + * strong hash (MAC) value and a few bits of TCP SYN options in the ISS + * of our SYN|ACK. The MAC can be recomputed when the ACK to our SYN|ACK + * returns and signifies a legitimate connection if it matches the ACK. * - * This implementation extends the orginal idea and first implementation - * of FreeBSD by using not only the initial sequence number field to store - * information but also the timestamp field if present. This way we can - * keep track of the entire state we need to know to recreate the session in - * its original form. Almost all TCP speakers implement RFC1323 timestamps - * these days. For those that do not we still have to live with the known - * shortcomings of the ISN only SYN cookies. + * The available space of 32 bits to store the hash and to encode the SYN + * option information is very tight and we should have at least 24 bits for + * the MAC to keep the number of guesses by blind spoofing reasonably high. * - * Cookie layers: + * SYN option information we have to encode to fully restore a connection: + * MSS: is imporant to chose an optimal segment size to avoid IP level + * fragmentation along the path. The common MSS values can be encoded + * in a 3-bit table. Uncommon values are captured by the next lower value + * in the table leading to a slight increase in packetization overhead. + * WSCALE: is necessary to allow large windows to be used for high delay- + * bandwidth product links. Not scaling the window when it was initially + * negotiated is bad for performance as lack of scaling further decreases + * the apparent available send window. We only need to encode the WSCALE + * we received from the remote end. Our end can be recalculated at any + * time. The common WSCALE values can be encoded in a 3-bit table. + * Uncommon values are captured by the next lower value in the table + * making us under-estimate the available window size halving our + * theoretically possible maximum throughput for that connection. + * SACK: Greatly assists in packet loss recovery and requires 1 bit. + * TIMESTAMP and SIGNATURE is not encoded because they are permanent options + * that are included in all segments on a connection. We enable them when + * the ACK has them. * - * Initial sequence number we send: - * 31|................................|0 - * DDDDDDDDDDDDDDDDDDDDDDDDDMMMRRRP - * D = MD5 Digest (first dword) - * M = MSS index - * R = Rotation of secret - * P = Odd or Even secret + * Security of syncookies and attack vectors: * - * The MD5 Digest is computed with over following parameters: - * a) randomly rotated secret - * b) struct in_conninfo containing the remote/local ip/port (IPv4&IPv6) - * c) the received initial sequence number from remote host - * d) the rotation offset and odd/even bit + * The MAC is computed over (faddr||laddr||fport||lport||irs||flags||secmod) + * together with the gloabl secret to make it unique per connection attempt. + * Thus any change of any of those parameters results in a different MAC output + * in an unpredictable way unless a collision is encountered. 24 bits of the + * MAC are embedded into the ISS. * - * Timestamp we send: - * 31|................................|0 - * DDDDDDDDDDDDDDDDDDDDDDSSSSRRRRA5 - * D = MD5 Digest (third dword) (only as filler) - * S = Requested send window scale - * R = Requested receive window scale - * A = SACK allowed - * 5 = TCP-MD5 enabled (not implemented yet) - * XORed with MD5 Digest (forth dword) + * To prevent replay attacks two rotating global secrets are updated with a + * new random value every 15 seconds. The life-time of a syncookie is thus + * 15-30 seconds. * - * The timestamp isn't cryptographically secure and doesn't need to be. - * The double use of the MD5 digest dwords ties it to a specific remote/ - * local host/port, remote initial sequence number and our local time - * limited secret. A received timestamp is reverted (XORed) and then - * the contained MD5 dword is compared to the computed one to ensure the - * timestamp belongs to the SYN-ACK we sent. The other parameters may - * have been tampered with but this isn't different from supplying bogus - * values in the SYN in the first place. + * Vector 1: Attacking the secret. This requires finding a weakness in the + * MAC itself or the way it is used here. The attacker can do a chosen plain + * text attack by varying and testing the all parameters under his control. + * The strength depends on the size and randomness of the secret, and the + * cryptographic security of the MAC function. Due to the constant updating + * of the secret the attacker has at most 29.999 seconds to find the secret + * and launch spoofed connections. After that he has to start all over again. * - * Some problems with SYN cookies remain however: - * Consider the problem of a recreated (and retransmitted) cookie. If the - * original SYN was accepted, the connection is established. The second - * SYN is inflight, and if it arrives with an ISN that falls within the - * receive window, the connection is killed. + * Vector 2: Collision attack on the MAC of a single ACK. With a 24 bit MAC + * size an average of 4,823 attempts are required for a 50% chance of success + * to spoof a single syncookie (birthday collision paradox). However the + * attacker is blind and doesn't know if one of his attempts succeeded unless + * he has a side channel to interfere success from. A single connection setup + * success average of 90% requires 8,790 packets, 99.99% requires 17,578 packets. + * This many attempts are required for each one blind spoofed connection. For + * every additional spoofed connection he has to launch another N attempts. + * Thus for a sustained rate 100 spoofed connections per second approximately + * 1,800,000 packets per second would have to be sent. * - * Notes: - * A heuristic to determine when to accept syn cookies is not necessary. - * An ACK flood would cause the syncookie verification to be attempted, - * but a SYN flood causes syncookies to be generated. Both are of equal - * cost, so there's no point in trying to optimize the ACK flood case. - * Also, if you don't process certain ACKs for some reason, then all someone - * would have to do is launch a SYN and ACK flood at the same time, which - * would stop cookie verification and defeat the entire purpose of syncookies. + * NB: The MAC function should be fast so that it doesn't become a CPU + * exhaustion attack vector itself. + * + * References: + * RFC4987 TCP SYN Flooding Attacks and Common Mitigations + * SYN cookies were first proposed by cryptographer Dan J. Bernstein in 1996 + * http://cr.yp.to/syncookies.html (overview) + * http://cr.yp.to/syncookies/archive (details) + * + * + * Schematic construction of a syncookie enabled Initial Sequence Number: + * 0 1 2 3 + * 12345678901234567890123456789012 + * |xxxxxxxxxxxxxxxxxxxxxxxxWWWMMMSP| + * + * x 24 MAC (truncated) + * W 3 Send Window Scale index + * M 3 MSS index + * S 1 SACK permitted + * P 1 Odd/even secret */ -static int tcp_sc_msstab[] = { 0, 256, 468, 536, 996, 1452, 1460, 8960 }; -static void -syncookie_generate(struct syncache_head *sch, struct syncache *sc, - u_int32_t *flowlabel) +/* + * Distribution and probability of certain MSS values. Those in between are + * rounded down to the next lower one. + * [An Analysis of TCP Maximum Segment Sizes, S. Alcock and R. Nelson, 2011] + * .2% .3% 5% 7% 7% 20% 15% 45% + */ +static int tcp_sc_msstab[] = { 216, 536, 1200, 1360, 1400, 1440, 1452, 1460 }; + +/* + * Distribution and probability of certain WSCALE values. We have to map the + * (send) window scale (shift) option with a range of 0-14 from 4 bits into 3 + * bits based on prevalence of certain values. Where we don't have an exact + * match for are rounded down to the next lower one letting us under-estimate + * the true available window. At the moment this would happen only for the + * very uncommon values 3, 5 and those above 8 (more than 16MB socket buffer + * and window size). The absence of the WSCALE option (no scaling in either + * direction) is encoded with index zero. + * [WSCALE values histograms, Allman, 2012] + * X 10 10 35 5 6 14 10% by host + * X 11 4 5 5 18 49 3% by connections + */ +static int tcp_sc_wstab[] = { 0, 0, 1, 2, 4, 6, 7, 8 }; + +/* + * Compute the MAC for the SYN cookie. SIPHASH-2-4 is chosen for its speed + * and good cryptographic properties. + */ +static uint32_t +syncookie_mac(struct in_conninfo *inc, tcp_seq irs, uint8_t flags, + uint8_t *secbits, uintptr_t secmod) { - MD5_CTX ctx; - u_int32_t md5_buffer[MD5_DIGEST_LENGTH / sizeof(u_int32_t)]; - u_int32_t data; - u_int32_t *secbits; - u_int off, pmss, mss; - int i; + SIPHASH_CTX ctx; + uint32_t siphash[2]; + + SipHash24_Init(&ctx); + SipHash_SetKey(&ctx, secbits); + switch (inc->inc_flags & INC_ISIPV6) { +#ifdef INET + case 0: + SipHash_Update(&ctx, &inc->inc_faddr, sizeof(inc->inc_faddr)); + SipHash_Update(&ctx, &inc->inc_laddr, sizeof(inc->inc_laddr)); + break; +#endif +#ifdef INET6 + case INC_ISIPV6: + SipHash_Update(&ctx, &inc->inc6_faddr, sizeof(inc->inc6_faddr)); + SipHash_Update(&ctx, &inc->inc6_laddr, sizeof(inc->inc6_laddr)); + break; +#endif + } + SipHash_Update(&ctx, &inc->inc_fport, sizeof(inc->inc_fport)); + SipHash_Update(&ctx, &inc->inc_lport, sizeof(inc->inc_lport)); + SipHash_Update(&ctx, &flags, sizeof(flags)); + SipHash_Update(&ctx, &secmod, sizeof(secmod)); + SipHash_Final((u_int8_t *)&siphash, &ctx); + + return (siphash[0] ^ siphash[1]); +} + +static tcp_seq +syncookie_generate(struct syncache_head *sch, struct syncache *sc) +{ + u_int i, mss, secbit, wscale; + uint32_t iss, hash; + uint8_t *secbits; + union syncookie cookie; SCH_LOCK_ASSERT(sch); - /* Which of the two secrets to use. */ - secbits = sch->sch_oddeven ? - sch->sch_secbits_odd : sch->sch_secbits_even; - - /* Reseed secret if too old. */ - if (sch->sch_reseed < time_uptime) { - sch->sch_oddeven = sch->sch_oddeven ? 0 : 1; /* toggle */ - secbits = sch->sch_oddeven ? - sch->sch_secbits_odd : sch->sch_secbits_even; - for (i = 0; i < SYNCOOKIE_SECRET_SIZE; i++) - secbits[i] = arc4random(); - sch->sch_reseed = time_uptime + SYNCOOKIE_LIFETIME; + cookie.cookie = 0; + + /* Map our computed MSS into the 3-bit index. */ + mss = min(tcp_mssopt(&sc->sc_inc), max(sc->sc_peer_mss, V_tcp_minmss)); + for (i = sizeof(tcp_sc_msstab) / sizeof(*tcp_sc_msstab) - 1; + tcp_sc_msstab[i] > mss && i > 0; + i--) + ; + cookie.flags.mss_idx = i; + + /* + * Map the send window scale into the 3-bit index but only if + * the wscale option was received. + */ + if (sc->sc_flags & SCF_WINSCALE) { + wscale = sc->sc_requested_s_scale; + for (i = sizeof(tcp_sc_wstab) / sizeof(*tcp_sc_wstab) - 1; + tcp_sc_wstab[i] > wscale && i > 0; + i--) + ; + cookie.flags.wscale_idx = i; } - /* Secret rotation offset. */ - off = sc->sc_iss & 0x7; /* iss was randomized before */ - - /* Maximum segment size calculation. */ - pmss = - max( min(sc->sc_peer_mss, tcp_mssopt(&sc->sc_inc)), V_tcp_minmss); - for (mss = sizeof(tcp_sc_msstab) / sizeof(int) - 1; mss > 0; mss--) - if (tcp_sc_msstab[mss] <= pmss) - break; - - /* Fold parameters and MD5 digest into the ISN we will send. */ - data = sch->sch_oddeven;/* odd or even secret, 1 bit */ - data |= off << 1; /* secret offset, derived from iss, 3 bits */ - data |= mss << 4; /* mss, 3 bits */ - - MD5Init(&ctx); - MD5Update(&ctx, ((u_int8_t *)secbits) + off, - SYNCOOKIE_SECRET_SIZE * sizeof(*secbits) - off); - MD5Update(&ctx, secbits, off); - MD5Update(&ctx, &sc->sc_inc, sizeof(sc->sc_inc)); - MD5Update(&ctx, &sc->sc_irs, sizeof(sc->sc_irs)); - MD5Update(&ctx, &data, sizeof(data)); - MD5Final((u_int8_t *)&md5_buffer, &ctx); - - data |= (md5_buffer[0] << 7); - sc->sc_iss = data; + /* Can we do SACK? */ + if (sc->sc_flags & SCF_SACK) + cookie.flags.sack_ok = 1; -#ifdef INET6 - *flowlabel = md5_buffer[1] & IPV6_FLOWLABEL_MASK; -#endif + /* Which of the two secrets to use. */ + secbit = sch->sch_sc->secret.oddeven & 0x1; + cookie.flags.odd_even = secbit; - /* Additional parameters are stored in the timestamp if present. */ + secbits = sch->sch_sc->secret.key[secbit]; + hash = syncookie_mac(&sc->sc_inc, sc->sc_irs, cookie.cookie, secbits, + (uintptr_t)sch); + + /* + * Put the flags into the hash and XOR them to get better ISS number + * variance. This doesn't enhance the cryptographic strength and is + * done to prevent the 8 cookie bits from showing up directly on the + * wire. + */ + iss = hash & ~0xff; + iss |= cookie.cookie ^ (hash >> 24); + + /* Randomize the timestamp. */ if (sc->sc_flags & SCF_TIMESTAMP) { - data = ((sc->sc_flags & SCF_SIGNATURE) ? 1 : 0); /* TCP-MD5, 1 bit */ - data |= ((sc->sc_flags & SCF_SACK) ? 1 : 0) << 1; /* SACK, 1 bit */ - data |= sc->sc_requested_s_scale << 2; /* SWIN scale, 4 bits */ - data |= sc->sc_requested_r_scale << 6; /* RWIN scale, 4 bits */ - data |= md5_buffer[2] << 10; /* more digest bits */ - data ^= md5_buffer[3]; - sc->sc_ts = data; - sc->sc_tsoff = data - tcp_ts_getticks(); /* after XOR */ + sc->sc_ts = arc4random(); + sc->sc_tsoff = sc->sc_ts - tcp_ts_getticks(); } TCPSTAT_INC(tcps_sc_sendcookie); + return (iss); } static struct syncache * syncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch, - struct syncache *sc, struct tcpopt *to, struct tcphdr *th, - struct socket *so) + struct syncache *sc, struct tcphdr *th, struct tcpopt *to, + struct socket *lso) { - MD5_CTX ctx; - u_int32_t md5_buffer[MD5_DIGEST_LENGTH / sizeof(u_int32_t)]; - u_int32_t data = 0; - u_int32_t *secbits; + uint32_t hash; + uint8_t *secbits; tcp_seq ack, seq; - int off, mss, wnd, flags; + int wnd, wscale = 0; + union syncookie cookie; SCH_LOCK_ASSERT(sch); /* - * Pull information out of SYN-ACK/ACK and - * revert sequence number advances. + * Pull information out of SYN-ACK/ACK and revert sequence number + * advances. */ ack = th->th_ack - 1; seq = th->th_seq - 1; - off = (ack >> 1) & 0x7; - mss = (ack >> 4) & 0x7; - flags = ack & 0x7f; - - /* Which of the two secrets to use. */ - secbits = (flags & 0x1) ? sch->sch_secbits_odd : sch->sch_secbits_even; /* - * The secret wasn't updated for the lifetime of a syncookie, - * so this SYN-ACK/ACK is either too old (replay) or totally bogus. + * Unpack the flags containing enough information to restore the + * connection. */ - if (sch->sch_reseed + SYNCOOKIE_LIFETIME < time_uptime) { - return (NULL); - } + cookie.cookie = (ack & 0xff) ^ (ack >> 24); - /* Recompute the digest so we can compare it. */ - MD5Init(&ctx); - MD5Update(&ctx, ((u_int8_t *)secbits) + off, - SYNCOOKIE_SECRET_SIZE * sizeof(*secbits) - off); - MD5Update(&ctx, secbits, off); - MD5Update(&ctx, inc, sizeof(*inc)); - MD5Update(&ctx, &seq, sizeof(seq)); - MD5Update(&ctx, &flags, sizeof(flags)); - MD5Final((u_int8_t *)&md5_buffer, &ctx); - - /* Does the digest part of or ACK'ed ISS match? */ - if ((ack & (~0x7f)) != (md5_buffer[0] << 7)) - return (NULL); + /* Which of the two secrets to use. */ + secbits = sch->sch_sc->secret.key[cookie.flags.odd_even]; - /* Does the digest part of our reflected timestamp match? */ - if (to->to_flags & TOF_TS) { - data = md5_buffer[3] ^ to->to_tsecr; - if ((data & (~0x3ff)) != (md5_buffer[2] << 10)) - return (NULL); - } + hash = syncookie_mac(inc, seq, cookie.cookie, secbits, (uintptr_t)sch); + + /* The recomputed hash matches the ACK if this was a genuine cookie. */ + if ((ack & ~0xff) != (hash & ~0xff)) + return (NULL); /* Fill in the syncache values. */ + sc->sc_flags = 0; bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo)); sc->sc_ipopts = NULL; sc->sc_irs = seq; sc->sc_iss = ack; + switch (inc->inc_flags & INC_ISIPV6) { +#ifdef INET + case 0: + sc->sc_ip_ttl = sotoinpcb(lso)->inp_ip_ttl; + sc->sc_ip_tos = sotoinpcb(lso)->inp_ip_tos; + break; +#endif #ifdef INET6 - if (inc->inc_flags & INC_ISIPV6) { - if (sotoinpcb(so)->inp_flags & IN6P_AUTOFLOWLABEL) - sc->sc_flowlabel = md5_buffer[1] & IPV6_FLOWLABEL_MASK; - } else + case INC_ISIPV6: + if (sotoinpcb(lso)->inp_flags & IN6P_AUTOFLOWLABEL) + sc->sc_flowlabel = sc->sc_iss & IPV6_FLOWLABEL_MASK; + break; #endif - { - sc->sc_ip_ttl = sotoinpcb(so)->inp_ip_ttl; - sc->sc_ip_tos = sotoinpcb(so)->inp_ip_tos; } - /* Additional parameters that were encoded in the timestamp. */ - if (data) { + sc->sc_peer_mss = tcp_sc_msstab[cookie.flags.mss_idx]; + + /* We can simply recompute receive window scale we sent earlier. */ + while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < sb_max) + wscale++; + + /* Only use wscale if it was enabled in the orignal SYN. */ + if (cookie.flags.wscale_idx > 0) { + sc->sc_requested_r_scale = wscale; + sc->sc_requested_s_scale = tcp_sc_wstab[cookie.flags.wscale_idx]; + sc->sc_flags |= SCF_WINSCALE; + } + + wnd = sbspace(&lso->so_rcv); + wnd = imax(wnd, 0); + wnd = imin(wnd, TCP_MAXWIN); + sc->sc_wnd = wnd; + + if (cookie.flags.sack_ok) + sc->sc_flags |= SCF_SACK; + + if (to->to_flags & TOF_TS) { sc->sc_flags |= SCF_TIMESTAMP; sc->sc_tsreflect = to->to_tsval; sc->sc_ts = to->to_tsecr; sc->sc_tsoff = to->to_tsecr - tcp_ts_getticks(); - sc->sc_flags |= (data & 0x1) ? SCF_SIGNATURE : 0; - sc->sc_flags |= ((data >> 1) & 0x1) ? SCF_SACK : 0; - sc->sc_requested_s_scale = min((data >> 2) & 0xf, - TCP_MAX_WINSHIFT); - sc->sc_requested_r_scale = min((data >> 6) & 0xf, - TCP_MAX_WINSHIFT); - if (sc->sc_requested_s_scale || sc->sc_requested_r_scale) - sc->sc_flags |= SCF_WINSCALE; - } else - sc->sc_flags |= SCF_NOOPT; + } - wnd = sbspace(&so->so_rcv); - wnd = imax(wnd, 0); - wnd = imin(wnd, TCP_MAXWIN); - sc->sc_wnd = wnd; + if (to->to_flags & TOF_SIGNATURE) + sc->sc_flags |= SCF_SIGNATURE; sc->sc_rxmits = 0; - sc->sc_peer_mss = tcp_sc_msstab[mss]; TCPSTAT_INC(tcps_sc_recvcookie); return (sc); } +#ifdef INVARIANTS +static int +syncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch, + struct syncache *sc, struct tcphdr *th, struct tcpopt *to, + struct socket *lso) +{ + struct syncache scs, *scx; + char *s; + + bzero(&scs, sizeof(scs)); + scx = syncookie_lookup(inc, sch, &scs, th, to, lso); + + if ((s = tcp_log_addrs(inc, th, NULL, NULL)) == NULL) + return (0); + + if (scx != NULL) { + if (sc->sc_peer_mss != scx->sc_peer_mss) + log(LOG_DEBUG, "%s; %s: mss different %i vs %i\n", + s, __func__, sc->sc_peer_mss, scx->sc_peer_mss); + + if (sc->sc_requested_r_scale != scx->sc_requested_r_scale) + log(LOG_DEBUG, "%s; %s: rwscale different %i vs %i\n", + s, __func__, sc->sc_requested_r_scale, + scx->sc_requested_r_scale); + + if (sc->sc_requested_s_scale != scx->sc_requested_s_scale) + log(LOG_DEBUG, "%s; %s: swscale different %i vs %i\n", + s, __func__, sc->sc_requested_s_scale, + scx->sc_requested_s_scale); + + if ((sc->sc_flags & SCF_SACK) != (scx->sc_flags & SCF_SACK)) + log(LOG_DEBUG, "%s; %s: SACK different\n", s, __func__); + } + + if (s != NULL) + free(s, M_TCPLOG); + return (0); +} +#endif /* INVARIANTS */ + +static void +syncookie_reseed(void *arg) +{ + struct tcp_syncache *sc = arg; + uint8_t *secbits; + int secbit; + + /* + * Reseeding the secret doesn't have to be protected by a lock. + * It only must be ensured that the new random values are visible + * to all CPUs in a SMP environment. The atomic with release + * semantics ensures that. + */ + secbit = (sc->secret.oddeven & 0x1) ? 0 : 1; + secbits = sc->secret.key[secbit]; + arc4rand(secbits, SYNCOOKIE_SECRET_SIZE, 0); + atomic_add_rel_int(&sc->secret.oddeven, 1); + + /* Reschedule ourself. */ + callout_schedule(&sc->secret.reseed, SYNCOOKIE_LIFETIME * hz); +} + /* * Returns the current number of syncache entries. This number * will probably change before you get around to calling * syncache_pcblist. */ - int syncache_pcbcount(void) { diff --git a/sys/netinet/tcp_syncache.h b/sys/netinet/tcp_syncache.h index 1ff8a46..520b484 100644 --- a/sys/netinet/tcp_syncache.h +++ b/sys/netinet/tcp_syncache.h @@ -90,20 +90,23 @@ struct syncache { #define SCF_SACK 0x80 /* send SACK option */ #define SCF_ECN 0x100 /* send ECN setup packet */ -#define SYNCOOKIE_SECRET_SIZE 8 /* dwords */ -#define SYNCOOKIE_LIFETIME 16 /* seconds */ - struct syncache_head { - struct vnet *sch_vnet; struct mtx sch_mtx; TAILQ_HEAD(sch_head, syncache) sch_bucket; struct callout sch_timer; int sch_nextc; u_int sch_length; - u_int sch_oddeven; - u_int32_t sch_secbits_odd[SYNCOOKIE_SECRET_SIZE]; - u_int32_t sch_secbits_even[SYNCOOKIE_SECRET_SIZE]; - u_int sch_reseed; /* time_uptime, seconds */ + struct tcp_syncache *sch_sc; +}; + +#define SYNCOOKIE_SECRET_SIZE 16 +#define SYNCOOKIE_LIFETIME 15 /* seconds */ + +struct syncookie_secret { + volatile u_int oddeven; + uint8_t key[2][SYNCOOKIE_SECRET_SIZE]; + struct callout reseed; + u_int lifetime; }; struct tcp_syncache { @@ -115,6 +118,19 @@ struct tcp_syncache { u_int cache_limit; u_int rexmt_limit; u_int hash_secret; + struct vnet *vnet; + struct syncookie_secret secret; +}; + +/* Internal use for the syncookie functions. */ +union syncookie { + uint8_t cookie; + struct { + uint8_t odd_even:1, + sack_ok:1, + wscale_idx:3, + mss_idx:3; + } flags; }; #endif /* _KERNEL */ diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index 0445d8c..cfba5d9 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -514,119 +514,15 @@ struct tcpstat { }; #ifdef _KERNEL - #include <sys/counter.h> -/* Should match 'struct tcpstat' above. */ -struct tcpstat_p { - counter_u64_t tcps_connattempt; - counter_u64_t tcps_accepts; - counter_u64_t tcps_connects; - counter_u64_t tcps_drops; - counter_u64_t tcps_conndrops; - counter_u64_t tcps_minmssdrops; - counter_u64_t tcps_closed; - counter_u64_t tcps_segstimed; - counter_u64_t tcps_rttupdated; - counter_u64_t tcps_delack; - counter_u64_t tcps_timeoutdrop; - counter_u64_t tcps_rexmttimeo; - counter_u64_t tcps_persisttimeo; - counter_u64_t tcps_keeptimeo; - counter_u64_t tcps_keepprobe; - counter_u64_t tcps_keepdrops; - counter_u64_t tcps_sndtotal; - counter_u64_t tcps_sndpack; - counter_u64_t tcps_sndbyte; - counter_u64_t tcps_sndrexmitpack; - counter_u64_t tcps_sndrexmitbyte; - counter_u64_t tcps_sndrexmitbad; - counter_u64_t tcps_sndacks; - counter_u64_t tcps_sndprobe; - counter_u64_t tcps_sndurg; - counter_u64_t tcps_sndwinup; - counter_u64_t tcps_sndctrl; - counter_u64_t tcps_rcvtotal; - counter_u64_t tcps_rcvpack; - counter_u64_t tcps_rcvbyte; - counter_u64_t tcps_rcvbadsum; - counter_u64_t tcps_rcvbadoff; - counter_u64_t tcps_rcvmemdrop; - counter_u64_t tcps_rcvshort; - counter_u64_t tcps_rcvduppack; - counter_u64_t tcps_rcvdupbyte; - counter_u64_t tcps_rcvpartduppack; - counter_u64_t tcps_rcvpartdupbyte; - counter_u64_t tcps_rcvoopack; - counter_u64_t tcps_rcvoobyte; - counter_u64_t tcps_rcvpackafterwin; - counter_u64_t tcps_rcvbyteafterwin; - counter_u64_t tcps_rcvafterclose; - counter_u64_t tcps_rcvwinprobe; - counter_u64_t tcps_rcvdupack; - counter_u64_t tcps_rcvacktoomuch; - counter_u64_t tcps_rcvackpack; - counter_u64_t tcps_rcvackbyte; - counter_u64_t tcps_rcvwinupd; - counter_u64_t tcps_pawsdrop; - counter_u64_t tcps_predack; - counter_u64_t tcps_preddat; - counter_u64_t tcps_pcbcachemiss; - counter_u64_t tcps_cachedrtt; - counter_u64_t tcps_cachedrttvar; - counter_u64_t tcps_cachedssthresh; - counter_u64_t tcps_usedrtt; - counter_u64_t tcps_usedrttvar; - counter_u64_t tcps_usedssthresh; - counter_u64_t tcps_persistdrop; - counter_u64_t tcps_badsyn; - counter_u64_t tcps_mturesent; - counter_u64_t tcps_listendrop; - counter_u64_t tcps_badrst; - counter_u64_t tcps_sc_added; - counter_u64_t tcps_sc_retransmitted; - counter_u64_t tcps_sc_dupsyn; - counter_u64_t tcps_sc_dropped; - counter_u64_t tcps_sc_completed; - counter_u64_t tcps_sc_bucketoverflow; - counter_u64_t tcps_sc_cacheoverflow; - counter_u64_t tcps_sc_reset; - counter_u64_t tcps_sc_stale; - counter_u64_t tcps_sc_aborted; - counter_u64_t tcps_sc_badack; - counter_u64_t tcps_sc_unreach; - counter_u64_t tcps_sc_zonefail; - counter_u64_t tcps_sc_sendcookie; - counter_u64_t tcps_sc_recvcookie; - counter_u64_t tcps_hc_added; - counter_u64_t tcps_hc_bucketoverflow; - counter_u64_t tcps_finwait2_drops; - counter_u64_t tcps_sack_recovery_episode; - counter_u64_t tcps_sack_rexmits; - counter_u64_t tcps_sack_rexmit_bytes; - counter_u64_t tcps_sack_rcv_blocks; - counter_u64_t tcps_sack_send_blocks; - counter_u64_t tcps_sack_sboverflow; - counter_u64_t tcps_ecn_ce; - counter_u64_t tcps_ecn_ect0; - counter_u64_t tcps_ecn_ect1; - counter_u64_t tcps_ecn_shs; - counter_u64_t tcps_ecn_rcwnd; - counter_u64_t tcps_sig_rcvgoodsig; - counter_u64_t tcps_sig_rcvbadsig; - counter_u64_t tcps_sig_err_buildsig; - counter_u64_t tcps_sig_err_sigopt; - counter_u64_t tcps_sig_err_nosigopt; -}; - -VNET_DECLARE(struct tcpstat_p, tcpstatp); /* tcp statistics */ -#define V_tcpstatp VNET(tcpstatp) - +VNET_PCPUSTAT_DECLARE(struct tcpstat, tcpstat); /* tcp statistics */ /* * In-kernel consumers can use these accessor macros directly to update * stats. */ -#define TCPSTAT_ADD(name, val) counter_u64_add(V_tcpstatp.name, (val)) +#define TCPSTAT_ADD(name, val) \ + VNET_PCPUSTAT_ADD(struct tcpstat, tcpstat, name, (val)) #define TCPSTAT_INC(name) TCPSTAT_ADD(name, 1) /* @@ -634,8 +530,7 @@ VNET_DECLARE(struct tcpstat_p, tcpstatp); /* tcp statistics */ */ void kmod_tcpstat_inc(int statnum); #define KMOD_TCPSTAT_INC(name) \ - kmod_tcpstat_inc(offsetof(struct tcpstat_p, name) / \ - sizeof(counter_u64_t)) + kmod_tcpstat_inc(offsetof(struct tcpstat, name) / sizeof(uint64_t)) /* * TCP specific helper hook point identifiers. diff --git a/sys/netinet/udp_usrreq.c b/sys/netinet/udp_usrreq.c index 6bc94c3..982a2db 100644 --- a/sys/netinet/udp_usrreq.c +++ b/sys/netinet/udp_usrreq.c @@ -143,11 +143,14 @@ static VNET_DEFINE(uma_zone_t, udpcb_zone); #define UDBHASHSIZE 128 #endif -VNET_DEFINE(struct udpstat, udpstat); /* from udp_var.h */ -SYSCTL_VNET_STRUCT(_net_inet_udp, UDPCTL_STATS, stats, CTLFLAG_RW, - &VNET_NAME(udpstat), udpstat, - "UDP statistics (struct udpstat, netinet/udp_var.h)"); +VNET_PCPUSTAT_DEFINE(struct udpstat, udpstat); /* from udp_var.h */ +VNET_PCPUSTAT_SYSINIT(udpstat); +SYSCTL_VNET_PCPUSTAT(_net_inet_udp, UDPCTL_STATS, stats, struct udpstat, + udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)"); +#ifdef VIMAGE +VNET_PCPUSTAT_SYSUNINIT(udpstat); +#endif /* VIMAGE */ #ifdef INET static void udp_detach(struct socket *so); static int udp_output(struct inpcb *, struct mbuf *, struct sockaddr *, @@ -207,7 +210,7 @@ void kmod_udpstat_inc(int statnum) { - (*((u_long *)&V_udpstat + statnum))++; + counter_u64_add(VNET(udpstat)[statnum], 1); } int diff --git a/sys/netinet/udp_var.h b/sys/netinet/udp_var.h index e4a6668..0c26b88 100644 --- a/sys/netinet/udp_var.h +++ b/sys/netinet/udp_var.h @@ -75,38 +75,42 @@ struct udpcb { struct udpstat { /* input statistics: */ - u_long udps_ipackets; /* total input packets */ - u_long udps_hdrops; /* packet shorter than header */ - u_long udps_badsum; /* checksum error */ - u_long udps_nosum; /* no checksum */ - u_long udps_badlen; /* data length larger than packet */ - u_long udps_noport; /* no socket on port */ - u_long udps_noportbcast; /* of above, arrived as broadcast */ - u_long udps_fullsock; /* not delivered, input socket full */ - u_long udpps_pcbcachemiss; /* input packets missing pcb cache */ - u_long udpps_pcbhashmiss; /* input packets not for hashed pcb */ + uint64_t udps_ipackets; /* total input packets */ + uint64_t udps_hdrops; /* packet shorter than header */ + uint64_t udps_badsum; /* checksum error */ + uint64_t udps_nosum; /* no checksum */ + uint64_t udps_badlen; /* data length larger than packet */ + uint64_t udps_noport; /* no socket on port */ + uint64_t udps_noportbcast; /* of above, arrived as broadcast */ + uint64_t udps_fullsock; /* not delivered, input socket full */ + uint64_t udpps_pcbcachemiss; /* input packets missing pcb cache */ + uint64_t udpps_pcbhashmiss; /* input packets not for hashed pcb */ /* output statistics: */ - u_long udps_opackets; /* total output packets */ - u_long udps_fastout; /* output packets on fast path */ + uint64_t udps_opackets; /* total output packets */ + uint64_t udps_fastout; /* output packets on fast path */ /* of no socket on port, arrived as multicast */ - u_long udps_noportmcast; - u_long udps_filtermcast; /* blocked by multicast filter */ + uint64_t udps_noportmcast; + uint64_t udps_filtermcast; /* blocked by multicast filter */ }; #ifdef _KERNEL +#include <sys/counter.h> + +VNET_PCPUSTAT_DECLARE(struct udpstat, udpstat); /* * In-kernel consumers can use these accessor macros directly to update * stats. */ -#define UDPSTAT_ADD(name, val) V_udpstat.name += (val) +#define UDPSTAT_ADD(name, val) \ + VNET_PCPUSTAT_ADD(struct udpstat, udpstat, name, (val)) #define UDPSTAT_INC(name) UDPSTAT_ADD(name, 1) /* * Kernel module consumers must use this accessor macro. */ void kmod_udpstat_inc(int statnum); -#define KMOD_UDPSTAT_INC(name) \ - kmod_udpstat_inc(offsetof(struct udpstat, name) / sizeof(u_long)) +#define KMOD_UDPSTAT_INC(name) \ + kmod_udpstat_inc(offsetof(struct udpstat, name) / sizeof(uint64_t)) #endif /* @@ -140,10 +144,8 @@ VNET_DECLARE(struct inpcbinfo, udbinfo); extern u_long udp_sendspace; extern u_long udp_recvspace; VNET_DECLARE(int, udp_cksum); -VNET_DECLARE(struct udpstat, udpstat); VNET_DECLARE(int, udp_blackhole); #define V_udp_cksum VNET(udp_cksum) -#define V_udpstat VNET(udpstat) #define V_udp_blackhole VNET(udp_blackhole) extern int udp_log_in_vain; |