diff options
author | bms <bms@FreeBSD.org> | 2009-03-09 17:53:05 +0000 |
---|---|---|
committer | bms <bms@FreeBSD.org> | 2009-03-09 17:53:05 +0000 |
commit | 71233409ea6a2f4d751847c05e7aad9375278d94 (patch) | |
tree | 3d95180b6661648cd67dba62e6daefc7c8661793 | |
parent | 2173e9258f5b3400a58f42bd91fd01c1efc4441d (diff) | |
download | FreeBSD-src-71233409ea6a2f4d751847c05e7aad9375278d94.zip FreeBSD-src-71233409ea6a2f4d751847c05e7aad9375278d94.tar.gz |
Merge IGMPv3 and Source-Specific Multicast (SSM) to the FreeBSD
IPv4 stack.
Diffs are minimized against p4.
PCS has been used for some protocol verification, more widespread
testing of recorded sources in Group-and-Source queries is needed.
sizeof(struct igmpstat) has changed.
__FreeBSD_version is bumped to 800070.
-rw-r--r-- | UPDATING | 39 | ||||
-rw-r--r-- | share/man/man4/Makefile | 1 | ||||
-rw-r--r-- | share/man/man4/ip.4 | 18 | ||||
-rw-r--r-- | share/man/man4/multicast.4 | 1 | ||||
-rw-r--r-- | sys/netinet/if_ether.c | 3 | ||||
-rw-r--r-- | sys/netinet/igmp.c | 3719 | ||||
-rw-r--r-- | sys/netinet/igmp_var.h | 197 | ||||
-rw-r--r-- | sys/netinet/in.c | 110 | ||||
-rw-r--r-- | sys/netinet/in.h | 1 | ||||
-rw-r--r-- | sys/netinet/in_mcast.c | 2413 | ||||
-rw-r--r-- | sys/netinet/in_proto.c | 1 | ||||
-rw-r--r-- | sys/netinet/in_var.h | 330 | ||||
-rw-r--r-- | sys/netinet/ip_input.c | 14 | ||||
-rw-r--r-- | sys/netinet/ip_var.h | 19 | ||||
-rw-r--r-- | sys/netinet/raw_ip.c | 58 | ||||
-rw-r--r-- | sys/netinet/udp_usrreq.c | 69 | ||||
-rw-r--r-- | sys/netinet/vinet.h | 34 | ||||
-rw-r--r-- | sys/sys/param.h | 2 | ||||
-rw-r--r-- | sys/sys/vimage.h | 14 | ||||
-rw-r--r-- | usr.bin/netstat/inet.c | 121 | ||||
-rw-r--r-- | usr.sbin/ifmcstat/Makefile | 4 | ||||
-rw-r--r-- | usr.sbin/ifmcstat/ifmcstat.8 | 22 | ||||
-rw-r--r-- | usr.sbin/ifmcstat/ifmcstat.c | 433 |
23 files changed, 6225 insertions, 1398 deletions
@@ -23,6 +23,45 @@ NOTE TO PEOPLE WHO THINK THAT FreeBSD 8.x IS SLOW: ln -s aj /etc/malloc.conf.) 20090309: + IGMPv3 and Source-Specific Multicast (SSM) have been merged + to the IPv4 stack. VIMAGE hooks are in but not yet used. + + For kernel developers, the most important changes are that the + ip_output() and ip_input() paths no longer take the IN_MULTI_LOCK(), + and this lock has been downgraded to a non-recursive mutex. + + Transport protocols (UDP, Raw IP) are now responsible for filtering + inbound multicast traffic according to group membership and source + filters. The imo_multicast_filter() KPI exists for this purpose. + Transports which do not use multicast (SCTP, TCP) already reject + multicast by default. Forwarding and receive performance may improve + as a mutex acquisition is no longer needed in the ip_input() + low-level input path. in_addmulti() and in_delmulti() are shimmed + to new KPIs which exist to support SSM in-kernel. + + For application developers, it is recommended that loopback of + multicast datagrams be disabled for best performance, as this + will still cause the lock to be taken for each looped-back + datagram transmission. The net.inet.ip.mcast.loop sysctl may + be tuned to 0 to disable loopback by default; it defaults to 1 + to preserve the existing behaviour. + + For systems administrators, to obtain best performance with + multicast reception and multiple groups, it is always recommended + that a card with a suitably precise hash filter is used. Hash + collisions will still result in the lock being taken within the + transport protocol input path to check group membership. + + If deploying FreeBSD in an environment with IGMP snooping switches, + it is recommended that the net.inet.igmp.sendlocal sysctl remain + enabled; this forces 224.0.0.0/24 group membership to be announced + via IGMP. + + The size of 'struct igmpstat' has changed; netstat needs to be + recompiled to reflect this. + Bump __FreeBSD_version to 800070. + +20090309: libusb20.so.1 is now installed as libusb.so.1 and the ports system updated to use it. This requires a buildworld/installworld in order to update the library and dependencies (usbconfig, etc). Its advisable to diff --git a/share/man/man4/Makefile b/share/man/man4/Makefile index 80ce734..e5af601 100644 --- a/share/man/man4/Makefile +++ b/share/man/man4/Makefile @@ -128,6 +128,7 @@ MAN= aac.4 \ if_bridge.4 \ ifmib.4 \ igb.4 \ + igmp.4 \ iic.4 \ iicbb.4 \ iicbus.4 \ diff --git a/share/man/man4/ip.4 b/share/man/man4/ip.4 index 6945475..72d0490 100644 --- a/share/man/man4/ip.4 +++ b/share/man/man4/ip.4 @@ -32,7 +32,7 @@ .\" @(#)ip.4 8.2 (Berkeley) 11/30/93 .\" $FreeBSD$ .\" -.Dd February 13, 2009 +.Dd March 9, 2009 .Dt IP 4 .Os .Sh NAME @@ -466,13 +466,19 @@ setsockopt(s, IPPROTO_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); .Pp This option improves performance for applications that may have no more than one -instance on a single host (such as a router daemon), by eliminating +instance on a single host (such as a routing daemon), by eliminating the overhead of receiving their own transmissions. It should generally not be used by applications for which there may be more than one instance on a single host (such as a conferencing program) or for which the sender does not belong to the destination group (such as a time querying program). .Pp +The sysctl setting +.Va net.inet.ip.mcast.loop +controls the default setting of the +.Dv IP_MULTICAST_LOOP +socket option for new sockets. +.Pp A multicast datagram sent with an initial TTL greater than 1 may be delivered to the sending host on a different interface from that on which it was sent, if the host belongs to the destination group on that other interface. @@ -650,6 +656,13 @@ documented in RFC 3678. For management of source filter lists using this API, please refer to .Xr sourcefilter 3 . +.Pp +The sysctl settings +.Va net.inet.ip.mcast.maxsocksrc +and +.Va net.inet.ip.mcast.maxgrpsrc +are used to specify an upper limit on the number of per-socket and per-group +source filter entries which the kernel may allocate. .\"----------------------- .Ss "Raw IP Sockets" .Pp @@ -795,6 +808,7 @@ field was not equal to the length of the datagram written to the socket. .Xr send 2 , .Xr byteorder 3 , .Xr icmp 4 , +.Xr igmp 4 , .Xr inet 4 , .Xr intro 4 , .Xr multicast 4 , diff --git a/share/man/man4/multicast.4 b/share/man/man4/multicast.4 index eefa33d..4fbe5b6 100644 --- a/share/man/man4/multicast.4 +++ b/share/man/man4/multicast.4 @@ -956,6 +956,7 @@ after the previous upcall. .Xr socket 2 , .Xr sourcefilter 3 , .Xr icmp6 4 , +.Xr igmp 4 , .Xr inet 4 , .Xr inet6 4 , .Xr intro 4 , diff --git a/sys/netinet/if_ether.c b/sys/netinet/if_ether.c index 385212c..a918415 100644 --- a/sys/netinet/if_ether.c +++ b/sys/netinet/if_ether.c @@ -81,7 +81,8 @@ __FBSDID("$FreeBSD$"); #define SIN(s) ((struct sockaddr_in *)s) #define SDL(s) ((struct sockaddr_dl *)s) -#define LLTABLE(ifp) ((struct lltable *)(ifp)->if_afdata[AF_INET]) +#define LLTABLE(ifp) \ + ((struct in_ifinfo *)(ifp)->if_afdata[AF_INET])->ii_llt SYSCTL_DECL(_net_link_ether); SYSCTL_NODE(_net_link_ether, PF_INET, inet, CTLFLAG_RW, 0, ""); diff --git a/sys/netinet/igmp.c b/sys/netinet/igmp.c index 95e56b2..31e8306 100644 --- a/sys/netinet/igmp.c +++ b/sys/netinet/igmp.c @@ -1,4 +1,5 @@ /*- + * Copyright (c) 2007-2009 Bruce Simpson. * Copyright (c) 1988 Stephen Deering. * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. @@ -35,11 +36,13 @@ /* * Internet Group Management Protocol (IGMP) routines. + * [RFC1112, RFC2236, RFC3376] * * Written by Steve Deering, Stanford, May 1988. * Modified by Rosen Sharma, Stanford, Aug 1994. * Modified by Bill Fenner, Xerox PARC, Feb 1995. * Modified to fully comply to IGMPv2 by Bill Fenner, Oct 1995. + * Significantly rewritten for IGMPv3, VIMAGE, and SMP by Bruce Simpson. * * MULTICAST Revision: 3.5.1.4 */ @@ -52,6 +55,7 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/systm.h> +#include <sys/module.h> #include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/socket.h> @@ -59,8 +63,11 @@ __FBSDID("$FreeBSD$"); #include <sys/kernel.h> #include <sys/sysctl.h> #include <sys/vimage.h> +#include <sys/ktr.h> +#include <sys/condvar.h> #include <net/if.h> +#include <net/netisr.h> #include <net/route.h> #include <net/vnet.h> @@ -78,125 +85,1349 @@ __FBSDID("$FreeBSD$"); #include <security/mac/mac_framework.h> -static MALLOC_DEFINE(M_IGMP, "igmp", "igmp state"); - -static struct router_info *find_rti(struct ifnet *ifp); -static void igmp_sendpkt(struct in_multi *, int, unsigned long); +#ifndef KTR_IGMPV3 +#define KTR_IGMPV3 KTR_SUBSYS +#endif -#ifdef VIMAGE_GLOBALS -static struct igmpstat igmpstat; +static struct igmp_ifinfo * + igi_alloc_locked(struct ifnet *); +static void igi_delete_locked(const struct ifnet *); +static void igmp_dispatch_queue(struct ifqueue *, int, const int); +static void igmp_fasttimo_vnet(void); +static void igmp_final_leave(struct in_multi *, struct igmp_ifinfo *); +static int igmp_handle_state_change(struct in_multi *, + struct igmp_ifinfo *); +static int igmp_initial_join(struct in_multi *, struct igmp_ifinfo *); +static int igmp_input_v1_query(struct ifnet *, const struct ip *); +static int igmp_input_v2_query(struct ifnet *, const struct ip *, + const struct igmp *); +static int igmp_input_v3_query(struct ifnet *, const struct ip *, + /*const*/ struct igmpv3 *); +static int igmp_input_v3_group_query(struct in_multi *, + struct igmp_ifinfo *, int, /*const*/ struct igmpv3 *); +static int igmp_input_v1_report(struct ifnet *, /*const*/ struct ip *, + /*const*/ struct igmp *); +static int igmp_input_v2_report(struct ifnet *, /*const*/ struct ip *, + /*const*/ struct igmp *); +static void igmp_intr(struct mbuf *); +static int igmp_isgroupreported(const struct in_addr); +static struct mbuf * + igmp_ra_alloc(void); +#ifdef KTR +static char * igmp_rec_type_to_str(const int); #endif +static void igmp_set_version(struct igmp_ifinfo *, const int); +static void igmp_slowtimo_vnet(void); +static void igmp_sysinit(void); +static int igmp_v1v2_queue_report(struct in_multi *, const int); +static void igmp_v1v2_process_group_timer(struct in_multi *, const int); +static void igmp_v1v2_process_querier_timers(struct igmp_ifinfo *); +static void igmp_v2_update_group(struct in_multi *, const int); +static void igmp_v3_cancel_link_timers(struct igmp_ifinfo *); +static void igmp_v3_dispatch_general_query(struct igmp_ifinfo *); +static struct mbuf * + igmp_v3_encap_report(struct ifnet *, struct mbuf *); +static int igmp_v3_enqueue_group_record(struct ifqueue *, + struct in_multi *, const int, const int, const int); +static int igmp_v3_enqueue_filter_change(struct ifqueue *, + struct in_multi *); +static void igmp_v3_process_group_timers(struct igmp_ifinfo *, + struct ifqueue *, struct ifqueue *, struct in_multi *, + const int); +static int igmp_v3_merge_state_changes(struct in_multi *, + struct ifqueue *); +static void igmp_v3_suppress_group_record(struct in_multi *); +static int sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS); +static int sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS); +static int sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS); -SYSCTL_V_STRUCT(V_NET, vnet_inet, _net_inet_igmp, IGMPCTL_STATS, - stats, CTLFLAG_RW, igmpstat, igmpstat, ""); +#ifdef VIMAGE +static vnet_attach_fn vnet_igmp_iattach; +static vnet_detach_fn vnet_igmp_idetach; +#else +static int vnet_igmp_iattach(const void *); +static int vnet_igmp_idetach(const void *); +#endif /* VIMAGE */ /* - * igmp_mtx protects all mutable global variables in igmp.c, as well as the - * data fields in struct router_info. In general, a router_info structure - * will be valid as long as the referencing struct in_multi is valid, so no - * reference counting is used. We allow unlocked reads of router_info data - * when accessed via an in_multi read-only. + * System-wide globals. + * + * Unlocked access to these is OK, except for the global IGMP output + * queue. The IGMP subsystem lock ends up being system-wide for the moment, + * because all VIMAGEs have to share a global output queue, as netisrs + * themselves are not virtualized. + * + * Locking: + * * The permitted lock order is: IN_MULTI_LOCK, IGMP_LOCK, IF_ADDR_LOCK. + * Any may be taken independently; if any are held at the same + * time, the above lock order must be followed. + * * All output is delegated to the netisr to handle IFF_NEEDSGIANT. + * Most of the time, direct dispatch will be fine. + * * IN_MULTI_LOCK covers in_multi. + * * IGMP_LOCK covers igmp_ifinfo and any global variables in this file, + * including the output queue. + * * IF_ADDR_LOCK covers if_multiaddrs, which is used for a variety of + * per-link state iterators. + * * igmp_ifinfo is valid as long as PF_INET is attached to the interface, + * therefore it is not refcounted. + * We allow unlocked reads of igmp_ifinfo when accessed via in_multi. + * + * Reference counting + * * IGMP acquires its own reference every time an in_multi is passed to + * it and the group is being joined for the first time. + * * IGMP releases its reference(s) on in_multi in a deferred way, + * because the operations which process the release run as part of + * a loop whose control variables are directly affected by the release + * (that, and not recursing on the IF_ADDR_LOCK). + * + * VIMAGE: Each in_multi corresponds to an ifp, and each ifp corresponds + * to a vnet in ifp->if_vnet. + * + */ +struct mtx igmp_mtx; +int mpsafe_igmp = 0; +SYSCTL_INT(_debug, OID_AUTO, mpsafe_igmp, CTLFLAG_RDTUN, &mpsafe_igmp, 0, + "Enable SMP-safe IGMPv3"); + +struct mbuf *m_raopt; /* Router Alert option */ +MALLOC_DEFINE(M_IGMP, "igmp", "igmp state"); + +/* + * Global netisr output queue. + * This is only used as a last resort if we cannot directly dispatch. + * As IN_MULTI_LOCK is no longer in the bottom half of IP, we can do + * this, providing mpsafe_igmp is set. If it is not, we take Giant, + * and queueing is forced. + */ +struct ifqueue igmpoq; + +/* + * VIMAGE-wide globals. + * + * The IGMPv3 timers themselves need to run per-image, however, + * protosw timers run globally (see tcp). + * An ifnet can only be in one vimage at a time, and the loopback + * ifnet, loif, is itself virtualized. + * It would otherwise be possible to seriously hose IGMP state, + * and create inconsistencies in upstream multicast routing, if you have + * multiple VIMAGEs running on the same link joining different multicast + * groups, UNLESS the "primary IP address" is different. This is because + * IGMP for IPv4 does not force link-local addresses to be used for each + * node, unlike MLD for IPv6. + * Obviously the IGMPv3 per-interface state has per-vimage granularity + * also as a result. + * + * FUTURE: Stop using IFP_TO_IA/INADDR_ANY, and use source address selection + * policy to control the address used by IGMP on the link. */ #ifdef VIMAGE_GLOBALS -static SLIST_HEAD(, router_info) router_info_head; -#endif -static struct mtx igmp_mtx; -static int igmp_timers_are_running; +int interface_timers_running; /* IGMPv3 general query response */ +int state_change_timers_running; /* IGMPv3 state-change retransmit */ +int current_state_timers_running; /* IGMPv1/v2 host report; + * IGMPv3 g/sg query response */ + +LIST_HEAD(, igmp_ifinfo) igi_head; +struct igmpstat igmpstat; +struct timeval igmp_gsrdelay; + +int igmp_recvifkludge; +int igmp_sendra; +int igmp_sendlocal; +int igmp_v1enable; +int igmp_v2enable; +int igmp_legacysupp; +int igmp_default_version; +#endif /* VIMAGE_GLOBALS */ /* - * XXXRW: can we define these such that these can be made const? In any - * case, these shouldn't be changed after igmp_init() and therefore don't - * need locking. + * Virtualized sysctls. */ -static u_long igmp_all_hosts_group; -static u_long igmp_all_rtrs_group; +SYSCTL_V_STRUCT(V_NET, vnet_inet, _net_inet_igmp, IGMPCTL_STATS, stats, + CTLFLAG_RW, igmpstat, igmpstat, ""); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_igmp, OID_AUTO, recvifkludge, + CTLFLAG_RW, igmp_recvifkludge, 0, + "Rewrite IGMPv1/v2 reports from 0.0.0.0 to contain subnet address"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_igmp, OID_AUTO, sendra, + CTLFLAG_RW, igmp_sendra, 0, + "Send IP Router Alert option in IGMPv2/v3 messages"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_igmp, OID_AUTO, sendlocal, + CTLFLAG_RW, igmp_sendlocal, 0, + "Send IGMP membership reports for 224.0.0.0/24 groups"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_igmp, OID_AUTO, v1enable, + CTLFLAG_RW, igmp_v1enable, 0, + "Enable backwards compatibility with IGMPv1"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_igmp, OID_AUTO, v2enable, + CTLFLAG_RW, igmp_v2enable, 0, + "Enable backwards compatibility with IGMPv2"); +SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_igmp, OID_AUTO, legacysupp, + CTLFLAG_RW, igmp_legacysupp, 0, + "Allow v1/v2 reports to suppress v3 group responses"); +SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_igmp, OID_AUTO, default_version, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, igmp_default_version, 0, + sysctl_igmp_default_version, "I", + "Default version of IGMP to run on each interface"); +SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_igmp, OID_AUTO, gsrdelay, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, igmp_gsrdelay.tv_sec, 0, + sysctl_igmp_gsr, "I", + "Rate limit for IGMPv3 Group-and-Source queries in seconds"); -static struct mbuf *router_alert; -static struct route igmprt; +/* + * Non-virtualized sysctls. + */ +SYSCTL_NODE(_net_inet_igmp, OID_AUTO, ifinfo, CTLFLAG_RD | CTLFLAG_MPSAFE, + sysctl_igmp_ifinfo, "Per-interface IGMPv3 state"); -#ifdef IGMP_DEBUG -#define IGMP_PRINTF(x) printf(x) -#else -#define IGMP_PRINTF(x) +static __inline void +igmp_save_context(struct mbuf *m, struct ifnet *ifp) +{ + +#ifdef VIMAGE + m->m_pkthdr.header = ifp->if_vnet; +#endif /* VIMAGE */ + m->m_pkthdr.flowid = ifp->if_index; +} + +static __inline void +igmp_scrub_context(struct mbuf *m) +{ + + m->m_pkthdr.header = NULL; + m->m_pkthdr.flowid = 0; +} + +#ifdef KTR +static __inline char * +inet_ntoa_haddr(in_addr_t haddr) +{ + struct in_addr ia; + + ia.s_addr = htonl(haddr); + return (inet_ntoa(ia)); +} #endif -void -igmp_init(void) +/* + * Restore context from a queued IGMP output chain. + * Return saved ifindex. + * + * VIMAGE: The assertion is there to make sure that we + * actually called CURVNET_SET() with what's in the mbuf chain. + */ +static __inline uint32_t +igmp_restore_context(struct mbuf *m) { - INIT_VNET_INET(curvnet); - struct ipoption *ra; + +#ifdef notyet +#if defined(VIMAGE) && defined(INVARIANTS) + KASSERT(curvnet == (m->m_pkthdr.header), + ("%s: called when curvnet was not restored", __func__)); +#endif +#endif + return (m->m_pkthdr.flowid); +} + +/* + * Retrieve or set default IGMP version. + * + * VIMAGE: Assume curvnet set by caller. + * SMPng: NOTE: Serialized by IGMP lock. + */ +static int +sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS) +{ + int error; + int new; + + error = sysctl_wire_old_buffer(req, sizeof(int)); + if (error) + return (error); + + IGMP_LOCK(); + + new = V_igmp_default_version; + + error = sysctl_handle_int(oidp, &new, 0, req); + if (error || !req->newptr) + goto out_locked; + + if (new < IGMP_VERSION_1 || new > IGMP_VERSION_3) { + error = EINVAL; + goto out_locked; + } + + CTR2(KTR_IGMPV3, "change igmp_default_version from %d to %d", + V_igmp_default_version, new); + + V_igmp_default_version = new; + +out_locked: + IGMP_UNLOCK(); + return (error); +} + +/* + * Retrieve or set threshold between group-source queries in seconds. + * + * VIMAGE: Assume curvnet set by caller. + * SMPng: NOTE: Serialized by IGMP lock. + */ +static int +sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS) +{ + int error; + int i; + + error = sysctl_wire_old_buffer(req, sizeof(int)); + if (error) + return (error); + + IGMP_LOCK(); + + i = V_igmp_gsrdelay.tv_sec; + + error = sysctl_handle_int(oidp, &i, 0, req); + if (error || !req->newptr) + goto out_locked; + + if (i < -1 || i >= 60) { + error = EINVAL; + goto out_locked; + } + + CTR2(KTR_IGMPV3, "change igmp_gsrdelay from %d to %d", + V_igmp_gsrdelay.tv_sec, i); + V_igmp_gsrdelay.tv_sec = i; + +out_locked: + IGMP_UNLOCK(); + return (error); +} + +/* + * Expose struct igmp_ifinfo to userland, keyed by ifindex. + * For use by ifmcstat(8). + * + * SMPng: NOTE: Does an unlocked ifindex space read. + * VIMAGE: Assume curvnet set by caller. The node handler itself + * is not directly virtualized. + */ +static int +sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS) +{ + INIT_VNET_NET(curvnet); + int *name; + int error; + u_int namelen; + struct ifnet *ifp; + struct igmp_ifinfo *igi; + + name = (int *)arg1; + namelen = arg2; + + if (req->newptr != NULL) + return (EPERM); + + if (namelen != 1) + return (EINVAL); + + error = sysctl_wire_old_buffer(req, sizeof(struct igmp_ifinfo)); + if (error) + return (error); + + IN_MULTI_LOCK(); + IGMP_LOCK(); + + if (name[0] <= 0 || name[0] > V_if_index) { + error = ENOENT; + goto out_locked; + } + + error = ENOENT; + + ifp = ifnet_byindex(name[0]); + if (ifp == NULL) + goto out_locked; + + LIST_FOREACH(igi, &V_igi_head, igi_link) { + if (ifp == igi->igi_ifp) { + error = SYSCTL_OUT(req, igi, + sizeof(struct igmp_ifinfo)); + break; + } + } + +out_locked: + IGMP_UNLOCK(); + IN_MULTI_UNLOCK(); + return (error); +} + +/* + * Dispatch an entire queue of pending packet chains + * using the netisr. + * VIMAGE: Assumes the vnet pointer has been set. + */ +static void +igmp_dispatch_queue(struct ifqueue *ifq, int limit, const int loop) +{ + struct mbuf *m; + + for (;;) { + _IF_DEQUEUE(ifq, m); + if (m == NULL) + break; + CTR3(KTR_IGMPV3, "%s: dispatch %p from %p", __func__, ifq, m); + if (loop) + m->m_flags |= M_IGMP_LOOP; + netisr_dispatch(NETISR_IGMP, m); + if (--limit == 0) + break; + } +} + +/* + * Filter outgoing IGMP report state by group. + * + * Reports are ALWAYS suppressed for ALL-HOSTS (224.0.0.1). + * If the net.inet.igmp.sendlocal sysctl is 0, then IGMP reports are + * disabled for all groups in the 224.0.0.0/24 link-local scope. However, + * this may break certain IGMP snooping switches which rely on the old + * report behaviour. + * + * Return zero if the given group is one for which IGMP reports + * should be suppressed, or non-zero if reports should be issued. + */ +static __inline int +igmp_isgroupreported(const struct in_addr addr) +{ + + if (in_allhosts(addr) || + ((!V_igmp_sendlocal && IN_LOCAL_GROUP(ntohl(addr.s_addr))))) + return (0); + + return (1); +} + +/* + * Construct a Router Alert option to use in outgoing packets. + */ +static struct mbuf * +igmp_ra_alloc(void) +{ + struct mbuf *m; + struct ipoption *p; + + MGET(m, M_DONTWAIT, MT_DATA); + p = mtod(m, struct ipoption *); + p->ipopt_dst.s_addr = INADDR_ANY; + p->ipopt_list[0] = IPOPT_RA; /* Router Alert Option */ + p->ipopt_list[1] = 0x04; /* 4 bytes long */ + p->ipopt_list[2] = IPOPT_EOL; /* End of IP option list */ + p->ipopt_list[3] = 0x00; /* pad byte */ + m->m_len = sizeof(p->ipopt_dst) + p->ipopt_list[1]; + + return (m); +} + +/* + * Attach IGMP when PF_INET is attached to an interface. + * + * VIMAGE: Currently we set the vnet pointer, although it is + * likely that it was already set by our caller. + */ +struct igmp_ifinfo * +igmp_domifattach(struct ifnet *ifp) +{ + struct igmp_ifinfo *igi; + + CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", + __func__, ifp, ifp->if_xname); + + CURVNET_SET(ifp->if_vnet); + IGMP_LOCK(); + + igi = igi_alloc_locked(ifp); + if (!(ifp->if_flags & IFF_MULTICAST)) + igi->igi_flags |= IGIF_SILENT; + + IGMP_UNLOCK(); + CURVNET_RESTORE(); + + return (igi); +} + +/* + * VIMAGE: assume curvnet set by caller. + */ +static struct igmp_ifinfo * +igi_alloc_locked(/*const*/ struct ifnet *ifp) +{ + struct igmp_ifinfo *igi; + + IGMP_LOCK_ASSERT(); + + igi = malloc(sizeof(struct igmp_ifinfo), M_IGMP, M_NOWAIT|M_ZERO); + if (igi == NULL) + goto out; + + igi->igi_ifp = ifp; + igi->igi_version = V_igmp_default_version; + igi->igi_flags = 0; + igi->igi_rv = IGMP_RV_INIT; + igi->igi_qi = IGMP_QI_INIT; + igi->igi_qri = IGMP_QRI_INIT; + igi->igi_uri = IGMP_URI_INIT; + + SLIST_INIT(&igi->igi_relinmhead); /* - * To avoid byte-swapping the same value over and over again. + * Responses to general queries are subject to bounds. */ - igmp_all_hosts_group = htonl(INADDR_ALLHOSTS_GROUP); - igmp_all_rtrs_group = htonl(INADDR_ALLRTRS_GROUP); + IFQ_SET_MAXLEN(&igi->igi_gq, IGMP_MAX_RESPONSE_PACKETS); + + LIST_INSERT_HEAD(&V_igi_head, igi, igi_link); - igmp_timers_are_running = 0; + CTR2(KTR_IGMPV3, "allocate igmp_ifinfo for ifp %p(%s)", + ifp, ifp->if_xname); + +out: + return (igi); +} +/* + * Hook for ifdetach. + * + * NOTE: Some finalization tasks need to run before the protocol domain + * is detached, but also before the link layer does its cleanup. + * + * SMPNG: igmp_ifdetach() needs to take IF_ADDR_LOCK(). + * + * VIMAGE: curvnet should have been set by caller, but let's not assume + * that for now. + */ +void +igmp_ifdetach(struct ifnet *ifp) +{ + struct igmp_ifinfo *igi; + struct ifmultiaddr *ifma; + struct in_multi *inm, *tinm; + + CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp, + ifp->if_xname); + + CURVNET_SET(ifp->if_vnet); + + IGMP_LOCK(); + + igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; + if (igi->igi_version == IGMP_VERSION_3) { + IF_ADDR_LOCK(ifp); + TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { + if (ifma->ifma_addr->sa_family != AF_INET) + continue; + inm = (struct in_multi *)ifma->ifma_protospec; + if (inm->inm_state == IGMP_LEAVING_MEMBER) { + SLIST_INSERT_HEAD(&igi->igi_relinmhead, + inm, inm_nrele); + } + inm_clear_recorded(inm); + } + IF_ADDR_UNLOCK(ifp); + /* + * Free the in_multi reference(s) for this IGMP lifecycle. + */ + SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, inm_nrele, + tinm) { + SLIST_REMOVE_HEAD(&igi->igi_relinmhead, inm_nrele); + inm_release_locked(inm); + } + } + + IGMP_UNLOCK(); + +#ifdef VIMAGE /* - * Construct a Router Alert option to use in outgoing packets. + * Plug the potential race which may occur when a VIMAGE + * is detached and we are forced to queue pending IGMP output for + * output netisr processing due to !mpsafe_igmp. In this case it + * is possible that igmp_intr() is about to see mbuf chains with + * invalid cached curvnet pointers. + * This is a rare condition, so just blow them all away. + * FUTURE: This may in fact not be needed, because IFF_NEEDSGIANT + * is being removed in 8.x and the netisr may then be eliminated; + * it is needed only if VIMAGE and IFF_NEEDSGIANT need to co-exist */ - MGET(router_alert, M_DONTWAIT, MT_DATA); - ra = mtod(router_alert, struct ipoption *); - ra->ipopt_dst.s_addr = 0; - ra->ipopt_list[0] = IPOPT_RA; /* Router Alert Option */ - ra->ipopt_list[1] = 0x04; /* 4 bytes long */ - ra->ipopt_list[2] = 0x00; - ra->ipopt_list[3] = 0x00; - router_alert->m_len = sizeof(ra->ipopt_dst) + ra->ipopt_list[1]; + if (!mpsafe_igmp) { + int drops; + + IF_LOCK(&igmpoq); + drops = igmpoq.ifq_len; + _IF_DRAIN(&igmpoq); + IF_UNLOCK(&igmpoq); + if (bootverbose && drops) { + printf("%s: dropped %d pending IGMP output packets\n", + __func__, drops); + } + } +#endif /* VIMAGE */ + + CURVNET_RESTORE(); +} + +/* + * Hook for domifdetach. + * + * VIMAGE: curvnet should have been set by caller, but let's not assume + * that for now. + */ +void +igmp_domifdetach(struct ifnet *ifp) +{ + struct igmp_ifinfo *igi; + + CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", + __func__, ifp, ifp->if_xname); + + CURVNET_SET(ifp->if_vnet); + IGMP_LOCK(); + + igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; + igi_delete_locked(ifp); + + IGMP_UNLOCK(); + CURVNET_RESTORE(); +} + +static void +igi_delete_locked(const struct ifnet *ifp) +{ + struct igmp_ifinfo *igi, *tigi; + + CTR3(KTR_IGMPV3, "%s: freeing igmp_ifinfo for ifp %p(%s)", + __func__, ifp, ifp->if_xname); + + IGMP_LOCK_ASSERT(); + + LIST_FOREACH_SAFE(igi, &V_igi_head, igi_link, tigi) { + if (igi->igi_ifp == ifp) { + /* + * Free deferred General Query responses. + */ + _IF_DRAIN(&igi->igi_gq); + + LIST_REMOVE(igi, igi_link); + + KASSERT(SLIST_EMPTY(&igi->igi_relinmhead), + ("%s: there are dangling in_multi references", + __func__)); - mtx_init(&igmp_mtx, "igmp_mtx", NULL, MTX_DEF); - SLIST_INIT(&V_router_info_head); + free(igi, M_IGMP); + return; + } + } + +#ifdef INVARIANTS + panic("%s: igmp_ifinfo not found for ifp %p\n", __func__, ifp); +#endif } -static struct router_info * -find_rti(struct ifnet *ifp) +/* + * Process a received IGMPv1 query. + * Return non-zero if the message should be dropped. + * + * VIMAGE: The curvnet pointer is derived from the input ifp. + */ +static int +igmp_input_v1_query(struct ifnet *ifp, const struct ip *ip) { INIT_VNET_INET(ifp->if_vnet); - struct router_info *rti; + struct ifmultiaddr *ifma; + struct igmp_ifinfo *igi; + struct in_multi *inm; + + /* + * IGMPv1 General Queries SHOULD always addressed to 224.0.0.1. + * igmp_group is always ignored. Do not drop it as a userland + * daemon may wish to see it. + */ + if (!in_allhosts(ip->ip_dst)) { + ++V_igmpstat.igps_rcv_badqueries; + return (0); + } + + ++V_igmpstat.igps_rcv_gen_queries; + + /* + * Switch to IGMPv1 host compatibility mode. + */ + IN_MULTI_LOCK(); + IGMP_LOCK(); + + igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; + KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp)); + + if (igi->igi_flags & IGIF_LOOPBACK) { + CTR2(KTR_IGMPV3, "ignore v1 query on IGIF_LOOPBACK ifp %p(%s)", + ifp, ifp->if_xname); + goto out_locked; + } + + igmp_set_version(igi, IGMP_VERSION_1); + + CTR2(KTR_IGMPV3, "process v1 query on ifp %p(%s)", ifp, ifp->if_xname); + + /* + * Start the timers in all of our group records + * for the interface on which the query arrived, + * except those which are already running. + */ + IF_ADDR_LOCK(ifp); + TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { + if (ifma->ifma_addr->sa_family != AF_INET) + continue; + inm = (struct in_multi *)ifma->ifma_protospec; + if (inm->inm_timer != 0) + continue; + switch (inm->inm_state) { + case IGMP_NOT_MEMBER: + case IGMP_SILENT_MEMBER: + break; + case IGMP_G_QUERY_PENDING_MEMBER: + case IGMP_SG_QUERY_PENDING_MEMBER: + case IGMP_REPORTING_MEMBER: + case IGMP_IDLE_MEMBER: + case IGMP_LAZY_MEMBER: + case IGMP_SLEEPING_MEMBER: + case IGMP_AWAKENING_MEMBER: + inm->inm_state = IGMP_REPORTING_MEMBER; + inm->inm_timer = IGMP_RANDOM_DELAY( + IGMP_V1V2_MAX_RI * PR_FASTHZ); + V_current_state_timers_running = 1; + break; + case IGMP_LEAVING_MEMBER: + break; + } + } + IF_ADDR_UNLOCK(ifp); + +out_locked: + IGMP_UNLOCK(); + IN_MULTI_UNLOCK(); + + return (0); +} + +/* + * Process a received IGMPv2 general or group-specific query. + */ +static int +igmp_input_v2_query(struct ifnet *ifp, const struct ip *ip, + const struct igmp *igmp) +{ + struct ifmultiaddr *ifma; + struct igmp_ifinfo *igi; + struct in_multi *inm; + uint16_t timer; + + /* + * Perform lazy allocation of IGMP link info if required, + * and switch to IGMPv2 host compatibility mode. + */ + IN_MULTI_LOCK(); + IGMP_LOCK(); + + igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; + KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp)); + + if (igi->igi_flags & IGIF_LOOPBACK) { + CTR2(KTR_IGMPV3, "ignore v2 query on IGIF_LOOPBACK ifp %p(%s)", + ifp, ifp->if_xname); + goto out_locked; + } + + igmp_set_version(igi, IGMP_VERSION_2); + + timer = igmp->igmp_code * PR_FASTHZ / IGMP_TIMER_SCALE; + if (timer == 0) + timer = 1; + + if (!in_nullhost(igmp->igmp_group)) { + /* + * IGMPv2 Group-Specific Query. + * If this is a group-specific IGMPv2 query, we need only + * look up the single group to process it. + */ + inm = inm_lookup(ifp, igmp->igmp_group); + if (inm != NULL) { + CTR3(KTR_IGMPV3, "process v2 query %s on ifp %p(%s)", + inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname); + igmp_v2_update_group(inm, timer); + } + ++V_igmpstat.igps_rcv_group_queries; + } else { + /* + * IGMPv2 General Query. + * If this was not sent to the all-hosts group, ignore it. + */ + if (in_allhosts(ip->ip_dst)) { + /* + * For each reporting group joined on this + * interface, kick the report timer. + */ + CTR2(KTR_IGMPV3, + "process v2 general query on ifp %p(%s)", + ifp, ifp->if_xname); + + IF_ADDR_LOCK(ifp); + TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { + if (ifma->ifma_addr->sa_family != AF_INET) + continue; + inm = (struct in_multi *)ifma->ifma_protospec; + igmp_v2_update_group(inm, timer); + } + IF_ADDR_UNLOCK(ifp); + } + ++V_igmpstat.igps_rcv_gen_queries; + } + +out_locked: + IGMP_UNLOCK(); + IN_MULTI_UNLOCK(); + + return (0); +} + +/* + * Update the report timer on a group in response to an IGMPv2 query. + * + * If we are becoming the reporting member for this group, start the timer. + * If we already are the reporting member for this group, and timer is + * below the threshold, reset it. + * + * We may be updating the group for the first time since we switched + * to IGMPv3. If we are, then we must clear any recorded source lists, + * and transition to REPORTING state; the group timer is overloaded + * for group and group-source query responses. + * + * Unlike IGMPv3, the delay per group should be jittered + * to avoid bursts of IGMPv2 reports. + */ +static void +igmp_v2_update_group(struct in_multi *inm, const int timer) +{ + + CTR4(KTR_IGMPV3, "%s: %s/%s timer=%d", __func__, + inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname, timer); + + IN_MULTI_LOCK_ASSERT(); + + switch (inm->inm_state) { + case IGMP_NOT_MEMBER: + case IGMP_SILENT_MEMBER: + break; + case IGMP_REPORTING_MEMBER: + if (inm->inm_timer != 0 && + inm->inm_timer <= timer) { + CTR1(KTR_IGMPV3, "%s: REPORTING and timer running, " + "skipping.", __func__); + break; + } + /* FALLTHROUGH */ + case IGMP_SG_QUERY_PENDING_MEMBER: + case IGMP_G_QUERY_PENDING_MEMBER: + case IGMP_IDLE_MEMBER: + case IGMP_LAZY_MEMBER: + case IGMP_AWAKENING_MEMBER: + CTR1(KTR_IGMPV3, "%s: ->REPORTING", __func__); + inm->inm_state = IGMP_REPORTING_MEMBER; + inm->inm_timer = IGMP_RANDOM_DELAY(timer); + V_current_state_timers_running = 1; + break; + case IGMP_SLEEPING_MEMBER: + CTR1(KTR_IGMPV3, "%s: ->AWAKENING", __func__); + inm->inm_state = IGMP_AWAKENING_MEMBER; + break; + case IGMP_LEAVING_MEMBER: + break; + } +} + +/* + * Process a received IGMPv3 general, group-specific or + * group-and-source-specific query. + * Assumes m has already been pulled up to the full IGMP message length. + * Return 0 if successful, otherwise an appropriate error code is returned. + */ +static int +igmp_input_v3_query(struct ifnet *ifp, const struct ip *ip, + /*const*/ struct igmpv3 *igmpv3) +{ + struct igmp_ifinfo *igi; + struct in_multi *inm; + uint32_t maxresp, nsrc, qqi; + uint16_t timer; + uint8_t qrv; + + CTR2(KTR_IGMPV3, "process v3 query on ifp %p(%s)", ifp, ifp->if_xname); + + maxresp = igmpv3->igmp_code; /* in 1/10ths of a second */ + if (maxresp >= 128) { + maxresp = IGMP_MANT(igmpv3->igmp_code) << + (IGMP_EXP(igmpv3->igmp_code) + 3); + } + + /* + * Robustness must never be less than 2 for on-wire IGMPv3. + * FIXME: Check if ifp has IGIF_LOOPBACK set, as we make + * an exception for interfaces whose IGMPv3 state changes + * are redirected to loopback (e.g. MANET). + */ + qrv = IGMP_QRV(igmpv3->igmp_misc); + if (qrv < 2) { + CTR3(KTR_IGMPV3, "%s: clamping qrv %d to %d", __func__, + qrv, IGMP_RV_INIT); + qrv = IGMP_RV_INIT; + } + + qqi = igmpv3->igmp_qqi; + if (qqi >= 128) { + maxresp = IGMP_MANT(igmpv3->igmp_qqi) << + (IGMP_EXP(igmpv3->igmp_qqi) + 3); + } + + timer = maxresp * PR_FASTHZ / IGMP_TIMER_SCALE; + if (timer == 0) + timer = 1; + + nsrc = ntohs(igmpv3->igmp_numsrc); + + IN_MULTI_LOCK(); + IGMP_LOCK(); + + igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; + KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp)); - mtx_assert(&igmp_mtx, MA_OWNED); - IGMP_PRINTF("[igmp.c, _find_rti] --> entering \n"); - SLIST_FOREACH(rti, &V_router_info_head, rti_list) { - if (rti->rti_ifp == ifp) { - IGMP_PRINTF( - "[igmp.c, _find_rti] --> found old entry \n"); - return (rti); + if (igi->igi_flags & IGIF_LOOPBACK) { + CTR2(KTR_IGMPV3, "ignore v3 query on IGIF_LOOPBACK ifp %p(%s)", + ifp, ifp->if_xname); + goto out_locked; + } + + igmp_set_version(igi, IGMP_VERSION_3); + + igi->igi_rv = qrv; + igi->igi_qi = qqi; + igi->igi_qri = maxresp; + + CTR4(KTR_IGMPV3, "%s: qrv %d qi %d qri %d", __func__, qrv, qqi, + maxresp); + + if (in_nullhost(igmpv3->igmp_group)) { + /* + * IGMPv3 General Query. + * Schedule a current-state report on this ifp for + * all groups, possibly containing source lists. + */ + ++V_igmpstat.igps_rcv_gen_queries; + + if (!in_allhosts(ip->ip_dst) || nsrc > 0) { + /* + * General Queries SHOULD be directed to 224.0.0.1. + * A general query with a source list has undefined + * behaviour; discard it. + */ + ++V_igmpstat.igps_rcv_badqueries; + goto out_locked; + } + + CTR2(KTR_IGMPV3, "process v3 general query on ifp %p(%s)", + ifp, ifp->if_xname); + + /* + * If there is a pending General Query response + * scheduled earlier than the selected delay, do + * not schedule any other reports. + * Otherwise, reset the interface timer. + */ + if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer) { + igi->igi_v3_timer = IGMP_RANDOM_DELAY(timer); + V_interface_timers_running = 1; + } + } else { + /* + * IGMPv3 Group-specific or Group-and-source-specific Query. + * + * Group-source-specific queries are throttled on + * a per-group basis to defeat denial-of-service attempts. + * Queries for groups we are not a member of on this + * link are simply ignored. + */ + inm = inm_lookup(ifp, igmpv3->igmp_group); + if (inm == NULL) + goto out_locked; + if (nsrc > 0) { + ++V_igmpstat.igps_rcv_gsr_queries; + if (!ratecheck(&inm->inm_lastgsrtv, + &V_igmp_gsrdelay)) { + CTR1(KTR_IGMPV3, "%s: GS query throttled.", + __func__); + ++V_igmpstat.igps_drop_gsr_queries; + goto out_locked; + } + } else { + ++V_igmpstat.igps_rcv_group_queries; } + CTR3(KTR_IGMPV3, "process v3 %s query on ifp %p(%s)", + inet_ntoa(igmpv3->igmp_group), ifp, ifp->if_xname); + /* + * If there is a pending General Query response + * scheduled sooner than the selected delay, no + * further report need be scheduled. + * Otherwise, prepare to respond to the + * group-specific or group-and-source query. + */ + if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer) + igmp_input_v3_group_query(inm, igi, timer, igmpv3); } - rti = malloc(sizeof *rti, M_IGMP, M_NOWAIT); - if (rti == NULL) { - IGMP_PRINTF("[igmp.c, _find_rti] --> no memory for entry\n"); - return (NULL); + +out_locked: + IGMP_UNLOCK(); + IN_MULTI_UNLOCK(); + + return (0); +} + +/* + * Process a recieved IGMPv3 group-specific or group-and-source-specific + * query. + * Return <0 if any error occured. Currently this is ignored. + */ +static int +igmp_input_v3_group_query(struct in_multi *inm, struct igmp_ifinfo *igi, + int timer, /*const*/ struct igmpv3 *igmpv3) +{ + int retval; + uint16_t nsrc; + + IN_MULTI_LOCK_ASSERT(); + IGMP_LOCK_ASSERT(); + + retval = 0; + + switch (inm->inm_state) { + case IGMP_NOT_MEMBER: + case IGMP_SILENT_MEMBER: + case IGMP_SLEEPING_MEMBER: + case IGMP_LAZY_MEMBER: + case IGMP_AWAKENING_MEMBER: + case IGMP_IDLE_MEMBER: + case IGMP_LEAVING_MEMBER: + return (retval); + break; + case IGMP_REPORTING_MEMBER: + case IGMP_G_QUERY_PENDING_MEMBER: + case IGMP_SG_QUERY_PENDING_MEMBER: + break; + } + + nsrc = ntohs(igmpv3->igmp_numsrc); + + /* + * Deal with group-specific queries upfront. + * If any group query is already pending, purge any recorded + * source-list state if it exists, and schedule a query response + * for this group-specific query. + */ + if (nsrc == 0) { + if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER || + inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) { + inm_clear_recorded(inm); + timer = min(inm->inm_timer, timer); + } + inm->inm_state = IGMP_G_QUERY_PENDING_MEMBER; + inm->inm_timer = IGMP_RANDOM_DELAY(timer); + V_current_state_timers_running = 1; + return (retval); + } + + /* + * Deal with the case where a group-and-source-specific query has + * been received but a group-specific query is already pending. + */ + if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER) { + timer = min(inm->inm_timer, timer); + inm->inm_timer = IGMP_RANDOM_DELAY(timer); + V_current_state_timers_running = 1; + return (retval); + } + + /* + * Finally, deal with the case where a group-and-source-specific + * query has been received, where a response to a previous g-s-r + * query exists, or none exists. + * In this case, we need to parse the source-list which the Querier + * has provided us with and check if we have any source list filter + * entries at T1 for these sources. If we do not, there is no need + * schedule a report and the query may be dropped. + * If we do, we must record them and schedule a current-state + * report for those sources. + * FIXME: Handling source lists larger than 1 mbuf requires that + * we pass the mbuf chain pointer down to this function, and use + * m_getptr() to walk the chain. + */ + if (inm->inm_nsrc > 0) { + const struct in_addr *ap; + int i, nrecorded; + + ap = (const struct in_addr *)(igmpv3 + 1); + nrecorded = 0; + for (i = 0; i < nsrc; i++, ap++) { + retval = inm_record_source(inm, ap->s_addr); + if (retval < 0) + break; + nrecorded += retval; + } + if (nrecorded > 0) { + CTR1(KTR_IGMPV3, + "%s: schedule response to SG query", __func__); + inm->inm_state = IGMP_SG_QUERY_PENDING_MEMBER; + inm->inm_timer = IGMP_RANDOM_DELAY(timer); + V_current_state_timers_running = 1; + } } - rti->rti_ifp = ifp; - rti->rti_type = IGMP_V2_ROUTER; - rti->rti_time = 0; - SLIST_INSERT_HEAD(&V_router_info_head, rti, rti_list); - IGMP_PRINTF("[igmp.c, _find_rti] --> created an entry \n"); - return (rti); + + return (retval); +} + +/* + * Process a received IGMPv1 host membership report. + * + * NOTE: 0.0.0.0 workaround breaks const correctness. + */ +static int +igmp_input_v1_report(struct ifnet *ifp, /*const*/ struct ip *ip, + /*const*/ struct igmp *igmp) +{ + struct in_ifaddr *ia; + struct in_multi *inm; + + ++V_igmpstat.igps_rcv_reports; + + if (ifp->if_flags & IFF_LOOPBACK) + return (0); + + if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr) || + !in_hosteq(igmp->igmp_group, ip->ip_dst))) { + ++V_igmpstat.igps_rcv_badreports; + return (EINVAL); + } + + /* + * RFC 3376, Section 4.2.13, 9.2, 9.3: + * Booting clients may use the source address 0.0.0.0. Some + * IGMP daemons may not know how to use IP_RECVIF to determine + * the interface upon which this message was received. + * Replace 0.0.0.0 with the subnet address if told to do so. + */ + if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) { + IFP_TO_IA(ifp, ia); + if (ia != NULL) + ip->ip_src.s_addr = htonl(ia->ia_subnet); + } + + CTR3(KTR_IGMPV3, "process v1 report %s on ifp %p(%s)", + inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname); + + /* + * IGMPv1 report suppression. + * If we are a member of this group, and our membership should be + * reported, stop our group timer and transition to the 'lazy' state. + */ + IN_MULTI_LOCK(); + inm = inm_lookup(ifp, igmp->igmp_group); + if (inm != NULL) { + struct igmp_ifinfo *igi; + + igi = inm->inm_igi; + if (igi == NULL) { + KASSERT(igi != NULL, + ("%s: no igi for ifp %p", __func__, ifp)); + goto out_locked; + } + + ++V_igmpstat.igps_rcv_ourreports; + + /* + * If we are in IGMPv3 host mode, do not allow the + * other host's IGMPv1 report to suppress our reports + * unless explicitly configured to do so. + */ + if (igi->igi_version == IGMP_VERSION_3) { + if (V_igmp_legacysupp) + igmp_v3_suppress_group_record(inm); + goto out_locked; + } + + inm->inm_timer = 0; + + switch (inm->inm_state) { + case IGMP_NOT_MEMBER: + case IGMP_SILENT_MEMBER: + break; + case IGMP_IDLE_MEMBER: + case IGMP_LAZY_MEMBER: + case IGMP_AWAKENING_MEMBER: + CTR3(KTR_IGMPV3, + "report suppressed for %s on ifp %p(%s)", + inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname); + case IGMP_SLEEPING_MEMBER: + inm->inm_state = IGMP_SLEEPING_MEMBER; + break; + case IGMP_REPORTING_MEMBER: + CTR3(KTR_IGMPV3, + "report suppressed for %s on ifp %p(%s)", + inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname); + if (igi->igi_version == IGMP_VERSION_1) + inm->inm_state = IGMP_LAZY_MEMBER; + else if (igi->igi_version == IGMP_VERSION_2) + inm->inm_state = IGMP_SLEEPING_MEMBER; + break; + case IGMP_G_QUERY_PENDING_MEMBER: + case IGMP_SG_QUERY_PENDING_MEMBER: + case IGMP_LEAVING_MEMBER: + break; + } + } + +out_locked: + IN_MULTI_UNLOCK(); + + return (0); +} + +/* + * Process a received IGMPv2 host membership report. + * + * NOTE: 0.0.0.0 workaround breaks const correctness. + */ +static int +igmp_input_v2_report(struct ifnet *ifp, /*const*/ struct ip *ip, + /*const*/ struct igmp *igmp) +{ + struct in_ifaddr *ia; + struct in_multi *inm; + + /* + * Make sure we don't hear our own membership report. Fast + * leave requires knowing that we are the only member of a + * group. + */ + IFP_TO_IA(ifp, ia); + if (ia != NULL && in_hosteq(ip->ip_src, IA_SIN(ia)->sin_addr)) + return (0); + + ++V_igmpstat.igps_rcv_reports; + + if (ifp->if_flags & IFF_LOOPBACK) + return (0); + + if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) || + !in_hosteq(igmp->igmp_group, ip->ip_dst)) { + ++V_igmpstat.igps_rcv_badreports; + return (EINVAL); + } + + /* + * RFC 3376, Section 4.2.13, 9.2, 9.3: + * Booting clients may use the source address 0.0.0.0. Some + * IGMP daemons may not know how to use IP_RECVIF to determine + * the interface upon which this message was received. + * Replace 0.0.0.0 with the subnet address if told to do so. + */ + if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) { + if (ia != NULL) + ip->ip_src.s_addr = htonl(ia->ia_subnet); + } + + CTR3(KTR_IGMPV3, "process v2 report %s on ifp %p(%s)", + inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname); + + /* + * IGMPv2 report suppression. + * If we are a member of this group, and our membership should be + * reported, and our group timer is pending or about to be reset, + * stop our group timer by transitioning to the 'lazy' state. + */ + IN_MULTI_LOCK(); + inm = inm_lookup(ifp, igmp->igmp_group); + if (inm != NULL) { + struct igmp_ifinfo *igi; + + igi = inm->inm_igi; + KASSERT(igi != NULL, ("%s: no igi for ifp %p", __func__, ifp)); + + ++V_igmpstat.igps_rcv_ourreports; + + /* + * If we are in IGMPv3 host mode, do not allow the + * other host's IGMPv1 report to suppress our reports + * unless explicitly configured to do so. + */ + if (igi->igi_version == IGMP_VERSION_3) { + if (V_igmp_legacysupp) + igmp_v3_suppress_group_record(inm); + goto out_locked; + } + + inm->inm_timer = 0; + + switch (inm->inm_state) { + case IGMP_NOT_MEMBER: + case IGMP_SILENT_MEMBER: + case IGMP_SLEEPING_MEMBER: + break; + case IGMP_REPORTING_MEMBER: + case IGMP_IDLE_MEMBER: + case IGMP_AWAKENING_MEMBER: + CTR3(KTR_IGMPV3, + "report suppressed for %s on ifp %p(%s)", + inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname); + case IGMP_LAZY_MEMBER: + inm->inm_state = IGMP_LAZY_MEMBER; + break; + case IGMP_G_QUERY_PENDING_MEMBER: + case IGMP_SG_QUERY_PENDING_MEMBER: + case IGMP_LEAVING_MEMBER: + break; + } + } + +out_locked: + IN_MULTI_UNLOCK(); + + return (0); } void -igmp_input(register struct mbuf *m, int off) -{ - register int iphlen = off; - register struct igmp *igmp; - register struct ip *ip; - register int igmplen; - register struct ifnet *ifp = m->m_pkthdr.rcvif; - register int minlen; - register struct in_multi *inm; - register struct in_ifaddr *ia; - struct in_multistep step; - struct router_info *rti; - int timer; /** timer value in the igmp query header **/ +igmp_input(struct mbuf *m, int off) +{ + int iphlen; + struct ifnet *ifp; + struct igmp *igmp; + struct ip *ip; + int igmplen; + int minlen; + int queryver; + + CTR3(KTR_IGMPV3, "%s: called w/mbuf (%p,%d)", __func__, m, off); + + ifp = m->m_pkthdr.rcvif; INIT_VNET_INET(ifp->if_vnet); ++V_igmpstat.igps_rcv_total; ip = mtod(m, struct ip *); + iphlen = off; igmplen = ip->ip_len; /* @@ -207,12 +1438,28 @@ igmp_input(register struct mbuf *m, int off) m_freem(m); return; } - minlen = iphlen + IGMP_MINLEN; + + /* + * Always pullup to the minimum size for v1/v2 or v3 + * to amortize calls to m_pullup(). + */ + minlen = iphlen; + if (igmplen >= IGMP_V3_QUERY_MINLEN) + minlen += IGMP_V3_QUERY_MINLEN; + else + minlen += IGMP_MINLEN; if ((m->m_flags & M_EXT || m->m_len < minlen) && (m = m_pullup(m, minlen)) == 0) { ++V_igmpstat.igps_rcv_tooshort; return; } + ip = mtod(m, struct ip *); + + if (ip->ip_ttl != 1) { + ++V_igmpstat.igps_rcv_badttl; + m_freem(m); + return; + } /* * Validate checksum. @@ -228,147 +1475,112 @@ igmp_input(register struct mbuf *m, int off) m->m_data -= iphlen; m->m_len += iphlen; - ip = mtod(m, struct ip *); - timer = igmp->igmp_code * PR_FASTHZ / IGMP_TIMER_SCALE; - if (timer == 0) - timer = 1; - - /* - * In the IGMPv2 specification, there are 3 states and a flag. - * - * In Non-Member state, we simply don't have a membership record. - * In Delaying Member state, our timer is running (inm->inm_timer). - * In Idle Member state, our timer is not running (inm->inm_timer==0). - * - * The flag is inm->inm_state, it is set to IGMP_OTHERMEMBER if we - * have heard a report from another member, or IGMP_IREPORTEDLAST if - * I sent the last report. - */ switch (igmp->igmp_type) { - case IGMP_MEMBERSHIP_QUERY: - ++V_igmpstat.igps_rcv_queries; - - if (ifp->if_flags & IFF_LOOPBACK) - break; - - if (igmp->igmp_code == 0) { - /* - * Old router. Remember that the querier on this - * interface is old, and set the timer to the value - * in RFC 1112. - */ + case IGMP_HOST_MEMBERSHIP_QUERY: + if (igmplen == IGMP_MINLEN) { + if (igmp->igmp_code == 0) + queryver = IGMP_VERSION_1; + else + queryver = IGMP_VERSION_2; + } else if (igmplen >= IGMP_V3_QUERY_MINLEN) { + queryver = IGMP_VERSION_3; + } else { + ++V_igmpstat.igps_rcv_tooshort; + m_freem(m); + return; + } - mtx_lock(&igmp_mtx); - rti = find_rti(ifp); - if (rti == NULL) { - mtx_unlock(&igmp_mtx); + switch (queryver) { + case IGMP_VERSION_1: + ++V_igmpstat.igps_rcv_v1v2_queries; + if (!V_igmp_v1enable) + break; + if (igmp_input_v1_query(ifp, ip) != 0) { m_freem(m); return; } - rti->rti_type = IGMP_V1_ROUTER; - rti->rti_time = 0; - mtx_unlock(&igmp_mtx); - - timer = IGMP_MAX_HOST_REPORT_DELAY * PR_FASTHZ; + break; - if (ip->ip_dst.s_addr != igmp_all_hosts_group || - igmp->igmp_group.s_addr != 0) { - ++V_igmpstat.igps_rcv_badqueries; + case IGMP_VERSION_2: + ++V_igmpstat.igps_rcv_v1v2_queries; + if (!V_igmp_v2enable) + break; + if (igmp_input_v2_query(ifp, ip, igmp) != 0) { m_freem(m); return; } - } else { - /* - * New router. Simply do the new validity check. - */ - - if (igmp->igmp_group.s_addr != 0 && - !IN_MULTICAST(ntohl(igmp->igmp_group.s_addr))) { - ++V_igmpstat.igps_rcv_badqueries; - m_freem(m); - return; - } - } + break; - /* - * - Start the timers in all of our membership records that - * the query applies to for the interface on which the - * query arrived excl. those that belong to the "all-hosts" - * group (224.0.0.1). - * - Restart any timer that is already running but has a - * value longer than the requested timeout. - * - Use the value specified in the query message as the - * maximum timeout. - */ - IN_MULTI_LOCK(); - IN_FIRST_MULTI(step, inm); - while (inm != NULL) { - if (inm->inm_ifp == ifp && - inm->inm_addr.s_addr != igmp_all_hosts_group && - (igmp->igmp_group.s_addr == 0 || - igmp->igmp_group.s_addr == inm->inm_addr.s_addr)) { - if (inm->inm_timer == 0 || - inm->inm_timer > timer) { - inm->inm_timer = - IGMP_RANDOM_DELAY(timer); - igmp_timers_are_running = 1; + case IGMP_VERSION_3: { + struct igmpv3 *igmpv3; + uint16_t igmpv3len; + uint16_t srclen; + int nsrc; + + ++V_igmpstat.igps_rcv_v3_queries; + igmpv3 = (struct igmpv3 *)igmp; + /* + * Validate length based on source count. + */ + nsrc = ntohs(igmpv3->igmp_numsrc); + srclen = sizeof(struct in_addr) * nsrc; + if (nsrc * sizeof(in_addr_t) > srclen) { + ++V_igmpstat.igps_rcv_tooshort; + return; + } + /* + * m_pullup() may modify m, so pullup in + * this scope. + */ + igmpv3len = iphlen + IGMP_V3_QUERY_MINLEN + + srclen; + if ((m->m_flags & M_EXT || + m->m_len < igmpv3len) && + (m = m_pullup(m, igmpv3len)) == NULL) { + ++V_igmpstat.igps_rcv_tooshort; + return; + } + igmpv3 = (struct igmpv3 *)(mtod(m, uint8_t *) + + iphlen); + if (igmp_input_v3_query(ifp, ip, igmpv3) != 0) { + m_freem(m); + return; } } - IN_NEXT_MULTI(step, inm); + break; } - IN_MULTI_UNLOCK(); break; - case IGMP_V1_MEMBERSHIP_REPORT: - case IGMP_V2_MEMBERSHIP_REPORT: - /* - * For fast leave to work, we have to know that we are the - * last person to send a report for this group. Reports can - * potentially get looped back if we are a multicast router, - * so discard reports sourced by me. - */ - IFP_TO_IA(ifp, ia); - if (ia != NULL && - ip->ip_src.s_addr == IA_SIN(ia)->sin_addr.s_addr) + case IGMP_v1_HOST_MEMBERSHIP_REPORT: + if (!V_igmp_v1enable) break; + if (igmp_input_v1_report(ifp, ip, igmp) != 0) { + m_freem(m); + return; + } + break; - ++V_igmpstat.igps_rcv_reports; - - if (ifp->if_flags & IFF_LOOPBACK) + case IGMP_v2_HOST_MEMBERSHIP_REPORT: + if (!V_igmp_v2enable) break; - - if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr))) { - ++V_igmpstat.igps_rcv_badreports; + if (!ip_checkrouteralert(m)) + ++V_igmpstat.igps_rcv_nora; + if (igmp_input_v2_report(ifp, ip, igmp) != 0) { m_freem(m); return; } + break; + case IGMP_v3_HOST_MEMBERSHIP_REPORT: /* - * KLUDGE: if the IP source address of the report has an - * unspecified (i.e., zero) subnet number, as is allowed for - * a booting host, replace it with the correct subnet number - * so that a process-level multicast routing daemon can - * determine which subnet it arrived from. This is necessary - * to compensate for the lack of any way for a process to - * determine the arrival interface of an incoming packet. + * Hosts do not need to process IGMPv3 membership reports, + * as report suppression is no longer required. */ - if ((ntohl(ip->ip_src.s_addr) & IN_CLASSA_NET) == 0) { - if (ia != NULL) - ip->ip_src.s_addr = htonl(ia->ia_subnet); - } + if (!ip_checkrouteralert(m)) + ++V_igmpstat.igps_rcv_nora; + break; - /* - * If we belong to the group being reported, stop our timer - * for that group. - */ - IN_MULTI_LOCK(); - IN_LOOKUP_MULTI(igmp->igmp_group, ifp, inm); - if (inm != NULL) { - inm->inm_timer = 0; - ++V_igmpstat.igps_rcv_ourreports; - inm->inm_state = IGMP_OTHERMEMBER; - } - IN_MULTI_UNLOCK(); + default: break; } @@ -379,163 +1591,2102 @@ igmp_input(register struct mbuf *m, int off) rip_input(m, off); } + +/* + * Fast timeout handler (global). + * VIMAGE: Timeout handlers are expected to service all vimages. + */ void -igmp_joingroup(struct in_multi *inm) +igmp_fasttimo(void) +{ +#ifdef VIMAGE + VNET_ITERATOR_DECL(vnet_iter); + + VNET_LIST_RLOCK(); + VNET_FOREACH(vnet_iter) { + CURVNET_SET(vnet_iter); + INIT_VNET_INET(vnet_iter); + igmp_fasttimo_vnet(); + CURVNET_RESTORE(); + } + VNET_LIST_RUNLOCK(); +#else /* !VIMAGE */ + + igmp_fasttimo_vnet(); +#endif /* VIMAGE */ +} + +/* + * Fast timeout handler (per-vnet). + * Sends are shuffled off to a netisr to deal with Giant. + * + * VIMAGE: Assume caller has set up our curvnet. + */ +static void +igmp_fasttimo_vnet(void) +{ + struct ifqueue scq; /* State-change packets */ + struct ifqueue qrq; /* Query response packets */ + struct ifnet *ifp; + struct igmp_ifinfo *igi; + struct ifmultiaddr *ifma, *tifma; + struct in_multi *inm; + int loop, uri_fasthz; + + loop = 0; + uri_fasthz = 0; + + /* + * Quick check to see if any work needs to be done, in order to + * minimize the overhead of fasttimo processing. + * SMPng: XXX Unlocked reads. + */ + if (!V_current_state_timers_running && + !V_interface_timers_running && + !V_state_change_timers_running) + return; + + if (!mpsafe_igmp) + mtx_lock(&Giant); + + IN_MULTI_LOCK(); + IGMP_LOCK(); + + /* + * IGMPv3 General Query response timer processing. + */ + if (V_interface_timers_running) { + CTR1(KTR_IGMPV3, "%s: interface timers running", __func__); + + V_interface_timers_running = 0; + LIST_FOREACH(igi, &V_igi_head, igi_link) { + if (igi->igi_v3_timer == 0) { + /* Do nothing. */ + } else if (--igi->igi_v3_timer == 0) { + igmp_v3_dispatch_general_query(igi); + } else { + V_interface_timers_running = 1; + } + } + } + + if (!V_current_state_timers_running && + !V_state_change_timers_running) + goto out_locked; + + V_current_state_timers_running = 0; + V_state_change_timers_running = 0; + + CTR1(KTR_IGMPV3, "%s: state change timers running", __func__); + + /* + * IGMPv1/v2/v3 host report and state-change timer processing. + * Note: Processing a v3 group timer may remove a node. + */ + LIST_FOREACH(igi, &V_igi_head, igi_link) { + ifp = igi->igi_ifp; + + if (igi->igi_version == IGMP_VERSION_3) { + loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0; + uri_fasthz = IGMP_RANDOM_DELAY(igi->igi_uri * + PR_FASTHZ); + + memset(&qrq, 0, sizeof(struct ifqueue)); + IFQ_SET_MAXLEN(&qrq, IGMP_MAX_G_GS_PACKETS); + + memset(&scq, 0, sizeof(struct ifqueue)); + IFQ_SET_MAXLEN(&scq, IGMP_MAX_STATE_CHANGE_PACKETS); + } + + IF_ADDR_LOCK(ifp); + TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, + tifma) { + if (ifma->ifma_addr->sa_family != AF_INET) + continue; + inm = (struct in_multi *)ifma->ifma_protospec; + switch (igi->igi_version) { + case IGMP_VERSION_1: + case IGMP_VERSION_2: + igmp_v1v2_process_group_timer(inm, + igi->igi_version); + break; + case IGMP_VERSION_3: + igmp_v3_process_group_timers(igi, &qrq, + &scq, inm, uri_fasthz); + break; + } + } + IF_ADDR_UNLOCK(ifp); + + if (igi->igi_version == IGMP_VERSION_3) { + struct in_multi *tinm; + + igmp_dispatch_queue(&qrq, 0, loop); + igmp_dispatch_queue(&scq, 0, loop); + + /* + * Free the in_multi reference(s) for this + * IGMP lifecycle. + */ + SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, + inm_nrele, tinm) { + SLIST_REMOVE_HEAD(&igi->igi_relinmhead, + inm_nrele); + inm_release_locked(inm); + } + } + } + +out_locked: + IGMP_UNLOCK(); + IN_MULTI_UNLOCK(); + if (!mpsafe_igmp) + mtx_unlock(&Giant); +} + +/* + * Update host report group timer for IGMPv1/v2. + * Will update the global pending timer flags. + */ +static void +igmp_v1v2_process_group_timer(struct in_multi *inm, const int version) { + int report_timer_expired; IN_MULTI_LOCK_ASSERT(); + IGMP_LOCK_ASSERT(); - if (inm->inm_addr.s_addr == igmp_all_hosts_group - || inm->inm_ifp->if_flags & IFF_LOOPBACK) { - inm->inm_timer = 0; - inm->inm_state = IGMP_OTHERMEMBER; + if (inm->inm_timer == 0) { + report_timer_expired = 0; + } else if (--inm->inm_timer == 0) { + report_timer_expired = 1; } else { - mtx_lock(&igmp_mtx); - inm->inm_rti = find_rti(inm->inm_ifp); - mtx_unlock(&igmp_mtx); - if (inm->inm_rti != NULL) { - igmp_sendpkt(inm, inm->inm_rti->rti_type, 0); - inm->inm_timer = IGMP_RANDOM_DELAY( - IGMP_MAX_HOST_REPORT_DELAY*PR_FASTHZ); - inm->inm_state = IGMP_IREPORTEDLAST; - igmp_timers_are_running = 1; + V_current_state_timers_running = 1; + return; + } + + switch (inm->inm_state) { + case IGMP_NOT_MEMBER: + case IGMP_SILENT_MEMBER: + case IGMP_IDLE_MEMBER: + case IGMP_LAZY_MEMBER: + case IGMP_SLEEPING_MEMBER: + case IGMP_AWAKENING_MEMBER: + break; + case IGMP_REPORTING_MEMBER: + if (report_timer_expired) { + inm->inm_state = IGMP_IDLE_MEMBER; + (void)igmp_v1v2_queue_report(inm, + (version == IGMP_VERSION_2) ? + IGMP_v2_HOST_MEMBERSHIP_REPORT : + IGMP_v1_HOST_MEMBERSHIP_REPORT); } - /* XXX handling of failure case? */ + break; + case IGMP_G_QUERY_PENDING_MEMBER: + case IGMP_SG_QUERY_PENDING_MEMBER: + case IGMP_LEAVING_MEMBER: + break; } } -void -igmp_leavegroup(struct in_multi *inm) +/* + * Update a group's timers for IGMPv3. + * Will update the global pending timer flags. + * Note: Unlocked read from igi. + */ +static void +igmp_v3_process_group_timers(struct igmp_ifinfo *igi, + struct ifqueue *qrq, struct ifqueue *scq, + struct in_multi *inm, const int uri_fasthz) +{ + int query_response_timer_expired; + int state_change_retransmit_timer_expired; + + IN_MULTI_LOCK_ASSERT(); + IGMP_LOCK_ASSERT(); + + query_response_timer_expired = 0; + state_change_retransmit_timer_expired = 0; + + /* + * During a transition from v1/v2 compatibility mode back to v3, + * a group record in REPORTING state may still have its group + * timer active. This is a no-op in this function; it is easier + * to deal with it here than to complicate the slow-timeout path. + */ + if (inm->inm_timer == 0) { + query_response_timer_expired = 0; + } else if (--inm->inm_timer == 0) { + query_response_timer_expired = 1; + } else { + V_current_state_timers_running = 1; + } + + if (inm->inm_sctimer == 0) { + state_change_retransmit_timer_expired = 0; + } else if (--inm->inm_sctimer == 0) { + state_change_retransmit_timer_expired = 1; + } else { + V_state_change_timers_running = 1; + } + + /* We are in fasttimo, so be quick about it. */ + if (!state_change_retransmit_timer_expired && + !query_response_timer_expired) + return; + + switch (inm->inm_state) { + case IGMP_NOT_MEMBER: + case IGMP_SILENT_MEMBER: + case IGMP_SLEEPING_MEMBER: + case IGMP_LAZY_MEMBER: + case IGMP_AWAKENING_MEMBER: + case IGMP_IDLE_MEMBER: + break; + case IGMP_G_QUERY_PENDING_MEMBER: + case IGMP_SG_QUERY_PENDING_MEMBER: + /* + * Respond to a previously pending Group-Specific + * or Group-and-Source-Specific query by enqueueing + * the appropriate Current-State report for + * immediate transmission. + */ + if (query_response_timer_expired) { + int retval; + + retval = igmp_v3_enqueue_group_record(qrq, inm, 0, 1, + (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER)); + CTR2(KTR_IGMPV3, "%s: enqueue record = %d", + __func__, retval); + inm->inm_state = IGMP_REPORTING_MEMBER; + /* XXX Clear recorded sources for next time. */ + inm_clear_recorded(inm); + } + /* FALLTHROUGH */ + case IGMP_REPORTING_MEMBER: + case IGMP_LEAVING_MEMBER: + if (state_change_retransmit_timer_expired) { + /* + * State-change retransmission timer fired. + * If there are any further pending retransmissions, + * set the global pending state-change flag, and + * reset the timer. + */ + if (--inm->inm_scrv > 0) { + inm->inm_sctimer = uri_fasthz; + V_state_change_timers_running = 1; + } + /* + * Retransmit the previously computed state-change + * report. If there are no further pending + * retransmissions, the mbuf queue will be consumed. + * Update T0 state to T1 as we have now sent + * a state-change. + */ + (void)igmp_v3_merge_state_changes(inm, scq); + + inm_commit(inm); + CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__, + inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname); + + /* + * If we are leaving the group for good, make sure + * we release IGMP's reference to it. + * This release must be deferred using a SLIST, + * as we are called from a loop which traverses + * the in_ifmultiaddr TAILQ. + */ + if (inm->inm_state == IGMP_LEAVING_MEMBER && + inm->inm_scrv == 0) { + inm->inm_state = IGMP_NOT_MEMBER; + SLIST_INSERT_HEAD(&igi->igi_relinmhead, + inm, inm_nrele); + } + } + break; + } +} + + +/* + * Suppress a group's pending response to a group or source/group query. + * + * Do NOT suppress state changes. This leads to IGMPv3 inconsistency. + * Do NOT update ST1/ST0 as this operation merely suppresses + * the currently pending group record. + * Do NOT suppress the response to a general query. It is possible but + * it would require adding another state or flag. + */ +static void +igmp_v3_suppress_group_record(struct in_multi *inm) { IN_MULTI_LOCK_ASSERT(); - if (inm->inm_state == IGMP_IREPORTEDLAST && - inm->inm_addr.s_addr != igmp_all_hosts_group && - !(inm->inm_ifp->if_flags & IFF_LOOPBACK) && - inm->inm_rti->rti_type != IGMP_V1_ROUTER) - igmp_sendpkt(inm, IGMP_V2_LEAVE_GROUP, igmp_all_rtrs_group); + KASSERT(inm->inm_igi->igi_version == IGMP_VERSION_3, + ("%s: not IGMPv3 mode on link", __func__)); + + if (inm->inm_state != IGMP_G_QUERY_PENDING_MEMBER || + inm->inm_state != IGMP_SG_QUERY_PENDING_MEMBER) + return; + + if (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) + inm_clear_recorded(inm); + + inm->inm_timer = 0; + inm->inm_state = IGMP_REPORTING_MEMBER; } -void -igmp_fasttimo(void) +/* + * Switch to a different IGMP version on the given interface, + * as per Section 7.2.1. + */ +static void +igmp_set_version(struct igmp_ifinfo *igi, const int version) { - VNET_ITERATOR_DECL(vnet_iter); - register struct in_multi *inm; - struct in_multistep step; + + IGMP_LOCK_ASSERT(); + + CTR4(KTR_IGMPV3, "%s: switching to v%d on ifp %p(%s)", __func__, + version, igi->igi_ifp, igi->igi_ifp->if_xname); + + if (version == IGMP_VERSION_1 || version == IGMP_VERSION_2) { + int old_version_timer; + /* + * Compute the "Older Version Querier Present" timer as per + * Section 8.12. + */ + old_version_timer = igi->igi_rv * igi->igi_qi + igi->igi_qri; + old_version_timer *= PR_SLOWHZ; + + if (version == IGMP_VERSION_1) { + igi->igi_v1_timer = old_version_timer; + igi->igi_v2_timer = 0; + } else if (version == IGMP_VERSION_2) { + igi->igi_v1_timer = 0; + igi->igi_v2_timer = old_version_timer; + } + } + + if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) { + if (igi->igi_version != IGMP_VERSION_2) { + igi->igi_version = IGMP_VERSION_2; + igmp_v3_cancel_link_timers(igi); + } + } else if (igi->igi_v1_timer > 0) { + if (igi->igi_version != IGMP_VERSION_1) { + igi->igi_version = IGMP_VERSION_1; + igmp_v3_cancel_link_timers(igi); + } + } +} + +/* + * Cancel pending IGMPv3 timers for the given link and all groups + * joined on it; state-change, general-query, and group-query timers. + */ +static void +igmp_v3_cancel_link_timers(struct igmp_ifinfo *igi) +{ + struct ifmultiaddr *ifma; + struct ifnet *ifp; + struct in_multi *inm; + + CTR3(KTR_IGMPV3, "%s: cancel v3 timers on ifp %p(%s)", __func__, + igi->igi_ifp, igi->igi_ifp->if_xname); + + IN_MULTI_LOCK_ASSERT(); + IGMP_LOCK_ASSERT(); /* - * Quick check to see if any work needs to be done, in order to - * minimize the overhead of fasttimo processing. + * Fast-track this potentially expensive operation + * by checking all the global 'timer pending' flags. */ - - if (!igmp_timers_are_running) + if (!V_interface_timers_running && + !V_state_change_timers_running && + !V_current_state_timers_running) return; - IN_MULTI_LOCK(); - igmp_timers_are_running = 0; - VNET_LIST_RLOCK(); - VNET_FOREACH(vnet_iter) { - CURVNET_SET(vnet_iter); - INIT_VNET_INET(vnet_iter); - IN_FIRST_MULTI(step, inm); - while (inm != NULL) { - if (inm->inm_timer == 0) { - /* do nothing */ - } else if (--inm->inm_timer == 0) { - igmp_sendpkt(inm, inm->inm_rti->rti_type, 0); - inm->inm_state = IGMP_IREPORTEDLAST; - } else { - igmp_timers_are_running = 1; + igi->igi_v3_timer = 0; + + ifp = igi->igi_ifp; + + IF_ADDR_LOCK(ifp); + TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { + if (ifma->ifma_addr->sa_family != AF_INET) + continue; + inm = (struct in_multi *)ifma->ifma_protospec; + switch (inm->inm_state) { + case IGMP_NOT_MEMBER: + case IGMP_SILENT_MEMBER: + case IGMP_IDLE_MEMBER: + case IGMP_LAZY_MEMBER: + case IGMP_SLEEPING_MEMBER: + case IGMP_AWAKENING_MEMBER: + break; + case IGMP_LEAVING_MEMBER: + /* + * If we are leaving the group and switching + * IGMP version, we need to release the final + * reference held for issuing the INCLUDE {}. + * + * SMPNG: Must drop and re-acquire IF_ADDR_LOCK + * around inm_release_locked(), as it is not + * a recursive mutex. + */ + IF_ADDR_UNLOCK(ifp); + inm_release_locked(inm); + IF_ADDR_LOCK(ifp); + /* FALLTHROUGH */ + case IGMP_G_QUERY_PENDING_MEMBER: + case IGMP_SG_QUERY_PENDING_MEMBER: + inm_clear_recorded(inm); + /* FALLTHROUGH */ + case IGMP_REPORTING_MEMBER: + inm->inm_sctimer = 0; + inm->inm_timer = 0; + inm->inm_state = IGMP_REPORTING_MEMBER; + /* + * Free any pending IGMPv3 state-change records. + */ + _IF_DRAIN(&inm->inm_scq); + break; + } + } + IF_ADDR_UNLOCK(ifp); +} + +/* + * Update the Older Version Querier Present timers for a link. + * See Section 7.2.1 of RFC 3376. + */ +static void +igmp_v1v2_process_querier_timers(struct igmp_ifinfo *igi) +{ + + IGMP_LOCK_ASSERT(); + + if (igi->igi_v1_timer == 0 && igi->igi_v2_timer == 0) { + /* + * IGMPv1 and IGMPv2 Querier Present timers expired. + * + * Revert to IGMPv3. + */ + if (igi->igi_version != IGMP_VERSION_3) { + CTR5(KTR_IGMPV3, + "%s: transition from v%d -> v%d on %p(%s)", + __func__, igi->igi_version, IGMP_VERSION_3, + igi->igi_ifp, igi->igi_ifp->if_xname); + igi->igi_version = IGMP_VERSION_3; + } + } else if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) { + /* + * IGMPv1 Querier Present timer expired, + * IGMPv2 Querier Present timer running. + * If IGMPv2 was disabled since last timeout, + * revert to IGMPv3. + * If IGMPv2 is enabled, revert to IGMPv2. + */ + if (!V_igmp_v2enable) { + CTR5(KTR_IGMPV3, + "%s: transition from v%d -> v%d on %p(%s)", + __func__, igi->igi_version, IGMP_VERSION_3, + igi->igi_ifp, igi->igi_ifp->if_xname); + igi->igi_v2_timer = 0; + igi->igi_version = IGMP_VERSION_3; + } else { + --igi->igi_v2_timer; + if (igi->igi_version != IGMP_VERSION_2) { + CTR5(KTR_IGMPV3, + "%s: transition from v%d -> v%d on %p(%s)", + __func__, igi->igi_version, IGMP_VERSION_2, + igi->igi_ifp, igi->igi_ifp->if_xname); + igi->igi_version = IGMP_VERSION_2; } - IN_NEXT_MULTI(step, inm); } - CURVNET_RESTORE(); + } else if (igi->igi_v1_timer > 0) { + /* + * IGMPv1 Querier Present timer running. + * Stop IGMPv2 timer if running. + * + * If IGMPv1 was disabled since last timeout, + * revert to IGMPv3. + * If IGMPv1 is enabled, reset IGMPv2 timer if running. + */ + if (!V_igmp_v1enable) { + CTR5(KTR_IGMPV3, + "%s: transition from v%d -> v%d on %p(%s)", + __func__, igi->igi_version, IGMP_VERSION_3, + igi->igi_ifp, igi->igi_ifp->if_xname); + igi->igi_v1_timer = 0; + igi->igi_version = IGMP_VERSION_3; + } else { + --igi->igi_v1_timer; + } + if (igi->igi_v2_timer > 0) { + CTR3(KTR_IGMPV3, + "%s: cancel v2 timer on %p(%s)", + __func__, igi->igi_ifp, igi->igi_ifp->if_xname); + igi->igi_v2_timer = 0; + } } - VNET_LIST_RUNLOCK(); - IN_MULTI_UNLOCK(); } +/* + * Global slowtimo handler. + * VIMAGE: Timeout handlers are expected to service all vimages. + */ void igmp_slowtimo(void) { +#ifdef VIMAGE VNET_ITERATOR_DECL(vnet_iter); - struct router_info *rti; - IGMP_PRINTF("[igmp.c,_slowtimo] -- > entering \n"); - mtx_lock(&igmp_mtx); VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); INIT_VNET_INET(vnet_iter); - SLIST_FOREACH(rti, &V_router_info_head, rti_list) { - if (rti->rti_type == IGMP_V1_ROUTER) { - rti->rti_time++; - if (rti->rti_time >= IGMP_AGE_THRESHOLD) - rti->rti_type = IGMP_V2_ROUTER; - } - } + igmp_slowtimo_vnet(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); - mtx_unlock(&igmp_mtx); - IGMP_PRINTF("[igmp.c,_slowtimo] -- > exiting \n"); +#else /* !VIMAGE */ + igmp_slowtimo_vnet(); +#endif /* VIMAGE */ } +/* + * Per-vnet slowtimo handler. + */ static void -igmp_sendpkt(struct in_multi *inm, int type, unsigned long addr) +igmp_slowtimo_vnet(void) { - INIT_VNET_NET(curvnet); - INIT_VNET_INET(curvnet); - struct mbuf *m; - struct igmp *igmp; - struct ip *ip; - struct ip_moptions imo; + struct igmp_ifinfo *igi; + + IGMP_LOCK(); + + LIST_FOREACH(igi, &V_igi_head, igi_link) { + igmp_v1v2_process_querier_timers(igi); + } + + IGMP_UNLOCK(); +} + +/* + * Dispatch an IGMPv1/v2 host report or leave message. + * These are always small enough to fit inside a single mbuf. + */ +static int +igmp_v1v2_queue_report(struct in_multi *inm, const int type) +{ + struct ifnet *ifp; + struct igmp *igmp; + struct ip *ip; + struct mbuf *m; IN_MULTI_LOCK_ASSERT(); + IGMP_LOCK_ASSERT(); + + ifp = inm->inm_ifp; + /* XXX are these needed ? */ + INIT_VNET_NET(ifp->if_vnet); + INIT_VNET_INET(ifp->if_vnet); MGETHDR(m, M_DONTWAIT, MT_DATA); if (m == NULL) - return; + return (ENOMEM); + MH_ALIGN(m, sizeof(struct ip) + sizeof(struct igmp)); + + m->m_pkthdr.len = sizeof(struct ip) + sizeof(struct igmp); - m->m_pkthdr.rcvif = V_loif; -#ifdef MAC - mac_netinet_igmp_send(inm->inm_ifp, m); -#endif - m->m_pkthdr.len = sizeof(struct ip) + IGMP_MINLEN; - MH_ALIGN(m, IGMP_MINLEN + sizeof(struct ip)); m->m_data += sizeof(struct ip); - m->m_len = IGMP_MINLEN; + m->m_len = sizeof(struct igmp); + igmp = mtod(m, struct igmp *); igmp->igmp_type = type; igmp->igmp_code = 0; igmp->igmp_group = inm->inm_addr; igmp->igmp_cksum = 0; - igmp->igmp_cksum = in_cksum(m, IGMP_MINLEN); + igmp->igmp_cksum = in_cksum(m, sizeof(struct igmp)); m->m_data -= sizeof(struct ip); m->m_len += sizeof(struct ip); + ip = mtod(m, struct ip *); ip->ip_tos = 0; - ip->ip_len = sizeof(struct ip) + IGMP_MINLEN; + ip->ip_len = sizeof(struct ip) + sizeof(struct igmp); ip->ip_off = 0; ip->ip_p = IPPROTO_IGMP; ip->ip_src.s_addr = INADDR_ANY; - ip->ip_dst.s_addr = addr ? addr : igmp->igmp_group.s_addr; - imo.imo_multicast_ifp = inm->inm_ifp; - imo.imo_multicast_ttl = 1; - imo.imo_multicast_vif = -1; + if (type == IGMP_HOST_LEAVE_MESSAGE) + ip->ip_dst.s_addr = htonl(INADDR_ALLRTRS_GROUP); + else + ip->ip_dst = inm->inm_addr; + + igmp_save_context(m, ifp); + + m->m_flags |= M_IGMPV2; + if (inm->inm_igi->igi_flags & IGIF_LOOPBACK) + m->m_flags |= M_IGMP_LOOP; + + CTR2(KTR_IGMPV3, "%s: netisr_dispatch(NETISR_IGMP, %p)", __func__, m); + netisr_dispatch(NETISR_IGMP, m); + + return (0); +} + +/* + * Process a state change from the upper layer for the given IPv4 group. + * + * Each socket holds a reference on the in_multi in its own ip_moptions. + * The socket layer will have made the necessary updates to.the group + * state, it is now up to IGMP to issue a state change report if there + * has been any change between T0 (when the last state-change was issued) + * and T1 (now). + * + * We use the IGMPv3 state machine at group level. The IGMP module + * however makes the decision as to which IGMP protocol version to speak. + * A state change *from* INCLUDE {} always means an initial join. + * A state change *to* INCLUDE {} always means a final leave. + * + * FUTURE: If IGIF_V3LITE is enabled for this interface, then we can + * save ourselves a bunch of work; any exclusive mode groups need not + * compute source filter lists. + * + * VIMAGE: curvnet should have been set by caller, as this routine + * is called from the socket option handlers. + */ +int +igmp_change_state(struct in_multi *inm) +{ + struct igmp_ifinfo *igi; + struct ifnet *ifp; + int error; + + IN_MULTI_LOCK_ASSERT(); + + error = 0; + + /* + * Try to detect if the upper layer just asked us to change state + * for an interface which has now gone away. + */ + KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__)); + ifp = inm->inm_ifma->ifma_ifp; + if (ifp != NULL) { + /* + * Sanity check that netinet's notion of ifp is the + * same as net's. + */ + KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__)); + } + + IGMP_LOCK(); + + igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; + KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp)); + + /* + * If we detect a state transition to or from MCAST_UNDEFINED + * for this group, then we are starting or finishing an IGMP + * life cycle for this group. + */ + if (inm->inm_st[1].iss_fmode != inm->inm_st[0].iss_fmode) { + CTR3(KTR_IGMPV3, "%s: inm transition %d -> %d", __func__, + inm->inm_st[0].iss_fmode, inm->inm_st[1].iss_fmode); + if (inm->inm_st[0].iss_fmode == MCAST_UNDEFINED) { + CTR1(KTR_IGMPV3, "%s: initial join", __func__); + error = igmp_initial_join(inm, igi); + goto out_locked; + } else if (inm->inm_st[1].iss_fmode == MCAST_UNDEFINED) { + CTR1(KTR_IGMPV3, "%s: final leave", __func__); + igmp_final_leave(inm, igi); + goto out_locked; + } + } else { + CTR1(KTR_IGMPV3, "%s: filter set change", __func__); + } + + error = igmp_handle_state_change(inm, igi); + +out_locked: + IGMP_UNLOCK(); + return (error); +} + +/* + * Perform the initial join for an IGMP group. + * + * When joining a group: + * If the group should have its IGMP traffic suppressed, do nothing. + * IGMPv1 starts sending IGMPv1 host membership reports. + * IGMPv2 starts sending IGMPv2 host membership reports. + * IGMPv3 will schedule an IGMPv3 state-change report containing the + * initial state of the membership. + */ +static int +igmp_initial_join(struct in_multi *inm, struct igmp_ifinfo *igi) +{ + struct ifnet *ifp; + struct ifqueue *ifq; + int error, retval, syncstates; + + CTR4(KTR_IGMPV3, "%s: initial join %s on ifp %p(%s)", + __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp, + inm->inm_ifp->if_xname); + + error = 0; + syncstates = 1; + + ifp = inm->inm_ifp; + + IN_MULTI_LOCK_ASSERT(); + IGMP_LOCK_ASSERT(); + + KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__)); + + /* + * Groups joined on loopback or marked as 'not reported', + * e.g. 224.0.0.1, enter the IGMP_SILENT_MEMBER state and + * are never reported in any IGMP protocol exchanges. + * All other groups enter the appropriate IGMP state machine + * for the version in use on this link. + * A link marked as IGIF_SILENT causes IGMP to be completely + * disabled for the link. + */ + if ((ifp->if_flags & IFF_LOOPBACK) || + (igi->igi_flags & IGIF_SILENT) || + !igmp_isgroupreported(inm->inm_addr)) { + CTR1(KTR_IGMPV3, +"%s: not kicking state machine for silent group", __func__); + inm->inm_state = IGMP_SILENT_MEMBER; + inm->inm_timer = 0; + } else { + /* + * Deal with overlapping in_multi lifecycle. + * If this group was LEAVING, then make sure + * we drop the reference we picked up to keep the + * group around for the final INCLUDE {} enqueue. + */ + if (igi->igi_version == IGMP_VERSION_3 && + inm->inm_state == IGMP_LEAVING_MEMBER) + inm_release_locked(inm); + + inm->inm_state = IGMP_REPORTING_MEMBER; + + switch (igi->igi_version) { + case IGMP_VERSION_1: + case IGMP_VERSION_2: + inm->inm_state = IGMP_IDLE_MEMBER; + error = igmp_v1v2_queue_report(inm, + (igi->igi_version == IGMP_VERSION_2) ? + IGMP_v2_HOST_MEMBERSHIP_REPORT : + IGMP_v1_HOST_MEMBERSHIP_REPORT); + if (error == 0) { + inm->inm_timer = IGMP_RANDOM_DELAY( + IGMP_V1V2_MAX_RI * PR_FASTHZ); + V_current_state_timers_running = 1; + } + break; + + case IGMP_VERSION_3: + /* + * Defer update of T0 to T1, until the first copy + * of the state change has been transmitted. + */ + syncstates = 0; + + /* + * Immediately enqueue a State-Change Report for + * this interface, freeing any previous reports. + * Don't kick the timers if there is nothing to do, + * or if an error occurred. + */ + ifq = &inm->inm_scq; + _IF_DRAIN(ifq); + retval = igmp_v3_enqueue_group_record(ifq, inm, 1, + 0, 0); + CTR2(KTR_IGMPV3, "%s: enqueue record = %d", + __func__, retval); + if (retval <= 0) { + error = retval * -1; + break; + } + + /* + * Schedule transmission of pending state-change + * report up to RV times for this link. The timer + * will fire at the next igmp_fasttimo (~200ms), + * giving us an opportunity to merge the reports. + */ + if (igi->igi_flags & IGIF_LOOPBACK) { + inm->inm_scrv = 1; + } else { + KASSERT(igi->igi_rv > 1, + ("%s: invalid robustness %d", __func__, + igi->igi_rv)); + inm->inm_scrv = igi->igi_rv; + } + inm->inm_sctimer = 1; + V_state_change_timers_running = 1; + + error = 0; + break; + } + } + + /* + * Only update the T0 state if state change is atomic, + * i.e. we don't need to wait for a timer to fire before we + * can consider the state change to have been communicated. + */ + if (syncstates) { + inm_commit(inm); + CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__, + inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname); + } + + return (error); +} + +/* + * Issue an intermediate state change during the IGMP life-cycle. + */ +static int +igmp_handle_state_change(struct in_multi *inm, struct igmp_ifinfo *igi) +{ + struct ifnet *ifp; + int retval; + + CTR4(KTR_IGMPV3, "%s: state change for %s on ifp %p(%s)", + __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp, + inm->inm_ifp->if_xname); + + ifp = inm->inm_ifp; + + IN_MULTI_LOCK_ASSERT(); + IGMP_LOCK_ASSERT(); + + KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__)); + + if ((ifp->if_flags & IFF_LOOPBACK) || + (igi->igi_flags & IGIF_SILENT) || + !igmp_isgroupreported(inm->inm_addr) || + (igi->igi_version != IGMP_VERSION_3)) { + if (!igmp_isgroupreported(inm->inm_addr)) { + CTR1(KTR_IGMPV3, +"%s: not kicking state machine for silent group", __func__); + } + CTR1(KTR_IGMPV3, "%s: nothing to do", __func__); + inm_commit(inm); + CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__, + inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname); + return (0); + } + + _IF_DRAIN(&inm->inm_scq); + + retval = igmp_v3_enqueue_group_record(&inm->inm_scq, inm, 1, 0, 0); + CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval); + if (retval <= 0) + return (-retval); + + /* + * If record(s) were enqueued, start the state-change + * report timer for this group. + */ + inm->inm_scrv = ((igi->igi_flags & IGIF_LOOPBACK) ? 1 : igi->igi_rv); + inm->inm_sctimer = 1; + V_state_change_timers_running = 1; + + return (0); +} + +/* + * Perform the final leave for an IGMP group. + * + * When leaving a group: + * IGMPv1 does nothing. + * IGMPv2 sends a host leave message, if and only if we are the reporter. + * IGMPv3 enqueues a state-change report containing a transition + * to INCLUDE {} for immediate transmission. + */ +static void +igmp_final_leave(struct in_multi *inm, struct igmp_ifinfo *igi) +{ + int syncstates; + + syncstates = 1; + + CTR4(KTR_IGMPV3, "%s: final leave %s on ifp %p(%s)", + __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp, + inm->inm_ifp->if_xname); + + IN_MULTI_LOCK_ASSERT(); + IGMP_LOCK_ASSERT(); + + switch (inm->inm_state) { + case IGMP_NOT_MEMBER: + case IGMP_SILENT_MEMBER: + case IGMP_LEAVING_MEMBER: + /* Already leaving or left; do nothing. */ + CTR1(KTR_IGMPV3, +"%s: not kicking state machine for silent group", __func__); + break; + case IGMP_REPORTING_MEMBER: + case IGMP_IDLE_MEMBER: + case IGMP_G_QUERY_PENDING_MEMBER: + case IGMP_SG_QUERY_PENDING_MEMBER: + if (igi->igi_version == IGMP_VERSION_2) { +#ifdef INVARIANTS + if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER || + inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) + panic("%s: IGMPv3 state reached, not IGMPv3 mode", + __func__); +#endif + igmp_v1v2_queue_report(inm, IGMP_HOST_LEAVE_MESSAGE); + inm->inm_state = IGMP_NOT_MEMBER; + } else if (igi->igi_version == IGMP_VERSION_3) { + /* + * Stop group timer and all pending reports. + * Immediately enqueue a state-change report + * TO_IN {} to be sent on the next fast timeout, + * giving us an opportunity to merge reports. + */ + _IF_DRAIN(&inm->inm_scq); + inm->inm_timer = 0; + if (igi->igi_flags & IGIF_LOOPBACK) { + inm->inm_scrv = 1; + } else { + inm->inm_scrv = igi->igi_rv; + } + CTR4(KTR_IGMPV3, "%s: Leaving %s/%s with %d " + "pending retransmissions.", __func__, + inet_ntoa(inm->inm_addr), + inm->inm_ifp->if_xname, inm->inm_scrv); + if (inm->inm_scrv == 0) { + inm->inm_state = IGMP_NOT_MEMBER; + inm->inm_sctimer = 0; + } else { + int retval; + + inm_acquire_locked(inm); + + retval = igmp_v3_enqueue_group_record( + &inm->inm_scq, inm, 1, 0, 0); + KASSERT(retval != 0, + ("%s: enqueue record = %d", __func__, + retval)); + + inm->inm_state = IGMP_LEAVING_MEMBER; + inm->inm_sctimer = 1; + V_state_change_timers_running = 1; + syncstates = 0; + } + break; + } + break; + case IGMP_LAZY_MEMBER: + case IGMP_SLEEPING_MEMBER: + case IGMP_AWAKENING_MEMBER: + /* Our reports are suppressed; do nothing. */ + break; + } + + if (syncstates) { + inm_commit(inm); + CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__, + inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname); + inm->inm_st[1].iss_fmode = MCAST_UNDEFINED; + CTR3(KTR_IGMPV3, "%s: T1 now MCAST_UNDEFINED for %s/%s", + __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname); + } +} + +/* + * Enqueue an IGMPv3 group record to the given output queue. + * + * XXX This function could do with having the allocation code + * split out, and the multiple-tree-walks coalesced into a single + * routine as has been done in igmp_v3_enqueue_filter_change(). + * + * If is_state_change is zero, a current-state record is appended. + * If is_state_change is non-zero, a state-change report is appended. + * + * If is_group_query is non-zero, an mbuf packet chain is allocated. + * If is_group_query is zero, and if there is a packet with free space + * at the tail of the queue, it will be appended to providing there + * is enough free space. + * Otherwise a new mbuf packet chain is allocated. + * + * If is_source_query is non-zero, each source is checked to see if + * it was recorded for a Group-Source query, and will be omitted if + * it is not both in-mode and recorded. + * + * The function will attempt to allocate leading space in the packet + * for the IP/IGMP header to be prepended without fragmenting the chain. + * + * If successful the size of all data appended to the queue is returned, + * otherwise an error code less than zero is returned, or zero if + * no record(s) were appended. + */ +static int +igmp_v3_enqueue_group_record(struct ifqueue *ifq, struct in_multi *inm, + const int is_state_change, const int is_group_query, + const int is_source_query) +{ + struct igmp_grouprec ig; + struct igmp_grouprec *pig; + struct ifnet *ifp; + struct ip_msource *ims, *nims; + struct mbuf *m0, *m, *md; + int error, is_filter_list_change; + int minrec0len, m0srcs, msrcs, nbytes, off; + int record_has_sources; + int now; + int type; + in_addr_t naddr; + uint8_t mode; + + IN_MULTI_LOCK_ASSERT(); + + error = 0; + ifp = inm->inm_ifp; + is_filter_list_change = 0; + m = NULL; + m0 = NULL; + m0srcs = 0; + msrcs = 0; + nbytes = 0; + nims = NULL; + record_has_sources = 1; + pig = NULL; + type = IGMP_DO_NOTHING; + mode = inm->inm_st[1].iss_fmode; + + /* + * If we did not transition out of ASM mode during t0->t1, + * and there are no source nodes to process, we can skip + * the generation of source records. + */ + if (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0 && + inm->inm_nsrc == 0) + record_has_sources = 0; + + if (is_state_change) { + /* + * Queue a state change record. + * If the mode did not change, and there are non-ASM + * listeners or source filters present, + * we potentially need to issue two records for the group. + * If we are transitioning to MCAST_UNDEFINED, we need + * not send any sources. + * If there are ASM listeners, and there was no filter + * mode transition of any kind, do nothing. + */ + if (mode != inm->inm_st[0].iss_fmode) { + if (mode == MCAST_EXCLUDE) { + CTR1(KTR_IGMPV3, "%s: change to EXCLUDE", + __func__); + type = IGMP_CHANGE_TO_EXCLUDE_MODE; + } else { + CTR1(KTR_IGMPV3, "%s: change to INCLUDE", + __func__); + type = IGMP_CHANGE_TO_INCLUDE_MODE; + if (mode == MCAST_UNDEFINED) + record_has_sources = 0; + } + } else { + if (record_has_sources) { + is_filter_list_change = 1; + } else { + type = IGMP_DO_NOTHING; + } + } + } else { + /* + * Queue a current state record. + */ + if (mode == MCAST_EXCLUDE) { + type = IGMP_MODE_IS_EXCLUDE; + } else if (mode == MCAST_INCLUDE) { + type = IGMP_MODE_IS_INCLUDE; + KASSERT(inm->inm_st[1].iss_asm == 0, + ("%s: inm %p is INCLUDE but ASM count is %d", + __func__, inm, inm->inm_st[1].iss_asm)); + } + } + /* - * Request loopback of the report if we are acting as a multicast - * router, so that the process-level routing daemon can hear it. + * Generate the filter list changes using a separate function. */ + if (is_filter_list_change) + return (igmp_v3_enqueue_filter_change(ifq, inm)); + + if (type == IGMP_DO_NOTHING) { + CTR3(KTR_IGMPV3, "%s: nothing to do for %s/%s", + __func__, inet_ntoa(inm->inm_addr), + inm->inm_ifp->if_xname); + return (0); + } + + /* + * If any sources are present, we must be able to fit at least + * one in the trailing space of the tail packet's mbuf, + * ideally more. + */ + minrec0len = sizeof(struct igmp_grouprec); + if (record_has_sources) + minrec0len += sizeof(in_addr_t); + + CTR4(KTR_IGMPV3, "%s: queueing %s for %s/%s", __func__, + igmp_rec_type_to_str(type), inet_ntoa(inm->inm_addr), + inm->inm_ifp->if_xname); + + /* + * Check if we have a packet in the tail of the queue for this + * group into which the first group record for this group will fit. + * Otherwise allocate a new packet. + * Always allocate leading space for IP+RA_OPT+IGMP+REPORT. + * Note: Group records for G/GSR query responses MUST be sent + * in their own packet. + */ + m0 = ifq->ifq_tail; + if (!is_group_query && + m0 != NULL && + (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= IGMP_V3_REPORT_MAXRECS) && + (m0->m_pkthdr.len + minrec0len) < + (ifp->if_mtu - IGMP_LEADINGSPACE)) { + m0srcs = (ifp->if_mtu - m0->m_pkthdr.len - + sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); + m = m0; + CTR1(KTR_IGMPV3, "%s: use existing packet", __func__); + } else { + if (_IF_QFULL(ifq)) { + CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__); + return (-ENOMEM); + } + m = NULL; + m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE - + sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); + if (!is_state_change && !is_group_query) + m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR); + if (m == NULL) { + m = m_gethdr(M_DONTWAIT, MT_DATA); + if (m) + MH_ALIGN(m, IGMP_LEADINGSPACE); + } + if (m == NULL) + return (-ENOMEM); + m->m_data += IGMP_LEADINGSPACE; + + igmp_save_context(m, ifp); + + CTR1(KTR_IGMPV3, "%s: allocated first packet", __func__); + } + + /* + * Append group record. + * If we have sources, we don't know how many yet. + */ + ig.ig_type = type; + ig.ig_datalen = 0; + ig.ig_numsrc = 0; + ig.ig_group = inm->inm_addr; + if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) { + if (m != m0) + m_freem(m); + CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__); + return (-ENOMEM); + } + nbytes += sizeof(struct igmp_grouprec); + + /* + * Append as many sources as will fit in the first packet. + * If we are appending to a new packet, the chain allocation + * may potentially use clusters; use m_getptr() in this case. + * If we are appending to an existing packet, we need to obtain + * a pointer to the group record after m_append(), in case a new + * mbuf was allocated. + * Only append sources which are in-mode at t1. If we are + * transitioning to MCAST_UNDEFINED state on the group, do not + * include source entries. + * Only report recorded sources in our filter set when responding + * to a group-source query. + */ + if (record_has_sources) { + if (m == m0) { + md = m_last(m); + pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + + md->m_len - nbytes); + } else { + md = m_getptr(m, 0, &off); + pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + + off); + } + msrcs = 0; + RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, nims) { + CTR2(KTR_IGMPV3, "%s: visit node %s", __func__, + inet_ntoa_haddr(ims->ims_haddr)); + now = ims_get_mode(inm, ims, 1); + CTR2(KTR_IGMPV3, "%s: node is %d", __func__, now); + if ((now != mode) || + (now == mode && mode == MCAST_UNDEFINED)) { + CTR1(KTR_IGMPV3, "%s: skip node", __func__); + continue; + } + if (is_source_query && ims->ims_stp == 0) { + CTR1(KTR_IGMPV3, "%s: skip unrecorded node", + __func__); + continue; + } + CTR1(KTR_IGMPV3, "%s: append node", __func__); + naddr = htonl(ims->ims_haddr); + if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) { + if (m != m0) + m_freem(m); + CTR1(KTR_IGMPV3, "%s: m_append() failed.", + __func__); + return (-ENOMEM); + } + nbytes += sizeof(in_addr_t); + ++msrcs; + if (msrcs == m0srcs) + break; + } + CTR2(KTR_IGMPV3, "%s: msrcs is %d this packet", __func__, + msrcs); + pig->ig_numsrc = htons(msrcs); + nbytes += (msrcs * sizeof(in_addr_t)); + } + + if (is_source_query && msrcs == 0) { + CTR1(KTR_IGMPV3, "%s: no recorded sources to report", __func__); + if (m != m0) + m_freem(m); + return (0); + } + + /* + * We are good to go with first packet. + */ + if (m != m0) { + CTR1(KTR_IGMPV3, "%s: enqueueing first packet", __func__); + m->m_pkthdr.PH_vt.vt_nrecs = 1; + _IF_ENQUEUE(ifq, m); + } else + m->m_pkthdr.PH_vt.vt_nrecs++; + + /* + * No further work needed if no source list in packet(s). + */ + if (!record_has_sources) + return (nbytes); + + /* + * Whilst sources remain to be announced, we need to allocate + * a new packet and fill out as many sources as will fit. + * Always try for a cluster first. + */ + while (nims != NULL) { + if (_IF_QFULL(ifq)) { + CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__); + return (-ENOMEM); + } + m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR); + if (m == NULL) { + m = m_gethdr(M_DONTWAIT, MT_DATA); + if (m) + MH_ALIGN(m, IGMP_LEADINGSPACE); + } + if (m == NULL) + return (-ENOMEM); + igmp_save_context(m, ifp); + m->m_data += IGMP_LEADINGSPACE; + md = m_getptr(m, 0, &off); + pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + off); + CTR1(KTR_IGMPV3, "%s: allocated next packet", __func__); + + if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) { + if (m != m0) + m_freem(m); + CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__); + return (-ENOMEM); + } + m->m_pkthdr.PH_vt.vt_nrecs = 1; + nbytes += sizeof(struct igmp_grouprec); + + m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE - + sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); + + msrcs = 0; + RB_FOREACH_FROM(ims, ip_msource_tree, nims) { + CTR2(KTR_IGMPV3, "%s: visit node %s", __func__, + inet_ntoa_haddr(ims->ims_haddr)); + now = ims_get_mode(inm, ims, 1); + if ((now != mode) || + (now == mode && mode == MCAST_UNDEFINED)) { + CTR1(KTR_IGMPV3, "%s: skip node", __func__); + continue; + } + if (is_source_query && ims->ims_stp == 0) { + CTR1(KTR_IGMPV3, "%s: skip unrecorded node", + __func__); + continue; + } + CTR1(KTR_IGMPV3, "%s: append node", __func__); + naddr = htonl(ims->ims_haddr); + if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) { + if (m != m0) + m_freem(m); + CTR1(KTR_IGMPV3, "%s: m_append() failed.", + __func__); + return (-ENOMEM); + } + ++msrcs; + if (msrcs == m0srcs) + break; + } + pig->ig_numsrc = htons(msrcs); + nbytes += (msrcs * sizeof(in_addr_t)); + + CTR1(KTR_IGMPV3, "%s: enqueueing next packet", __func__); + _IF_ENQUEUE(ifq, m); + } + + return (nbytes); +} + +/* + * Type used to mark record pass completion. + * We exploit the fact we can cast to this easily from the + * current filter modes on each ip_msource node. + */ +typedef enum { + REC_NONE = 0x00, /* MCAST_UNDEFINED */ + REC_ALLOW = 0x01, /* MCAST_INCLUDE */ + REC_BLOCK = 0x02, /* MCAST_EXCLUDE */ + REC_FULL = REC_ALLOW | REC_BLOCK +} rectype_t; + +/* + * Enqueue an IGMPv3 filter list change to the given output queue. + * + * Source list filter state is held in an RB-tree. When the filter list + * for a group is changed without changing its mode, we need to compute + * the deltas between T0 and T1 for each source in the filter set, + * and enqueue the appropriate ALLOW_NEW/BLOCK_OLD records. + * + * As we may potentially queue two record types, and the entire R-B tree + * needs to be walked at once, we break this out into its own function + * so we can generate a tightly packed queue of packets. + * + * XXX This could be written to only use one tree walk, although that makes + * serializing into the mbuf chains a bit harder. For now we do two walks + * which makes things easier on us, and it may or may not be harder on + * the L2 cache. + * + * If successful the size of all data appended to the queue is returned, + * otherwise an error code less than zero is returned, or zero if + * no record(s) were appended. + */ +static int +igmp_v3_enqueue_filter_change(struct ifqueue *ifq, struct in_multi *inm) +{ + static const int MINRECLEN = + sizeof(struct igmp_grouprec) + sizeof(in_addr_t); + struct ifnet *ifp; + struct igmp_grouprec ig; + struct igmp_grouprec *pig; + struct ip_msource *ims, *nims; + struct mbuf *m, *m0, *md; + in_addr_t naddr; + int m0srcs, nbytes, off, rsrcs, schanged; + int nallow, nblock; + uint8_t mode, now, then; + rectype_t crt, drt, nrt; + + IN_MULTI_LOCK_ASSERT(); + + if (inm->inm_nsrc == 0 || + (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0)) + return (0); + + ifp = inm->inm_ifp; /* interface */ + mode = inm->inm_st[1].iss_fmode; /* filter mode at t1 */ + crt = REC_NONE; /* current group record type */ + drt = REC_NONE; /* mask of completed group record types */ + nrt = REC_NONE; /* record type for current node */ + m0srcs = 0; /* # source which will fit in current mbuf chain */ + nbytes = 0; /* # of bytes appended to group's state-change queue */ + rsrcs = 0; /* # sources encoded in current record */ + schanged = 0; /* # nodes encoded in overall filter change */ + nallow = 0; /* # of source entries in ALLOW_NEW */ + nblock = 0; /* # of source entries in BLOCK_OLD */ + nims = NULL; /* next tree node pointer */ + + /* + * For each possible filter record mode. + * The first kind of source we encounter tells us which + * is the first kind of record we start appending. + * If a node transitioned to UNDEFINED at t1, its mode is treated + * as the inverse of the group's filter mode. + */ + while (drt != REC_FULL) { + do { + m0 = ifq->ifq_tail; + if (m0 != NULL && + (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= + IGMP_V3_REPORT_MAXRECS) && + (m0->m_pkthdr.len + MINRECLEN) < + (ifp->if_mtu - IGMP_LEADINGSPACE)) { + m = m0; + m0srcs = (ifp->if_mtu - m0->m_pkthdr.len - + sizeof(struct igmp_grouprec)) / + sizeof(in_addr_t); + CTR1(KTR_IGMPV3, + "%s: use previous packet", __func__); + } else { + m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR); + if (m == NULL) { + m = m_gethdr(M_DONTWAIT, MT_DATA); + if (m) + MH_ALIGN(m, IGMP_LEADINGSPACE); + } + if (m == NULL) { + CTR1(KTR_IGMPV3, + "%s: m_get*() failed", __func__); + return (-ENOMEM); + } + m->m_pkthdr.PH_vt.vt_nrecs = 0; + igmp_save_context(m, ifp); + m->m_data += IGMP_LEADINGSPACE; + m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE - + sizeof(struct igmp_grouprec)) / + sizeof(in_addr_t); + CTR1(KTR_IGMPV3, + "%s: allocated new packet", __func__); + } + /* + * Append the IGMP group record header to the + * current packet's data area. + * Recalculate pointer to free space for next + * group record, in case m_append() allocated + * a new mbuf or cluster. + */ + memset(&ig, 0, sizeof(ig)); + ig.ig_group = inm->inm_addr; + if (!m_append(m, sizeof(ig), (void *)&ig)) { + if (m != m0) + m_freem(m); + CTR1(KTR_IGMPV3, + "%s: m_append() failed", __func__); + return (-ENOMEM); + } + nbytes += sizeof(struct igmp_grouprec); + if (m == m0) { + md = m_last(m); + pig = (struct igmp_grouprec *)(mtod(md, + uint8_t *) + md->m_len - nbytes); + } else { + md = m_getptr(m, 0, &off); + pig = (struct igmp_grouprec *)(mtod(md, + uint8_t *) + off); + } + /* + * Begin walking the tree for this record type + * pass, or continue from where we left off + * previously if we had to allocate a new packet. + * Only report deltas in-mode at t1. + * We need not report included sources as allowed + * if we are in inclusive mode on the group, + * however the converse is not true. + */ + rsrcs = 0; + if (nims == NULL) + nims = RB_MIN(ip_msource_tree, &inm->inm_srcs); + RB_FOREACH_FROM(ims, ip_msource_tree, nims) { + CTR2(KTR_IGMPV3, "%s: visit node %s", + __func__, inet_ntoa_haddr(ims->ims_haddr)); + now = ims_get_mode(inm, ims, 1); + then = ims_get_mode(inm, ims, 0); + CTR3(KTR_IGMPV3, "%s: mode: t0 %d, t1 %d", + __func__, then, now); + if (now == then) { + CTR1(KTR_IGMPV3, + "%s: skip unchanged", __func__); + continue; + } + if (mode == MCAST_EXCLUDE && + now == MCAST_INCLUDE) { + CTR1(KTR_IGMPV3, + "%s: skip IN src on EX group", + __func__); + continue; + } + nrt = (rectype_t)now; + if (nrt == REC_NONE) + nrt = (rectype_t)(~mode & REC_FULL); + if (schanged++ == 0) { + crt = nrt; + } else if (crt != nrt) + continue; + naddr = htonl(ims->ims_haddr); + if (!m_append(m, sizeof(in_addr_t), + (void *)&naddr)) { + if (m != m0) + m_freem(m); + CTR1(KTR_IGMPV3, + "%s: m_append() failed", __func__); + return (-ENOMEM); + } + nallow += !!(crt == REC_ALLOW); + nblock += !!(crt == REC_BLOCK); + if (++rsrcs == m0srcs) + break; + } + /* + * If we did not append any tree nodes on this + * pass, back out of allocations. + */ + if (rsrcs == 0) { + nbytes -= sizeof(struct igmp_grouprec); + if (m != m0) { + CTR1(KTR_IGMPV3, + "%s: m_free(m)", __func__); + m_freem(m); + } else { + CTR1(KTR_IGMPV3, + "%s: m_adj(m, -ig)", __func__); + m_adj(m, -((int)sizeof( + struct igmp_grouprec))); + } + continue; + } + nbytes += (rsrcs * sizeof(in_addr_t)); + if (crt == REC_ALLOW) + pig->ig_type = IGMP_ALLOW_NEW_SOURCES; + else if (crt == REC_BLOCK) + pig->ig_type = IGMP_BLOCK_OLD_SOURCES; + pig->ig_numsrc = htons(rsrcs); + /* + * Count the new group record, and enqueue this + * packet if it wasn't already queued. + */ + m->m_pkthdr.PH_vt.vt_nrecs++; + if (m != m0) + _IF_ENQUEUE(ifq, m); + } while (nims != NULL); + drt |= crt; + crt = (~crt & REC_FULL); + } + + CTR3(KTR_IGMPV3, "%s: queued %d ALLOW_NEW, %d BLOCK_OLD", __func__, + nallow, nblock); + + return (nbytes); +} + +static int +igmp_v3_merge_state_changes(struct in_multi *inm, struct ifqueue *ifscq) +{ + struct ifqueue *gq; + struct mbuf *m; /* pending state-change */ + struct mbuf *m0; /* copy of pending state-change */ + struct mbuf *mt; /* last state-change in packet */ + int docopy, domerge; + u_int recslen; + + docopy = 0; + domerge = 0; + recslen = 0; + + IN_MULTI_LOCK_ASSERT(); + IGMP_LOCK_ASSERT(); + + /* + * If there are further pending retransmissions, make a writable + * copy of each queued state-change message before merging. + */ + if (inm->inm_scrv > 0) + docopy = 1; + + gq = &inm->inm_scq; +#ifdef KTR + if (gq->ifq_head == NULL) { + CTR2(KTR_IGMPV3, "%s: WARNING: queue for inm %p is empty", + __func__, inm); + } +#endif + + m = gq->ifq_head; + while (m != NULL) { + /* + * Only merge the report into the current packet if + * there is sufficient space to do so; an IGMPv3 report + * packet may only contain 65,535 group records. + * Always use a simple mbuf chain concatentation to do this, + * as large state changes for single groups may have + * allocated clusters. + */ + domerge = 0; + mt = ifscq->ifq_tail; + if (mt != NULL) { + recslen = m_length(m, NULL); + + if ((mt->m_pkthdr.PH_vt.vt_nrecs + + m->m_pkthdr.PH_vt.vt_nrecs <= + IGMP_V3_REPORT_MAXRECS) && + (mt->m_pkthdr.len + recslen <= + (inm->inm_ifp->if_mtu - IGMP_LEADINGSPACE))) + domerge = 1; + } + + if (!domerge && _IF_QFULL(gq)) { + CTR2(KTR_IGMPV3, + "%s: outbound queue full, skipping whole packet %p", + __func__, m); + mt = m->m_nextpkt; + if (!docopy) + m_freem(m); + m = mt; + continue; + } + + if (!docopy) { + CTR2(KTR_IGMPV3, "%s: dequeueing %p", __func__, m); + _IF_DEQUEUE(gq, m0); + m = m0->m_nextpkt; + } else { + CTR2(KTR_IGMPV3, "%s: copying %p", __func__, m); + m0 = m_dup(m, M_NOWAIT); + if (m0 == NULL) + return (ENOMEM); + m0->m_nextpkt = NULL; + m = m->m_nextpkt; + } + + if (!domerge) { + CTR3(KTR_IGMPV3, "%s: queueing %p to ifscq %p)", + __func__, m0, ifscq); + _IF_ENQUEUE(ifscq, m0); + } else { + struct mbuf *mtl; /* last mbuf of packet mt */ + + CTR3(KTR_IGMPV3, "%s: merging %p with ifscq tail %p)", + __func__, m0, mt); + + mtl = m_last(mt); + m0->m_flags &= ~M_PKTHDR; + mt->m_pkthdr.len += recslen; + mt->m_pkthdr.PH_vt.vt_nrecs += + m0->m_pkthdr.PH_vt.vt_nrecs; + + mtl->m_next = m0; + } + } + + return (0); +} + +/* + * Respond to a pending IGMPv3 General Query. + */ +static void +igmp_v3_dispatch_general_query(struct igmp_ifinfo *igi) +{ + struct ifmultiaddr *ifma, *tifma; + struct ifnet *ifp; + struct in_multi *inm; + int retval, loop; + + IN_MULTI_LOCK_ASSERT(); + IGMP_LOCK_ASSERT(); + + KASSERT(igi->igi_version == IGMP_VERSION_3, + ("%s: called when version %d", __func__, igi->igi_version)); + + ifp = igi->igi_ifp; + + IF_ADDR_LOCK(ifp); + TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, tifma) { + if (ifma->ifma_addr->sa_family != AF_INET) + continue; + + inm = (struct in_multi *)ifma->ifma_protospec; + KASSERT(ifp == inm->inm_ifp, + ("%s: inconsistent ifp", __func__)); + + switch (inm->inm_state) { + case IGMP_NOT_MEMBER: + case IGMP_SILENT_MEMBER: + break; + case IGMP_REPORTING_MEMBER: + case IGMP_IDLE_MEMBER: + case IGMP_LAZY_MEMBER: + case IGMP_SLEEPING_MEMBER: + case IGMP_AWAKENING_MEMBER: + inm->inm_state = IGMP_REPORTING_MEMBER; + retval = igmp_v3_enqueue_group_record(&igi->igi_gq, + inm, 0, 0, 0); + CTR2(KTR_IGMPV3, "%s: enqueue record = %d", + __func__, retval); + break; + case IGMP_G_QUERY_PENDING_MEMBER: + case IGMP_SG_QUERY_PENDING_MEMBER: + case IGMP_LEAVING_MEMBER: + break; + } + } + IF_ADDR_UNLOCK(ifp); + + loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0; + igmp_dispatch_queue(&igi->igi_gq, IGMP_MAX_RESPONSE_BURST, loop); + + /* + * Slew transmission of bursts over 500ms intervals. + */ + if (igi->igi_gq.ifq_head != NULL) { + igi->igi_v3_timer = 1 + IGMP_RANDOM_DELAY( + IGMP_RESPONSE_BURST_INTERVAL); + V_interface_timers_running = 1; + } +} + +/* + * Transmit the next pending IGMP message in the output queue. + * + * We get called from netisr_processqueue(). A mutex private to igmpoq + * will be acquired and released around this routine. + * + * VIMAGE: Needs to store/restore vnet pointer on a per-mbuf-chain basis. + * MRT: Nothing needs to be done, as IGMP traffic is always local to + * a link and uses a link-scope multicast address. + */ +static void +igmp_intr(struct mbuf *m) +{ + struct ip_moptions imo; + struct ifnet *ifp; + struct mbuf *ipopts, *m0; + int error; + uint32_t ifindex; + + CTR2(KTR_IGMPV3, "%s: transmit %p", __func__, m); + + /* + * Restore VNET image pointer from enqueued mbuf chain + * before doing anything else. Whilst we use interface + * indexes to guard against interface detach, they are + * unique to each VIMAGE and must be retrieved. + */ + CURVNET_SET(m->m_pkthdr.header); + ifindex = igmp_restore_context(m); + + /* + * Check if the ifnet still exists. This limits the scope of + * any race in the absence of a global ifp lock for low cost + * (an array lookup). + */ + ifp = ifnet_byindex(ifindex); + if (ifp == NULL) { + CTR3(KTR_IGMPV3, "%s: dropped %p as ifindex %u went away.", + __func__, m, ifindex); + m_freem(m); + V_ipstat.ips_noroute++; + goto out; + } + + ipopts = V_igmp_sendra ? m_raopt : NULL; + + imo.imo_multicast_ttl = 1; + imo.imo_multicast_vif = -1; imo.imo_multicast_loop = (V_ip_mrouter != NULL); /* - * XXX: Do we have to worry about reentrancy here? Don't think so. + * If the user requested that IGMP traffic be explicitly + * redirected to the loopback interface (e.g. they are running a + * MANET interface and the routing protocol needs to see the + * updates), handle this now. */ - ip_output(m, router_alert, &igmprt, 0, &imo, NULL); + if (m->m_flags & M_IGMP_LOOP) + imo.imo_multicast_ifp = V_loif; + else + imo.imo_multicast_ifp = ifp; + + if (m->m_flags & M_IGMPV2) { + m0 = m; + } else { + m0 = igmp_v3_encap_report(ifp, m); + if (m0 == NULL) { + CTR2(KTR_IGMPV3, "%s: dropped %p", __func__, m); + m_freem(m); + V_ipstat.ips_odropped++; + goto out; + } + } + + igmp_scrub_context(m0); + m->m_flags &= ~(M_PROTOFLAGS); + m0->m_pkthdr.rcvif = V_loif; +#ifdef MAC + mac_netinet_igmp_send(ifp, m0); +#endif + error = ip_output(m0, ipopts, NULL, 0, &imo, NULL); + if (error) { + CTR3(KTR_IGMPV3, "%s: ip_output(%p) = %d", __func__, m0, error); + m_freem(m0); + goto out; + } ++V_igmpstat.igps_snd_reports; + +out: + /* + * We must restore the existing vnet pointer before + * continuing as we are run from netisr context. + */ + CURVNET_RESTORE(); +} + +/* + * Encapsulate an IGMPv3 report. + * + * The internal mbuf flag M_IGMPV3_HDR is used to indicate that the mbuf + * chain has already had its IP/IGMPv3 header prepended. In this case + * the function will not attempt to prepend; the lengths and checksums + * will however be re-computed. + * + * Returns a pointer to the new mbuf chain head, or NULL if the + * allocation failed. + */ +static struct mbuf * +igmp_v3_encap_report(struct ifnet *ifp, struct mbuf *m) +{ + INIT_VNET_NET(curvnet); + INIT_VNET_INET(curvnet); + struct igmp_report *igmp; + struct ip *ip; + int hdrlen, igmpreclen; + + KASSERT((m->m_flags & M_PKTHDR), + ("%s: mbuf chain %p is !M_PKTHDR", __func__, m)); + + igmpreclen = m_length(m, NULL); + hdrlen = sizeof(struct ip) + sizeof(struct igmp_report); + + if (m->m_flags & M_IGMPV3_HDR) { + igmpreclen -= hdrlen; + } else { + M_PREPEND(m, hdrlen, M_DONTWAIT); + if (m == NULL) + return (NULL); + m->m_flags |= M_IGMPV3_HDR; + } + + CTR2(KTR_IGMPV3, "%s: igmpreclen is %d", __func__, igmpreclen); + + m->m_data += sizeof(struct ip); + m->m_len -= sizeof(struct ip); + + igmp = mtod(m, struct igmp_report *); + igmp->ir_type = IGMP_v3_HOST_MEMBERSHIP_REPORT; + igmp->ir_rsv1 = 0; + igmp->ir_rsv2 = 0; + igmp->ir_numgrps = htons(m->m_pkthdr.PH_vt.vt_nrecs); + igmp->ir_cksum = 0; + igmp->ir_cksum = in_cksum(m, sizeof(struct igmp_report) + igmpreclen); + m->m_pkthdr.PH_vt.vt_nrecs = 0; + + m->m_data -= sizeof(struct ip); + m->m_len += sizeof(struct ip); + + ip = mtod(m, struct ip *); + ip->ip_tos = IPTOS_PREC_INTERNETCONTROL; + ip->ip_len = hdrlen + igmpreclen; + ip->ip_off = IP_DF; + ip->ip_p = IPPROTO_IGMP; + ip->ip_sum = 0; + + ip->ip_src.s_addr = INADDR_ANY; + + if (m->m_flags & M_IGMP_LOOP) { + struct in_ifaddr *ia; + + IFP_TO_IA(ifp, ia); + if (ia != NULL) + ip->ip_src = ia->ia_addr.sin_addr; + } + + ip->ip_dst.s_addr = htonl(INADDR_ALLRPTS_GROUP); + + return (m); +} + +#ifdef KTR +static char * +igmp_rec_type_to_str(const int type) +{ + + switch (type) { + case IGMP_CHANGE_TO_EXCLUDE_MODE: + return "TO_EX"; + break; + case IGMP_CHANGE_TO_INCLUDE_MODE: + return "TO_IN"; + break; + case IGMP_MODE_IS_EXCLUDE: + return "MODE_EX"; + break; + case IGMP_MODE_IS_INCLUDE: + return "MODE_IN"; + break; + case IGMP_ALLOW_NEW_SOURCES: + return "ALLOW_NEW"; + break; + case IGMP_BLOCK_OLD_SOURCES: + return "BLOCK_OLD"; + break; + default: + break; + } + return "unknown"; +} +#endif + +static void +igmp_sysinit(void) +{ + + CTR1(KTR_IGMPV3, "%s: initializing", __func__); + + IGMP_LOCK_INIT(); + TUNABLE_INT_FETCH("debug.mpsafeigmp", &mpsafe_igmp); + + mtx_init(&igmpoq.ifq_mtx, "igmpoq_mtx", NULL, MTX_DEF); + IFQ_SET_MAXLEN(&igmpoq, IFQ_MAXLEN); + + m_raopt = igmp_ra_alloc(); + +#if __FreeBSD_version < 800000 + netisr_register(NETISR_IGMP, igmp_intr, &igmpoq, + mpsafe_igmp ? NETISR_MPSAFE : 0); +#else + netisr_register(NETISR_IGMP, igmp_intr, &igmpoq, + mpsafe_igmp ? 0 : NETISR_FORCEQUEUE); +#endif +} + +static void +igmp_sysuninit(void) +{ + + CTR1(KTR_IGMPV3, "%s: tearing down", __func__); + + netisr_unregister(NETISR_IGMP); + mtx_destroy(&igmpoq.ifq_mtx); + + m_free(m_raopt); + m_raopt = NULL; + + IGMP_LOCK_DESTROY(); +} + +/* + * Initialize an IGMPv3 instance. + * VIMAGE: Assumes curvnet set by caller and called per vimage. + */ +static int +vnet_igmp_iattach(const void *unused __unused) +{ + INIT_VNET_INET(curvnet); + + CTR1(KTR_IGMPV3, "%s: initializing", __func__); + + LIST_INIT(&V_igi_head); + + V_current_state_timers_running = 0; + V_state_change_timers_running = 0; + V_interface_timers_running = 0; + + /* + * Initialize sysctls to default values. + */ + V_igmp_recvifkludge = 1; + V_igmp_sendra = 1; + V_igmp_sendlocal = 1; + V_igmp_v1enable = 1; + V_igmp_v2enable = 1; + V_igmp_legacysupp = 0; + V_igmp_default_version = IGMP_VERSION_3; + V_igmp_gsrdelay.tv_sec = 10; + V_igmp_gsrdelay.tv_usec = 0; + + memset(&V_igmpstat, 0, sizeof(struct igmpstat)); + V_igmpstat.igps_version = IGPS_VERSION_3; + V_igmpstat.igps_len = sizeof(struct igmpstat); + + return (0); +} + +static int +vnet_igmp_idetach(const void *unused __unused) +{ + INIT_VNET_INET(curvnet); + + CTR1(KTR_IGMPV3, "%s: tearing down", __func__); + + KASSERT(LIST_EMPTY(&V_igi_head), + ("%s: igi list not empty; ifnets not detached?", __func__)); + + return (0); +} + +#ifdef VIMAGE +static struct vnet_symmap vnet_igmp_symmap[] = { + VNET_SYMMAP(igmp, igi_head), + VNET_SYMMAP(igmp, igmpstat), + VNET_SYMMAP_END +}; +VNET_MOD_DECLARE(IGMP, igmp, vnet_igmp_iattach, vnet_igmp_idetach, + vnet_igmp_symmap); +#endif /* VIMAGE */ + +static int +igmp_modevent(module_t mod, int type, void *unused __unused) +{ + + switch (type) { + case MOD_LOAD: + igmp_sysinit(); +#ifdef VIMAGE + vnet_mod_register(&vnet_igmp_modinfo); +#else + (void)vnet_igmp_iattach(NULL); +#endif /* VIMAGE */ + break; + case MOD_UNLOAD: +#ifdef VIMAGE + /* + * TODO: Allow module unload if any VIMAGE instances + * are using this module. + */ + return (EBUSY); +#else + (void)vnet_igmp_idetach(NULL); +#endif /* VIMAGE */ + igmp_sysuninit(); + break; + default: + return (EOPNOTSUPP); + } + return (0); } + +static moduledata_t igmp_mod = { + "igmp", + igmp_modevent, + 0 +}; +DECLARE_MODULE(igmp, igmp_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); diff --git a/sys/netinet/igmp_var.h b/sys/netinet/igmp_var.h index 11c3769..9c9c7d4 100644 --- a/sys/netinet/igmp_var.h +++ b/sys/netinet/igmp_var.h @@ -46,105 +46,166 @@ * MULTICAST Revision: 3.5.1.3 */ +#ifndef BURN_BRIDGES +/* + * Pre-IGMPV3 igmpstat structure. + */ +struct oigmpstat { + u_int igps_rcv_total; /* total IGMP messages received */ + u_int igps_rcv_tooshort; /* received with too few bytes */ + u_int igps_rcv_badsum; /* received with bad checksum */ + u_int igps_rcv_queries; /* received membership queries */ + u_int igps_rcv_badqueries; /* received invalid queries */ + u_int igps_rcv_reports; /* received membership reports */ + u_int igps_rcv_badreports; /* received invalid reports */ + u_int igps_rcv_ourreports; /* received reports for our groups */ + u_int igps_snd_reports; /* sent membership reports */ + u_int igps_rcv_toolong; /* received with too many bytes */ +}; +#endif + +/* + * IGMPv3 protocol statistics. + */ struct igmpstat { - u_int igps_rcv_total; /* total IGMP messages received */ - u_int igps_rcv_tooshort; /* received with too few bytes */ - u_int igps_rcv_badsum; /* received with bad checksum */ - u_int igps_rcv_queries; /* received membership queries */ - u_int igps_rcv_badqueries; /* received invalid queries */ - u_int igps_rcv_reports; /* received membership reports */ - u_int igps_rcv_badreports; /* received invalid reports */ - u_int igps_rcv_ourreports; /* received reports for our groups */ - u_int igps_snd_reports; /* sent membership reports */ - u_int igps_rcv_toolong; /* received with too many bytes */ + /* + * Structure header (to insulate ABI changes). + */ + uint32_t igps_version; /* version of this structure */ + uint32_t igps_len; /* length of this structure */ + /* + * Message statistics. + */ + uint64_t igps_rcv_total; /* total IGMP messages received */ + uint64_t igps_rcv_tooshort; /* received with too few bytes */ + uint64_t igps_rcv_badttl; /* received with ttl other than 1 */ + uint64_t igps_rcv_badsum; /* received with bad checksum */ + /* + * Query statistics. + */ + uint64_t igps_rcv_v1v2_queries; /* received IGMPv1/IGMPv2 queries */ + uint64_t igps_rcv_v3_queries; /* received IGMPv3 queries */ + uint64_t igps_rcv_badqueries; /* received invalid queries */ + uint64_t igps_rcv_gen_queries; /* received general queries */ + uint64_t igps_rcv_group_queries;/* received group queries */ + uint64_t igps_rcv_gsr_queries; /* received group-source queries */ + uint64_t igps_drop_gsr_queries; /* dropped group-source queries */ + /* + * Report statistics. + */ + uint64_t igps_rcv_reports; /* received membership reports */ + uint64_t igps_rcv_badreports; /* received invalid reports */ + uint64_t igps_rcv_ourreports; /* received reports for our groups */ + uint64_t igps_rcv_nora; /* received w/o Router Alert option */ + uint64_t igps_snd_reports; /* sent membership reports */ + /* + * Padding for future additions. + */ + uint64_t __igps_pad[4]; }; +#define IGPS_VERSION_3 3 /* as of FreeBSD 8.x */ +#define IGPS_VERSION3_LEN 168 + +#ifdef CTASSERT +CTASSERT(sizeof(struct igmpstat) == 168); +#endif #ifdef _KERNEL #define IGMP_RANDOM_DELAY(X) (random() % (X) + 1) -/* - * States for IGMPv2's leave processing - */ -#define IGMP_OTHERMEMBER 0 -#define IGMP_IREPORTEDLAST 1 +#define IGMP_MAX_STATE_CHANGES 24 /* Max pending changes per group */ /* - * State masks for IGMPv3 + * IGMP per-group states. */ -#define IGMP_V3_NONEXISTENT 0x01 -#define IGMP_V3_OTHERMEMBER 0x02 -#define IGMP_V3_IREPORTEDLAST 0x04 +#define IGMP_NOT_MEMBER 0 /* Can garbage collect in_multi */ +#define IGMP_SILENT_MEMBER 1 /* Do not perform IGMP for group */ +#define IGMP_REPORTING_MEMBER 2 /* IGMPv1/2/3 we are reporter */ +#define IGMP_IDLE_MEMBER 3 /* IGMPv1/2 we reported last */ +#define IGMP_LAZY_MEMBER 4 /* IGMPv1/2 other member reporting */ +#define IGMP_SLEEPING_MEMBER 5 /* IGMPv1/2 start query response */ +#define IGMP_AWAKENING_MEMBER 6 /* IGMPv1/2 group timer will start */ +#define IGMP_G_QUERY_PENDING_MEMBER 7 /* IGMPv3 group query pending */ +#define IGMP_SG_QUERY_PENDING_MEMBER 8 /* IGMPv3 source query pending */ +#define IGMP_LEAVING_MEMBER 9 /* IGMPv3 dying gasp (pending last */ + /* retransmission of INCLUDE {}) */ /* - * We must remember what version the subnet's querier is. - * We conveniently use the IGMP message type for the proper - * membership report to keep this state. + * IGMP version tag. */ -#define IGMP_V1_ROUTER IGMP_V1_MEMBERSHIP_REPORT -#define IGMP_V2_ROUTER IGMP_V2_MEMBERSHIP_REPORT -#define IGMP_V3_ROUTER IGMP_V3_MEMBERSHIP_REPORT +#define IGMP_VERSION_NONE 0 /* Invalid */ +#define IGMP_VERSION_1 1 +#define IGMP_VERSION_2 2 +#define IGMP_VERSION_3 3 /* Default */ /* - * Revert to new router if we haven't heard from an old router in - * this amount of time. + * IGMPv3 protocol control variables. */ -#define IGMP_AGE_THRESHOLD 540 +#define IGMP_RV_INIT 2 /* Robustness Variable */ +#define IGMP_RV_MIN 1 +#define IGMP_RV_MAX 7 -/* - * IGMPv3 protocol defaults - */ -#define IGMP_INIT_ROBVAR 2 /* Robustness */ -#define IGMP_MAX_ROBVAR 7 -#define IGMP_INIT_QRYINT 125 /* Querier's Query interval */ -#define IGMP_MAX_QRYINT 255 -#define IGMP_INIT_QRYRSP 10 /* Query Response interval */ -#define IGMP_DEF_QRYMRT 10 -#define IGMP_UNSOL_INT 1 /* Unsolicited Report interval */ +#define IGMP_QI_INIT 125 /* Query Interval (s) */ +#define IGMP_QI_MIN 1 +#define IGMP_QI_MAX 255 + +#define IGMP_QRI_INIT 10 /* Query Response Interval (s) */ +#define IGMP_QRI_MIN 1 +#define IGMP_QRI_MAX 255 + +#define IGMP_URI_INIT 3 /* Unsolicited Report Interval (s) */ +#define IGMP_URI_MIN 0 +#define IGMP_URI_MAX 10 + +#define IGMP_MAX_G_GS_PACKETS 8 /* # of packets to answer G/GS */ +#define IGMP_MAX_STATE_CHANGE_PACKETS 8 /* # of packets per state change */ +#define IGMP_MAX_RESPONSE_PACKETS 16 /* # of packets for general query */ +#define IGMP_MAX_RESPONSE_BURST 4 /* # of responses to send at once */ +#define IGMP_RESPONSE_BURST_INTERVAL (PR_FASTHZ / 2) /* 500ms */ /* - * IGMPv3 report types + * IGMP-specific mbuf flags. */ -#define IGMP_REPORT_MODE_IN 1 /* mode-is-include */ -#define IGMP_REPORT_MODE_EX 2 /* mode-is-exclude */ -#define IGMP_REPORT_TO_IN 3 /* change-to-include */ -#define IGMP_REPORT_TO_EX 4 /* change-to-exclude */ -#define IGMP_REPORT_ALLOW_NEW 5 /* allow-new-sources */ -#define IGMP_REPORT_BLOCK_OLD 6 /* block-old-sources */ +#define M_IGMPV2 M_PROTO1 /* Packet is IGMPv2 */ +#define M_IGMPV3_HDR M_PROTO2 /* Packet has IGMPv3 headers */ +#define M_GROUPREC M_PROTO3 /* mbuf chain is a group record */ +#define M_IGMP_LOOP M_PROTO4 /* transmit on loif, not real ifp */ /* - * Report types + * Default amount of leading space for IGMPv3 to allocate at the + * beginning of its mbuf packet chains, to avoid fragmentation and + * unnecessary allocation of leading mbufs. */ -#define IGMP_MASK_CUR_STATE 0x01 /* Report current-state */ -#define IGMP_MASK_ALLOW_NEW 0x02 /* Report source as allow-new */ -#define IGMP_MASK_BLOCK_OLD 0x04 /* Report source as block-old */ -#define IGMP_MASK_TO_IN 0x08 /* Report source as to_in */ -#define IGMP_MASK_TO_EX 0x10 /* Report source as to_ex */ -#define IGMP_MASK_STATE_T1 0x20 /* State at T1 */ -#define IGMP_MASK_STATE_T2 0x40 /* State at T2 */ -#define IGMP_MASK_IF_STATE 0x80 /* Report current-state per interface */ - -#define IGMP_MASK_STATE_TX (IGMP_MASK_STATE_T1 | IGMP_MASK_STATE_T2) -#define IGMP_MASK_PENDING (IGMP_MASK_CUR_STATE | \ - IGMP_MASK_ALLOW_NEW | \ - IGMP_MASK_BLOCK_OLD) +#define RAOPT_LEN 4 /* Length of IP Router Alert option */ +#define IGMP_LEADINGSPACE \ + (sizeof(struct ip) + RAOPT_LEN + sizeof(struct igmp_report)) /* - * List identifiers + * Subsystem lock macros. + * The IGMP lock is only taken with IGMP. Currently it is system-wide. + * VIMAGE: The lock could be pushed to per-VIMAGE granularity in future. */ -#define IGMP_EXCLUDE_LIST 1 /* exclude list used to tag report */ -#define IGMP_INCLUDE_LIST 2 /* include list used to tag report */ -#define IGMP_RECORDED_LIST 3 /* recorded list used to tag report */ +#define IGMP_LOCK_INIT() mtx_init(&igmp_mtx, "igmp_mtx", NULL, MTX_DEF) +#define IGMP_LOCK_DESTROY() mtx_destroy(&igmp_mtx) +#define IGMP_LOCK() mtx_lock(&igmp_mtx) +#define IGMP_LOCK_ASSERT() mtx_assert(&igmp_mtx, MA_OWNED) +#define IGMP_UNLOCK() mtx_unlock(&igmp_mtx) +#define IGMP_UNLOCK_ASSERT() mtx_assert(&igmp_mtx, MA_NOTOWNED) -void igmp_init(void); -void igmp_input(struct mbuf *, int); -void igmp_joingroup(struct in_multi *); -void igmp_leavegroup(struct in_multi *); +struct igmp_ifinfo; + +int igmp_change_state(struct in_multi *); void igmp_fasttimo(void); +struct igmp_ifinfo * + igmp_domifattach(struct ifnet *); +void igmp_domifdetach(struct ifnet *); +void igmp_ifdetach(struct ifnet *); +void igmp_input(struct mbuf *, int); void igmp_slowtimo(void); SYSCTL_DECL(_net_inet_igmp); -#endif +#endif /* _KERNEL */ /* * Names for IGMP sysctl objects diff --git a/sys/netinet/in.c b/sys/netinet/in.c index bf1ebae..7c890b1 100644 --- a/sys/netinet/in.c +++ b/sys/netinet/in.c @@ -57,6 +57,7 @@ __FBSDID("$FreeBSD$"); #include <netinet/in_pcb.h> #include <netinet/ip_var.h> #include <netinet/vinet.h> +#include <netinet/igmp_var.h> static int in_mask2len(struct in_addr *); static void in_len2mask(struct in_addr *, int); @@ -215,12 +216,14 @@ in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct in_addr allhosts_addr; struct in_addr dst; struct in_ifaddr *oia; + struct in_ifinfo *ii; struct in_aliasreq *ifra = (struct in_aliasreq *)data; struct sockaddr_in oldaddr; int error, hostIsNew, iaIsNew, maskIsNew, s; int iaIsFirst; ia = NULL; + ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]); iaIsFirst = 0; iaIsNew = 0; allhosts_addr.s_addr = htonl(INADDR_ALLHOSTS_GROUP); @@ -425,8 +428,11 @@ in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, if (error != 0 && iaIsNew) break; if (error == 0) { - if (iaIsFirst && (ifp->if_flags & IFF_MULTICAST) != 0) - in_addmulti(&allhosts_addr, ifp); + if (iaIsFirst && + (ifp->if_flags & IFF_MULTICAST) != 0) { + error = in_joingroup(ifp, &allhosts_addr, + NULL, &ii->ii_allhosts); + } EVENTHANDLER_INVOKE(ifaddr_event, ifp); } return (0); @@ -472,8 +478,11 @@ in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, (ifra->ifra_broadaddr.sin_family == AF_INET)) ia->ia_broadaddr = ifra->ifra_broadaddr; if (error == 0) { - if (iaIsFirst && (ifp->if_flags & IFF_MULTICAST) != 0) - in_addmulti(&allhosts_addr, ifp); + if (iaIsFirst && + (ifp->if_flags & IFF_MULTICAST) != 0) { + error = in_joingroup(ifp, &allhosts_addr, + NULL, &ii->ii_allhosts); + } EVENTHANDLER_INVOKE(ifaddr_event, ifp); } return (error); @@ -515,18 +524,18 @@ in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, /* * If this is the last IPv4 address configured on this * interface, leave the all-hosts group. - * XXX: This is quite ugly because of locking and structure. + * No state-change report need be transmitted. */ oia = NULL; IFP_TO_IA(ifp, oia); if (oia == NULL) { - struct in_multi *inm; - IFF_LOCKGIANT(ifp); IN_MULTI_LOCK(); - IN_LOOKUP_MULTI(allhosts_addr, ifp, inm); - if (inm != NULL) - in_delmulti_locked(inm); + if (ii->ii_allhosts) { + (void)in_leavegroup_locked(ii->ii_allhosts, + NULL); + ii->ii_allhosts = NULL; + } IN_MULTI_UNLOCK(); IFF_UNLOCKGIANT(ifp); } @@ -993,37 +1002,56 @@ in_broadcast(struct in_addr in, struct ifnet *ifp) } /* + * On interface removal, clean up IPv4 data structures hung off of the ifnet. + */ +void +in_ifdetach(struct ifnet *ifp) +{ + INIT_VNET_INET(ifp->if_vnet); + + in_pcbpurgeif0(&V_ripcbinfo, ifp); + in_pcbpurgeif0(&V_udbinfo, ifp); + in_purgemaddrs(ifp); +} + +/* * Delete all IPv4 multicast address records, and associated link-layer * multicast address records, associated with ifp. + * XXX It looks like domifdetach runs AFTER the link layer cleanup. */ static void in_purgemaddrs(struct ifnet *ifp) { INIT_VNET_INET(ifp->if_vnet); - struct in_multi *inm; - struct in_multi *oinm; + LIST_HEAD(,in_multi) purgeinms; + struct in_multi *inm, *tinm; + struct ifmultiaddr *ifma; - IFF_LOCKGIANT(ifp); + LIST_INIT(&purgeinms); IN_MULTI_LOCK(); - LIST_FOREACH_SAFE(inm, &V_in_multihead, inm_link, oinm) { - if (inm->inm_ifp == ifp) - in_delmulti_locked(inm); + + /* + * Extract list of in_multi associated with the detaching ifp + * which the PF_INET layer is about to release. + * We need to do this as IF_ADDR_LOCK() may be re-acquired + * by code further down. + */ + IF_ADDR_LOCK(ifp); + TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { + if (ifma->ifma_addr->sa_family != AF_INET) + continue; + inm = (struct in_multi *)ifma->ifma_protospec; + LIST_INSERT_HEAD(&purgeinms, inm, inm_link); } - IN_MULTI_UNLOCK(); - IFF_UNLOCKGIANT(ifp); -} + IF_ADDR_UNLOCK(ifp); -/* - * On interface removal, clean up IPv4 data structures hung off of the ifnet. - */ -void -in_ifdetach(struct ifnet *ifp) -{ - INIT_VNET_INET(ifp->if_vnet); + LIST_FOREACH_SAFE(inm, &purgeinms, inm_link, tinm) { + inm_release_locked(inm); + LIST_REMOVE(inm, inm_link); + } + igmp_ifdetach(ifp); - in_pcbpurgeif0(&V_ripcbinfo, ifp); - in_pcbpurgeif0(&V_udbinfo, ifp); - in_purgemaddrs(ifp); + IN_MULTI_UNLOCK(); } #include <sys/syslog.h> @@ -1250,9 +1278,13 @@ in_lltable_dump(struct lltable *llt, struct sysctl_req *wr) void * in_domifattach(struct ifnet *ifp) -{ - struct lltable *llt = lltable_init(ifp, AF_INET); - +{ + struct in_ifinfo *ii; + struct lltable *llt; + + ii = malloc(sizeof(struct in_ifinfo), M_IFADDR, M_WAITOK|M_ZERO); + + llt = lltable_init(ifp, AF_INET); if (llt != NULL) { llt->llt_new = in_lltable_new; llt->llt_free = in_lltable_free; @@ -1260,13 +1292,19 @@ in_domifattach(struct ifnet *ifp) llt->llt_lookup = in_lltable_lookup; llt->llt_dump = in_lltable_dump; } - return (llt); + ii->ii_llt = llt; + + ii->ii_igmp = igmp_domifattach(ifp); + + return ii; } void -in_domifdetach(struct ifnet *ifp __unused, void *aux) +in_domifdetach(struct ifnet *ifp, void *aux) { - struct lltable *llt = (struct lltable *)aux; + struct in_ifinfo *ii = (struct in_ifinfo *)aux; - lltable_free(llt); + igmp_domifdetach(ifp); + lltable_free(ii->ii_llt); + free(ii, M_IFADDR); } diff --git a/sys/netinet/in.h b/sys/netinet/in.h index ada44cd..4b2c827 100644 --- a/sys/netinet/in.h +++ b/sys/netinet/in.h @@ -509,6 +509,7 @@ __END_DECLS */ #define IP_MAX_GROUP_SRC_FILTER 512 /* sources per group */ #define IP_MAX_SOCK_SRC_FILTER 128 /* sources per socket/group */ +#define IP_MAX_SOCK_MUTE_FILTER 128 /* XXX no longer used */ /* * Argument structure for IP_ADD_MEMBERSHIP and IP_DROP_MEMBERSHIP. diff --git a/sys/netinet/in_mcast.c b/sys/netinet/in_mcast.c index 7d9aecb..4ffabbd 100644 --- a/sys/netinet/in_mcast.c +++ b/sys/netinet/in_mcast.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2007 Bruce M. Simpson. + * Copyright (c) 2007-2009 Bruce Simpson. * Copyright (c) 2005 Robert N. M. Watson. * All rights reserved. * @@ -30,10 +30,6 @@ /* * IPv4 multicast socket, group, and socket option processing module. - * Until further notice, this file requires INET to compile. - * TODO: Make this infrastructure independent of address family. - * TODO: Teach netinet6 to use this code. - * TODO: Hook up SSM logic to IGMPv3/MLDv2. */ #include <sys/cdefs.h> @@ -49,8 +45,11 @@ __FBSDID("$FreeBSD$"); #include <sys/protosw.h> #include <sys/socket.h> #include <sys/socketvar.h> +#include <sys/protosw.h> #include <sys/sysctl.h> #include <sys/vimage.h> +#include <sys/ktr.h> +#include <sys/tree.h> #include <net/if.h> #include <net/if_dl.h> @@ -65,69 +64,164 @@ __FBSDID("$FreeBSD$"); #include <netinet/igmp_var.h> #include <netinet/vinet.h> +#ifndef KTR_IGMPV3 +#define KTR_IGMPV3 KTR_SUBSYS +#endif + #ifndef __SOCKUNION_DECLARED union sockunion { struct sockaddr_storage ss; struct sockaddr sa; struct sockaddr_dl sdl; struct sockaddr_in sin; -#ifdef INET6 - struct sockaddr_in6 sin6; -#endif }; typedef union sockunion sockunion_t; #define __SOCKUNION_DECLARED #endif /* __SOCKUNION_DECLARED */ +static MALLOC_DEFINE(M_INMFILTER, "in_mfilter", + "IPv4 multicast PCB-layer source filter"); static MALLOC_DEFINE(M_IPMADDR, "in_multi", "IPv4 multicast group"); static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "IPv4 multicast options"); -static MALLOC_DEFINE(M_IPMSOURCE, "in_msource", "IPv4 multicast source filter"); +static MALLOC_DEFINE(M_IPMSOURCE, "ip_msource", + "IPv4 multicast IGMP-layer source filter"); -/* - * The IPv4 multicast list (in_multihead and associated structures) are - * protected by the global in_multi_mtx. See in_var.h for more details. For - * now, in_multi_mtx is marked as recursible due to IGMP's calling back into - * ip_output() to send IGMP packets while holding the lock; this probably is - * not quite desirable. - */ #ifdef VIMAGE_GLOBALS -struct in_multihead in_multihead; /* XXX BSS initialization */ +struct in_multihead in_multihead; /* XXX now unused; retain for ABI */ #endif + +/* + * Locking: + * - Lock order is: Giant, INP_WLOCK, IN_MULTI_LOCK, IGMP_LOCK, IF_ADDR_LOCK. + * - The IF_ADDR_LOCK is implicitly taken by inm_lookup() earlier, however + * it can be taken by code in net/if.c also. + * - ip_moptions and in_mfilter are covered by the INP_WLOCK. + * + * struct in_multi is covered by IN_MULTI_LOCK. There isn't strictly + * any need for in_multi itself to be virtualized -- it is bound to an ifp + * anyway no matter what happens. + */ struct mtx in_multi_mtx; -MTX_SYSINIT(in_multi_mtx, &in_multi_mtx, "in_multi_mtx", MTX_DEF | MTX_RECURSE); +MTX_SYSINIT(in_multi_mtx, &in_multi_mtx, "in_multi_mtx", MTX_DEF); /* * Functions with non-static linkage defined in this file should be * declared in in_var.h: - * imo_match_group() - * imo_match_source() + * imo_multi_filter() * in_addmulti() * in_delmulti() - * in_delmulti_locked() + * in_joingroup() + * in_joingroup_locked() + * in_leavegroup() + * in_leavegroup_locked() * and ip_var.h: * inp_freemoptions() * inp_getmoptions() * inp_setmoptions() + * + * XXX: Both carp and pf need to use the legacy (*,G) KPIs in_addmulti() + * and in_delmulti(). */ +static void imf_commit(struct in_mfilter *); +static int imf_get_source(struct in_mfilter *imf, + const struct sockaddr_in *psin, + struct in_msource **); +static struct in_msource * + imf_graft(struct in_mfilter *, const uint8_t, + const struct sockaddr_in *); +static void imf_leave(struct in_mfilter *); +static int imf_prune(struct in_mfilter *, const struct sockaddr_in *); +static void imf_purge(struct in_mfilter *); +static void imf_rollback(struct in_mfilter *); +static void imf_reap(struct in_mfilter *); static int imo_grow(struct ip_moptions *); -static int imo_join_source(struct ip_moptions *, size_t, sockunion_t *); -static int imo_leave_source(struct ip_moptions *, size_t, sockunion_t *); -static int inp_change_source_filter(struct inpcb *, struct sockopt *); +static size_t imo_match_group(const struct ip_moptions *, + const struct ifnet *, const struct sockaddr *); +static struct in_msource * + imo_match_source(const struct ip_moptions *, const size_t, + const struct sockaddr *); +static void ims_merge(struct ip_msource *ims, + const struct in_msource *lims, const int rollback); +static int in_getmulti(struct ifnet *, const struct in_addr *, + struct in_multi **); +static int inm_get_source(struct in_multi *inm, const in_addr_t haddr, + const int noalloc, struct ip_msource **pims); +static int inm_is_ifp_detached(const struct in_multi *); +static int inm_merge(struct in_multi *, /*const*/ struct in_mfilter *); +static void inm_purge(struct in_multi *); +static void inm_reap(struct in_multi *); static struct ip_moptions * inp_findmoptions(struct inpcb *); static int inp_get_source_filters(struct inpcb *, struct sockopt *); static int inp_join_group(struct inpcb *, struct sockopt *); static int inp_leave_group(struct inpcb *, struct sockopt *); +static struct ifnet * + inp_lookup_mcast_ifp(const struct inpcb *, + const struct sockaddr_in *, const struct in_addr); +static int inp_block_unblock_source(struct inpcb *, struct sockopt *); static int inp_set_multicast_if(struct inpcb *, struct sockopt *); static int inp_set_source_filters(struct inpcb *, struct sockopt *); +static int sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS); SYSCTL_NODE(_net_inet_ip, OID_AUTO, mcast, CTLFLAG_RW, 0, "IPv4 multicast"); +static u_long in_mcast_maxgrpsrc = IP_MAX_GROUP_SRC_FILTER; +SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxgrpsrc, + CTLFLAG_RW | CTLFLAG_TUN, &in_mcast_maxgrpsrc, 0, + "Max source filters per group"); +TUNABLE_ULONG("net.inet.ip.mcast.maxgrpsrc", &in_mcast_maxgrpsrc); + +static u_long in_mcast_maxsocksrc = IP_MAX_SOCK_SRC_FILTER; +SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxsocksrc, + CTLFLAG_RW | CTLFLAG_TUN, &in_mcast_maxsocksrc, 0, + "Max source filters per socket"); +TUNABLE_ULONG("net.inet.ip.mcast.maxsocksrc", &in_mcast_maxsocksrc); + int in_mcast_loop = IP_DEFAULT_MULTICAST_LOOP; SYSCTL_INT(_net_inet_ip_mcast, OID_AUTO, loop, CTLFLAG_RW | CTLFLAG_TUN, &in_mcast_loop, 0, "Loopback multicast datagrams by default"); TUNABLE_INT("net.inet.ip.mcast.loop", &in_mcast_loop); +SYSCTL_NODE(_net_inet_ip_mcast, OID_AUTO, filters, + CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_ip_mcast_filters, + "Per-interface stack-wide source filters"); + +/* + * Inline function which wraps assertions for a valid ifp. + * The ifnet layer will set the ifma's ifp pointer to NULL if the ifp + * is detached. + */ +static int __inline +inm_is_ifp_detached(const struct in_multi *inm) +{ + struct ifnet *ifp; + + KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__)); + ifp = inm->inm_ifma->ifma_ifp; + if (ifp != NULL) { + /* + * Sanity check that netinet's notion of ifp is the + * same as net's. + */ + KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__)); + } + + return (ifp == NULL); +} + +/* + * Initialize an in_mfilter structure to a known state at t0, t1 + * with an empty source filter list. + */ +static __inline void +imf_init(struct in_mfilter *imf, const int st0, const int st1) +{ + memset(imf, 0, sizeof(struct in_mfilter)); + RB_INIT(&imf->imf_sources); + imf->imf_st[0] = st0; + imf->imf_st[1] = st1; +} + /* * Resize the ip_moptions vector to the next power-of-two minus 1. * May be called with locks held; do not sleep. @@ -154,13 +248,12 @@ imo_grow(struct ip_moptions *imo) nmships = (struct in_multi **)realloc(omships, sizeof(struct in_multi *) * newmax, M_IPMOPTS, M_NOWAIT); nmfilters = (struct in_mfilter *)realloc(omfilters, - sizeof(struct in_mfilter) * newmax, M_IPMSOURCE, M_NOWAIT); + sizeof(struct in_mfilter) * newmax, M_INMFILTER, M_NOWAIT); if (nmships != NULL && nmfilters != NULL) { /* Initialize newly allocated source filter heads. */ for (idx = oldmax; idx < newmax; idx++) { - nmfilters[idx].imf_fmode = MCAST_EXCLUDE; - nmfilters[idx].imf_nsources = 0; - TAILQ_INIT(&nmfilters[idx].imf_sources); + imf_init(&nmfilters[idx], MCAST_UNDEFINED, + MCAST_EXCLUDE); } imo->imo_max_memberships = newmax; imo->imo_membership = nmships; @@ -172,7 +265,7 @@ imo_grow(struct ip_moptions *imo) if (nmships != NULL) free(nmships, M_IPMOPTS); if (nmfilters != NULL) - free(nmfilters, M_IPMSOURCE); + free(nmfilters, M_INMFILTER); return (ETOOMANYREFS); } @@ -180,80 +273,20 @@ imo_grow(struct ip_moptions *imo) } /* - * Add a source to a multicast filter list. - * Assumes the associated inpcb is locked. - */ -static int -imo_join_source(struct ip_moptions *imo, size_t gidx, sockunion_t *src) -{ - struct in_msource *ims, *nims; - struct in_mfilter *imf; - - KASSERT(src->ss.ss_family == AF_INET, ("%s: !AF_INET", __func__)); - KASSERT(imo->imo_mfilters != NULL, - ("%s: imo_mfilters vector not allocated", __func__)); - - imf = &imo->imo_mfilters[gidx]; - if (imf->imf_nsources == IP_MAX_SOURCE_FILTER) - return (ENOBUFS); - - ims = imo_match_source(imo, gidx, &src->sa); - if (ims != NULL) - return (EADDRNOTAVAIL); - - /* Do not sleep with inp lock held. */ - nims = malloc(sizeof(struct in_msource), - M_IPMSOURCE, M_NOWAIT | M_ZERO); - if (nims == NULL) - return (ENOBUFS); - - nims->ims_addr = src->ss; - TAILQ_INSERT_TAIL(&imf->imf_sources, nims, ims_next); - imf->imf_nsources++; - - return (0); -} - -static int -imo_leave_source(struct ip_moptions *imo, size_t gidx, sockunion_t *src) -{ - struct in_msource *ims; - struct in_mfilter *imf; - - KASSERT(src->ss.ss_family == AF_INET, ("%s: !AF_INET", __func__)); - KASSERT(imo->imo_mfilters != NULL, - ("%s: imo_mfilters vector not allocated", __func__)); - - imf = &imo->imo_mfilters[gidx]; - if (imf->imf_nsources == IP_MAX_SOURCE_FILTER) - return (ENOBUFS); - - ims = imo_match_source(imo, gidx, &src->sa); - if (ims == NULL) - return (EADDRNOTAVAIL); - - TAILQ_REMOVE(&imf->imf_sources, ims, ims_next); - free(ims, M_IPMSOURCE); - imf->imf_nsources--; - - return (0); -} - -/* * Find an IPv4 multicast group entry for this ip_moptions instance * which matches the specified group, and optionally an interface. * Return its index into the array, or -1 if not found. */ -size_t -imo_match_group(struct ip_moptions *imo, struct ifnet *ifp, - struct sockaddr *group) +static size_t +imo_match_group(const struct ip_moptions *imo, const struct ifnet *ifp, + const struct sockaddr *group) { - sockunion_t *gsa; + const struct sockaddr_in *gsin; struct in_multi **pinm; int idx; int nmships; - gsa = (sockunion_t *)group; + gsin = (const struct sockaddr_in *)group; /* The imo_membership array may be lazy allocated. */ if (imo->imo_membership == NULL || imo->imo_num_memberships == 0) @@ -264,14 +297,8 @@ imo_match_group(struct ip_moptions *imo, struct ifnet *ifp, for (idx = 0; idx < nmships; idx++, pinm++) { if (*pinm == NULL) continue; -#if 0 - printf("%s: trying ifp = %p, inaddr = %s ", __func__, - ifp, inet_ntoa(gsa->sin.sin_addr)); - printf("against %p, %s\n", - (*pinm)->inm_ifp, inet_ntoa((*pinm)->inm_addr)); -#endif if ((ifp == NULL || ((*pinm)->inm_ifp == ifp)) && - (*pinm)->inm_addr.s_addr == gsa->sin.sin_addr.s_addr) { + in_hosteq((*pinm)->inm_addr, gsin->sin_addr)) { break; } } @@ -282,14 +309,20 @@ imo_match_group(struct ip_moptions *imo, struct ifnet *ifp, } /* - * Find a multicast source entry for this imo which matches + * Find an IPv4 multicast source entry for this imo which matches * the given group index for this socket, and source address. + * + * NOTE: This does not check if the entry is in-mode, merely if + * it exists, which may not be the desired behaviour. */ -struct in_msource * -imo_match_source(struct ip_moptions *imo, size_t gidx, struct sockaddr *src) +static struct in_msource * +imo_match_source(const struct ip_moptions *imo, const size_t gidx, + const struct sockaddr *src) { + struct ip_msource find; struct in_mfilter *imf; - struct in_msource *ims, *pims; + struct ip_msource *ims; + const sockunion_t *psa; KASSERT(src->sa_family == AF_INET, ("%s: !AF_INET", __func__)); KASSERT(gidx != -1 && gidx < imo->imo_num_memberships, @@ -298,41 +331,82 @@ imo_match_source(struct ip_moptions *imo, size_t gidx, struct sockaddr *src) /* The imo_mfilters array may be lazy allocated. */ if (imo->imo_mfilters == NULL) return (NULL); - - pims = NULL; imf = &imo->imo_mfilters[gidx]; - TAILQ_FOREACH(ims, &imf->imf_sources, ims_next) { - /* - * Perform bitwise comparison of two IPv4 addresses. - * TODO: Do the same for IPv6. - * Do not use sa_equal() for this as it is not aware of - * deeper structure in sockaddr_in or sockaddr_in6. - */ - if (((struct sockaddr_in *)&ims->ims_addr)->sin_addr.s_addr == - ((struct sockaddr_in *)src)->sin_addr.s_addr) { - pims = ims; - break; - } - } - return (pims); + /* Source trees are keyed in host byte order. */ + psa = (const sockunion_t *)src; + find.ims_haddr = ntohl(psa->sin.sin_addr.s_addr); + ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find); + + return ((struct in_msource *)ims); } /* - * Join an IPv4 multicast group. + * Perform filtering for multicast datagrams on a socket by group and source. + * + * Returns 0 if a datagram should be allowed through, or various error codes + * if the socket was not a member of the group, or the source was muted, etc. */ -struct in_multi * -in_addmulti(struct in_addr *ap, struct ifnet *ifp) +int +imo_multi_filter(const struct ip_moptions *imo, const struct ifnet *ifp, + const struct sockaddr *group, const struct sockaddr *src) { - INIT_VNET_INET(ifp->if_vnet); - struct in_multi *inm; + size_t gidx; + struct in_msource *ims; + int mode; - inm = NULL; + KASSERT(ifp != NULL, ("%s: null ifp", __func__)); - IFF_LOCKGIANT(ifp); - IN_MULTI_LOCK(); + gidx = imo_match_group(imo, ifp, group); + if (gidx == -1) + return (MCAST_NOTGMEMBER); + + /* + * Check if the source was included in an (S,G) join. + * Allow reception on exclusive memberships by default, + * reject reception on inclusive memberships by default. + * Exclude source only if an in-mode exclude filter exists. + * Include source only if an in-mode include filter exists. + * NOTE: We are comparing group state here at IGMP t1 (now) + * with socket-layer t0 (since last downcall). + */ + mode = imo->imo_mfilters[gidx].imf_st[1]; + ims = imo_match_source(imo, gidx, src); + + if ((ims == NULL && mode == MCAST_INCLUDE) || + (ims != NULL && ims->imsl_st[0] != mode)) + return (MCAST_NOTSMEMBER); + + return (MCAST_PASS); +} + +/* + * Find and return a reference to an in_multi record for (ifp, group), + * and bump its reference count. + * If one does not exist, try to allocate it, and update link-layer multicast + * filters on ifp to listen for group. + * Assumes the IN_MULTI lock is held across the call. + * Return 0 if successful, otherwise return an appropriate error code. + */ +static int +in_getmulti(struct ifnet *ifp, const struct in_addr *group, + struct in_multi **pinm) +{ + INIT_VNET_INET(ifp->if_vnet); + struct sockaddr_in gsin; + struct ifmultiaddr *ifma; + struct in_ifinfo *ii; + struct in_multi *inm; + int error; + +#if defined(INVARIANTS) && defined(IFF_ASSERTGIANT) + IFF_ASSERTGIANT(ifp); +#endif + IN_MULTI_LOCK_ASSERT(); - IN_LOOKUP_MULTI(*ap, ifp, inm); + ii = (struct in_ifinfo *)ifp->if_afdata[AF_INET]; + + inm = inm_lookup(ifp, *group); if (inm != NULL) { /* * If we already joined this group, just bump the @@ -341,141 +415,900 @@ in_addmulti(struct in_addr *ap, struct ifnet *ifp) KASSERT(inm->inm_refcount >= 1, ("%s: bad refcount %d", __func__, inm->inm_refcount)); ++inm->inm_refcount; - } else do { - sockunion_t gsa; - struct ifmultiaddr *ifma; - struct in_multi *ninm; - int error; + *pinm = inm; + return (0); + } - memset(&gsa, 0, sizeof(gsa)); - gsa.sin.sin_family = AF_INET; - gsa.sin.sin_len = sizeof(struct sockaddr_in); - gsa.sin.sin_addr = *ap; + memset(&gsin, 0, sizeof(gsin)); + gsin.sin_family = AF_INET; + gsin.sin_len = sizeof(struct sockaddr_in); + gsin.sin_addr = *group; - /* - * Check if a link-layer group is already associated - * with this network-layer group on the given ifnet. - * If so, bump the refcount on the existing network-layer - * group association and return it. - */ - error = if_addmulti(ifp, &gsa.sa, &ifma); - if (error) - break; - if (ifma->ifma_protospec != NULL) { - inm = (struct in_multi *)ifma->ifma_protospec; + /* + * Check if a link-layer group is already associated + * with this network-layer group on the given ifnet. + */ + error = if_addmulti(ifp, (struct sockaddr *)&gsin, &ifma); + if (error != 0) + return (error); + + /* + * If something other than netinet is occupying the link-layer + * group, print a meaningful error message and back out of + * the allocation. + * Otherwise, bump the refcount on the existing network-layer + * group association and return it. + */ + if (ifma->ifma_protospec != NULL) { + inm = (struct in_multi *)ifma->ifma_protospec; #ifdef INVARIANTS - if (inm->inm_ifma != ifma || inm->inm_ifp != ifp || - inm->inm_addr.s_addr != ap->s_addr) - panic("%s: ifma is inconsistent", __func__); + KASSERT(ifma->ifma_addr != NULL, ("%s: no ifma_addr", + __func__)); + KASSERT(ifma->ifma_addr->sa_family == AF_INET, + ("%s: ifma not AF_INET", __func__)); + KASSERT(inm != NULL, ("%s: no ifma_protospec", __func__)); + if (inm->inm_ifma != ifma || inm->inm_ifp != ifp || + !in_hosteq(inm->inm_addr, *group)) + panic("%s: ifma %p is inconsistent with %p (%s)", + __func__, ifma, inm, inet_ntoa(*group)); #endif - ++inm->inm_refcount; - break; + ++inm->inm_refcount; + *pinm = inm; + return (0); + } + + /* + * A new in_multi record is needed; allocate and initialize it. + * We DO NOT perform an IGMP join as the in_ layer may need to + * push an initial source list down to IGMP to support SSM. + * + * The initial source filter state is INCLUDE, {} as per the RFC. + */ + inm = malloc(sizeof(*inm), M_IPMADDR, M_NOWAIT | M_ZERO); + if (inm == NULL) { + if_delmulti_ifma(ifma); + return (ENOMEM); + } + inm->inm_addr = *group; + inm->inm_ifp = ifp; + inm->inm_igi = ii->ii_igmp; + inm->inm_ifma = ifma; + inm->inm_refcount = 1; + inm->inm_state = IGMP_NOT_MEMBER; + + /* + * Pending state-changes per group are subject to a bounds check. + */ + IFQ_SET_MAXLEN(&inm->inm_scq, IGMP_MAX_STATE_CHANGES); + + inm->inm_st[0].iss_fmode = MCAST_UNDEFINED; + inm->inm_st[1].iss_fmode = MCAST_UNDEFINED; + RB_INIT(&inm->inm_srcs); + + ifma->ifma_protospec = inm; + + *pinm = inm; + + return (0); +} + +/* + * Drop a reference to an in_multi record. + * + * If the refcount drops to 0, free the in_multi record and + * delete the underlying link-layer membership. + */ +void +inm_release_locked(struct in_multi *inm) +{ + struct ifmultiaddr *ifma; + +#if defined(INVARIANTS) && defined(IFF_ASSERTGIANT) + if (!inm_is_ifp_detached(inm)) + IFF_ASSERTGIANT(ifp); +#endif + + IN_MULTI_LOCK_ASSERT(); + + CTR2(KTR_IGMPV3, "%s: refcount is %d", __func__, inm->inm_refcount); + + if (--inm->inm_refcount > 0) { + CTR2(KTR_IGMPV3, "%s: refcount is now %d", __func__, + inm->inm_refcount); + return; + } + + CTR2(KTR_IGMPV3, "%s: freeing inm %p", __func__, inm); + + ifma = inm->inm_ifma; + + CTR2(KTR_IGMPV3, "%s: purging ifma %p", __func__, ifma); + KASSERT(ifma->ifma_protospec == inm, + ("%s: ifma_protospec != inm", __func__)); + ifma->ifma_protospec = NULL; + + inm_purge(inm); + + free(inm, M_IPMADDR); + + if_delmulti_ifma(ifma); +} + +/* + * Clear recorded source entries for a group. + * Used by the IGMP code. Caller must hold the IN_MULTI lock. + * FIXME: Should reap. + */ +void +inm_clear_recorded(struct in_multi *inm) +{ + struct ip_msource *ims; + + IN_MULTI_LOCK_ASSERT(); + + RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) { + if (ims->ims_stp) { + ims->ims_stp = 0; + --inm->inm_st[1].iss_rec; } + } + KASSERT(inm->inm_st[1].iss_rec == 0, + ("%s: iss_rec %d not 0", __func__, inm->inm_st[1].iss_rec)); +} - /* - * A new membership is needed; construct it and - * perform the IGMP join. - */ - ninm = malloc(sizeof(*ninm), M_IPMADDR, M_NOWAIT | M_ZERO); - if (ninm == NULL) { - if_delmulti_ifma(ifma); +/* + * Record a source as pending for a Source-Group IGMPv3 query. + * This lives here as it modifies the shared tree. + * + * inm is the group descriptor. + * naddr is the address of the source to record in network-byte order. + * + * If the net.inet.igmp.sgalloc sysctl is non-zero, we will + * lazy-allocate a source node in response to an SG query. + * Otherwise, no allocation is performed. This saves some memory + * with the trade-off that the source will not be reported to the + * router if joined in the window between the query response and + * the group actually being joined on the local host. + * + * VIMAGE: XXX: Currently the igmp_sgalloc feature has been removed. + * This turns off the allocation of a recorded source entry if + * the group has not been joined. + * + * Return 0 if the source didn't exist or was already marked as recorded. + * Return 1 if the source was marked as recorded by this function. + * Return <0 if any error occured (negated errno code). + */ +int +inm_record_source(struct in_multi *inm, const in_addr_t naddr) +{ + struct ip_msource find; + struct ip_msource *ims, *nims; + + IN_MULTI_LOCK_ASSERT(); + + find.ims_haddr = ntohl(naddr); + ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find); + if (ims && ims->ims_stp) + return (0); + if (ims == NULL) { + if (inm->inm_nsrc == in_mcast_maxgrpsrc) + return (-ENOSPC); + nims = malloc(sizeof(struct ip_msource), M_IPMSOURCE, + M_NOWAIT | M_ZERO); + if (nims == NULL) + return (-ENOMEM); + nims->ims_haddr = find.ims_haddr; + RB_INSERT(ip_msource_tree, &inm->inm_srcs, nims); + ++inm->inm_nsrc; + ims = nims; + } + + /* + * Mark the source as recorded and update the recorded + * source count. + */ + ++ims->ims_stp; + ++inm->inm_st[1].iss_rec; + + return (1); +} + +/* + * Return a pointer to an in_msource owned by an in_mfilter, + * given its source address. + * Lazy-allocate if needed. If this is a new entry its filter state is + * undefined at t0. + * + * imf is the filter set being modified. + * haddr is the source address in *host* byte-order. + * + * SMPng: May be called with locks held; malloc must not block. + */ +static int +imf_get_source(struct in_mfilter *imf, const struct sockaddr_in *psin, + struct in_msource **plims) +{ + struct ip_msource find; + struct ip_msource *ims, *nims; + struct in_msource *lims; + int error; + + error = 0; + ims = NULL; + lims = NULL; + + /* key is host byte order */ + find.ims_haddr = ntohl(psin->sin_addr.s_addr); + ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find); + lims = (struct in_msource *)ims; + if (lims == NULL) { + if (imf->imf_nsrc == in_mcast_maxsocksrc) + return (ENOSPC); + nims = malloc(sizeof(struct in_msource), M_INMFILTER, + M_NOWAIT | M_ZERO); + if (nims == NULL) + return (ENOMEM); + lims = (struct in_msource *)nims; + lims->ims_haddr = find.ims_haddr; + lims->imsl_st[0] = MCAST_UNDEFINED; + RB_INSERT(ip_msource_tree, &imf->imf_sources, nims); + ++imf->imf_nsrc; + } + + *plims = lims; + + return (error); +} + +/* + * Graft a source entry into an existing socket-layer filter set, + * maintaining any required invariants and checking allocations. + * + * The source is marked as being in the new filter mode at t1. + * + * Return the pointer to the new node, otherwise return NULL. + */ +static struct in_msource * +imf_graft(struct in_mfilter *imf, const uint8_t st1, + const struct sockaddr_in *psin) +{ + struct ip_msource *nims; + struct in_msource *lims; + + nims = malloc(sizeof(struct in_msource), M_INMFILTER, + M_NOWAIT | M_ZERO); + if (nims == NULL) + return (NULL); + lims = (struct in_msource *)nims; + lims->ims_haddr = ntohl(psin->sin_addr.s_addr); + lims->imsl_st[0] = MCAST_UNDEFINED; + lims->imsl_st[1] = st1; + RB_INSERT(ip_msource_tree, &imf->imf_sources, nims); + ++imf->imf_nsrc; + + return (lims); +} + +/* + * Prune a source entry from an existing socket-layer filter set, + * maintaining any required invariants and checking allocations. + * + * The source is marked as being left at t1, it is not freed. + * + * Return 0 if no error occurred, otherwise return an errno value. + */ +static int +imf_prune(struct in_mfilter *imf, const struct sockaddr_in *psin) +{ + struct ip_msource find; + struct ip_msource *ims; + struct in_msource *lims; + + /* key is host byte order */ + find.ims_haddr = ntohl(psin->sin_addr.s_addr); + ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find); + if (ims == NULL) + return (ENOENT); + lims = (struct in_msource *)ims; + lims->imsl_st[1] = MCAST_UNDEFINED; + return (0); +} + +/* + * Revert socket-layer filter set deltas at t1 to t0 state. + */ +static void +imf_rollback(struct in_mfilter *imf) +{ + struct ip_msource *ims, *tims; + struct in_msource *lims; + + RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) { + lims = (struct in_msource *)ims; + if (lims->imsl_st[0] == lims->imsl_st[1]) { + /* no change at t1 */ + continue; + } else if (lims->imsl_st[0] != MCAST_UNDEFINED) { + /* revert change to existing source at t1 */ + lims->imsl_st[1] = lims->imsl_st[0]; + } else { + /* revert source added t1 */ + CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); + RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims); + free(ims, M_INMFILTER); + imf->imf_nsrc--; + } + } + imf->imf_st[1] = imf->imf_st[0]; +} + +/* + * Mark socket-layer filter set as INCLUDE {} at t1. + */ +static void +imf_leave(struct in_mfilter *imf) +{ + struct ip_msource *ims; + struct in_msource *lims; + + RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { + lims = (struct in_msource *)ims; + lims->imsl_st[1] = MCAST_UNDEFINED; + } + imf->imf_st[1] = MCAST_INCLUDE; +} + +/* + * Mark socket-layer filter set deltas as committed. + */ +static void +imf_commit(struct in_mfilter *imf) +{ + struct ip_msource *ims; + struct in_msource *lims; + + RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { + lims = (struct in_msource *)ims; + lims->imsl_st[0] = lims->imsl_st[1]; + } + imf->imf_st[0] = imf->imf_st[1]; +} + +/* + * Reap unreferenced sources from socket-layer filter set. + */ +static void +imf_reap(struct in_mfilter *imf) +{ + struct ip_msource *ims, *tims; + struct in_msource *lims; + + RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) { + lims = (struct in_msource *)ims; + if ((lims->imsl_st[0] == MCAST_UNDEFINED) && + (lims->imsl_st[1] == MCAST_UNDEFINED)) { + CTR2(KTR_IGMPV3, "%s: free lims %p", __func__, ims); + RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims); + free(ims, M_INMFILTER); + imf->imf_nsrc--; + } + } +} + +/* + * Purge socket-layer filter set. + */ +static void +imf_purge(struct in_mfilter *imf) +{ + struct ip_msource *ims, *tims; + + RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) { + CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); + RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims); + free(ims, M_INMFILTER); + imf->imf_nsrc--; + } + imf->imf_st[0] = imf->imf_st[1] = MCAST_UNDEFINED; + KASSERT(RB_EMPTY(&imf->imf_sources), + ("%s: imf_sources not empty", __func__)); +} + +/* + * Look up a source filter entry for a multicast group. + * + * inm is the group descriptor to work with. + * haddr is the host-byte-order IPv4 address to look up. + * noalloc may be non-zero to suppress allocation of sources. + * *pims will be set to the address of the retrieved or allocated source. + * + * SMPng: NOTE: may be called with locks held. + * Return 0 if successful, otherwise return a non-zero error code. + */ +static int +inm_get_source(struct in_multi *inm, const in_addr_t haddr, + const int noalloc, struct ip_msource **pims) +{ + struct ip_msource find; + struct ip_msource *ims, *nims; +#ifdef KTR + struct in_addr ia; +#endif + + find.ims_haddr = haddr; + ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find); + if (ims == NULL && !noalloc) { + if (inm->inm_nsrc == in_mcast_maxgrpsrc) + return (ENOSPC); + nims = malloc(sizeof(struct ip_msource), M_IPMSOURCE, + M_NOWAIT | M_ZERO); + if (nims == NULL) + return (ENOMEM); + nims->ims_haddr = haddr; + RB_INSERT(ip_msource_tree, &inm->inm_srcs, nims); + ++inm->inm_nsrc; + ims = nims; +#ifdef KTR + ia.s_addr = htonl(haddr); + CTR3(KTR_IGMPV3, "%s: allocated %s as %p", __func__, + inet_ntoa(ia), ims); +#endif + } + + *pims = ims; + return (0); +} + +/* + * Merge socket-layer source into IGMP-layer source. + * If rollback is non-zero, perform the inverse of the merge. + */ +static void +ims_merge(struct ip_msource *ims, const struct in_msource *lims, + const int rollback) +{ + int n = rollback ? -1 : 1; +#ifdef KTR + struct in_addr ia; + + ia.s_addr = htonl(ims->ims_haddr); +#endif + + if (lims->imsl_st[0] == MCAST_EXCLUDE) { + CTR3(KTR_IGMPV3, "%s: t1 ex -= %d on %s", + __func__, n, inet_ntoa(ia)); + ims->ims_st[1].ex -= n; + } else if (lims->imsl_st[0] == MCAST_INCLUDE) { + CTR3(KTR_IGMPV3, "%s: t1 in -= %d on %s", + __func__, n, inet_ntoa(ia)); + ims->ims_st[1].in -= n; + } + + if (lims->imsl_st[1] == MCAST_EXCLUDE) { + CTR3(KTR_IGMPV3, "%s: t1 ex += %d on %s", + __func__, n, inet_ntoa(ia)); + ims->ims_st[1].ex += n; + } else if (lims->imsl_st[1] == MCAST_INCLUDE) { + CTR3(KTR_IGMPV3, "%s: t1 in += %d on %s", + __func__, n, inet_ntoa(ia)); + ims->ims_st[1].in += n; + } +} + +/* + * Atomically update the global in_multi state, when a membership's + * filter list is being updated in any way. + * + * imf is the per-inpcb-membership group filter pointer. + * A fake imf may be passed for in-kernel consumers. + * + * XXX This is a candidate for a set-symmetric-difference style loop + * which would eliminate the repeated lookup from root of ims nodes, + * as they share the same key space. + * + * If any error occurred this function will back out of refcounts + * and return a non-zero value. + */ +static int +inm_merge(struct in_multi *inm, /*const*/ struct in_mfilter *imf) +{ + struct ip_msource *ims, *nims; + struct in_msource *lims; + int schanged, error; + int nsrc0, nsrc1; + + schanged = 0; + error = 0; + nsrc1 = nsrc0 = 0; + + /* + * Update the source filters first, as this may fail. + * Maintain count of in-mode filters at t0, t1. These are + * used to work out if we transition into ASM mode or not. + * Maintain a count of source filters whose state was + * actually modified by this operation. + */ + RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { + lims = (struct in_msource *)ims; + if (lims->imsl_st[0] == imf->imf_st[0]) nsrc0++; + if (lims->imsl_st[1] == imf->imf_st[1]) nsrc1++; + if (lims->imsl_st[0] == lims->imsl_st[1]) continue; + error = inm_get_source(inm, lims->ims_haddr, 0, &nims); + ++schanged; + if (error) break; + ims_merge(nims, lims, 0); + } + if (error) { + struct ip_msource *bims; + + RB_FOREACH_REVERSE_FROM(ims, ip_msource_tree, nims) { + lims = (struct in_msource *)ims; + if (lims->imsl_st[0] == lims->imsl_st[1]) + continue; + (void)inm_get_source(inm, lims->ims_haddr, 1, &bims); + if (bims == NULL) + continue; + ims_merge(bims, lims, 1); + } + goto out_reap; + } + + CTR3(KTR_IGMPV3, "%s: imf filters in-mode: %d at t0, %d at t1", + __func__, nsrc0, nsrc1); + + /* Handle transition between INCLUDE {n} and INCLUDE {} on socket. */ + if (imf->imf_st[0] == imf->imf_st[1] && + imf->imf_st[1] == MCAST_INCLUDE) { + if (nsrc1 == 0) { + CTR1(KTR_IGMPV3, "%s: --in on inm at t1", __func__); + --inm->inm_st[1].iss_in; } - ninm->inm_addr = *ap; - ninm->inm_ifp = ifp; - ninm->inm_ifma = ifma; - ninm->inm_refcount = 1; - ifma->ifma_protospec = ninm; - LIST_INSERT_HEAD(&V_in_multihead, ninm, inm_link); + } + + /* Handle filter mode transition on socket. */ + if (imf->imf_st[0] != imf->imf_st[1]) { + CTR3(KTR_IGMPV3, "%s: imf transition %d to %d", + __func__, imf->imf_st[0], imf->imf_st[1]); + + if (imf->imf_st[0] == MCAST_EXCLUDE) { + CTR1(KTR_IGMPV3, "%s: --ex on inm at t1", __func__); + --inm->inm_st[1].iss_ex; + } else if (imf->imf_st[0] == MCAST_INCLUDE) { + CTR1(KTR_IGMPV3, "%s: --in on inm at t1", __func__); + --inm->inm_st[1].iss_in; + } + + if (imf->imf_st[1] == MCAST_EXCLUDE) { + CTR1(KTR_IGMPV3, "%s: ex++ on inm at t1", __func__); + inm->inm_st[1].iss_ex++; + } else if (imf->imf_st[1] == MCAST_INCLUDE && nsrc1 > 0) { + CTR1(KTR_IGMPV3, "%s: in++ on inm at t1", __func__); + inm->inm_st[1].iss_in++; + } + } + + /* + * Track inm filter state in terms of listener counts. + * If there are any exclusive listeners, stack-wide + * membership is exclusive. + * Otherwise, if only inclusive listeners, stack-wide is inclusive. + * If no listeners remain, state is undefined at t1, + * and the IGMP lifecycle for this group should finish. + */ + if (inm->inm_st[1].iss_ex > 0) { + CTR1(KTR_IGMPV3, "%s: transition to EX", __func__); + inm->inm_st[1].iss_fmode = MCAST_EXCLUDE; + } else if (inm->inm_st[1].iss_in > 0) { + CTR1(KTR_IGMPV3, "%s: transition to IN", __func__); + inm->inm_st[1].iss_fmode = MCAST_INCLUDE; + } else { + CTR1(KTR_IGMPV3, "%s: transition to UNDEF", __func__); + inm->inm_st[1].iss_fmode = MCAST_UNDEFINED; + } + + /* Decrement ASM listener count on transition out of ASM mode. */ + if (imf->imf_st[0] == MCAST_EXCLUDE && nsrc0 == 0) { + if ((imf->imf_st[1] != MCAST_EXCLUDE) || + (imf->imf_st[1] == MCAST_EXCLUDE && nsrc1 > 0)) + CTR1(KTR_IGMPV3, "%s: --asm on inm at t1", __func__); + --inm->inm_st[1].iss_asm; + } + + /* Increment ASM listener count on transition to ASM mode. */ + if (imf->imf_st[1] == MCAST_EXCLUDE && nsrc1 == 0) { + CTR1(KTR_IGMPV3, "%s: asm++ on inm at t1", __func__); + inm->inm_st[1].iss_asm++; + } + + CTR3(KTR_IGMPV3, "%s: merged imf %p to inm %p", __func__, imf, inm); + inm_print(inm); + +out_reap: + if (schanged > 0) { + CTR1(KTR_IGMPV3, "%s: sources changed; reaping", __func__); + inm_reap(inm); + } + return (error); +} + +/* + * Mark an in_multi's filter set deltas as committed. + * Called by IGMP after a state change has been enqueued. + */ +void +inm_commit(struct in_multi *inm) +{ + struct ip_msource *ims; - igmp_joingroup(ninm); + CTR2(KTR_IGMPV3, "%s: commit inm %p", __func__, inm); + CTR1(KTR_IGMPV3, "%s: pre commit:", __func__); + inm_print(inm); - inm = ninm; - } while (0); + RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) { + ims->ims_st[0] = ims->ims_st[1]; + } + inm->inm_st[0] = inm->inm_st[1]; +} +/* + * Reap unreferenced nodes from an in_multi's filter set. + */ +static void +inm_reap(struct in_multi *inm) +{ + struct ip_msource *ims, *tims; + + RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, tims) { + if (ims->ims_st[0].ex > 0 || ims->ims_st[0].in > 0 || + ims->ims_st[1].ex > 0 || ims->ims_st[1].in > 0 || + ims->ims_stp != 0) + continue; + CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); + RB_REMOVE(ip_msource_tree, &inm->inm_srcs, ims); + free(ims, M_IPMSOURCE); + inm->inm_nsrc--; + } +} + +/* + * Purge all source nodes from an in_multi's filter set. + */ +static void +inm_purge(struct in_multi *inm) +{ + struct ip_msource *ims, *tims; + + RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, tims) { + CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); + RB_REMOVE(ip_msource_tree, &inm->inm_srcs, ims); + free(ims, M_IPMSOURCE); + inm->inm_nsrc--; + } +} + +/* + * Join a multicast group; unlocked entry point. + * + * SMPng: XXX: in_joingroup() is called from in_control() when Giant + * is not held. Fortunately, ifp is unlikely to have been detached + * at this point, so we assume it's OK to recurse. + */ +int +in_joingroup(struct ifnet *ifp, const struct in_addr *gina, + /*const*/ struct in_mfilter *imf, struct in_multi **pinm) +{ + int error; + + IFF_LOCKGIANT(ifp); + IN_MULTI_LOCK(); + error = in_joingroup_locked(ifp, gina, imf, pinm); IN_MULTI_UNLOCK(); IFF_UNLOCKGIANT(ifp); - return (inm); + return (error); } /* - * Leave an IPv4 multicast group. - * It is OK to call this routine if the underlying ifnet went away. + * Join a multicast group; real entry point. * - * XXX: To deal with the ifp going away, we cheat; the link-layer code in net - * will set ifma_ifp to NULL when the associated ifnet instance is detached - * from the system. + * Only preserves atomicity at inm level. + * NOTE: imf argument cannot be const due to sys/tree.h limitations. * - * The only reason we need to violate layers and check ifma_ifp here at all - * is because certain hardware drivers still require Giant to be held, - * and it must always be taken before other locks. + * If the IGMP downcall fails, the group is not joined, and an error + * code is returned. */ -void -in_delmulti(struct in_multi *inm) +int +in_joingroup_locked(struct ifnet *ifp, const struct in_addr *gina, + /*const*/ struct in_mfilter *imf, struct in_multi **pinm) { - struct ifnet *ifp; + struct in_mfilter timf; + struct in_multi *inm; + int error; - KASSERT(inm != NULL, ("%s: inm is NULL", __func__)); - KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__)); - ifp = inm->inm_ifma->ifma_ifp; + IN_MULTI_LOCK_ASSERT(); - if (ifp != NULL) { - /* - * Sanity check that netinet's notion of ifp is the - * same as net's. - */ - KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__)); - IFF_LOCKGIANT(ifp); + CTR4(KTR_IGMPV3, "%s: join %s on %p(%s))", __func__, + inet_ntoa(*gina), ifp, ifp->if_xname); + + error = 0; + inm = NULL; + + /* + * If no imf was specified (i.e. kernel consumer), + * fake one up and assume it is an ASM join. + */ + if (imf == NULL) { + imf_init(&timf, MCAST_UNDEFINED, MCAST_EXCLUDE); + imf = &timf; + } + + error = in_getmulti(ifp, gina, &inm); + if (error) { + CTR1(KTR_IGMPV3, "%s: in_getmulti() failure", __func__); + return (error); + } + + CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); + error = inm_merge(inm, imf); + if (error) { + CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); + goto out_inm_release; + } + + CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); + error = igmp_change_state(inm); + if (error) { + CTR1(KTR_IGMPV3, "%s: failed to update source", __func__); + goto out_inm_release; + } + +out_inm_release: + if (error) { + CTR2(KTR_IGMPV3, "%s: dropping ref on %p", __func__, inm); + inm_release_locked(inm); + } else { + *pinm = inm; } + return (error); +} + +/* + * Leave a multicast group; unlocked entry point. + */ +int +in_leavegroup(struct in_multi *inm, /*const*/ struct in_mfilter *imf) +{ + struct ifnet *ifp; + int detached, error; + + detached = inm_is_ifp_detached(inm); + ifp = inm->inm_ifp; + if (!detached) + IFF_LOCKGIANT(ifp); + IN_MULTI_LOCK(); - in_delmulti_locked(inm); + error = in_leavegroup_locked(inm, imf); IN_MULTI_UNLOCK(); - if (ifp != NULL) + if (!detached) IFF_UNLOCKGIANT(ifp); + + return (error); } /* - * Delete a multicast address record, with locks held. + * Leave a multicast group; real entry point. + * All source filters will be expunged. + * + * Only preserves atomicity at inm level. * - * It is OK to call this routine if the ifp went away. - * Assumes that caller holds the IN_MULTI lock, and that - * Giant was taken before other locks if required by the hardware. + * Holding the write lock for the INP which contains imf + * is highly advisable. We can't assert for it as imf does not + * contain a back-pointer to the owning inp. + * + * Note: This is not the same as inm_release(*) as this function also + * makes a state change downcall into IGMP. */ -void -in_delmulti_locked(struct in_multi *inm) +int +in_leavegroup_locked(struct in_multi *inm, /*const*/ struct in_mfilter *imf) { - struct ifmultiaddr *ifma; - - IN_MULTI_LOCK_ASSERT(); - KASSERT(inm->inm_refcount >= 1, ("%s: freeing freed inm", __func__)); + struct in_mfilter timf; + int error; - if (--inm->inm_refcount == 0) { - igmp_leavegroup(inm); + error = 0; - ifma = inm->inm_ifma; -#ifdef DIAGNOSTIC - if (bootverbose) - printf("%s: purging ifma %p\n", __func__, ifma); +#if defined(INVARIANTS) && defined(IFF_ASSERTGIANT) + if (!inm_is_ifp_detached(inm)) + IFF_ASSERTGIANT(inm->inm_ifp); #endif - KASSERT(ifma->ifma_protospec == inm, - ("%s: ifma_protospec != inm", __func__)); - ifma->ifma_protospec = NULL; - LIST_REMOVE(inm, inm_link); - free(inm, M_IPMADDR); + IN_MULTI_LOCK_ASSERT(); - if_delmulti_ifma(ifma); + CTR5(KTR_IGMPV3, "%s: leave inm %p, %s/%s, imf %p", __func__, + inm, inet_ntoa(inm->inm_addr), + (inm_is_ifp_detached(inm) ? "null" : inm->inm_ifp->if_xname), + imf); + + /* + * If no imf was specified (i.e. kernel consumer), + * fake one up and assume it is an ASM join. + */ + if (imf == NULL) { + imf_init(&timf, MCAST_EXCLUDE, MCAST_UNDEFINED); + imf = &timf; } + + /* + * Begin state merge transaction at IGMP layer. + * + * As this particular invocation should not cause any memory + * to be allocated, and there is no opportunity to roll back + * the transaction, it MUST NOT fail. + */ + CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); + error = inm_merge(inm, imf); + KASSERT(error == 0, ("%s: failed to merge inm state", __func__)); + + CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); + error = igmp_change_state(inm); + if (error) + CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); + + CTR2(KTR_IGMPV3, "%s: dropping ref on %p", __func__, inm); + inm_release_locked(inm); + + return (error); +} + +/*#ifndef BURN_BRIDGES*/ +/* + * Join an IPv4 multicast group in (*,G) exclusive mode. + * The group must be a 224.0.0.0/24 link-scope group. + * This KPI is for legacy kernel consumers only. + */ +struct in_multi * +in_addmulti(struct in_addr *ap, struct ifnet *ifp) +{ + struct in_multi *pinm; + int error; + + KASSERT(IN_LOCAL_GROUP(ntohl(ap->s_addr)), + ("%s: %s not in 224.0.0.0/24", __func__, inet_ntoa(*ap))); + + error = in_joingroup(ifp, ap, NULL, &pinm); + if (error != 0) + pinm = NULL; + + return (pinm); } /* - * Block or unblock an ASM/SSM multicast source on an inpcb. + * Leave an IPv4 multicast group, assumed to be in exclusive (*,G) mode. + * This KPI is for legacy kernel consumers only. + */ +void +in_delmulti(struct in_multi *inm) +{ + + (void)in_leavegroup(inm, NULL); +} +/*#endif*/ + +/* + * Block or unblock an ASM multicast source on an inpcb. + * This implements the delta-based API described in RFC 3678. + * + * The delta-based API applies only to exclusive-mode memberships. + * An IGMP downcall will be performed. + * + * SMPng: NOTE: Must take Giant as a join may create a new ifma. + * + * Return 0 if successful, otherwise return an appropriate error code. */ static int -inp_change_source_filter(struct inpcb *inp, struct sockopt *sopt) +inp_block_unblock_source(struct inpcb *inp, struct sockopt *sopt) { INIT_VNET_NET(curvnet); INIT_VNET_INET(curvnet); @@ -485,13 +1318,14 @@ inp_change_source_filter(struct inpcb *inp, struct sockopt *sopt) struct in_mfilter *imf; struct ip_moptions *imo; struct in_msource *ims; + struct in_multi *inm; size_t idx; - int error; - int block; + uint16_t fmode; + int error, doblock; ifp = NULL; error = 0; - block = 0; + doblock = 0; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; @@ -516,18 +1350,14 @@ inp_change_source_filter(struct inpcb *inp, struct sockopt *sopt) ssa->sin.sin_len = sizeof(struct sockaddr_in); ssa->sin.sin_addr = mreqs.imr_sourceaddr; - if (mreqs.imr_interface.s_addr != INADDR_ANY) + if (!in_nullhost(mreqs.imr_interface)) INADDR_TO_IFP(mreqs.imr_interface, ifp); if (sopt->sopt_name == IP_BLOCK_SOURCE) - block = 1; + doblock = 1; -#ifdef DIAGNOSTIC - if (bootverbose) { - printf("%s: imr_interface = %s, ifp = %p\n", - __func__, inet_ntoa(mreqs.imr_interface), ifp); - } -#endif + CTR3(KTR_IGMPV3, "%s: imr_interface = %s, ifp = %p", + __func__, inet_ntoa(mreqs.imr_interface), ifp); break; } @@ -553,24 +1383,21 @@ inp_change_source_filter(struct inpcb *inp, struct sockopt *sopt) ifp = ifnet_byindex(gsr.gsr_interface); if (sopt->sopt_name == MCAST_BLOCK_SOURCE) - block = 1; + doblock = 1; break; default: -#ifdef DIAGNOSTIC - if (bootverbose) { - printf("%s: unknown sopt_name %d\n", __func__, - sopt->sopt_name); - } -#endif + CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d", + __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } - /* XXX INET6 */ if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); + IFF_LOCKGIANT(ifp); + /* * Check if we are actually a member of this group. */ @@ -578,103 +1405,97 @@ inp_change_source_filter(struct inpcb *inp, struct sockopt *sopt) idx = imo_match_group(imo, ifp, &gsa->sa); if (idx == -1 || imo->imo_mfilters == NULL) { error = EADDRNOTAVAIL; - goto out_locked; + goto out_inp_locked; } KASSERT(imo->imo_mfilters != NULL, ("%s: imo_mfilters not allocated", __func__)); imf = &imo->imo_mfilters[idx]; + inm = imo->imo_membership[idx]; /* - * SSM multicast truth table for block/unblock operations. - * - * Operation Filter Mode Entry exists? Action - * - * block exclude no add source to filter - * unblock include no add source to filter - * block include no EINVAL - * unblock exclude no EINVAL - * block exclude yes EADDRNOTAVAIL - * unblock include yes EADDRNOTAVAIL - * block include yes remove source from filter - * unblock exclude yes remove source from filter - * - * FreeBSD does not explicitly distinguish between ASM and SSM - * mode sockets; all sockets are assumed to have a filter list. + * Attempting to use the delta-based API on an + * non exclusive-mode membership is an error. */ -#ifdef DIAGNOSTIC - if (bootverbose) { - printf("%s: imf_fmode is %s\n", __func__, - imf->imf_fmode == MCAST_INCLUDE ? "include" : "exclude"); + fmode = imf->imf_st[0]; + if (fmode != MCAST_EXCLUDE) { + error = EINVAL; + goto out_inp_locked; } -#endif + + /* + * Deal with error cases up-front: + * Asked to block, but already blocked; or + * Asked to unblock, but nothing to unblock. + * If adding a new block entry, allocate it. + */ ims = imo_match_source(imo, idx, &ssa->sa); - if (ims == NULL) { - if ((block == 1 && imf->imf_fmode == MCAST_EXCLUDE) || - (block == 0 && imf->imf_fmode == MCAST_INCLUDE)) { -#ifdef DIAGNOSTIC - if (bootverbose) { - printf("%s: adding %s to filter list\n", - __func__, inet_ntoa(ssa->sin.sin_addr)); - } -#endif - error = imo_join_source(imo, idx, ssa); - } - if ((block == 1 && imf->imf_fmode == MCAST_INCLUDE) || - (block == 0 && imf->imf_fmode == MCAST_EXCLUDE)) { - /* - * If the socket is in inclusive mode: - * the source is already blocked as it has no entry. - * If the socket is in exclusive mode: - * the source is already unblocked as it has no entry. - */ -#ifdef DIAGNOSTIC - if (bootverbose) { - printf("%s: ims %p; %s already [un]blocked\n", - __func__, ims, - inet_ntoa(ssa->sin.sin_addr)); - } -#endif - error = EINVAL; - } + if ((ims != NULL && doblock) || (ims == NULL && !doblock)) { + CTR3(KTR_IGMPV3, "%s: source %s %spresent", __func__, + inet_ntoa(ssa->sin.sin_addr), doblock ? "" : "not "); + error = EADDRNOTAVAIL; + goto out_inp_locked; + } + + INP_WLOCK_ASSERT(inp); + + /* + * Begin state merge transaction at socket layer. + */ + if (doblock) { + CTR2(KTR_IGMPV3, "%s: %s source", __func__, "block"); + ims = imf_graft(imf, fmode, &ssa->sin); + if (ims == NULL) + error = ENOMEM; } else { - if ((block == 1 && imf->imf_fmode == MCAST_EXCLUDE) || - (block == 0 && imf->imf_fmode == MCAST_INCLUDE)) { - /* - * If the socket is in exclusive mode: - * the source is already blocked as it has an entry. - * If the socket is in inclusive mode: - * the source is already unblocked as it has an entry. - */ -#ifdef DIAGNOSTIC - if (bootverbose) { - printf("%s: ims %p; %s already [un]blocked\n", - __func__, ims, - inet_ntoa(ssa->sin.sin_addr)); - } -#endif - error = EADDRNOTAVAIL; - } - if ((block == 1 && imf->imf_fmode == MCAST_INCLUDE) || - (block == 0 && imf->imf_fmode == MCAST_EXCLUDE)) { -#ifdef DIAGNOSTIC - if (bootverbose) { - printf("%s: removing %s from filter list\n", - __func__, inet_ntoa(ssa->sin.sin_addr)); - } -#endif - error = imo_leave_source(imo, idx, ssa); - } + CTR2(KTR_IGMPV3, "%s: %s source", __func__, "allow"); + error = imf_prune(imf, &ssa->sin); + } + + if (error) { + CTR1(KTR_IGMPV3, "%s: merge imf state failed", __func__); + goto out_imf_rollback; + } + + /* + * Begin state merge transaction at IGMP layer. + */ + IN_MULTI_LOCK(); + + CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); + error = inm_merge(inm, imf); + if (error) { + CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); + goto out_imf_rollback; } -out_locked: + CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); + error = igmp_change_state(inm); + if (error) + CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); + + IN_MULTI_UNLOCK(); + +out_imf_rollback: + if (error) + imf_rollback(imf); + else + imf_commit(imf); + + imf_reap(imf); + +out_inp_locked: INP_WUNLOCK(inp); + IFF_UNLOCKGIANT(ifp); return (error); } /* * Given an inpcb, return its multicast options structure pointer. Accepts * an unlocked inpcb pointer, but will return it locked. May sleep. + * + * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held. + * SMPng: NOTE: Returns with the INP write lock held. */ static struct ip_moptions * inp_findmoptions(struct inpcb *inp) @@ -690,13 +1511,11 @@ inp_findmoptions(struct inpcb *inp) INP_WUNLOCK(inp); - imo = (struct ip_moptions *)malloc(sizeof(*imo), M_IPMOPTS, - M_WAITOK); - immp = (struct in_multi **)malloc(sizeof(*immp) * IP_MIN_MEMBERSHIPS, - M_IPMOPTS, M_WAITOK | M_ZERO); - imfp = (struct in_mfilter *)malloc( - sizeof(struct in_mfilter) * IP_MIN_MEMBERSHIPS, - M_IPMSOURCE, M_WAITOK); + imo = malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK); + immp = malloc(sizeof(*immp) * IP_MIN_MEMBERSHIPS, M_IPMOPTS, + M_WAITOK | M_ZERO); + imfp = malloc(sizeof(struct in_mfilter) * IP_MIN_MEMBERSHIPS, + M_INMFILTER, M_WAITOK); imo->imo_multicast_ifp = NULL; imo->imo_multicast_addr.s_addr = INADDR_ANY; @@ -708,16 +1527,13 @@ inp_findmoptions(struct inpcb *inp) imo->imo_membership = immp; /* Initialize per-group source filters. */ - for (idx = 0; idx < IP_MIN_MEMBERSHIPS; idx++) { - imfp[idx].imf_fmode = MCAST_EXCLUDE; - imfp[idx].imf_nsources = 0; - TAILQ_INIT(&imfp[idx].imf_sources); - } + for (idx = 0; idx < IP_MIN_MEMBERSHIPS; idx++) + imf_init(&imfp[idx], MCAST_UNDEFINED, MCAST_EXCLUDE); imo->imo_mfilters = imfp; INP_WLOCK(inp); if (inp->inp_moptions != NULL) { - free(imfp, M_IPMSOURCE); + free(imfp, M_INMFILTER); free(immp, M_IPMOPTS); free(imo, M_IPMOPTS); return (inp->inp_moptions); @@ -728,35 +1544,29 @@ inp_findmoptions(struct inpcb *inp) /* * Discard the IP multicast options (and source filters). + * + * SMPng: NOTE: assumes INP write lock is held. */ void inp_freemoptions(struct ip_moptions *imo) { struct in_mfilter *imf; - struct in_msource *ims, *tims; size_t idx, nmships; KASSERT(imo != NULL, ("%s: ip_moptions is NULL", __func__)); nmships = imo->imo_num_memberships; for (idx = 0; idx < nmships; ++idx) { - in_delmulti(imo->imo_membership[idx]); - - if (imo->imo_mfilters != NULL) { - imf = &imo->imo_mfilters[idx]; - TAILQ_FOREACH_SAFE(ims, &imf->imf_sources, - ims_next, tims) { - TAILQ_REMOVE(&imf->imf_sources, ims, ims_next); - free(ims, M_IPMSOURCE); - imf->imf_nsources--; - } - KASSERT(imf->imf_nsources == 0, - ("%s: did not free all imf_nsources", __func__)); - } + imf = imo->imo_mfilters ? &imo->imo_mfilters[idx] : NULL; + if (imf) + imf_leave(imf); + (void)in_leavegroup(imo->imo_membership[idx], imf); + if (imf) + imf_purge(imf); } - if (imo->imo_mfilters != NULL) - free(imo->imo_mfilters, M_IPMSOURCE); + if (imo->imo_mfilters) + free(imo->imo_mfilters, M_INMFILTER); free(imo->imo_membership, M_IPMOPTS); free(imo, M_IPMOPTS); } @@ -774,11 +1584,13 @@ inp_get_source_filters(struct inpcb *inp, struct sockopt *sopt) struct ifnet *ifp; struct ip_moptions *imo; struct in_mfilter *imf; - struct in_msource *ims; + struct ip_msource *ims; + struct in_msource *lims; + struct sockaddr_in *psin; struct sockaddr_storage *ptss; struct sockaddr_storage *tss; int error; - size_t idx; + size_t idx, nsrcs, ncsrcs; INP_WLOCK_ASSERT(inp); @@ -810,36 +1622,52 @@ inp_get_source_filters(struct inpcb *inp, struct sockopt *sopt) INP_WUNLOCK(inp); return (EADDRNOTAVAIL); } - imf = &imo->imo_mfilters[idx]; - msfr.msfr_fmode = imf->imf_fmode; - msfr.msfr_nsrcs = imf->imf_nsources; + + /* + * Ignore memberships which are in limbo. + */ + if (imf->imf_st[1] == MCAST_UNDEFINED) { + INP_WUNLOCK(inp); + return (EAGAIN); + } + msfr.msfr_fmode = imf->imf_st[1]; /* * If the user specified a buffer, copy out the source filter * entries to userland gracefully. - * msfr.msfr_nsrcs is always set to the total number of filter - * entries which the kernel currently has for this group. + * We only copy out the number of entries which userland + * has asked for, but we always tell userland how big the + * buffer really needs to be. */ tss = NULL; if (msfr.msfr_srcs != NULL && msfr.msfr_nsrcs > 0) { - /* - * Make a copy of the source vector so that we do not - * thrash the inpcb lock whilst copying it out. - * We only copy out the number of entries which userland - * has asked for, but we always tell userland how big the - * buffer really needs to be. - */ tss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, - M_TEMP, M_NOWAIT); + M_TEMP, M_NOWAIT | M_ZERO); if (tss == NULL) { - error = ENOBUFS; - } else { - ptss = tss; - TAILQ_FOREACH(ims, &imf->imf_sources, ims_next) { - memcpy(ptss++, &ims->ims_addr, - sizeof(struct sockaddr_storage)); - } + INP_WUNLOCK(inp); + return (ENOBUFS); + } + } + + /* + * Count number of sources in-mode at t0. + * If buffer space exists and remains, copy out source entries. + */ + nsrcs = msfr.msfr_nsrcs; + ncsrcs = 0; + ptss = tss; + RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { + lims = (struct in_msource *)ims; + if (lims->imsl_st[0] == MCAST_UNDEFINED || + lims->imsl_st[0] != imf->imf_st[0]) + continue; + ++ncsrcs; + if (tss != NULL && nsrcs-- > 0) { + psin = (struct sockaddr_in *)ptss++; + psin->sin_family = AF_INET; + psin->sin_len = sizeof(struct sockaddr_in); + psin->sin_addr.s_addr = htonl(lims->ims_haddr); } } @@ -849,11 +1677,11 @@ inp_get_source_filters(struct inpcb *inp, struct sockopt *sopt) error = copyout(tss, msfr.msfr_srcs, sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); free(tss, M_TEMP); + if (error) + return (error); } - if (error) - return (error); - + msfr.msfr_nsrcs = ncsrcs; error = sooptcopyout(sopt, &msfr, sizeof(struct __msfilterreq)); return (error); @@ -901,7 +1729,7 @@ inp_getmoptions(struct inpcb *inp, struct sockopt *sopt) memset(&mreqn, 0, sizeof(struct ip_mreqn)); if (imo != NULL) { ifp = imo->imo_multicast_ifp; - if (imo->imo_multicast_addr.s_addr != INADDR_ANY) { + if (!in_nullhost(imo->imo_multicast_addr)) { mreqn.imr_address = imo->imo_multicast_addr; } else if (ifp != NULL) { mreqn.imr_ifindex = ifp->if_index; @@ -967,6 +1795,73 @@ inp_getmoptions(struct inpcb *inp, struct sockopt *sopt) } /* + * Look up the ifnet to use for a multicast group membership, + * given the IPv4 address of an interface, and the IPv4 group address. + * + * This routine exists to support legacy multicast applications + * which do not understand that multicast memberships are scoped to + * specific physical links in the networking stack, or which need + * to join link-scope groups before IPv4 addresses are configured. + * + * If inp is non-NULL, use this socket's current FIB number for any + * required FIB lookup. + * If ina is INADDR_ANY, look up the group address in the unicast FIB, + * and use its ifp; usually, this points to the default next-hop. + * + * If the FIB lookup fails, attempt to use the first non-loopback + * interface with multicast capability in the system as a + * last resort. The legacy IPv4 ASM API requires that we do + * this in order to allow groups to be joined when the routing + * table has not yet been populated during boot. + * + * Returns NULL if no ifp could be found. + * + * SMPng: TODO: Acquire the appropriate locks for INADDR_TO_IFP. + * FUTURE: Implement IPv4 source-address selection. + */ +static struct ifnet * +inp_lookup_mcast_ifp(const struct inpcb *inp, + const struct sockaddr_in *gsin, const struct in_addr ina) +{ + struct ifnet *ifp; + + KASSERT(gsin->sin_family == AF_INET, ("%s: not AF_INET", __func__)); + KASSERT(IN_MULTICAST(ntohl(gsin->sin_addr.s_addr)), + ("%s: not multicast", __func__)); + + ifp = NULL; + if (!in_nullhost(ina)) { + INADDR_TO_IFP(ina, ifp); + } else { + struct route ro; + + ro.ro_rt = NULL; + memcpy(&ro.ro_dst, gsin, sizeof(struct sockaddr_in)); + in_rtalloc_ign(&ro, 0, inp ? inp->inp_inc.inc_fibnum : 0); + if (ro.ro_rt != NULL) { + ifp = ro.ro_rt->rt_ifp; + KASSERT(ifp != NULL, ("%s: null ifp", __func__)); + RTFREE(ro.ro_rt); + } else { + struct in_ifaddr *ia; + struct ifnet *mifp; + + mifp = NULL; + TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { + mifp = ia->ia_ifp; + if (!(mifp->if_flags & IFF_LOOPBACK) && + (mifp->if_flags & IFF_MULTICAST)) { + ifp = mifp; + break; + } + } + } + } + + return (ifp); +} + +/* * Join an IPv4 multicast group, possibly with a source. */ static int @@ -980,11 +1875,14 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt) struct in_mfilter *imf; struct ip_moptions *imo; struct in_multi *inm; + struct in_msource *lims; size_t idx; - int error; + int error, is_new; ifp = NULL; + imf = NULL; error = 0; + is_new = 0; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; @@ -1025,52 +1923,10 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt) ssa->sin.sin_addr = mreqs.imr_sourceaddr; } - /* - * Obtain ifp. If no interface address was provided, - * use the interface of the route in the unicast FIB for - * the given multicast destination; usually, this is the - * default route. - * If this lookup fails, attempt to use the first non-loopback - * interface with multicast capability in the system as a - * last resort. The legacy IPv4 ASM API requires that we do - * this in order to allow groups to be joined when the routing - * table has not yet been populated during boot. - * If all of these conditions fail, return EADDRNOTAVAIL, and - * reject the IPv4 multicast join. - */ - if (mreqs.imr_interface.s_addr != INADDR_ANY) { - INADDR_TO_IFP(mreqs.imr_interface, ifp); - } else { - struct route ro; - - ro.ro_rt = NULL; - *(struct sockaddr_in *)&ro.ro_dst = gsa->sin; - in_rtalloc_ign(&ro, 0, - inp->inp_inc.inc_fibnum); - if (ro.ro_rt != NULL) { - ifp = ro.ro_rt->rt_ifp; - KASSERT(ifp != NULL, ("%s: null ifp", - __func__)); - RTFREE(ro.ro_rt); - } else { - struct in_ifaddr *ia; - struct ifnet *mfp = NULL; - TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { - mfp = ia->ia_ifp; - if (!(mfp->if_flags & IFF_LOOPBACK) && - (mfp->if_flags & IFF_MULTICAST)) { - ifp = mfp; - break; - } - } - } - } -#ifdef DIAGNOSTIC - if (bootverbose) { - printf("%s: imr_interface = %s, ifp = %p\n", - __func__, inet_ntoa(mreqs.imr_interface), ifp); - } -#endif + ifp = inp_lookup_mcast_ifp(inp, &gsa->sin, + mreqs.imr_interface); + CTR3(KTR_IGMPV3, "%s: imr_interface = %s, ifp = %p", + __func__, inet_ntoa(mreqs.imr_interface), ifp); break; } @@ -1095,7 +1951,6 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt) /* * Overwrite the port field if present, as the sockaddr * being copied in may be matched with a binary comparison. - * XXX INET6 */ gsa->sin.sin_port = 0; if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { @@ -1105,22 +1960,14 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt) ssa->sin.sin_port = 0; } - /* - * Obtain the ifp. - */ if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); - break; default: -#ifdef DIAGNOSTIC - if (bootverbose) { - printf("%s: unknown sopt_name %d\n", __func__, - sopt->sopt_name); - } -#endif + CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d", + __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } @@ -1131,96 +1978,131 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt) if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) return (EADDRNOTAVAIL); + IFF_LOCKGIANT(ifp); + /* - * Check if we already hold membership of this group for this inpcb. - * If so, we do not need to perform the initial join. + * MCAST_JOIN_SOURCE on an exclusive membership is an error. + * On an existing inclusive membership, it just adds the + * source to the filter list. */ imo = inp_findmoptions(inp); idx = imo_match_group(imo, ifp, &gsa->sa); - if (idx != -1) { - if (ssa->ss.ss_family != AF_UNSPEC) { - /* - * Attempting to join an ASM group (when already - * an ASM or SSM member) is an error. - */ + if (idx == -1) { + is_new = 1; + } else { + inm = imo->imo_membership[idx]; + imf = &imo->imo_mfilters[idx]; + if (ssa->ss.ss_family != AF_UNSPEC && + imf->imf_st[1] != MCAST_INCLUDE) { + error = EINVAL; + goto out_inp_locked; + } + lims = imo_match_source(imo, idx, &ssa->sa); + if (lims != NULL) { error = EADDRNOTAVAIL; - } else { - imf = &imo->imo_mfilters[idx]; - if (imf->imf_nsources == 0) { - /* - * Attempting to join an SSM group (when - * already an ASM member) is an error. - */ - error = EINVAL; - } else { - /* - * Attempting to join an SSM group (when - * already an SSM member) means "add this - * source to the inclusive filter list". - */ - error = imo_join_source(imo, idx, ssa); - } + goto out_inp_locked; } - goto out_locked; } /* - * Call imo_grow() to reallocate the membership and source filter - * vectors if they are full. If the size would exceed the hard limit, - * then we know we've really run out of entries. We keep the INP - * lock held to avoid introducing a race condition. + * Begin state merge transaction at socket layer. */ - if (imo->imo_num_memberships == imo->imo_max_memberships) { - error = imo_grow(imo); - if (error) - goto out_locked; + INP_WLOCK_ASSERT(inp); + + if (is_new) { + if (imo->imo_num_memberships == imo->imo_max_memberships) { + error = imo_grow(imo); + if (error) + goto out_inp_locked; + } + /* + * Allocate the new slot upfront so we can deal with + * grafting the new source filter in same code path + * as for join-source on existing membership. + */ + idx = imo->imo_num_memberships; + imo->imo_membership[idx] = NULL; + imo->imo_num_memberships++; + KASSERT(imo->imo_mfilters != NULL, + ("%s: imf_mfilters vector was not allocated", __func__)); + imf = &imo->imo_mfilters[idx]; + KASSERT(RB_EMPTY(&imf->imf_sources), + ("%s: imf_sources not empty", __func__)); } /* - * So far, so good: perform the layer 3 join, layer 2 join, - * and make an IGMP announcement if needed. + * Graft new source into filter list for this inpcb's + * membership of the group. The in_multi may not have + * been allocated yet if this is a new membership. */ - inm = in_addmulti(&gsa->sin.sin_addr, ifp); - if (inm == NULL) { - error = ENOBUFS; - goto out_locked; + if (ssa->ss.ss_family != AF_UNSPEC) { + /* Membership starts in IN mode */ + if (is_new) { + CTR1(KTR_IGMPV3, "%s: new join w/source", __func__); + imf_init(imf, MCAST_UNDEFINED, MCAST_INCLUDE); + } else { + CTR2(KTR_IGMPV3, "%s: %s source", __func__, "allow"); + } + lims = imf_graft(imf, MCAST_INCLUDE, &ssa->sin); + if (lims == NULL) { + CTR1(KTR_IGMPV3, "%s: merge imf state failed", + __func__); + error = ENOMEM; + goto out_imo_free; + } } - idx = imo->imo_num_memberships; - imo->imo_membership[idx] = inm; - imo->imo_num_memberships++; - - KASSERT(imo->imo_mfilters != NULL, - ("%s: imf_mfilters vector was not allocated", __func__)); - imf = &imo->imo_mfilters[idx]; - KASSERT(TAILQ_EMPTY(&imf->imf_sources), - ("%s: imf_sources not empty", __func__)); /* - * If this is a new SSM group join (i.e. a source was specified - * with this group), add this source to the filter list. + * Begin state merge transaction at IGMP layer. */ - if (ssa->ss.ss_family != AF_UNSPEC) { - /* - * An initial SSM join implies that this socket's membership - * of the multicast group is now in inclusive mode. - */ - imf->imf_fmode = MCAST_INCLUDE; + IN_MULTI_LOCK(); - error = imo_join_source(imo, idx, ssa); + if (is_new) { + error = in_joingroup_locked(ifp, &gsa->sin.sin_addr, imf, + &inm); + if (error) + goto out_imo_free; + imo->imo_membership[idx] = inm; + } else { + CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); + error = inm_merge(inm, imf); if (error) { - /* - * Drop inp lock before calling in_delmulti(), - * to prevent a lock order reversal. - */ - --imo->imo_num_memberships; - INP_WUNLOCK(inp); - in_delmulti(inm); - return (error); + CTR1(KTR_IGMPV3, "%s: failed to merge inm state", + __func__); + goto out_imf_rollback; + } + CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); + error = igmp_change_state(inm); + if (error) { + CTR1(KTR_IGMPV3, "%s: failed igmp downcall", + __func__); + goto out_imf_rollback; } } -out_locked: + IN_MULTI_UNLOCK(); + +out_imf_rollback: + INP_WLOCK_ASSERT(inp); + if (error) { + imf_rollback(imf); + if (is_new) + imf_purge(imf); + else + imf_reap(imf); + } else { + imf_commit(imf); + } + +out_imo_free: + if (error && is_new) { + imo->imo_membership[idx] = NULL; + --imo->imo_num_memberships; + } + +out_inp_locked: INP_WUNLOCK(inp); + IFF_UNLOCKGIANT(ifp); return (error); } @@ -1238,13 +2120,14 @@ inp_leave_group(struct inpcb *inp, struct sockopt *sopt) struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; - struct in_msource *ims, *tims; + struct in_msource *ims; struct in_multi *inm; size_t idx; - int error; + int error, is_final; ifp = NULL; error = 0; + is_final = 1; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; @@ -1284,15 +2167,12 @@ inp_leave_group(struct inpcb *inp, struct sockopt *sopt) ssa->sin.sin_addr = mreqs.imr_sourceaddr; } - if (gsa->sin.sin_addr.s_addr != INADDR_ANY) + if (!in_nullhost(gsa->sin.sin_addr)) INADDR_TO_IFP(mreqs.imr_interface, ifp); -#ifdef DIAGNOSTIC - if (bootverbose) { - printf("%s: imr_interface = %s, ifp = %p\n", - __func__, inet_ntoa(mreqs.imr_interface), ifp); - } -#endif + CTR3(KTR_IGMPV3, "%s: imr_interface = %s, ifp = %p", + __func__, inet_ntoa(mreqs.imr_interface), ifp); + break; case MCAST_LEAVE_GROUP: @@ -1326,12 +2206,8 @@ inp_leave_group(struct inpcb *inp, struct sockopt *sopt) break; default: -#ifdef DIAGNOSTIC - if (bootverbose) { - printf("%s: unknown sopt_name %d\n", __func__, - sopt->sopt_name); - } -#endif + CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d", + __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } @@ -1339,6 +2215,9 @@ inp_leave_group(struct inpcb *inp, struct sockopt *sopt) if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); + if (ifp) + IFF_LOCKGIANT(ifp); + /* * Find the membership in the membership array. */ @@ -1346,66 +2225,95 @@ inp_leave_group(struct inpcb *inp, struct sockopt *sopt) idx = imo_match_group(imo, ifp, &gsa->sa); if (idx == -1) { error = EADDRNOTAVAIL; - goto out_locked; + goto out_inp_locked; } + inm = imo->imo_membership[idx]; imf = &imo->imo_mfilters[idx]; + if (ssa->ss.ss_family != AF_UNSPEC) + is_final = 0; + + /* + * Begin state merge transaction at socket layer. + */ + INP_WLOCK_ASSERT(inp); + /* * If we were instructed only to leave a given source, do so. + * MCAST_LEAVE_SOURCE_GROUP is only valid for inclusive memberships. */ - if (ssa->ss.ss_family != AF_UNSPEC) { - if (imf->imf_nsources == 0 || - imf->imf_fmode == MCAST_EXCLUDE) { - /* - * Attempting to SSM leave an ASM group - * is an error; should use *_BLOCK_SOURCE instead. - * Attempting to SSM leave a source in a group when - * the socket is in 'exclude mode' is also an error. - */ - error = EINVAL; - } else { - error = imo_leave_source(imo, idx, ssa); + if (is_final) { + imf_leave(imf); + } else { + if (imf->imf_st[0] == MCAST_EXCLUDE) { + error = EADDRNOTAVAIL; + goto out_inp_locked; + } + ims = imo_match_source(imo, idx, &ssa->sa); + if (ims == NULL) { + CTR3(KTR_IGMPV3, "%s: source %s %spresent", __func__, + inet_ntoa(ssa->sin.sin_addr), "not "); + error = EADDRNOTAVAIL; + goto out_inp_locked; + } + CTR2(KTR_IGMPV3, "%s: %s source", __func__, "block"); + error = imf_prune(imf, &ssa->sin); + if (error) { + CTR1(KTR_IGMPV3, "%s: merge imf state failed", + __func__); + goto out_inp_locked; } - /* - * If an error occurred, or this source is not the last - * source in the group, do not leave the whole group. - */ - if (error || imf->imf_nsources > 0) - goto out_locked; } /* - * Give up the multicast address record to which the membership points. + * Begin state merge transaction at IGMP layer. */ - inm = imo->imo_membership[idx]; - in_delmulti(inm); + IN_MULTI_LOCK(); - /* - * Free any source filters for this group if they exist. - * Revert inpcb to the default MCAST_EXCLUDE state. - */ - if (imo->imo_mfilters != NULL) { - TAILQ_FOREACH_SAFE(ims, &imf->imf_sources, ims_next, tims) { - TAILQ_REMOVE(&imf->imf_sources, ims, ims_next); - free(ims, M_IPMSOURCE); - imf->imf_nsources--; + if (is_final) { + /* + * Give up the multicast address record to which + * the membership points. + */ + (void)in_leavegroup_locked(inm, imf); + } else { + CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); + error = inm_merge(inm, imf); + if (error) { + CTR1(KTR_IGMPV3, "%s: failed to merge inm state", + __func__); + goto out_imf_rollback; + } + + CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); + error = igmp_change_state(inm); + if (error) { + CTR1(KTR_IGMPV3, "%s: failed igmp downcall", + __func__); } - KASSERT(imf->imf_nsources == 0, - ("%s: imf_nsources not 0", __func__)); - KASSERT(TAILQ_EMPTY(&imf->imf_sources), - ("%s: imf_sources not empty", __func__)); - imf->imf_fmode = MCAST_EXCLUDE; } - /* - * Remove the gap in the membership array. - */ - for (++idx; idx < imo->imo_num_memberships; ++idx) - imo->imo_membership[idx-1] = imo->imo_membership[idx]; - imo->imo_num_memberships--; + IN_MULTI_UNLOCK(); + +out_imf_rollback: + if (error) + imf_rollback(imf); + else + imf_commit(imf); + + imf_reap(imf); -out_locked: + if (is_final) { + /* Remove the gap in the membership array. */ + for (++idx; idx < imo->imo_num_memberships; ++idx) + imo->imo_membership[idx-1] = imo->imo_membership[idx]; + imo->imo_num_memberships--; + } + +out_inp_locked: INP_WUNLOCK(inp); + if (ifp) + IFF_UNLOCKGIANT(ifp); return (error); } @@ -1456,19 +2364,15 @@ inp_set_multicast_if(struct inpcb *inp, struct sockopt *sopt) sizeof(struct in_addr)); if (error) return (error); - if (addr.s_addr == INADDR_ANY) { + if (in_nullhost(addr)) { ifp = NULL; } else { INADDR_TO_IFP(addr, ifp); if (ifp == NULL) return (EADDRNOTAVAIL); } -#ifdef DIAGNOSTIC - if (bootverbose) { - printf("%s: ifp = %p, addr = %s\n", - __func__, ifp, inet_ntoa(addr)); /* XXX INET6 */ - } -#endif + CTR3(KTR_IGMPV3, "%s: ifp = %p, addr = %s", __func__, ifp, + inet_ntoa(addr)); } /* Reject interfaces which do not support multicast. */ @@ -1485,6 +2389,8 @@ inp_set_multicast_if(struct inpcb *inp, struct sockopt *sopt) /* * Atomically set source filters on a socket for an IPv4 multicast group. + * + * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held. */ static int inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt) @@ -1495,7 +2401,7 @@ inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt) struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; - struct in_msource *ims, *tims; + struct in_multi *inm; size_t idx; int error; @@ -1504,7 +2410,7 @@ inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt) if (error) return (error); - if (msfr.msfr_nsrcs > IP_MAX_SOURCE_FILTER || + if (msfr.msfr_nsrcs > in_mcast_maxsocksrc || (msfr.msfr_fmode != MCAST_EXCLUDE && msfr.msfr_fmode != MCAST_INCLUDE)) return (EINVAL); @@ -1526,62 +2432,44 @@ inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt) if (ifp == NULL) return (EADDRNOTAVAIL); + IFF_LOCKGIANT(ifp); + /* - * Take the INP lock. + * Take the INP write lock. * Check if this socket is a member of this group. */ imo = inp_findmoptions(inp); idx = imo_match_group(imo, ifp, &gsa->sa); if (idx == -1 || imo->imo_mfilters == NULL) { error = EADDRNOTAVAIL; - goto out_locked; + goto out_inp_locked; } + inm = imo->imo_membership[idx]; imf = &imo->imo_mfilters[idx]; -#ifdef DIAGNOSTIC - if (bootverbose) - printf("%s: clearing source list\n", __func__); -#endif - /* - * Remove any existing source filters. + * Begin state merge transaction at socket layer. */ - TAILQ_FOREACH_SAFE(ims, &imf->imf_sources, ims_next, tims) { - TAILQ_REMOVE(&imf->imf_sources, ims, ims_next); - free(ims, M_IPMSOURCE); - imf->imf_nsources--; - } - KASSERT(imf->imf_nsources == 0, - ("%s: source list not cleared", __func__)); + INP_WLOCK_ASSERT(inp); + + imf->imf_st[1] = msfr.msfr_fmode; /* * Apply any new source filters, if present. + * Make a copy of the user-space source vector so + * that we may copy them with a single copyin. This + * allows us to deal with page faults up-front. */ if (msfr.msfr_nsrcs > 0) { - struct in_msource **pnims; - struct in_msource *nims; - struct sockaddr_storage *kss; - struct sockaddr_storage *pkss; - sockunion_t *psu; - int i, j; + struct in_msource *lims; + struct sockaddr_in *psin; + struct sockaddr_storage *kss, *pkss; + int i; - /* - * Drop the inp lock so we may sleep if we need to - * in order to satisfy a malloc request. - * We will re-take it before changing socket state. - */ INP_WUNLOCK(inp); -#ifdef DIAGNOSTIC - if (bootverbose) { - printf("%s: loading %lu source list entries\n", - __func__, (unsigned long)msfr.msfr_nsrcs); - } -#endif - /* - * Make a copy of the user-space source vector so - * that we may copy them with a single copyin. This - * allows us to deal with page faults up-front. - */ + + CTR2(KTR_IGMPV3, "%s: loading %lu source list entries", + __func__, (unsigned long)msfr.msfr_nsrcs); kss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, M_TEMP, M_WAITOK); error = copyin(msfr.msfr_srcs, kss, @@ -1591,103 +2479,79 @@ inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt) return (error); } - /* - * Perform argument checking on every sockaddr_storage - * structure in the vector provided to us. Overwrite - * fields which should not apply to source entries. - * TODO: Check for duplicate sources on this pass. - */ - psu = (sockunion_t *)kss; - for (i = 0; i < msfr.msfr_nsrcs; i++, psu++) { - switch (psu->ss.ss_family) { - case AF_INET: - if (psu->sin.sin_len != - sizeof(struct sockaddr_in)) { - error = EINVAL; - } else { - psu->sin.sin_port = 0; - } - break; -#ifdef notyet - case AF_INET6; - if (psu->sin6.sin6_len != - sizeof(struct sockaddr_in6)) { - error = EINVAL; - } else { - psu->sin6.sin6_port = 0; - psu->sin6.sin6_flowinfo = 0; - } - break; -#endif - default: - error = EAFNOSUPPORT; - break; - } - if (error) - break; - } - if (error) { - free(kss, M_TEMP); - return (error); - } + INP_WLOCK(inp); /* - * Allocate a block to track all the in_msource - * entries we are about to allocate, in case we - * abruptly need to free them. + * Mark all source filters as UNDEFINED at t1. + * Restore new group filter mode, as imf_leave() + * will set it to INCLUDE. */ - pnims = malloc(sizeof(struct in_msource *) * msfr.msfr_nsrcs, - M_TEMP, M_WAITOK | M_ZERO); + imf_leave(imf); + imf->imf_st[1] = msfr.msfr_fmode; /* - * Allocate up to nsrcs individual chunks. - * If we encounter an error, backtrack out of - * all allocations cleanly; updates must be atomic. + * Update socket layer filters at t1, lazy-allocating + * new entries. This saves a bunch of memory at the + * cost of one RB_FIND() per source entry; duplicate + * entries in the msfr_nsrcs vector are ignored. + * If we encounter an error, rollback transaction. + * + * XXX This too could be replaced with a set-symmetric + * difference like loop to avoid walking from root + * every time, as the key space is common. */ - pkss = kss; - nims = NULL; - for (i = 0; i < msfr.msfr_nsrcs; i++, pkss++) { - nims = malloc(sizeof(struct in_msource) * - msfr.msfr_nsrcs, M_IPMSOURCE, M_WAITOK | M_ZERO); - pnims[i] = nims; - } - if (i < msfr.msfr_nsrcs) { - for (j = 0; j < i; j++) { - if (pnims[j] != NULL) - free(pnims[j], M_IPMSOURCE); + for (i = 0, pkss = kss; i < msfr.msfr_nsrcs; i++, pkss++) { + psin = (struct sockaddr_in *)pkss; + if (psin->sin_family != AF_INET) { + error = EAFNOSUPPORT; + break; } - free(pnims, M_TEMP); - free(kss, M_TEMP); - return (ENOBUFS); - } - - INP_UNLOCK_ASSERT(inp); - - /* - * Finally, apply the filters to the socket. - * Re-take the inp lock; we are changing socket state. - */ - pkss = kss; - INP_WLOCK(inp); - for (i = 0; i < msfr.msfr_nsrcs; i++, pkss++) { - memcpy(&(pnims[i]->ims_addr), pkss, - sizeof(struct sockaddr_storage)); - TAILQ_INSERT_TAIL(&imf->imf_sources, pnims[i], - ims_next); - imf->imf_nsources++; + if (psin->sin_len != sizeof(struct sockaddr_in)) { + error = EINVAL; + break; + } + error = imf_get_source(imf, psin, &lims); + if (error) + break; + lims->imsl_st[1] = imf->imf_st[1]; } - free(pnims, M_TEMP); free(kss, M_TEMP); } + if (error) + goto out_imf_rollback; + + INP_WLOCK_ASSERT(inp); + IN_MULTI_LOCK(); + /* - * Update the filter mode on the socket before releasing the inpcb. + * Begin state merge transaction at IGMP layer. */ - INP_WLOCK_ASSERT(inp); - imf->imf_fmode = msfr.msfr_fmode; + CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); + error = inm_merge(inm, imf); + if (error) { + CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); + goto out_imf_rollback; + } + + CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); + error = igmp_change_state(inm); + if (error) + CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); + + IN_MULTI_UNLOCK(); + +out_imf_rollback: + if (error) + imf_rollback(imf); + else + imf_commit(imf); -out_locked: + imf_reap(imf); + +out_inp_locked: INP_WUNLOCK(inp); + IFF_UNLOCKGIANT(ifp); return (error); } @@ -1699,6 +2563,10 @@ out_locked: * it is not possible to merge the duplicate code, because the idempotence * of the IPv4 multicast part of the BSD Sockets API must be preserved; * the effects of these options must be treated as separate and distinct. + * + * SMPng: XXX: Unlocked read of inp_socket believed OK. + * FUTURE: The IP_MULTICAST_VIF option may be eliminated if MROUTING + * is refactored to no longer use vifs. */ int inp_setmoptions(struct inpcb *inp, struct sockopt *sopt) @@ -1711,11 +2579,10 @@ inp_setmoptions(struct inpcb *inp, struct sockopt *sopt) /* * If socket is neither of type SOCK_RAW or SOCK_DGRAM, * or is a divert socket, reject it. - * XXX Unlocked read of inp_socket believed OK. */ if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || (inp->inp_socket->so_proto->pr_type != SOCK_RAW && - inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) + inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) return (EOPNOTSUPP); switch (sopt->sopt_name) { @@ -1826,7 +2693,7 @@ inp_setmoptions(struct inpcb *inp, struct sockopt *sopt) case IP_UNBLOCK_SOURCE: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: - error = inp_change_source_filter(inp, sopt); + error = inp_block_unblock_source(inp, sopt); break; case IP_MSFILTER: @@ -1842,3 +2709,183 @@ inp_setmoptions(struct inpcb *inp, struct sockopt *sopt) return (error); } + +/* + * Expose IGMP's multicast filter mode and source list(s) to userland, + * keyed by (ifindex, group). + * The filter mode is written out as a uint32_t, followed by + * 0..n of struct in_addr. + * For use by ifmcstat(8). + * SMPng: NOTE: unlocked read of ifindex space. + */ +static int +sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS) +{ + INIT_VNET_NET(curvnet); + struct in_addr src, group; + struct ifnet *ifp; + struct ifmultiaddr *ifma; + struct in_multi *inm; + struct ip_msource *ims; + int *name; + int retval; + u_int namelen; + uint32_t fmode, ifindex; + + name = (int *)arg1; + namelen = arg2; + + if (req->newptr != NULL) + return (EPERM); + + if (namelen != 2) + return (EINVAL); + + ifindex = name[0]; + if (ifindex <= 0 || ifindex > V_if_index) { + CTR2(KTR_IGMPV3, "%s: ifindex %u out of range", + __func__, ifindex); + return (ENOENT); + } + + group.s_addr = name[1]; + if (!IN_MULTICAST(ntohl(group.s_addr))) { + CTR2(KTR_IGMPV3, "%s: group %s is not multicast", + __func__, inet_ntoa(group)); + return (EINVAL); + } + + ifp = ifnet_byindex(ifindex); + if (ifp == NULL) { + CTR2(KTR_IGMPV3, "%s: no ifp for ifindex %u", + __func__, ifindex); + return (ENOENT); + } + + retval = sysctl_wire_old_buffer(req, + sizeof(uint32_t) + (in_mcast_maxgrpsrc * sizeof(struct in_addr))); + if (retval) + return (retval); + + IN_MULTI_LOCK(); + + IF_ADDR_LOCK(ifp); + TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { + if (ifma->ifma_addr->sa_family != AF_INET || + ifma->ifma_protospec == NULL) + continue; + inm = (struct in_multi *)ifma->ifma_protospec; + if (!in_hosteq(inm->inm_addr, group)) + continue; + fmode = inm->inm_st[1].iss_fmode; + retval = SYSCTL_OUT(req, &fmode, sizeof(uint32_t)); + if (retval != 0) + break; + RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) { +#ifdef KTR + struct in_addr ina; + ina.s_addr = htonl(ims->ims_haddr); + CTR2(KTR_IGMPV3, "%s: visit node %s", __func__, + inet_ntoa(ina)); +#endif + /* + * Only copy-out sources which are in-mode. + */ + if (fmode != ims_get_mode(inm, ims, 1)) { + CTR1(KTR_IGMPV3, "%s: skip non-in-mode", + __func__); + continue; + } + src.s_addr = htonl(ims->ims_haddr); + retval = SYSCTL_OUT(req, &src, sizeof(struct in_addr)); + if (retval != 0) + break; + } + } + IF_ADDR_UNLOCK(ifp); + + IN_MULTI_UNLOCK(); + + return (retval); +} + +#ifdef KTR + +static const char *inm_modestrs[] = { "un", "in", "ex" }; + +static const char * +inm_mode_str(const int mode) +{ + + if (mode >= MCAST_UNDEFINED && mode <= MCAST_EXCLUDE) + return (inm_modestrs[mode]); + return ("??"); +} + +static const char *inm_statestrs[] = { + "not-member", + "silent", + "idle", + "lazy", + "sleeping", + "awakening", + "query-pending", + "sg-query-pending", + "leaving" +}; + +static const char * +inm_state_str(const int state) +{ + + if (state >= IGMP_NOT_MEMBER && state <= IGMP_LEAVING_MEMBER) + return (inm_statestrs[state]); + return ("??"); +} + +/* + * Dump an in_multi structure to the console. + */ +void +inm_print(const struct in_multi *inm) +{ + int t; + + printf("%s: --- begin inm %p ---\n", __func__, inm); + printf("addr %s ifp %p(%s) ifma %p\n", + inet_ntoa(inm->inm_addr), + inm->inm_ifp, + inm->inm_ifp->if_xname, + inm->inm_ifma); + printf("timer %u state %s refcount %u scq.len %u\n", + inm->inm_timer, + inm_state_str(inm->inm_state), + inm->inm_refcount, + inm->inm_scq.ifq_len); + printf("igi %p nsrc %lu sctimer %u scrv %u\n", + inm->inm_igi, + inm->inm_nsrc, + inm->inm_sctimer, + inm->inm_scrv); + for (t = 0; t < 2; t++) { + printf("t%d: fmode %s asm %u ex %u in %u rec %u\n", t, + inm_mode_str(inm->inm_st[t].iss_fmode), + inm->inm_st[t].iss_asm, + inm->inm_st[t].iss_ex, + inm->inm_st[t].iss_in, + inm->inm_st[t].iss_rec); + } + printf("%s: --- end inm %p ---\n", __func__, inm); +} + +#else /* !KTR */ + +void +inm_print(const struct in_multi *inm) +{ + +} + +#endif /* KTR */ + +RB_GENERATE(ip_msource_tree, ip_msource, ims_link, ip_msource_cmp); diff --git a/sys/netinet/in_proto.c b/sys/netinet/in_proto.c index 8957c3d..c25f6eb 100644 --- a/sys/netinet/in_proto.c +++ b/sys/netinet/in_proto.c @@ -206,7 +206,6 @@ struct protosw inetsw[] = { .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = igmp_input, .pr_ctloutput = rip_ctloutput, - .pr_init = igmp_init, .pr_fasttimo = igmp_fasttimo, .pr_slowtimo = igmp_slowtimo, .pr_usrreqs = &rip_usrreqs diff --git a/sys/netinet/in_var.h b/sys/netinet/in_var.h index 73868ad..39964cf 100644 --- a/sys/netinet/in_var.h +++ b/sys/netinet/in_var.h @@ -35,6 +35,20 @@ #include <sys/queue.h> #include <sys/fnv_hash.h> +#include <sys/tree.h> + +struct igmp_ifinfo; +struct in_multi; +struct lltable; + +/* + * IPv4 per-interface state. + */ +struct in_ifinfo { + struct lltable *ii_llt; /* ARP state */ + struct igmp_ifinfo *ii_igmp; /* IGMP state */ + struct in_multi *ii_allhosts; /* 224.0.0.1 membership */ +}; /* * Interface address, Internet version. One of these structures @@ -151,77 +165,163 @@ do { \ (((((x) & 0xF) | ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK) /* - * This information should be part of the ifnet structure but we don't wish - * to change that - as it might break a number of things + * Legacy IPv4 IGMP per-link structure. */ - struct router_info { struct ifnet *rti_ifp; int rti_type; /* type of router which is querier on this interface */ int rti_time; /* # of slow timeouts since last old query */ SLIST_ENTRY(router_info) rti_list; -#ifdef notyet - int rti_timev1; /* IGMPv1 querier present */ - int rti_timev2; /* IGMPv2 querier present */ - int rti_timer; /* report to general query */ - int rti_qrv; /* querier robustness */ -#endif }; /* - * Internet multicast address structure. There is one of these for each IP - * multicast group to which this host belongs on a given network interface. - * For every entry on the interface's if_multiaddrs list which represents - * an IP multicast group, there is one of these structures. They are also - * kept on a system-wide list to make it easier to keep our legacy IGMP code - * compatible with the rest of the world (see IN_FIRST_MULTI et al, below). + * Per-interface IGMP router version information. + */ +struct igmp_ifinfo { + LIST_ENTRY(igmp_ifinfo) igi_link; + struct ifnet *igi_ifp; /* interface this instance belongs to */ + uint32_t igi_version; /* IGMPv3 Host Compatibility Mode */ + uint32_t igi_v1_timer; /* IGMPv1 Querier Present timer (s) */ + uint32_t igi_v2_timer; /* IGMPv2 Querier Present timer (s) */ + uint32_t igi_v3_timer; /* IGMPv3 General Query (interface) timer (s)*/ + uint32_t igi_flags; /* IGMP per-interface flags */ + uint32_t igi_rv; /* IGMPv3 Robustness Variable */ + uint32_t igi_qi; /* IGMPv3 Query Interval (s) */ + uint32_t igi_qri; /* IGMPv3 Query Response Interval (s) */ + uint32_t igi_uri; /* IGMPv3 Unsolicited Report Interval (s) */ + SLIST_HEAD(,in_multi) igi_relinmhead; /* released groups */ + struct ifqueue igi_gq; /* queue of general query responses */ +}; + +#define IGIF_SILENT 0x00000001 /* Do not use IGMP on this ifp */ +#define IGIF_LOOPBACK 0x00000002 /* Send IGMP reports to loopback */ + +/* + * IPv4 multicast IGMP-layer source entry. + */ +struct ip_msource { + RB_ENTRY(ip_msource) ims_link; /* RB tree links */ + in_addr_t ims_haddr; /* host byte order */ + struct ims_st { + uint16_t ex; /* # of exclusive members */ + uint16_t in; /* # of inclusive members */ + } ims_st[2]; /* state at t0, t1 */ + uint8_t ims_stp; /* pending query */ +}; + +/* + * IPv4 multicast PCB-layer source entry. + */ +struct in_msource { + RB_ENTRY(ip_msource) ims_link; /* RB tree links */ + in_addr_t ims_haddr; /* host byte order */ + uint8_t imsl_st[2]; /* state before/at commit */ +}; + +RB_HEAD(ip_msource_tree, ip_msource); /* define struct ip_msource_tree */ + +static __inline int +ip_msource_cmp(const struct ip_msource *a, const struct ip_msource *b) +{ + + if (a->ims_haddr < b->ims_haddr) + return (-1); + if (a->ims_haddr == b->ims_haddr) + return (0); + return (1); +} +RB_PROTOTYPE(ip_msource_tree, ip_msource, ims_link, ip_msource_cmp); + +/* + * IPv4 multicast PCB-layer group filter descriptor. + */ +struct in_mfilter { + struct ip_msource_tree imf_sources; /* source list for (S,G) */ + u_long imf_nsrc; /* # of source entries */ + uint8_t imf_st[2]; /* state before/at commit */ +}; + +/* + * IPv4 group descriptor. + * + * For every entry on an ifnet's if_multiaddrs list which represents + * an IP multicast group, there is one of these structures. + * + * If any source filters are present, then a node will exist in the RB-tree + * to permit fast lookup by source whenever an operation takes place. + * This permits pre-order traversal when we issue reports. + * Source filter trees are kept separately from the socket layer to + * greatly simplify locking. + * + * When IGMPv3 is active, inm_timer is the response to group query timer. + * The state-change timer inm_sctimer is separate; whenever state changes + * for the group the state change record is generated and transmitted, + * and kept if retransmissions are necessary. + * + * FUTURE: inm_link is now only used when groups are being purged + * on a detaching ifnet. It could be demoted to a SLIST_ENTRY, but + * because it is at the very start of the struct, we can't do this + * w/o breaking the ABI for ifmcstat. */ struct in_multi { - LIST_ENTRY(in_multi) inm_link; /* queue macro glue */ + LIST_ENTRY(in_multi) inm_link; /* to-be-released by in_ifdetach */ struct in_addr inm_addr; /* IP multicast address, convenience */ struct ifnet *inm_ifp; /* back pointer to ifnet */ struct ifmultiaddr *inm_ifma; /* back pointer to ifmultiaddr */ - u_int inm_timer; /* IGMP membership report timer */ - u_int inm_state; /* state of the membership */ - struct router_info *inm_rti; /* router info*/ + u_int inm_timer; /* IGMPv1/v2 group / v3 query timer */ + u_int inm_state; /* state of the membership */ + void *inm_rti; /* unused, legacy field */ u_int inm_refcount; /* reference count */ -#ifdef notyet /* IGMPv3 source-specific multicast fields */ - TAILQ_HEAD(, in_msfentry) inm_msf; /* all active source filters */ - TAILQ_HEAD(, in_msfentry) inm_msf_record; /* recorded sources */ - TAILQ_HEAD(, in_msfentry) inm_msf_exclude; /* exclude sources */ - TAILQ_HEAD(, in_msfentry) inm_msf_include; /* include sources */ - /* XXX: should this lot go to the router_info structure? */ - /* XXX: can/should these be callouts? */ - /* IGMP protocol timers */ - int32_t inm_ti_curstate; /* current state timer */ - int32_t inm_ti_statechg; /* state change timer */ - /* IGMP report timers */ - uint16_t inm_rpt_statechg; /* state change report timer */ - uint16_t inm_rpt_toxx; /* fmode change report timer */ - /* IGMP protocol state */ - uint16_t inm_fmode; /* filter mode */ - uint32_t inm_recsrc_count; /* # of recorded sources */ - uint16_t inm_exclude_sock_count; /* # of exclude-mode sockets */ - uint16_t inm_gass_count; /* # of g-a-s queries */ -#endif + + /* New fields for IGMPv3 follow. */ + struct igmp_ifinfo *inm_igi; /* IGMP info */ + SLIST_ENTRY(in_multi) inm_nrele; /* to-be-released by IGMP */ + struct ip_msource_tree inm_srcs; /* tree of sources */ + u_long inm_nsrc; /* # of tree entries */ + + struct ifqueue inm_scq; /* queue of pending + * state-change packets */ + struct timeval inm_lastgsrtv; /* Time of last G-S-R query */ + uint16_t inm_sctimer; /* state-change timer */ + uint16_t inm_scrv; /* state-change rexmit count */ + + /* + * SSM state counters which track state at T0 (the time the last + * state-change report's RV timer went to zero) and T1 + * (time of pending report, i.e. now). + * Used for computing IGMPv3 state-change reports. Several refcounts + * are maintained here to optimize for common use-cases. + */ + struct inm_st { + uint16_t iss_fmode; /* IGMP filter mode */ + uint16_t iss_asm; /* # of ASM listeners */ + uint16_t iss_ex; /* # of exclusive members */ + uint16_t iss_in; /* # of inclusive members */ + uint16_t iss_rec; /* # of recorded sources */ + } inm_st[2]; /* state at t0, t1 */ }; -#ifdef notyet /* - * Internet multicast source filter list. This list is used to store - * IP multicast source addresses for each membership on an interface. - * TODO: Allocate these structures using UMA. - * TODO: Find an easier way of linking the struct into two lists at once. + * Helper function to derive the filter mode on a source entry + * from its internal counters. Predicates are: + * A source is only excluded if all listeners exclude it. + * A source is only included if no listeners exclude it, + * and at least one listener includes it. + * May be used by ifmcstat(8). */ -struct in_msfentry { - TAILQ_ENTRY(in_msfentry) isf_link; /* next filter in all-list */ - TAILQ_ENTRY(in_msfentry) isf_next; /* next filter in queue */ - struct in_addr isf_addr; /* the address of this source */ - uint16_t isf_refcount; /* reference count */ - uint16_t isf_reporttag; /* what to report to the IGMP router */ - uint16_t isf_rexmit; /* retransmission state/count */ -}; -#endif +static __inline uint8_t +ims_get_mode(const struct in_multi *inm, const struct ip_msource *ims, + uint8_t t) +{ + + t = !!t; + if (inm->inm_st[t].iss_ex > 0 && + inm->inm_st[t].iss_ex == ims->ims_st[t].ex) + return (MCAST_EXCLUDE); + else if (ims->ims_st[t].in > 0 && ims->ims_st[t].ex == 0) + return (MCAST_INCLUDE); + return (MCAST_UNDEFINED); +} #ifdef _KERNEL @@ -231,10 +331,10 @@ SYSCTL_DECL(_net_inet_ip); SYSCTL_DECL(_net_inet_raw); #endif -LIST_HEAD(in_multihead, in_multi); +LIST_HEAD(in_multihead, in_multi); /* XXX unused */ #ifdef VIMAGE_GLOBALS extern struct in_multihead in_multihead; -#endif +#endif /* BURN_BRIDGES */ /* * Lock macros for IPv4 layer multicast address lists. IPv4 lock goes @@ -246,74 +346,90 @@ extern struct mtx in_multi_mtx; #define IN_MULTI_LOCK() mtx_lock(&in_multi_mtx) #define IN_MULTI_UNLOCK() mtx_unlock(&in_multi_mtx) #define IN_MULTI_LOCK_ASSERT() mtx_assert(&in_multi_mtx, MA_OWNED) +#define IN_MULTI_UNLOCK_ASSERT() mtx_assert(&in_multi_mtx, MA_NOTOWNED) /* - * Structure used by macros below to remember position when stepping through - * all of the in_multi records. + * Function for looking up an in_multi record for an IPv4 multicast address + * on a given interface. ifp must be valid. If no record found, return NULL. + * The IN_MULTI_LOCK and IF_ADDR_LOCK on ifp must be held. */ -struct in_multistep { - struct in_multi *i_inm; -}; +static __inline struct in_multi * +inm_lookup_locked(struct ifnet *ifp, const struct in_addr ina) +{ + struct ifmultiaddr *ifma; + struct in_multi *inm; + + IN_MULTI_LOCK_ASSERT(); + IF_ADDR_LOCK_ASSERT(ifp); + + inm = NULL; + TAILQ_FOREACH(ifma, &((ifp)->if_multiaddrs), ifma_link) { + if (ifma->ifma_addr->sa_family == AF_INET) { + inm = (struct in_multi *)ifma->ifma_protospec; + if (inm->inm_addr.s_addr == ina.s_addr) + break; + inm = NULL; + } + } + return (inm); +} /* - * Macro for looking up the in_multi record for a given IP multicast address - * on a given interface. If no matching record is found, "inm" is set null. + * Wrapper for inm_lookup_locked(). + * The IF_ADDR_LOCK will be taken on ifp and released on return. */ -#define IN_LOOKUP_MULTI(addr, ifp, inm) \ - /* struct in_addr addr; */ \ - /* struct ifnet *ifp; */ \ - /* struct in_multi *inm; */ \ -do { \ - struct ifmultiaddr *ifma; \ -\ - IN_MULTI_LOCK_ASSERT(); \ - IF_ADDR_LOCK(ifp); \ - TAILQ_FOREACH(ifma, &((ifp)->if_multiaddrs), ifma_link) { \ - if (ifma->ifma_addr->sa_family == AF_INET \ - && ((struct sockaddr_in *)ifma->ifma_addr)->sin_addr.s_addr == \ - (addr).s_addr) \ - break; \ - } \ - (inm) = ifma ? ifma->ifma_protospec : 0; \ - IF_ADDR_UNLOCK(ifp); \ -} while(0) +static __inline struct in_multi * +inm_lookup(struct ifnet *ifp, const struct in_addr ina) +{ + struct in_multi *inm; + + IN_MULTI_LOCK_ASSERT(); + IF_ADDR_LOCK(ifp); + inm = inm_lookup_locked(ifp, ina); + IF_ADDR_UNLOCK(ifp); + + return (inm); +} + +/* Acquire an in_multi record. */ +static __inline void +inm_acquire_locked(struct in_multi *inm) +{ + + IN_MULTI_LOCK_ASSERT(); + ++inm->inm_refcount; +} /* - * Macro to step through all of the in_multi records, one at a time. - * The current position is remembered in "step", which the caller must - * provide. IN_FIRST_MULTI(), below, must be called to initialize "step" - * and get the first record. Both macros return a NULL "inm" when there - * are no remaining records. + * Return values for imo_multi_filter(). */ -#define IN_NEXT_MULTI(step, inm) \ - /* struct in_multistep step; */ \ - /* struct in_multi *inm; */ \ -do { \ - IN_MULTI_LOCK_ASSERT(); \ - if (((inm) = (step).i_inm) != NULL) \ - (step).i_inm = LIST_NEXT((step).i_inm, inm_link); \ -} while(0) - -#define IN_FIRST_MULTI(step, inm) \ - /* struct in_multistep step; */ \ - /* struct in_multi *inm; */ \ -do { \ - IN_MULTI_LOCK_ASSERT(); \ - (step).i_inm = LIST_FIRST(&V_in_multihead); \ - IN_NEXT_MULTI((step), (inm)); \ -} while(0) +#define MCAST_PASS 0 /* Pass */ +#define MCAST_NOTGMEMBER 1 /* This host not a member of group */ +#define MCAST_NOTSMEMBER 2 /* This host excluded source */ +#define MCAST_MUTED 3 /* [deprecated] */ struct rtentry; struct route; struct ip_moptions; -size_t imo_match_group(struct ip_moptions *, struct ifnet *, - struct sockaddr *); -struct in_msource *imo_match_source(struct ip_moptions *, size_t, - struct sockaddr *); -struct in_multi *in_addmulti(struct in_addr *, struct ifnet *); +int imo_multi_filter(const struct ip_moptions *, const struct ifnet *, + const struct sockaddr *, const struct sockaddr *); +void inm_commit(struct in_multi *); +void inm_clear_recorded(struct in_multi *); +void inm_print(const struct in_multi *); +int inm_record_source(struct in_multi *inm, const in_addr_t); +void inm_release(struct in_multi *); +void inm_release_locked(struct in_multi *); +struct in_multi * + in_addmulti(struct in_addr *, struct ifnet *); void in_delmulti(struct in_multi *); -void in_delmulti_locked(struct in_multi *); +int in_joingroup(struct ifnet *, const struct in_addr *, + /*const*/ struct in_mfilter *, struct in_multi **); +int in_joingroup_locked(struct ifnet *, const struct in_addr *, + /*const*/ struct in_mfilter *, struct in_multi **); +int in_leavegroup(struct in_multi *, /*const*/ struct in_mfilter *); +int in_leavegroup_locked(struct in_multi *, + /*const*/ struct in_mfilter *); int in_control(struct socket *, u_long, caddr_t, struct ifnet *, struct thread *); void in_rtqdrain(void); diff --git a/sys/netinet/ip_input.c b/sys/netinet/ip_input.c index dcf2825..a75ee72 100644 --- a/sys/netinet/ip_input.c +++ b/sys/netinet/ip_input.c @@ -592,7 +592,6 @@ passin: return; } if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { - struct in_multi *inm; if (V_ip_mrouter) { /* * If we are acting as a multicast router, all @@ -619,17 +618,10 @@ passin: V_ipstat.ips_forward++; } /* - * See if we belong to the destination multicast group on the - * arrival interface. + * Assume the packet is for us, to avoid prematurely taking + * a lock on the in_multi hash. Protocols must perform + * their own filtering and update statistics accordingly. */ - IN_MULTI_LOCK(); - IN_LOOKUP_MULTI(ip->ip_dst, m->m_pkthdr.rcvif, inm); - IN_MULTI_UNLOCK(); - if (inm == NULL) { - V_ipstat.ips_notmember++; - m_freem(m); - return; - } goto ours; } if (ip->ip_dst.s_addr == (u_long)INADDR_BROADCAST) diff --git a/sys/netinet/ip_var.h b/sys/netinet/ip_var.h index f023051..b68f481 100644 --- a/sys/netinet/ip_var.h +++ b/sys/netinet/ip_var.h @@ -82,25 +82,6 @@ struct ipoption { }; /* - * Multicast source list entry. - */ -struct in_msource { - TAILQ_ENTRY(in_msource) ims_next; /* next source */ - struct sockaddr_storage ims_addr; /* address of this source */ -}; - -/* - * Multicast filter descriptor; there is one instance per group membership - * on a socket, allocated as an expandable vector hung off ip_moptions. - * struct in_multi contains separate IPv4-stack-wide state for IGMPv3. - */ -struct in_mfilter { - uint16_t imf_fmode; /* filter mode for this socket/group */ - uint16_t imf_nsources; /* # of sources for this socket/group */ - TAILQ_HEAD(, in_msource) imf_sources; /* source list */ -}; - -/* * Structure attached to inpcb.ip_moptions and * passed to ip_output when IP multicast options are in use. * This structure is lazy-allocated. diff --git a/sys/netinet/raw_ip.c b/sys/netinet/raw_ip.c index 8eb20cc..b536eb7 100644 --- a/sys/netinet/raw_ip.c +++ b/sys/netinet/raw_ip.c @@ -251,6 +251,7 @@ void rip_input(struct mbuf *m, int off) { INIT_VNET_INET(curvnet); + struct ifnet *ifp; struct ip *ip = mtod(m, struct ip *); int proto = ip->ip_p; struct inpcb *inp, *last; @@ -262,6 +263,9 @@ rip_input(struct mbuf *m, int off) ripsrc.sin_family = AF_INET; ripsrc.sin_addr = ip->ip_src; last = NULL; + + ifp = m->m_pkthdr.rcvif; + hash = INP_PCBHASH_RAW(proto, ip->ip_src.s_addr, ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask); INP_INFO_RLOCK(&V_ripcbinfo); @@ -277,8 +281,14 @@ rip_input(struct mbuf *m, int off) continue; if (inp->inp_faddr.s_addr != ip->ip_src.s_addr) continue; - if (prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0) - continue; + if (jailed(inp->inp_cred)) { + /* + * XXX: If faddr was bound to multicast group, + * jailed raw socket will drop datagram. + */ + if (prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0) + continue; + } if (last != NULL) { struct mbuf *n; @@ -299,14 +309,46 @@ rip_input(struct mbuf *m, int off) if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif - if (inp->inp_laddr.s_addr && - inp->inp_laddr.s_addr != ip->ip_dst.s_addr) - continue; - if (inp->inp_faddr.s_addr && - inp->inp_faddr.s_addr != ip->ip_src.s_addr) + if (!in_nullhost(inp->inp_laddr) && + !in_hosteq(inp->inp_laddr, ip->ip_dst)) continue; - if (prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0) + if (!in_nullhost(inp->inp_faddr) && + !in_hosteq(inp->inp_faddr, ip->ip_src)) continue; + if (jailed(inp->inp_cred)) { + /* + * Allow raw socket in jail to receive multicast; + * assume process had PRIV_NETINET_RAW at attach, + * and fall through into normal filter path if so. + */ + if (!IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && + prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0) + continue; + } + /* + * If this raw socket has multicast state, and we + * have received a multicast, check if this socket + * should receive it, as multicast filtering is now + * the responsibility of the transport layer. + */ + if (inp->inp_moptions != NULL && + IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { + struct sockaddr_in group; + int blocked; + + bzero(&group, sizeof(struct sockaddr_in)); + group.sin_len = sizeof(struct sockaddr_in); + group.sin_family = AF_INET; + group.sin_addr = ip->ip_dst; + + blocked = imo_multi_filter(inp->inp_moptions, ifp, + (struct sockaddr *)&group, + (struct sockaddr *)&ripsrc); + if (blocked != MCAST_PASS) { + V_ipstat.ips_notmember++; + continue; + } + } if (last != NULL) { struct mbuf *n; diff --git a/sys/netinet/udp_usrreq.c b/sys/netinet/udp_usrreq.c index 804f5fe..33df73e 100644 --- a/sys/netinet/udp_usrreq.c +++ b/sys/netinet/udp_usrreq.c @@ -413,12 +413,6 @@ udp_input(struct mbuf *m, int off) if (inp->inp_faddr.s_addr != INADDR_ANY && inp->inp_faddr.s_addr != ip->ip_src.s_addr) continue; - /* - * XXX: Do not check source port of incoming datagram - * unless inp_connect() has been called to bind the - * fport part of the 4-tuple; the source could be - * trying to talk to us with an ephemeral port. - */ if (inp->inp_fport != 0 && inp->inp_fport != uh->uh_sport) continue; @@ -432,54 +426,23 @@ udp_input(struct mbuf *m, int off) imo = inp->inp_moptions; if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && imo != NULL) { - struct sockaddr_in sin; - struct in_msource *ims; - int blocked, mode; - size_t idx; - - bzero(&sin, sizeof(struct sockaddr_in)); - sin.sin_len = sizeof(struct sockaddr_in); - sin.sin_family = AF_INET; - sin.sin_addr = ip->ip_dst; - - blocked = 0; - idx = imo_match_group(imo, ifp, - (struct sockaddr *)&sin); - if (idx == -1) { - /* - * No group membership for this socket. - * Do not bump udps_noportbcast, as - * this will happen further down. - */ - blocked++; - } else { - /* - * Check for a multicast source filter - * entry on this socket for this group. - * MCAST_EXCLUDE is the default - * behaviour. It means default accept; - * entries, if present, denote sources - * to be excluded from delivery. - */ - ims = imo_match_source(imo, idx, - (struct sockaddr *)&udp_in); - mode = imo->imo_mfilters[idx].imf_fmode; - if ((ims != NULL && - mode == MCAST_EXCLUDE) || - (ims == NULL && - mode == MCAST_INCLUDE)) { -#ifdef DIAGNOSTIC - if (bootverbose) { - printf("%s: blocked by" - " source filter\n", - __func__); - } -#endif + struct sockaddr_in group; + int blocked; + + bzero(&group, sizeof(struct sockaddr_in)); + group.sin_len = sizeof(struct sockaddr_in); + group.sin_family = AF_INET; + group.sin_addr = ip->ip_dst; + + blocked = imo_multi_filter(imo, ifp, + (struct sockaddr *)&group, + (struct sockaddr *)&udp_in); + if (blocked != MCAST_PASS) { + if (blocked == MCAST_NOTGMEMBER) + V_ipstat.ips_notmember++; + if (blocked == MCAST_NOTSMEMBER || + blocked == MCAST_MUTED) V_udpstat.udps_filtermcast++; - blocked++; - } - } - if (blocked != 0) { INP_RUNLOCK(inp); continue; } diff --git a/sys/netinet/vinet.h b/sys/netinet/vinet.h index 73cd3b9..e5b0bab 100644 --- a/sys/netinet/vinet.h +++ b/sys/netinet/vinet.h @@ -54,7 +54,7 @@ struct vnet_inet { struct in_ifaddrhashhead *_in_ifaddrhashtbl; struct in_ifaddrhead _in_ifaddrhead; u_long _in_ifaddrhmask; - struct in_multihead _in_multihead; + struct in_multihead _in_multihead; /* XXX unused */ int _arpt_keep; int _arp_maxtries; @@ -157,9 +157,21 @@ struct vnet_inet { struct icmpstat _icmpstat; struct ipstat _ipstat; - struct igmpstat _igmpstat; - SLIST_HEAD(, router_info) _router_info_head; + LIST_HEAD(, igmp_ifinfo) _igi_head; + struct igmpstat _igmpstat; + int _interface_timers_running; + int _state_change_timers_running; + int _current_state_timers_running; + int _igmp_recvifkludge; + int _igmp_sendra; + int _igmp_sendlocal; + int _igmp_v1enable; + int _igmp_v2enable; + int _igmp_legacysupp; + int _igmp_sgalloc; + int _igmp_default_version; + struct timeval _igmp_gsrdelay; int _rtq_timeout; int _rtq_reallyold; @@ -231,7 +243,23 @@ extern struct vnet_inet vnet_inet_0; #define V_icmpmaskfake VNET_INET(icmpmaskfake) #define V_icmpmaskrepl VNET_INET(icmpmaskrepl) #define V_icmpstat VNET_INET(icmpstat) +#define V_igi_head VNET_INET(igi_head) #define V_igmpstat VNET_INET(igmpstat) +#define V_interface_timers_running \ + VNET_INET(interface_timers_running) +#define V_state_change_timers_running \ + VNET_INET(state_change_timers_running) +#define V_current_state_timers_running \ + VNET_INET(current_state_timers_running) +#define V_igmp_recvifkludge VNET_INET(igmp_recvifkludge) +#define V_igmp_sendra VNET_INET(igmp_sendra) +#define V_igmp_sendlocal VNET_INET(igmp_sendlocal) +#define V_igmp_v1enable VNET_INET(igmp_v1enable) +#define V_igmp_v2enable VNET_INET(igmp_v2enable) +#define V_igmp_legacysupp VNET_INET(igmp_legacysupp) +#define V_igmp_sgalloc VNET_INET(igmp_sgalloc) +#define V_igmp_default_version VNET_INET(igmp_default_version) +#define V_igmp_gsrdelay VNET_INET(igmp_gsrdelay) #define V_in_ifaddrhashtbl VNET_INET(in_ifaddrhashtbl) #define V_in_ifaddrhead VNET_INET(in_ifaddrhead) #define V_in_ifaddrhmask VNET_INET(in_ifaddrhmask) diff --git a/sys/sys/param.h b/sys/sys/param.h index 21e5433..e8b26c7 100644 --- a/sys/sys/param.h +++ b/sys/sys/param.h @@ -57,7 +57,7 @@ * is created, otherwise 1. */ #undef __FreeBSD_version -#define __FreeBSD_version 800069 /* Master, propagated to newvers */ +#define __FreeBSD_version 800070 /* Master, propagated to newvers */ #ifndef LOCORE #include <sys/types.h> diff --git a/sys/sys/vimage.h b/sys/sys/vimage.h index 40d537e..ea51e49 100644 --- a/sys/sys/vimage.h +++ b/sys/sys/vimage.h @@ -120,49 +120,49 @@ void vnet_mod_register(const struct vnet_modinfo *); #ifdef __amd64__ #define SIZEOF_vnet_net 464 #define SIZEOF_vnet_net_LINT 5144 -#define SIZEOF_vnet_inet 4160 +#define SIZEOF_vnet_inet 4352 #define SIZEOF_vnet_inet6 8800 #define SIZEOF_vnet_ipsec 31160 #endif #ifdef __arm__ #define SIZEOF_vnet_net 236 #define SIZEOF_vnet_net_LINT 1 /* No LINT kernel yet. */ -#define SIZEOF_vnet_inet 2396 +#define SIZEOF_vnet_inet 2580 #define SIZEOF_vnet_inet6 8536 #define SIZEOF_vnet_ipsec 1 #endif #ifdef __i386__ /* incl. pc98 */ #define SIZEOF_vnet_net 236 #define SIZEOF_vnet_net_LINT 2576 -#define SIZEOF_vnet_inet 2396 +#define SIZEOF_vnet_inet 2576 #define SIZEOF_vnet_inet6 8528 #define SIZEOF_vnet_ipsec 31016 #endif #ifdef __ia64__ #define SIZEOF_vnet_net 464 #define SIZEOF_vnet_net_LINT 5144 -#define SIZEOF_vnet_inet 4160 +#define SIZEOF_vnet_inet 4352 #define SIZEOF_vnet_inet6 8800 #define SIZEOF_vnet_ipsec 31160 #endif #ifdef __mips__ #define SIZEOF_vnet_net 236 #define SIZEOF_vnet_net_LINT 1 /* No LINT kernel yet. */ -#define SIZEOF_vnet_inet 2432 +#define SIZEOF_vnet_inet 2624 #define SIZEOF_vnet_inet6 8552 #define SIZEOF_vnet_ipsec 1 #endif #ifdef __powerpc__ #define SIZEOF_vnet_net 236 #define SIZEOF_vnet_net_LINT 2576 -#define SIZEOF_vnet_inet 2432 +#define SIZEOF_vnet_inet 2616 #define SIZEOF_vnet_inet6 8536 #define SIZEOF_vnet_ipsec 31048 #endif #ifdef __sparc64__ /* incl. sun4v */ #define SIZEOF_vnet_net 464 #define SIZEOF_vnet_net_LINT 5144 -#define SIZEOF_vnet_inet 4160 +#define SIZEOF_vnet_inet 4352 #define SIZEOF_vnet_inet6 8800 #define SIZEOF_vnet_ipsec 31160 #endif diff --git a/usr.bin/netstat/inet.c b/usr.bin/netstat/inet.c index fd95a88..9d82ef3 100644 --- a/usr.bin/netstat/inet.c +++ b/usr.bin/netstat/inet.c @@ -997,32 +997,30 @@ icmp_stats(u_long off, const char *name, int af1 __unused, int proto __unused) } } +#ifndef BURN_BRIDGES /* - * Dump IGMP statistics structure. + * Dump IGMP statistics structure (pre 8.x kernel). */ -void -igmp_stats(u_long off, const char *name, int af1 __unused, int proto __unused) +static void +igmp_stats_live_old(u_long off, const char *name) { - struct igmpstat igmpstat, zerostat; - size_t len = sizeof igmpstat; - - if (live) { - if (zflag) - memset(&zerostat, 0, len); - if (sysctlbyname("net.inet.igmp.stats", &igmpstat, &len, - zflag ? &zerostat : NULL, zflag ? len : 0) < 0) { - warn("sysctl: net.inet.igmp.stats"); - return; - } - } else - kread(off, &igmpstat, len); + struct oigmpstat oigmpstat, zerostat; + size_t len = sizeof(oigmpstat); + + if (zflag) + memset(&zerostat, 0, len); + if (sysctlbyname("net.inet.igmp.stats", &oigmpstat, &len, + zflag ? &zerostat : NULL, zflag ? len : 0) < 0) { + warn("sysctl: net.inet.igmp.stats"); + return; + } printf("%s:\n", name); -#define p(f, m) if (igmpstat.f || sflag <= 1) \ - printf(m, igmpstat.f, plural(igmpstat.f)) -#define py(f, m) if (igmpstat.f || sflag <= 1) \ - printf(m, igmpstat.f, igmpstat.f != 1 ? "ies" : "y") +#define p(f, m) if (oigmpstat.f || sflag <= 1) \ + printf(m, oigmpstat.f, plural(oigmpstat.f)) +#define py(f, m) if (oigmpstat.f || sflag <= 1) \ + printf(m, oigmpstat.f, oigmpstat.f != 1 ? "ies" : "y") p(igps_rcv_total, "\t%u message%s received\n"); p(igps_rcv_tooshort, "\t%u message%s received with too few bytes\n"); p(igps_rcv_badsum, "\t%u message%s received with bad checksum\n"); @@ -1038,6 +1036,89 @@ igmp_stats(u_long off, const char *name, int af1 __unused, int proto __unused) #undef p #undef py } +#endif /* !BURN_BRIDGES */ + +/* + * Dump IGMP statistics structure. + */ +void +igmp_stats(u_long off, const char *name, int af1 __unused, int proto __unused) +{ + struct igmpstat igmpstat, zerostat; + size_t len; + +#ifndef BURN_BRIDGES + if (live) { + /* + * Detect if we are being run against a pre-IGMPv3 kernel. + * We cannot do this for a core file as the legacy + * struct igmpstat has no size field, nor does it + * export it in any readily-available symbols. + */ + len = 0; + if (sysctlbyname("net.inet.igmp.stats", NULL, &len, NULL, + 0) < 0) { + warn("sysctl: net.inet.igmp.stats"); + return; + } + if (len < sizeof(igmpstat)) { + igmp_stats_live_old(off, name); + return; + } + } +#endif /* !BURN_BRIDGES */ + + len = sizeof(igmpstat); + if (live) { + if (zflag) + memset(&zerostat, 0, len); + if (sysctlbyname("net.inet.igmp.stats", &igmpstat, &len, + zflag ? &zerostat : NULL, zflag ? len : 0) < 0) { + warn("sysctl: net.inet.igmp.stats"); + return; + } + } else { + len = sizeof(igmpstat); + kread(off, &igmpstat, len); + } + + if (igmpstat.igps_version != IGPS_VERSION_3) { + warnx("%s: version mismatch (%d != %d)", __func__, + igmpstat.igps_version, IGPS_VERSION_3); + } + if (igmpstat.igps_len != IGPS_VERSION3_LEN) { + warnx("%s: size mismatch (%d != %d)", __func__, + igmpstat.igps_len, IGPS_VERSION3_LEN); + } + + printf("%s:\n", name); + +#define p64(f, m) if (igmpstat.f || sflag <= 1) \ + printf(m, (uintmax_t) igmpstat.f, plural(igmpstat.f)) +#define py64(f, m) if (igmpstat.f || sflag <= 1) \ + printf(m, (uintmax_t) igmpstat.f, pluralies(igmpstat.f)) + p64(igps_rcv_total, "\t%ju message%s received\n"); + p64(igps_rcv_tooshort, "\t%ju message%s received with too few bytes\n"); + p64(igps_rcv_badttl, "\t%ju message%s received with wrong TTL\n"); + p64(igps_rcv_badsum, "\t%ju message%s received with bad checksum\n"); + py64(igps_rcv_v1v2_queries, "\t%ju V1/V2 membership quer%s received\n"); + py64(igps_rcv_v3_queries, "\t%ju V3 membership quer%s received\n"); + py64(igps_rcv_badqueries, + "\t%ju membership quer%s received with invalid field(s)\n"); + py64(igps_rcv_gen_queries, "\t%ju general quer%s received\n"); + py64(igps_rcv_group_queries, "\t%ju group quer%s received\n"); + py64(igps_rcv_gsr_queries, "\t%ju group-source quer%s received\n"); + py64(igps_drop_gsr_queries, "\t%ju group-source quer%s dropped\n"); + p64(igps_rcv_reports, "\t%ju membership report%s received\n"); + p64(igps_rcv_badreports, + "\t%ju membership report%s received with invalid field(s)\n"); + p64(igps_rcv_ourreports, +"\t%ju membership report%s received for groups to which we belong\n"); + p64(igps_rcv_nora, "\t%ju V3 report%s received without Router Alert\n"); + p64(igps_snd_reports, "\t%ju membership report%s sent\n"); +#undef p64 +#undef py64 +} /* * Dump PIM statistics structure. diff --git a/usr.sbin/ifmcstat/Makefile b/usr.sbin/ifmcstat/Makefile index 62de0b9..fab7ea6 100644 --- a/usr.sbin/ifmcstat/Makefile +++ b/usr.sbin/ifmcstat/Makefile @@ -4,8 +4,10 @@ .include <bsd.own.mk> PROG= ifmcstat +SRCS= ifmcstat.c printb.c + MAN= ifmcstat.8 -BINMODE= 550 +BINMODE= 555 WARNS?= 2 diff --git a/usr.sbin/ifmcstat/ifmcstat.8 b/usr.sbin/ifmcstat/ifmcstat.8 index 78eb39c..5805183 100644 --- a/usr.sbin/ifmcstat/ifmcstat.8 +++ b/usr.sbin/ifmcstat/ifmcstat.8 @@ -30,7 +30,7 @@ .\" .\" $FreeBSD$ .\" -.Dd February 15, 2009 +.Dd February 28, 2009 .Dt IFMCSTAT 8 .Os .Sh NAME @@ -41,6 +41,7 @@ .Op Fl i Ar interface .Op Fl f Ar address-family .Op Fl v +.Op Fl K .Op Fl M Ar core .Op Fl N Ar system .\" @@ -66,6 +67,13 @@ specifies that link-layer memberships should be printed; they are suppressed by default. It may not be specified for .Fl f Ar link . +Source lists for each group will also be printed. +.Pp +If specified twice, and +.Xr kvm 3 +is in use, the IGMP timers for each interface +and the IGMP source list counters for each group +will also be printed. .El .Pp The following options are only available if @@ -73,6 +81,10 @@ The following options are only available if has been built with support for .Xr kvm 3 : .Bl -tag -width Fl +.It Fl K +forces the use of +.Xr kvm 3 +to be disabled. .It Fl M Ar core extracts values associated with the name list from the specified core, instead of the default @@ -106,10 +118,14 @@ support, the information displayed by is more limited. This support is recommended for debugging purposes. It requires super-user privilege if used to inspect a running kernel. +.Pp +The .Xr kvm 3 -will be used by default if +back-end will be used by default if .Nm -is run with super-user privileges. +is run with super-user privileges, unless the +.Fl K +option is specified. .Sh SEE ALSO .Xr getifaddrs 3 , .Xr getifmaddrs 3 , diff --git a/usr.sbin/ifmcstat/ifmcstat.c b/usr.sbin/ifmcstat/ifmcstat.c index c8f57d4..34068ea 100644 --- a/usr.sbin/ifmcstat/ifmcstat.c +++ b/usr.sbin/ifmcstat/ifmcstat.c @@ -35,8 +35,10 @@ __FBSDID("$FreeBSD$"); #include <sys/types.h> #include <sys/param.h> +#include <sys/sysctl.h> #include <sys/socket.h> #include <sys/queue.h> +#include <sys/tree.h> #include <net/if.h> #include <net/if_var.h> @@ -49,15 +51,13 @@ __FBSDID("$FreeBSD$"); #include <netinet/in_systm.h> #include <netinet/ip.h> #include <netinet/igmp.h> -#ifdef HAVE_IGMPV3 -# include <netinet/in_msf.h> -#endif #define KERNEL # include <netinet/if_ether.h> #undef KERNEL #define _KERNEL -# include <sys/sysctl.h> +#define SYSCTL_DECL(x) # include <netinet/igmp_var.h> +#undef SYSCTL_DECL #undef _KERNEL #ifdef INET6 @@ -80,6 +80,7 @@ __FBSDID("$FreeBSD$"); #include <ctype.h> #include <err.h> +#include <errno.h> #include <fcntl.h> #include <kvm.h> #include <limits.h> @@ -93,6 +94,8 @@ __FBSDID("$FreeBSD$"); #define INET #endif +extern void printb(const char *, unsigned int, const char *); + union sockunion { struct sockaddr_storage ss; struct sockaddr sa; @@ -108,6 +111,9 @@ typedef union sockunion sockunion_t; uint32_t ifindex = 0; int af = AF_UNSPEC; +#ifdef WITH_KVM +int Kflag = 0; +#endif int vflag = 0; #define sa_equal(a1, a2) \ @@ -130,9 +136,6 @@ int vflag = 0; static void if_addrlist(struct ifaddr *); static struct in_multi * in_multientry(struct in_multi *); -#ifdef HAVE_IGMPV3 -static void in_addr_slistentry(struct in_addr_slist *, char *); -#endif #endif /* INET */ #ifdef INET6 @@ -159,6 +162,10 @@ struct nlist nl[] = { #endif /* WITH_KVM */ static int ifmcstat_getifmaddrs(void); +#ifdef INET +static void in_ifinfo(struct igmp_ifinfo *); +static const char * inm_mode(u_int mode); +#endif #ifdef INET6 static const char * inet6_n2a(struct in6_addr *); #endif @@ -172,12 +179,18 @@ usage() "usage: ifmcstat [-i interface] [-f address family]" " [-v]" #ifdef WITH_KVM - " [-M core] [-N system]" + " [-K] [-M core] [-N system]" #endif "\n"); exit(EX_USAGE); } +static const char *options = "i:f:vM:N:" +#ifdef WITH_KVM + "K" +#endif + ; + int main(int argc, char **argv) { @@ -187,7 +200,7 @@ main(int argc, char **argv) const char *core = NULL; #endif - while ((c = getopt(argc, argv, "i:f:vM:N:")) != -1) { + while ((c = getopt(argc, argv, options)) != -1) { switch (c) { case 'i': if ((ifindex = if_nametoindex(optarg)) == 0) { @@ -219,8 +232,14 @@ main(int argc, char **argv) /*NOTREACHED*/ break; +#ifdef WITH_KVM + case 'K': + ++Kflag; + break; +#endif + case 'v': - vflag = 1; + ++vflag; break; #ifdef WITH_KVM @@ -244,12 +263,13 @@ main(int argc, char **argv) usage(); #ifdef WITH_KVM - error = ifmcstat_kvm(kernel, core); + if (!Kflag) + error = ifmcstat_kvm(kernel, core); /* * If KVM failed, and user did not explicitly specify a core file, - * try the sysctl backend. + * or force KVM backend to be disabled, try the sysctl backend. */ - if (error != 0 && (core == NULL && kernel == NULL)) + if (Kflag || (error != 0 && (core == NULL && kernel == NULL))) #endif error = ifmcstat_getifmaddrs(); if (error != 0) @@ -259,6 +279,52 @@ main(int argc, char **argv) /*NOTREACHED*/ } +#ifdef INET + +static void +in_ifinfo(struct igmp_ifinfo *igi) +{ + + printf("\t"); + switch (igi->igi_version) { + case IGMP_VERSION_1: + case IGMP_VERSION_2: + case IGMP_VERSION_3: + printf("igmpv%d", igi->igi_version); + break; + default: + printf("igmpv?(%d)", igi->igi_version); + break; + } + printb(" flags", igi->igi_flags, "\020\1SILENT\2LOOPBACK"); + if (igi->igi_version == IGMP_VERSION_3) { + printf(" rv %u qi %u qri %u uri %u", + igi->igi_rv, igi->igi_qi, igi->igi_qri, igi->igi_uri); + } + if (vflag >= 2) { + printf(" v1timer %u v2timer %u v3timer %u", + igi->igi_v1_timer, igi->igi_v2_timer, igi->igi_v3_timer); + } + printf("\n"); +} + +static const char *inm_modes[] = { + "undefined", + "include", + "exclude", +}; + +static const char * +inm_mode(u_int mode) +{ + + if (mode >= MCAST_UNDEFINED && mode <= MCAST_EXCLUDE) + return (inm_modes[mode]); + return (NULL); +} + +#endif /* INET */ + #ifdef WITH_KVM static int @@ -447,6 +513,7 @@ static void if_addrlist(struct ifaddr *ifap) { struct ifaddr ifa; + struct ifnet ifnet; struct sockaddr sa; struct in_ifaddr ia; struct ifaddr *ifap0; @@ -463,11 +530,24 @@ if_addrlist(struct ifaddr *ifap) goto nextifap; KREAD(ifap, &ia, struct in_ifaddr); printf("\tinet %s\n", inet_ntoa(ia.ia_addr.sin_addr)); + /* + * Print per-link IGMP information, if available. + */ + if (ifa.ifa_ifp != NULL) { + struct in_ifinfo ii; + struct igmp_ifinfo igi; + + KREAD(ifa.ifa_ifp, &ifnet, struct ifnet); + KREAD(ifnet.if_afdata[AF_INET], &ii, struct in_ifinfo); + if (ii.ii_igmp != NULL) { + KREAD(ii.ii_igmp, &igi, struct igmp_ifinfo); + in_ifinfo(&igi); + } + } nextifap: ifap = ifa.ifa_link.tqe_next; } if (ifap0) { - struct ifnet ifnet; struct ifmultiaddr ifm, *ifmp = 0; struct sockaddr_dl sdl; @@ -496,96 +576,145 @@ if_addrlist(struct ifaddr *ifap) } } -static struct in_multi * -in_multientry(struct in_multi *mc) +static const char *inm_states[] = { + "not-member", + "silent", + "idle", + "lazy", + "sleeping", + "awakening", + "query-pending", + "sg-query-pending", + "leaving" +}; + +static const char * +inm_state(u_int state) { - struct in_multi multi; - struct router_info rti; -#ifdef HAVE_IGMPV3 - struct in_multi_source src; -#endif - KREAD(mc, &multi, struct in_multi); - printf("\t\tgroup %s\n", inet_ntoa(multi.inm_addr)); + if (state >= IGMP_NOT_MEMBER && state <= IGMP_LEAVING_MEMBER) + return (inm_states[state]); + return (NULL); +} - if (multi.inm_rti != NULL) { - KREAD(multi.inm_rti, &rti, struct router_info); - printf("\t\t\t"); - switch (rti.rti_type) { - case IGMP_V1_ROUTER: - printf("igmpv1"); - break; - case IGMP_V2_ROUTER: - printf("igmpv2"); - break; -#ifdef HAVE_IGMPV3 - case IGMP_V3_ROUTER: - printf("igmpv3"); - break; -#endif - default: - printf("igmpv?(%d)", rti.rti_type); - break; - } +#if 0 +static struct ip_msource * +ims_min_kvm(struct in_multi *pinm) +{ + struct ip_msource ims0; + struct ip_msource *tmp, *parent; + + parent = NULL; + tmp = RB_ROOT(&pinm->inm_srcs); + while (tmp) { + parent = tmp; + KREAD(tmp, &ims0, struct ip_msource); + tmp = RB_LEFT(&ims0, ims_link); + } + return (parent); /* kva */ +} -#ifdef HAVE_IGMPV3 - if (multi.inm_source == NULL) { - printf("\n"); - return (multi.inm_list.le_next); +/* XXX This routine is buggy. See RB_NEXT in sys/tree.h. */ +static struct ip_msource * +ims_next_kvm(struct ip_msource *ims) +{ + struct ip_msource ims0, ims1; + struct ip_msource *tmp; + + KREAD(ims, &ims0, struct ip_msource); + if (RB_RIGHT(&ims0, ims_link)) { + ims = RB_RIGHT(&ims0, ims_link); + KREAD(ims, &ims1, struct ip_msource); + while ((tmp = RB_LEFT(&ims1, ims_link))) { + KREAD(tmp, &ims0, struct ip_msource); + ims = RB_LEFT(&ims0, ims_link); + } + } else { + tmp = RB_PARENT(&ims0, ims_link); + if (tmp) { + KREAD(tmp, &ims1, struct ip_msource); + if (ims == RB_LEFT(&ims1, ims_link)) + ims = tmp; + } else { + while ((tmp = RB_PARENT(&ims0, ims_link))) { + KREAD(tmp, &ims1, struct ip_msource); + if (ims == RB_RIGHT(&ims1, ims_link)) { + ims = tmp; + KREAD(ims, &ims0, struct ip_msource); + } else + break; + } + ims = RB_PARENT(&ims0, ims_link); } - - KREAD(multi.inm_source, &src, struct in_multi_source); - printf(" mode=%s grpjoin=%d\n", - src.ims_mode == MCAST_INCLUDE ? "include" : - src.ims_mode == MCAST_EXCLUDE ? "exclude" : - "???", - src.ims_grpjoin); - in_addr_slistentry(src.ims_cur, "current"); - in_addr_slistentry(src.ims_rec, "recorded"); - in_addr_slistentry(src.ims_in, "included"); - in_addr_slistentry(src.ims_ex, "excluded"); - in_addr_slistentry(src.ims_alw, "allowed"); - in_addr_slistentry(src.ims_blk, "blocked"); - in_addr_slistentry(src.ims_toin, "to-include"); - in_addr_slistentry(src.ims_ex, "to-exclude"); -#else - printf("\n"); -#endif } - - return (NULL); + return (ims); /* kva */ } -#ifdef HAVE_IGMPV3 static void -in_addr_slistentry(struct in_addr_slist *ias, char *heading) +inm_print_sources_kvm(struct in_multi *pinm) { - struct in_addr_slist slist; - struct ias_head head; - struct in_addr_source src; - - if (ias == NULL) { - printf("\t\t\t\t%s (none)\n", heading); - return; - } - memset(&slist, 0, sizeof(slist)); - KREAD(ias, &slist, struct in_addr_source); - printf("\t\t\t\t%s (entry num=%d)\n", heading, slist.numsrc); - if (slist.numsrc == 0) { + struct ip_msource ims0; + struct ip_msource *ims; + struct in_addr src; + int cnt; + uint8_t fmode; + + cnt = 0; + fmode = pinm->inm_st[1].iss_fmode; + if (fmode == MCAST_UNDEFINED) return; + for (ims = ims_min_kvm(pinm); ims != NULL; ims = ims_next_kvm(ims)) { + if (cnt == 0) + printf(" srcs "); + KREAD(ims, &ims0, struct ip_msource); + /* Only print sources in-mode at t1. */ + if (fmode != ims_get_mode(pinm, ims, 1)) + continue; + src.s_addr = htonl(ims0.ims_haddr); + printf("%s%s", (cnt++ == 0 ? "" : ","), inet_ntoa(src)); } - KREAD(slist.head, &head, struct ias_head); +} +#endif - KREAD(head.lh_first, &src, struct in_addr_source); - while (1) { - printf("\t\t\t\t\tsource %s (ref=%d)\n", - inet_ntoa(src.ias_addr.sin_addr), src.ias_refcount); - if (src.ias_list.le_next == NULL) - break; - KREAD(src.ias_list.le_next, &src, struct in_addr_source); +static struct in_multi * +in_multientry(struct in_multi *pinm) +{ + struct in_multi inm; + const char *state, *mode; + + KREAD(pinm, &inm, struct in_multi); + printf("\t\tgroup %s", inet_ntoa(inm.inm_addr)); + printf(" refcnt %u", inm.inm_refcount); + + state = inm_state(inm.inm_state); + if (state) + printf(" state %s", state); + else + printf(" state (%d)", inm.inm_state); + + mode = inm_mode(inm.inm_st[1].iss_fmode); + if (mode) + printf(" mode %s", mode); + else + printf(" mode (%d)", inm.inm_st[1].iss_fmode); + + if (vflag >= 2) { + printf(" asm %u ex %u in %u rec %u", + (u_int)inm.inm_st[1].iss_asm, + (u_int)inm.inm_st[1].iss_ex, + (u_int)inm.inm_st[1].iss_in, + (u_int)inm.inm_st[1].iss_rec); } + +#if 0 + /* Buggy. */ + if (vflag) + inm_print_sources_kvm(&inm); +#endif + + printf("\n"); + return (NULL); } -#endif /* HAVE_IGMPV3 */ #endif /* INET */ @@ -622,6 +751,97 @@ inet6_n2a(struct in6_addr *p) } #endif /* INET6 */ +#ifdef INET +/* + * Retrieve per-group source filter mode and lists via sysctl. + */ +static void +inm_print_sources_sysctl(uint32_t ifindex, struct in_addr gina) +{ +#define MAX_SYSCTL_TRY 5 + int mib[7]; + int ntry = 0; + size_t mibsize; + size_t len; + size_t needed; + size_t cnt; + int i; + char *buf; + struct in_addr *pina; + uint32_t *p; + uint32_t fmode; + const char *modestr; + + mibsize = sizeof(mib) / sizeof(mib[0]); + if (sysctlnametomib("net.inet.ip.mcast.filters", mib, &mibsize) == -1) { + perror("sysctlnametomib"); + return; + } + + needed = 0; + mib[5] = ifindex; + mib[6] = gina.s_addr; /* 32 bits wide */ + mibsize = sizeof(mib) / sizeof(mib[0]); + do { + if (sysctl(mib, mibsize, NULL, &needed, NULL, 0) == -1) { + perror("sysctl net.inet.ip.mcast.filters"); + return; + } + if ((buf = malloc(needed)) == NULL) { + perror("malloc"); + return; + } + if (sysctl(mib, mibsize, buf, &needed, NULL, 0) == -1) { + if (errno != ENOMEM || ++ntry >= MAX_SYSCTL_TRY) { + perror("sysctl"); + goto out_free; + } + free(buf); + buf = NULL; + } + } while (buf == NULL); + + len = needed; + if (len < sizeof(uint32_t)) { + perror("sysctl"); + goto out_free; + } + + p = (uint32_t *)buf; + fmode = *p++; + len -= sizeof(uint32_t); + + modestr = inm_mode(fmode); + if (modestr) + printf(" mode %s", modestr); + else + printf(" mode (%u)", fmode); + + if (vflag == 0) + goto out_free; + + cnt = len / sizeof(struct in_addr); + pina = (struct in_addr *)p; + + for (i = 0; i < cnt; i++) { + if (i == 0) + printf(" srcs "); + fprintf(stdout, "%s%s", (i == 0 ? "" : ","), + inet_ntoa(*pina++)); + len -= sizeof(struct in_addr); + } + if (len > 0) { + fprintf(stderr, "warning: %u trailing bytes from %s\n", + (unsigned int)len, "net.inet.ip.mcast.filters"); + } + +out_free: + free(buf); +#undef MAX_SYSCTL_TRY +} + +#endif /* INET */ + static int ifmcstat_getifmaddrs(void) { @@ -771,6 +991,32 @@ ifmcstat_getifmaddrs(void) } fprintf(stdout, "\t%s %s\n", pafname, addrbuf); +#ifdef INET + /* + * Print per-link IGMP information, if available. + */ + if (pifasa->sa.sa_family == AF_INET) { + struct igmp_ifinfo igi; + size_t mibsize, len; + int mib[5]; + + mibsize = sizeof(mib) / sizeof(mib[0]); + if (sysctlnametomib("net.inet.igmp.ifinfo", + mib, &mibsize) == -1) { + perror("sysctlnametomib"); + goto next_ifnet; + } + mib[mibsize] = thisifindex; + len = sizeof(struct igmp_ifinfo); + if (sysctl(mib, mibsize + 1, &igi, &len, NULL, + 0) == -1) { + perror("sysctl net.inet.igmp.ifinfo"); + goto next_ifnet; + } + in_ifinfo(&igi); + } +next_ifnet: +#endif lastifasa = *pifasa; } @@ -788,7 +1034,14 @@ ifmcstat_getifmaddrs(void) perror("getnameinfo"); } - fprintf(stdout, "\t\tgroup %s\n", addrbuf); + fprintf(stdout, "\t\tgroup %s", addrbuf); +#ifdef INET + if (pgsa->sa.sa_family == AF_INET) { + inm_print_sources_sysctl(thisifindex, + pgsa->sin.sin_addr); + } +#endif + fprintf(stdout, "\n"); /* Link-layer mapping, if present. */ pllsa = (sockunion_t *)ifma->ifma_lladdr; |