Initial get-the-easy-case-working upgrade of the multicast code

to something more recent than the ancient 1.2 release contained in 4.4. This code has the following advantages as compared to previous versions (culled from the README file for the SunOS release): - True multicast delivery - Configurable rate-limiting of forwarded multicast traffic on each physical interface or tunnel, using a token-bucket limiter. - Simplistic classification of packets for prioritized dropping. - Administrative scoping of multicast address ranges. - Faster detection of hosts leaving groups. - Support for multicast traceroute (code not yet available). - Support for RSVP, the Resource Reservation Protocol. What still needs to be done: - The multicast forwarder needs testing. - The multicast routing daemon needs to be ported. - Network interface drivers need to have the `#ifdef MULTICAST' goop ripped out of them. - The IGMP code should probably be bogon-tested. Some notes about the porting process: In some cases, the Berkeley people decided to incorporate functionality from later releases of the multicast code, but then had to do things differently. As a result, if you look at Deering's patches, and then look at our code, it is not always obvious whether the patch even applies. Let the reader beware. I ran ip_mroute.c through several passes of `unifdef' to get rid of useless grot, and to permanently enable the RSVP support, which we will include as standard. Ported by: Garrett Wollman Submitted by: Steve Deering and Ajit Thyagarajan (among others)
author: wollman <wollman@FreeBSD.org> 1994-09-06 22:42:31 +0000
committer: wollman <wollman@FreeBSD.org> 1994-09-06 22:42:31 +0000
commit: 75ad508fd126c679edba9b67bd09d74a1fff3aba (patch)
tree: da36f83faafbd2141b041ae182b2406dfee02756 /sys/netinet/ip_mroute.c
parent: f624d4a80eef8e47182201473e55257609525b41 (diff)
download: FreeBSD-src-75ad508fd126c679edba9b67bd09d74a1fff3aba.zip
FreeBSD-src-75ad508fd126c679edba9b67bd09d74a1fff3aba.tar.gz
1 files changed, 1603 insertions, 657 deletions
diff --git a/sys/netinet/ip_mroute.c b/sys/netinet/ip_mroute.c
index b07d919..b14951d 100644
--- a/sys/netinet/ip_mroute.c
+++ b/sys/netinet/ip_mroute.c
@@ -1,616 +1,857 @@
 /*
- * Copyright (c) 1989 Stephen Deering
- * Copyright (c) 1992, 1993
- *	The Regents of the University of California.  All rights reserved.
- *
- * This code is derived from software contributed to Berkeley by
- * Stephen Deering of Stanford University.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *    must display the following acknowledgement:
- *	This product includes software developed by the University of
- *	California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- *	@(#)ip_mroute.c	8.2 (Berkeley) 11/15/93
- * $Id$
- */
-
-/*
- * Procedures for the kernel part of DVMRP,
- * a Distance-Vector Multicast Routing Protocol.
- * (See RFC-1075.)
+ * IP multicast forwarding procedures
  *
  * Written by David Waitzman, BBN Labs, August 1988.
  * Modified by Steve Deering, Stanford, February 1989.
+ * Modified by Mark J. Steiglitz, Stanford, May, 1991
+ * Modified by Van Jacobson, LBL, January 1993
+ * Modified by Ajit Thyagarajan, PARC, August 1993
  *
- * MROUTING 1.1
+ * MROUTING 1.8
  */
 
-#ifndef MROUTING
-int	ip_mrtproto;				/* for netstat only */
-#else
 
 #include <sys/param.h>
 #include <sys/systm.h>
-#include <sys/errno.h>
-#include <sys/ioctl.h>
-#include <sys/malloc.h>
 #include <sys/mbuf.h>
-#include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
+#include <sys/protosw.h>
+#include <sys/errno.h>
 #include <sys/time.h>
-
+#include <sys/ioctl.h>
+#include <sys/syslog.h>
 #include <net/if.h>
 #include <net/route.h>
 #include <net/raw_cb.h>
-
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
+#include <netinet/ip_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
-#include <netinet/ip_var.h>
-
 #include <netinet/igmp.h>
 #include <netinet/igmp_var.h>
 #include <netinet/ip_mroute.h>
 
-/* Static forwards */
-static	int ip_mrouter_init __P((struct socket *));
-static	int add_vif __P((struct vifctl *));
-static	int del_vif __P((vifi_t *vifip));
-static	int add_lgrp __P((struct lgrplctl *));
-static	int del_lgrp __P((struct lgrplctl *));
-static	int grplst_member __P((struct vif *, struct in_addr));
-static	u_long nethash __P((struct in_addr in));
-static	int add_mrt __P((struct mrtctl *));
-static	int del_mrt __P((struct in_addr *));
-static	struct mrt *mrtfind __P((struct in_addr));
-static	void phyint_send __P((struct mbuf *, struct vif *));
-static	void tunnel_send __P((struct mbuf *, struct vif *));
-
-#define INSIZ sizeof(struct in_addr)
-#define	same(a1, a2) (bcmp((caddr_t)(a1), (caddr_t)(a2), INSIZ) == 0)
-#define	satosin(sa)	((struct sockaddr_in *)(sa))
+#ifndef NTOHL
+#if BYTE_ORDER != BIG_ENDIAN
+#define NTOHL(d) ((d) = ntohl((d)))
+#define NTOHS(d) ((d) = ntohs((u_short)(d)))
+#define HTONL(d) ((d) = htonl((d)))
+#define HTONS(d) ((d) = htons((u_short)(d)))
+#else
+#define NTOHL(d)
+#define NTOHS(d)
+#define HTONL(d)
+#define HTONS(d)
+#endif
+#endif
 
+#ifndef MROUTING
 /*
- * Globals.  All but ip_mrouter and ip_mrtproto could be static,
- * except for netstat or debugging purposes.
+ * Dummy routines and globals used when multicast routing is not compiled in.
  */
-struct	socket *ip_mrouter = NULL;
-int	ip_mrtproto = IGMP_DVMRP;		/* for netstat only */
 
-struct	mrt *mrttable[MRTHASHSIZ];
-struct	vif viftable[MAXVIFS];
-struct	mrtstat	mrtstat;
+struct socket  *ip_mrouter  = NULL;
+u_int		ip_mrtproto = 0;
 
-/*
- * Private variables.
- */
-static	vifi_t numvifs = 0;
-static	struct mrt *cached_mrt = NULL;
-static	u_long cached_origin;
-static	u_long cached_originmask;
-
-/*
- * Handle DVMRP setsockopt commands to modify the multicast routing tables.
- */
 int
 ip_mrouter_cmd(cmd, so, m)
-	register int cmd;
-	register struct socket *so;
-	register struct mbuf *m;
+	int cmd;
+	struct socket *so;
+	struct mbuf *m;
 {
-	register int error = 0;
+	return(EOPNOTSUPP);
+}
 
-	if (cmd != DVMRP_INIT && so != ip_mrouter)
-		error = EACCES;
-	else switch (cmd) {
+int
+ip_mrouter_done()
+{
+	return(0);
+}
 
-	case DVMRP_INIT:
-		error = ip_mrouter_init(so);
-		break;
+int
+ip_mforward(ip, ifp, m)
+	struct ip *ip;
+	struct ifnet *ifp;
+	struct mbuf *m;
+{
+	return(0);
+}
+#else
 
-	case DVMRP_DONE:
-		error = ip_mrouter_done();
-		break;
+#define INSIZ		sizeof(struct in_addr)
+#define	same(a1, a2) \
+	(bcmp((caddr_t)(a1), (caddr_t)(a2), INSIZ) == 0)
 
-	case DVMRP_ADD_VIF:
-		if (m == NULL || m->m_len < sizeof(struct vifctl))
-			error = EINVAL;
-		else
-			error = add_vif(mtod(m, struct vifctl *));
-		break;
+#define MT_MRTABLE MT_RTABLE	/* since nothing else uses it */
 
-	case DVMRP_DEL_VIF:
-		if (m == NULL || m->m_len < sizeof(short))
-			error = EINVAL;
-		else
-			error = del_vif(mtod(m, vifi_t *));
-		break;
+/*
+ * Globals.  All but ip_mrouter and ip_mrtproto could be static,
+ * except for netstat or debugging purposes.
+ */
+struct socket  *ip_mrouter  = NULL;
+int		ip_mrtproto = IGMP_DVMRP;    /* for netstat only */
 
-	case DVMRP_ADD_LGRP:
-		if (m == NULL || m->m_len < sizeof(struct lgrplctl))
-			error = EINVAL;
-		else
-			error = add_lgrp(mtod(m, struct lgrplctl *));
-		break;
+#define NO_RTE_FOUND 	0x1
+#define RTE_FOUND	0x2
 
-	case DVMRP_DEL_LGRP:
-		if (m == NULL || m->m_len < sizeof(struct lgrplctl))
-			error = EINVAL;
-		else
-			error = del_lgrp(mtod(m, struct lgrplctl *));
-		break;
+struct mbuf    *mfctable[MFCTBLSIZ];
+struct vif	viftable[MAXVIFS];
+struct mrtstat	mrtstat;
+u_int		mrtdebug = 0;	  /* debug level 	*/
+u_int       	tbfdebug = 0;     /* tbf debug level 	*/
 
-	case DVMRP_ADD_MRT:
-		if (m == NULL || m->m_len < sizeof(struct mrtctl))
-			error = EINVAL;
-		else
-			error = add_mrt(mtod(m, struct mrtctl *));
-		break;
+u_long timeout_val = 0;			/* count of outstanding upcalls */
 
-	case DVMRP_DEL_MRT:
-		if (m == NULL || m->m_len < sizeof(struct in_addr))
-			error = EINVAL;
-		else
-			error = del_mrt(mtod(m, struct in_addr *));
-		break;
+/*
+ * Define the token bucket filter structures
+ * tbftable -> each vif has one of these for storing info 
+ * qtable   -> each interface has an associated queue of pkts 
+ */
 
-	default:
-		error = EOPNOTSUPP;
-		break;
-	}
-	return (error);
-}
+struct tbf tbftable[MAXVIFS];
+struct pkt_queue qtable[MAXVIFS][MAXQSIZE];
 
 /*
- * Enable multicast routing
+ * 'Interfaces' associated with decapsulator (so we can tell
+ * packets that went through it from ones that get reflected
+ * by a broken gateway).  These interfaces are never linked into
+ * the system ifnet list & no routes point to them.  I.e., packets
+ * can't be sent this way.  They only exist as a placeholder for
+ * multicast source verification.
  */
-static int
-ip_mrouter_init(so)
-	register struct socket *so;
-{
-	if (so->so_type != SOCK_RAW ||
-	    so->so_proto->pr_protocol != IPPROTO_IGMP)
-		return (EOPNOTSUPP);
+struct ifnet multicast_decap_if[MAXVIFS];
 
-	if (ip_mrouter != NULL)
-		return (EADDRINUSE);
+#define ENCAP_TTL 64
+#define ENCAP_PROTO 4
 
-	ip_mrouter = so;
-
-	return (0);
-}
+/* prototype IP hdr for encapsulated packets */
+struct ip multicast_encap_iphdr = {
+#if defined(ultrix) || defined(i386)
+	sizeof(struct ip) >> 2, IPVERSION,
+#else
+	IPVERSION, sizeof(struct ip) >> 2,
+#endif
+	0,				/* tos */
+	sizeof(struct ip),		/* total length */
+	0,				/* id */
+	0,				/* frag offset */
+	ENCAP_TTL, ENCAP_PROTO,	
+	0,				/* checksum */
+};
 
 /*
- * Disable multicast routing
+ * Private variables.
  */
-int
-ip_mrouter_done()
-{
-	register vifi_t vifi;
-	register int i;
-	register struct ifnet *ifp;
-	register int s;
-	struct ifreq ifr;
+static vifi_t	   numvifs = 0;
 
-	s = splnet();
+/*
+ * one-back cache used by multiencap_decap to locate a tunnel's vif
+ * given a datagram's src ip address.
+ */
+static u_long last_encap_src;
+static struct vif *last_encap_vif;
+
+static u_long nethash_fc(u_long, u_long);
+static struct mfc *mfcfind(u_long, u_long);
+int get_sg_cnt(struct sioc_sg_req *);
+int get_vif_cnt(struct sioc_vif_req *);
+int get_vifs(caddr_t);
+static int add_vif(struct vifctl *);
+static int del_vif(vifi_t *);
+static int add_mfc(struct mfcctl *);
+static int del_mfc(struct delmfcctl *);
+static void cleanup_cache(void *);
+static int ip_mdq(struct mbuf *, struct ifnet *, u_long, struct mfc *,
+		  struct ip_moptions *);
+int legal_vif_num(int);
+static void phyint_send(struct ip *, struct vif *, struct mbuf *);
+static void srcrt_send(struct ip *, struct vif *, struct mbuf *);
+static void encap_send(struct ip *, struct vif *, struct mbuf *);
+void tbf_control(struct vif *, struct mbuf *, struct ip *, u_long,
+		 struct ip_moptions *);
+void tbf_queue(struct vif *, struct mbuf *, struct ip *, struct ip_moptions *);
+void tbf_process_q(struct vif *);
+void tbf_dequeue(struct vif *, int);
+void tbf_reprocess_q(void *);
+int tbf_dq_sel(struct vif *, struct ip *);
+void tbf_send_packet(struct vif *, struct mbuf *, struct ip_moptions *);
+void tbf_update_tokens(struct vif *);
+static int priority(struct vif *, struct ip *);
+static int ip_mrouter_init(struct socket *);
 
-	/*
-	 * For each phyint in use, free its local group list and
-	 * disable promiscuous reception of all IP multicasts.
-	 */
-	for (vifi = 0; vifi < numvifs; vifi++) {
-		if (viftable[vifi].v_lcl_addr.s_addr != 0 &&
-		    !(viftable[vifi].v_flags & VIFF_TUNNEL)) {
-			if (viftable[vifi].v_lcl_grps)
-				free(viftable[vifi].v_lcl_grps, M_MRTABLE);
-			satosin(&ifr.ifr_addr)->sin_family = AF_INET;
-			satosin(&ifr.ifr_addr)->sin_addr.s_addr = INADDR_ANY;
-			ifp = viftable[vifi].v_ifp;
-			(*ifp->if_ioctl)(ifp, SIOCDELMULTI, (caddr_t)&ifr);
-		}
-	}
-	bzero((caddr_t)viftable, sizeof(viftable));
-	numvifs = 0;
+/*
+ * A simple hash function: returns MFCHASHMOD of the low-order octet of
+ * the argument's network or subnet number and the multicast group assoc.
+ */ 
+static u_long
+nethash_fc(m,n)
+    register u_long m;
+    register u_long n;
+{
+    struct in_addr in1;
+    struct in_addr in2;
 
-	/*
-	 * Free any multicast route entries.
-	 */
-	for (i = 0; i < MRTHASHSIZ; i++)
-		if (mrttable[i])
-			free(mrttable[i], M_MRTABLE);
-	bzero((caddr_t)mrttable, sizeof(mrttable));
-	cached_mrt = NULL;
+    in1.s_addr = m;
+    m = in_netof(in1);
+    while ((m & 0xff) == 0) m >>= 8;
 
-	ip_mrouter = NULL;
+    in2.s_addr = n;
+    n = in_netof(in2);
+    while ((n & 0xff) == 0) n >>= 8;
 
-	splx(s);
-	return (0);
+    return (MFCHASHMOD(m) ^ MFCHASHMOD(n));
 }
 
 /*
- * Add a vif to the vif table
+ * this is a direct-mapped cache used to speed the mapping from a
+ * datagram source address to the associated multicast route.  Note
+ * that unlike mrttable, the hash is on IP address, not IP net number.
  */
-static int
-add_vif(vifcp)
-	register struct vifctl *vifcp;
-{
-	register struct vif *vifp = viftable + vifcp->vifc_vifi;
-	register struct ifaddr *ifa;
-	register struct ifnet *ifp;
-	struct ifreq ifr;
-	register int error, s;
-	static struct sockaddr_in sin = { sizeof(sin), AF_INET };
-
-	if (vifcp->vifc_vifi >= MAXVIFS)
-		return (EINVAL);
-	if (vifp->v_lcl_addr.s_addr != 0)
-		return (EADDRINUSE);
-
-	/* Find the interface with an address in AF_INET family */
-	sin.sin_addr = vifcp->vifc_lcl_addr;
-	ifa = ifa_ifwithaddr((struct sockaddr *)&sin);
-	if (ifa == 0)
-		return (EADDRNOTAVAIL);
-
-	s = splnet();
-
-	if (vifcp->vifc_flags & VIFF_TUNNEL)
-		vifp->v_rmt_addr = vifcp->vifc_rmt_addr;
-	else {
-		/* Make sure the interface supports multicast */
-		ifp = ifa->ifa_ifp;
-		if ((ifp->if_flags & IFF_MULTICAST) == 0) {
-			splx(s);
-			return (EOPNOTSUPP);
-		}
-		/*
-		 * Enable promiscuous reception of all IP multicasts
-		 * from the interface.
-		 */
-		satosin(&ifr.ifr_addr)->sin_family = AF_INET;
-		satosin(&ifr.ifr_addr)->sin_addr.s_addr = INADDR_ANY;
-		error = (*ifp->if_ioctl)(ifp, SIOCADDMULTI, (caddr_t)&ifr);
-		if (error) {
-			splx(s);
-			return (error);
-		}
-	}
-
-	vifp->v_flags = vifcp->vifc_flags;
-	vifp->v_threshold = vifcp->vifc_threshold;
-	vifp->v_lcl_addr = vifcp->vifc_lcl_addr;
-	vifp->v_ifp = ifa->ifa_ifp;
+#define MFCHASHSIZ 1024
+#define MFCHASH(a, g) ((((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \
+			((g) >> 20) ^ ((g) >> 10) ^ (g)) & (MFCHASHSIZ-1))
+struct mfc *mfchash[MFCHASHSIZ];
 
-	/* Adjust numvifs up if the vifi is higher than numvifs */
-	if (numvifs <= vifcp->vifc_vifi)
-		numvifs = vifcp->vifc_vifi + 1;
-
-	splx(s);
-	return (0);
+/*
+ * Find a route for a given origin IP address and Multicast group address
+ * Type of service parameter to be added in the future!!!
+ */
+#define MFCFIND(o, g, rt) { \
+	register u_int _mrhasho = o; \
+	register u_int _mrhashg = g; \
+	_mrhasho = MFCHASH(_mrhasho, _mrhashg); \
+	++mrtstat.mrts_mfc_lookups; \
+	rt = mfchash[_mrhasho]; \
+	if ((rt == NULL) || \
+	    ((o & rt->mfc_originmask.s_addr) != rt->mfc_origin.s_addr) || \
+	     (g != rt->mfc_mcastgrp.s_addr)) \
+	     if ((rt = mfcfind(o, g)) != NULL) \
+		mfchash[_mrhasho] = rt; \
 }
 
 /*
- * Delete a vif from the vif table
+ * Find route by examining hash table entries
  */
-static int
-del_vif(vifip)
-	register vifi_t *vifip;
+static struct mfc *
+mfcfind(origin, mcastgrp)
+    u_long origin; 
+    u_long mcastgrp;
 {
-	register struct vif *vifp = viftable + *vifip;
-	register struct ifnet *ifp;
-	register int i, s;
-	struct ifreq ifr;
-
-	if (*vifip >= numvifs)
-		return (EINVAL);
-	if (vifp->v_lcl_addr.s_addr == 0)
-		return (EADDRNOTAVAIL);
+    register struct mbuf *mb_rt;
+    register struct mfc *rt;
+    register u_long hash;
+
+    hash = nethash_fc(origin, mcastgrp);
+    for (mb_rt = mfctable[hash]; mb_rt; mb_rt = mb_rt->m_next) {
+	rt = mtod(mb_rt, struct mfc *);
+	if (((origin & rt->mfc_originmask.s_addr) == rt->mfc_origin.s_addr) &&
+	    (mcastgrp == rt->mfc_mcastgrp.s_addr) &&
+	    (mb_rt->m_act == NULL))
+	    return (rt);
+    }
+    mrtstat.mrts_mfc_misses++;
+    return NULL;
+}
 
-	s = splnet();
+/*
+ * Macros to compute elapsed time efficiently
+ * Borrowed from Van Jacobson's scheduling code
+ */
+#define TV_DELTA(a, b, delta) { \
+	    register int xxs; \
+		\
+	    delta = (a).tv_usec - (b).tv_usec; \
+	    if ((xxs = (a).tv_sec - (b).tv_sec)) { \
+	       switch (xxs) { \
+		      case 2: \
+			  delta += 1000000; \
+			      /* fall through */ \
+		      case 1: \
+			  delta += 1000000; \
+			  break; \
+		      default: \
+			  delta += (1000000 * xxs); \
+	       } \
+	    } \
+}
 
-	if (!(vifp->v_flags & VIFF_TUNNEL)) {
-		if (vifp->v_lcl_grps)
-			free(vifp->v_lcl_grps, M_MRTABLE);
-		satosin(&ifr.ifr_addr)->sin_family = AF_INET;
-		satosin(&ifr.ifr_addr)->sin_addr.s_addr = INADDR_ANY;
-		ifp = vifp->v_ifp;
-		(*ifp->if_ioctl)(ifp, SIOCDELMULTI, (caddr_t)&ifr);
-	}
+#define TV_LT(a, b) (((a).tv_usec < (b).tv_usec && \
+	      (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec)
 
-	bzero((caddr_t)vifp, sizeof (*vifp));
+/*
+ * Handle DVMRP setsockopt commands to modify the multicast routing tables.
+ */
+int
+ip_mrouter_cmd(cmd, so, m)
+    int cmd;
+    struct socket *so;
+    struct mbuf *m;
+{
+   if (cmd != DVMRP_INIT && so != ip_mrouter) return EACCES;
+
+    switch (cmd) {
+	case DVMRP_INIT:     return ip_mrouter_init(so);
+	case DVMRP_DONE:     return ip_mrouter_done();
+	case DVMRP_ADD_VIF:  return add_vif (mtod(m, struct vifctl *));
+	case DVMRP_DEL_VIF:  return del_vif (mtod(m, vifi_t *));
+	case DVMRP_ADD_MFC:  return add_mfc (mtod(m, struct mfcctl *));
+	case DVMRP_DEL_MFC:  return del_mfc (mtod(m, struct delmfcctl *));
+	default:             return EOPNOTSUPP;
+    }
+}
 
-	/* Adjust numvifs down */
-	for (i = numvifs - 1; i >= 0; i--)
-		if (viftable[i].v_lcl_addr.s_addr != 0)
-			break;
-	numvifs = i + 1;
 
-	splx(s);
-	return (0);
+/*
+ * Handle ioctl commands to obtain information from the cache
+ */
+int
+mrt_ioctl(cmd, data)
+    int cmd;
+    caddr_t data;
+{
+    int error = 0;
+
+    switch (cmd) {
+      case (SIOCGETVIFINF):		/* Read Virtual Interface (m/cast) */
+	  return (get_vifs(data));
+	  break;
+      case (SIOCGETVIFCNT):
+	  return (get_vif_cnt((struct sioc_vif_req *)data));
+	  break;
+      case (SIOCGETSGCNT):
+	  return (get_sg_cnt((struct sioc_sg_req *)data));
+	  break;
+	default:
+	  return (EINVAL);
+	  break;
+    }
+    return error;
 }
 
 /*
- * Add the multicast group in the lgrpctl to the list of local multicast
- * group memberships associated with the vif indexed by gcp->lgc_vifi.
+ * returns the packet count for the source group provided
  */
-static int
-add_lgrp(gcp)
-	register struct lgrplctl *gcp;
+int
+get_sg_cnt(req)
+    register struct sioc_sg_req *req;
 {
-	register struct vif *vifp;
-	register int s;
-
-	if (gcp->lgc_vifi >= numvifs)
-		return (EINVAL);
-
-	vifp = viftable + gcp->lgc_vifi;
-	if (vifp->v_lcl_addr.s_addr == 0 || (vifp->v_flags & VIFF_TUNNEL))
-		return (EADDRNOTAVAIL);
+    register struct mfc *rt;
+    int s;
+
+    s = splnet();
+    MFCFIND(req->src.s_addr, req->grp.s_addr, rt);
+    splx(s);
+    if (rt != NULL)
+	req->count = rt->mfc_pkt_cnt;
+    else
+	req->count = 0xffffffff;
+
+    return 0;
+}
 
-	/* If not enough space in existing list, allocate a larger one */
-	s = splnet();
-	if (vifp->v_lcl_grps_n + 1 >= vifp->v_lcl_grps_max) {
-		register int num;
-		register struct in_addr *ip;
-
-		num = vifp->v_lcl_grps_max;
-		if (num <= 0)
-			num = 32;	/* initial number */
-		else
-			num += num;	/* double last number */
-		ip = (struct in_addr *)malloc(num * sizeof(*ip),
-		    M_MRTABLE, M_NOWAIT);
-		if (ip == NULL) {
-			splx(s);
-			return (ENOBUFS);
-		}
+/*
+ * returns the input and output packet counts on the interface provided
+ */
+int
+get_vif_cnt(req)
+    register struct sioc_vif_req *req;
+{
+    register vifi_t vifi = req->vifi;
 
-		bzero((caddr_t)ip, num * sizeof(*ip));	/* XXX paranoid */
-		bcopy((caddr_t)vifp->v_lcl_grps, (caddr_t)ip,
-		    vifp->v_lcl_grps_n * sizeof(*ip));
+    req->icount = viftable[vifi].v_pkt_in;
+    req->ocount = viftable[vifi].v_pkt_out;
 
-		vifp->v_lcl_grps_max = num;
-		if (vifp->v_lcl_grps)
-			free(vifp->v_lcl_grps, M_MRTABLE);
-		vifp->v_lcl_grps = ip;
+    return 0;
+}
 
+int
+get_vifs(data)
+    char *data;
+{
+    struct vif_conf *vifc = (struct vif_conf *)data;
+    struct vif_req *vifrp, vifr;
+    int space, error=0;
+
+    vifi_t vifi;
+    int s;
+
+    space = vifc->vifc_len;
+    vifrp  = vifc->vifc_req;
+
+    s = splnet();
+    vifc->vifc_num=numvifs;
+
+    for (vifi = 0; vifi <  numvifs; vifi++, vifrp++) {
+	if (viftable[vifi].v_lcl_addr.s_addr != 0) {
+	    vifr.v_flags=viftable[vifi].v_flags;
+	    vifr.v_threshold=viftable[vifi].v_threshold;
+	    vifr.v_lcl_addr=viftable[vifi].v_lcl_addr;
+	    vifr.v_rmt_addr=viftable[vifi].v_rmt_addr;
+	    strncpy(vifr.v_if_name,viftable[vifi].v_ifp->if_name,IFNAMSIZ);
+	    if ((space -= sizeof(vifr)) < 0) {
+		splx(s);
+		return(ENOSPC);
+	    }
+	    error = copyout((caddr_t)&vifr,(caddr_t)vifrp,(u_int)(sizeof vifr));
+	    if (error) {
 		splx(s);
+		return(error);
+	    }
 	}
-
-	vifp->v_lcl_grps[vifp->v_lcl_grps_n++] = gcp->lgc_gaddr;
-
-	if (gcp->lgc_gaddr.s_addr == vifp->v_cached_group)
-		vifp->v_cached_result = 1;
-
-	splx(s);
-	return (0);
+    }
+    splx(s);
+    return 0;
 }
-
 /*
- * Delete the the local multicast group associated with the vif
- * indexed by gcp->lgc_vifi.
+ * Enable multicast routing
  */
-
 static int
-del_lgrp(gcp)
-	register struct lgrplctl *gcp;
+ip_mrouter_init(so)
+	struct socket *so;
 {
-	register struct vif *vifp;
-	register int i, error, s;
+    if (so->so_type != SOCK_RAW ||
+	so->so_proto->pr_protocol != IPPROTO_IGMP) return EOPNOTSUPP;
 
-	if (gcp->lgc_vifi >= numvifs)
-		return (EINVAL);
-	vifp = viftable + gcp->lgc_vifi;
-	if (vifp->v_lcl_addr.s_addr == 0 || (vifp->v_flags & VIFF_TUNNEL))
-		return (EADDRNOTAVAIL);
+    if (ip_mrouter != NULL) return EADDRINUSE;
 
-	s = splnet();
+    ip_mrouter = so;
 
-	if (gcp->lgc_gaddr.s_addr == vifp->v_cached_group)
-		vifp->v_cached_result = 0;
-
-	error = EADDRNOTAVAIL;
-	for (i = 0; i < vifp->v_lcl_grps_n; ++i)
-		if (same(&gcp->lgc_gaddr, &vifp->v_lcl_grps[i])) {
-			error = 0;
-			vifp->v_lcl_grps_n--;
-			bcopy((caddr_t)&vifp->v_lcl_grps[i + 1],
-			    (caddr_t)&vifp->v_lcl_grps[i],
-			    (vifp->v_lcl_grps_n - i) * sizeof(struct in_addr));
-			error = 0;
-			break;
-		}
+    if (mrtdebug)
+	log(LOG_DEBUG, "ip_mrouter_init");
 
-	splx(s);
-	return (error);
+    return 0;
 }
 
 /*
- * Return 1 if gaddr is a member of the local group list for vifp.
+ * Disable multicast routing
  */
-static int
-grplst_member(vifp, gaddr)
-	register struct vif *vifp;
-	struct in_addr gaddr;
+int
+ip_mrouter_done()
 {
-	register int i, s;
-	register u_long addr;
+    vifi_t vifi;
+    int i;
+    struct ifnet *ifp;
+    struct ifreq ifr;
+    struct mbuf *mb_rt;
+    struct mbuf *m;
+    struct rtdetq *rte;
+    int s;
+
+    s = splnet();
+
+    /*
+     * For each phyint in use, disable promiscuous reception of all IP
+     * multicasts.
+     */
+    for (vifi = 0; vifi < numvifs; vifi++) {
+	if (viftable[vifi].v_lcl_addr.s_addr != 0 &&
+	    !(viftable[vifi].v_flags & VIFF_TUNNEL)) {
+	    ((struct sockaddr_in *)&(ifr.ifr_addr))->sin_family = AF_INET;
+	    ((struct sockaddr_in *)&(ifr.ifr_addr))->sin_addr.s_addr
+								= INADDR_ANY;
+	    ifp = viftable[vifi].v_ifp;
+	    (*ifp->if_ioctl)(ifp, SIOCDELMULTI, (caddr_t)&ifr);
+	}
+    }
+    bzero((caddr_t)qtable, sizeof(qtable));
+    bzero((caddr_t)tbftable, sizeof(tbftable));
+    bzero((caddr_t)viftable, sizeof(viftable));
+    numvifs = 0;
+
+    /*
+     * Check if any outstanding timeouts remain
+     */
+    if (timeout_val != 0)
+	for (i = 0; i < MFCTBLSIZ; i++) {
+	    mb_rt = mfctable[i];
+	    while (mb_rt) {
+		if ( mb_rt->m_act != NULL) {
+		    untimeout(cleanup_cache, (caddr_t)mb_rt);
+		    while (m = mb_rt->m_act) {
+			mb_rt->m_act = m->m_act;
+			rte = mtod(m, struct rtdetq *);
+			m_freem(rte->m);
+			m_free(m);
+		    }
+		    timeout_val--;
+		}
+	    mb_rt = mb_rt->m_next;
+	    }
+	    if (timeout_val == 0)
+		break;
+	}
 
-	mrtstat.mrts_grp_lookups++;
+    /*
+     * Free all multicast forwarding cache entries.
+     */
+    for (i = 0; i < MFCTBLSIZ; i++)
+	m_freem(mfctable[i]);
 
-	addr = gaddr.s_addr;
-	if (addr == vifp->v_cached_group)
-		return (vifp->v_cached_result);
+    bzero((caddr_t)mfctable, sizeof(mfctable));
+    bzero((caddr_t)mfchash, sizeof(mfchash));
 
-	mrtstat.mrts_grp_misses++;
+    /*
+     * Reset de-encapsulation cache
+     */
+    last_encap_src = NULL;
+    last_encap_vif = NULL;
+ 
+    ip_mrouter = NULL;
 
-	for (i = 0; i < vifp->v_lcl_grps_n; ++i)
-		if (addr == vifp->v_lcl_grps[i].s_addr) {
-			s = splnet();
-			vifp->v_cached_group = addr;
-			vifp->v_cached_result = 1;
-			splx(s);
-			return (1);
-		}
-	s = splnet();
-	vifp->v_cached_group = addr;
-	vifp->v_cached_result = 0;
-	splx(s);
-	return (0);
+    splx(s);
+
+    if (mrtdebug)
+	log(LOG_DEBUG, "ip_mrouter_done");
+
+    return 0;
 }
 
 /*
- * A simple hash function: returns MRTHASHMOD of the low-order octet of
- * the argument's network or subnet number.
+ * Add a vif to the vif table
  */
-static u_long
-nethash(in)
-	struct in_addr in;
+static int
+add_vif(vifcp)
+    register struct vifctl *vifcp;
 {
-	register u_long n;
-
-	n = in_netof(in);
-	while ((n & 0xff) == 0)
-		n >>= 8;
-	return (MRTHASHMOD(n));
+    register struct vif *vifp = viftable + vifcp->vifc_vifi;
+    static struct sockaddr_in sin = {AF_INET};
+    struct ifaddr *ifa;
+    struct ifnet *ifp;
+    struct ifreq ifr;
+    int error, s;
+    struct tbf *v_tbf = tbftable + vifcp->vifc_vifi;
+
+    if (vifcp->vifc_vifi >= MAXVIFS)  return EINVAL;
+    if (vifp->v_lcl_addr.s_addr != 0) return EADDRINUSE;
+
+    /* Find the interface with an address in AF_INET family */
+    sin.sin_addr = vifcp->vifc_lcl_addr;
+    ifa = ifa_ifwithaddr((struct sockaddr *)&sin);
+    if (ifa == 0) return EADDRNOTAVAIL;
+    ifp = ifa->ifa_ifp;
+
+    if (vifcp->vifc_flags & VIFF_TUNNEL) {
+	if ((vifcp->vifc_flags & VIFF_SRCRT) == 0) {
+	    static int inited = 0;
+	    if(!inited) {
+		for (s = 0; s < MAXVIFS; ++s) {
+		    multicast_decap_if[s].if_name = "mdecap";
+		    multicast_decap_if[s].if_unit = s;
+		}
+		inited = 1;
+	    }
+	    ifp = &multicast_decap_if[vifcp->vifc_vifi];
+	} else {
+	    ifp = 0;
+	}
+    } else {
+	/* Make sure the interface supports multicast */
+	if ((ifp->if_flags & IFF_MULTICAST) == 0)
+	    return EOPNOTSUPP;
+
+	/* Enable promiscuous reception of all IP multicasts from the if */
+	((struct sockaddr_in *)&(ifr.ifr_addr))->sin_family = AF_INET;
+	((struct sockaddr_in *)&(ifr.ifr_addr))->sin_addr.s_addr = INADDR_ANY;
+	s = splnet();
+	error = (*ifp->if_ioctl)(ifp, SIOCADDMULTI, (caddr_t)&ifr);
+	splx(s);
+	if (error)
+	    return error;
+    }
+
+    s = splnet();
+    /* define parameters for the tbf structure */
+    vifp->v_tbf = v_tbf;
+    vifp->v_tbf->q_len = 0;
+    vifp->v_tbf->n_tok = 0;
+    vifp->v_tbf->last_pkt_t = 0;
+
+    vifp->v_flags     = vifcp->vifc_flags;
+    vifp->v_threshold = vifcp->vifc_threshold;
+    vifp->v_lcl_addr  = vifcp->vifc_lcl_addr;
+    vifp->v_rmt_addr  = vifcp->vifc_rmt_addr;
+    vifp->v_ifp       = ifp;
+    vifp->v_rate_limit= vifcp->vifc_rate_limit;
+    /* initialize per vif pkt counters */
+    vifp->v_pkt_in    = 0;
+    vifp->v_pkt_out   = 0;
+    splx(s);
+
+    /* Adjust numvifs up if the vifi is higher than numvifs */
+    if (numvifs <= vifcp->vifc_vifi) numvifs = vifcp->vifc_vifi + 1;
+
+    if (mrtdebug)
+	log(LOG_DEBUG, "add_vif #%d, lcladdr %x, %s %x, thresh %x, rate %d",
+	    vifcp->vifc_vifi, 
+	    ntohl(vifcp->vifc_lcl_addr.s_addr),
+	    (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask",
+	    ntohl(vifcp->vifc_rmt_addr.s_addr),
+	    vifcp->vifc_threshold,
+	    vifcp->vifc_rate_limit);    
+
+    return 0;
 }
 
 /*
- * Add an mrt entry
+ * Delete a vif from the vif table
  */
 static int
-add_mrt(mrtcp)
-	register struct mrtctl *mrtcp;
-{
-	struct mrt *rt;
-	u_long hash;
-	int s;
-
-	if (rt = mrtfind(mrtcp->mrtc_origin)) {
-		/* Just update the route */
-		s = splnet();
-		rt->mrt_parent = mrtcp->mrtc_parent;
-		VIFM_COPY(mrtcp->mrtc_children, rt->mrt_children);
-		VIFM_COPY(mrtcp->mrtc_leaves, rt->mrt_leaves);
-		splx(s);
-		return (0);
-	}
+del_vif(vifip)
+    vifi_t *vifip;
+{
+    register struct vif *vifp = viftable + *vifip;
+    register vifi_t vifi;
+    struct ifnet *ifp;
+    struct ifreq ifr;
+    int s;
 
-	s = splnet();
+    if (*vifip >= numvifs) return EINVAL;
+    if (vifp->v_lcl_addr.s_addr == 0) return EADDRNOTAVAIL;
 
-	rt = (struct mrt *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT);
-	if (rt == NULL) {
-		splx(s);
-		return (ENOBUFS);
-	}
+    s = splnet();
 
-	/*
-	 * insert new entry at head of hash chain
-	 */
-	rt->mrt_origin = mrtcp->mrtc_origin;
-	rt->mrt_originmask = mrtcp->mrtc_originmask;
-	rt->mrt_parent = mrtcp->mrtc_parent;
-	VIFM_COPY(mrtcp->mrtc_children, rt->mrt_children);
-	VIFM_COPY(mrtcp->mrtc_leaves, rt->mrt_leaves);
-	/* link into table */
-	hash = nethash(mrtcp->mrtc_origin);
-	rt->mrt_next = mrttable[hash];
-	mrttable[hash] = rt;
+    if (!(vifp->v_flags & VIFF_TUNNEL)) {
+	((struct sockaddr_in *)&(ifr.ifr_addr))->sin_family = AF_INET;
+	((struct sockaddr_in *)&(ifr.ifr_addr))->sin_addr.s_addr = INADDR_ANY;
+	ifp = vifp->v_ifp;
+	(*ifp->if_ioctl)(ifp, SIOCDELMULTI, (caddr_t)&ifr);
+    }
 
-	splx(s);
-	return (0);
+    if (vifp == last_encap_vif) {
+	last_encap_vif = 0;
+	last_encap_src = 0;
+    }
+
+    bzero((caddr_t)qtable[*vifip],
+	  sizeof(qtable[*vifip]));
+    bzero((caddr_t)vifp->v_tbf, sizeof(*(vifp->v_tbf)));
+    bzero((caddr_t)vifp, sizeof (*vifp));
+
+    /* Adjust numvifs down */
+    for (vifi = numvifs; vifi > 0; vifi--)
+	if (viftable[vifi-1].v_lcl_addr.s_addr != 0) break;
+    numvifs = vifi;
+
+    splx(s);
+
+    if (mrtdebug)
+      log(LOG_DEBUG, "del_vif %d, numvifs %d", *vifip, numvifs);
+
+    return 0;
 }
 
 /*
- * Delete an mrt entry
+ * Add an mfc entry
  */
 static int
-del_mrt(origin)
-	register struct in_addr *origin;
+add_mfc(mfccp)
+    struct mfcctl *mfccp;
 {
-	register struct mrt *rt, *prev_rt;
-	register u_long hash = nethash(*origin);
-	register int s;
-
-	for (prev_rt = rt = mrttable[hash]; rt; prev_rt = rt, rt = rt->mrt_next)
-		if (origin->s_addr == rt->mrt_origin.s_addr)
-			break;
-	if (!rt)
-		return (ESRCH);
+    struct mfc *rt;
+    struct mfc *rt1;
+    register struct mbuf *mb_rt;
+    struct mbuf *prev_mb_rt;
+    u_long hash;
+    struct mbuf *mb_ntry;
+    struct rtdetq *rte;
+    register u_short nstl;
+    int s;
+    int i;
+
+    rt = mfcfind(mfccp->mfcc_origin.s_addr, mfccp->mfcc_mcastgrp.s_addr);
+
+    /* If an entry already exists, just update the fields */
+    if (rt) {
+	if (mrtdebug)
+	    log(LOG_DEBUG,"add_mfc update o %x g %x m %x p %x",
+		ntohl(mfccp->mfcc_origin.s_addr),
+		ntohl(mfccp->mfcc_mcastgrp.s_addr),
+		ntohl(mfccp->mfcc_originmask.s_addr),
+		mfccp->mfcc_parent);
 
 	s = splnet();
-
-	if (rt == cached_mrt)
-		cached_mrt = NULL;
-
-	if (prev_rt == rt)
-		mrttable[hash] = rt->mrt_next;
-	else
-		prev_rt->mrt_next = rt->mrt_next;
-	free(rt, M_MRTABLE);
-
+	rt->mfc_parent = mfccp->mfcc_parent;
+	for (i = 0; i < numvifs; i++)
+	    VIFM_COPY(mfccp->mfcc_ttls[i], rt->mfc_ttls[i]);
 	splx(s);
-	return (0);
+	return 0;
+    }
+
+    /* 
+     * Find the entry for which the upcall was made and update
+     */
+    s = splnet();
+    hash = nethash_fc(mfccp->mfcc_origin.s_addr, mfccp->mfcc_mcastgrp.s_addr);
+    for (prev_mb_rt = mb_rt = mfctable[hash], nstl = 0; 
+	 mb_rt; prev_mb_rt = mb_rt, mb_rt = mb_rt->m_next) {
+
+	rt = mtod(mb_rt, struct mfc *);
+	if (((rt->mfc_origin.s_addr & mfccp->mfcc_originmask.s_addr) 
+	     == mfccp->mfcc_origin.s_addr) &&
+	    (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) &&
+	    (mb_rt->m_act != NULL)) {
+
+	    if (!nstl++) {
+		if (mrtdebug)
+		    log(LOG_DEBUG,"add_mfc o %x g %x m %x p %x dbg %x",
+			ntohl(mfccp->mfcc_origin.s_addr),
+			ntohl(mfccp->mfcc_mcastgrp.s_addr),
+			ntohl(mfccp->mfcc_originmask.s_addr),
+			mfccp->mfcc_parent, mb_rt->m_act);
+
+		rt->mfc_origin     = mfccp->mfcc_origin;
+		rt->mfc_originmask = mfccp->mfcc_originmask;
+		rt->mfc_mcastgrp   = mfccp->mfcc_mcastgrp;
+		rt->mfc_parent     = mfccp->mfcc_parent;
+		for (i = 0; i < numvifs; i++)
+		    VIFM_COPY(mfccp->mfcc_ttls[i], rt->mfc_ttls[i]);
+		/* initialize pkt counters per src-grp */
+		rt->mfc_pkt_cnt    = 0;
+		rt1 = rt;
+	    }
+
+	    /* prevent cleanup of cache entry */
+	    untimeout(cleanup_cache, (caddr_t)mb_rt);
+	    timeout_val--;
+
+	    /* free packets Qed at the end of this entry */
+	    while (mb_rt->m_act) {
+		mb_ntry = mb_rt->m_act;
+		rte = mtod(mb_ntry, struct rtdetq *);
+		ip_mdq(rte->m, rte->ifp, rte->tunnel_src, 
+		       rt1, rte->imo);
+		mb_rt->m_act = mb_ntry->m_act;
+		m_freem(rte->m);
+		m_free(mb_ntry);
+	    }
+
+	    /* 
+	     * If more than one entry was created for a single upcall
+	     * delete that entry
+	     */
+	    if (nstl > 1) {
+		MFREE(mb_rt, prev_mb_rt->m_next);
+		mb_rt = prev_mb_rt;
+	    }
+	}
+    }
+
+    /*
+     * It is possible that an entry is being inserted without an upcall
+     */
+    if (nstl == 0) {
+	if (mrtdebug)
+	    log(LOG_DEBUG,"add_mfc no upcall h %d o %x g %x m %x p %x",
+		hash, ntohl(mfccp->mfcc_origin.s_addr),
+		ntohl(mfccp->mfcc_mcastgrp.s_addr),
+		ntohl(mfccp->mfcc_originmask.s_addr),
+		mfccp->mfcc_parent);
+	
+	for (prev_mb_rt = mb_rt = mfctable[hash];
+	     mb_rt; prev_mb_rt = mb_rt, mb_rt = mb_rt->m_next) {
+	    
+	    rt = mtod(mb_rt, struct mfc *);
+	    if (((rt->mfc_origin.s_addr & mfccp->mfcc_originmask.s_addr) 
+		 == mfccp->mfcc_origin.s_addr) &&
+		(rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr)) {
+
+		rt->mfc_origin     = mfccp->mfcc_origin;
+		rt->mfc_originmask = mfccp->mfcc_originmask;
+		rt->mfc_mcastgrp   = mfccp->mfcc_mcastgrp;
+		rt->mfc_parent     = mfccp->mfcc_parent;
+		for (i = 0; i < numvifs; i++)
+		    VIFM_COPY(mfccp->mfcc_ttls[i], rt->mfc_ttls[i]);
+		/* initialize pkt counters per src-grp */
+		rt->mfc_pkt_cnt    = 0;
+	    }
+	}
+	if (mb_rt == NULL) {
+	    /* no upcall, so make a new entry */
+	    MGET(mb_rt, M_DONTWAIT, MT_MRTABLE);
+	    if (mb_rt == NULL) {
+		splx(s);
+		return ENOBUFS;
+	    }
+	    
+	    rt = mtod(mb_rt, struct mfc *);
+	    
+	    /* insert new entry at head of hash chain */
+	    rt->mfc_origin     = mfccp->mfcc_origin;
+	    rt->mfc_originmask = mfccp->mfcc_originmask;
+	    rt->mfc_mcastgrp   = mfccp->mfcc_mcastgrp;
+	    rt->mfc_parent     = mfccp->mfcc_parent;
+	    for (i = 0; i < numvifs; i++)
+		VIFM_COPY(mfccp->mfcc_ttls[i], rt->mfc_ttls[i]);
+	    /* initialize pkt counters per src-grp */
+	    rt->mfc_pkt_cnt    = 0;
+	    
+	    /* link into table */
+	    mb_rt->m_next  = mfctable[hash];
+	    mfctable[hash] = mb_rt;
+	    mb_rt->m_act = NULL;
+	}
+    }
+    splx(s);
+    return 0;
 }
 
 /*
- * Find a route for a given origin IP address.
- */
-static struct mrt *
-mrtfind(origin)
-	struct in_addr origin;
-{
-	register struct mrt *rt;
-	register u_int hash;
-	register int s;
-
-	mrtstat.mrts_mrt_lookups++;
-
-	if (cached_mrt != NULL &&
-	    (origin.s_addr & cached_originmask) == cached_origin)
-		return (cached_mrt);
-
-	mrtstat.mrts_mrt_misses++;
-
-	hash = nethash(origin);
-	for (rt = mrttable[hash]; rt; rt = rt->mrt_next)
-		if ((origin.s_addr & rt->mrt_originmask.s_addr) ==
-		    rt->mrt_origin.s_addr) {
-			s = splnet();
-			cached_mrt = rt;
-			cached_origin = rt->mrt_origin.s_addr;
-			cached_originmask = rt->mrt_originmask.s_addr;
-			splx(s);
-			return (rt);
-		}
-	return (NULL);
+ * Delete an mfc entry
+ */
+static int
+del_mfc(mfccp)
+    struct delmfcctl *mfccp;
+{
+    struct in_addr 	origin;
+    struct in_addr 	mcastgrp;
+    struct mfc 		*rt;
+    struct mbuf 	*mb_rt;
+    struct mbuf 	*prev_mb_rt;
+    u_long 		hash;
+    struct mfc 		**cmfc;
+    struct mfc 		**cmfcend;
+    int s, i;
+
+    origin = mfccp->mfcc_origin;
+    mcastgrp = mfccp->mfcc_mcastgrp;
+    hash = nethash_fc(origin.s_addr, mcastgrp.s_addr);
+
+    if (mrtdebug)
+	log(LOG_DEBUG,"del_mfc orig %x mcastgrp %x",
+	    ntohl(origin.s_addr), ntohl(mcastgrp.s_addr));
+
+    for (prev_mb_rt = mb_rt = mfctable[hash]
+	 ; mb_rt
+	 ; prev_mb_rt = mb_rt, mb_rt = mb_rt->m_next) {
+        rt = mtod(mb_rt, struct mfc *);
+	if (origin.s_addr == rt->mfc_origin.s_addr &&
+	    mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr &&
+	    mb_rt->m_act == NULL)
+	    break;
+    }
+    if (mb_rt == NULL) {
+	return ESRCH;
+    }
+
+    s = splnet();
+
+    cmfc = mfchash;
+    cmfcend = cmfc + MFCHASHSIZ;
+    for ( ; cmfc < cmfcend; ++cmfc)
+	if (*cmfc == rt)
+	    *cmfc = 0;
+
+    if (prev_mb_rt != mb_rt) {	/* if moved past head of list */
+	MFREE(mb_rt, prev_mb_rt->m_next);
+    } else			/* delete head of list, it is in the table */
+        mfctable[hash] = m_free(mb_rt);
+
+    splx(s);
+
+    return 0;
 }
 
 /*
@@ -628,209 +869,914 @@ mrtfind(origin)
 #define TUNNEL_LEN  12  /* # bytes of IP option for tunnel encapsulation  */
 
 int
-ip_mforward(m, ifp)
-	register struct mbuf *m;
-	register struct ifnet *ifp;
+ip_mforward(ip, ifp, m, imo)
+    struct mbuf *m;
+    register struct ip *ip;
+    struct ifnet *ifp;
+    struct ip_moptions *imo;
 {
-	register struct ip *ip = mtod(m, struct ip *);
-	register struct mrt *rt;
-	register struct vif *vifp;
-	register int vifi;
-	register u_char *ipoptions;
-	u_long tunnel_src;
-
-	if (ip->ip_hl < (IP_HDR_LEN + TUNNEL_LEN) >> 2 ||
-	    (ipoptions = (u_char *)(ip + 1))[1] != IPOPT_LSRR ) {
-		/*
-		 * Packet arrived via a physical interface.
-		 */
-		tunnel_src = 0;
-	} else {
-		/*
-		 * Packet arrived through a tunnel.
-		 *
-		 * A tunneled packet has a single NOP option and a
-		 * two-element loose-source-and-record-route (LSRR)
-		 * option immediately following the fixed-size part of
-		 * the IP header.  At this point in processing, the IP
-		 * header should contain the following IP addresses:
-		 *
-		 * original source          - in the source address field
-		 * destination group        - in the destination address field
-		 * remote tunnel end-point  - in the first  element of LSRR
-		 * one of this host's addrs - in the second element of LSRR
-		 *
-		 * NOTE: RFC-1075 would have the original source and
-		 * remote tunnel end-point addresses swapped.  However,
-		 * that could cause delivery of ICMP error messages to
-		 * innocent applications on intermediate routing
-		 * hosts!  Therefore, we hereby change the spec.
-		 */
-
-		/*
-		 * Verify that the tunnel options are well-formed.
-		 */
-		if (ipoptions[0] != IPOPT_NOP ||
-		    ipoptions[2] != 11 ||	/* LSRR option length   */
-		    ipoptions[3] != 12 ||	/* LSRR address pointer */
-		    (tunnel_src = *(u_long *)(&ipoptions[4])) == 0) {
-			mrtstat.mrts_bad_tunnel++;
-			return (1);
-		}
-
-		/*
-		 * Delete the tunnel options from the packet.
-		 */
-		ovbcopy((caddr_t)(ipoptions + TUNNEL_LEN), (caddr_t)ipoptions,
-		    (unsigned)(m->m_len - (IP_HDR_LEN + TUNNEL_LEN)));
-		m->m_len -= TUNNEL_LEN;
-		ip->ip_len -= TUNNEL_LEN;
-		ip->ip_hl -= TUNNEL_LEN >> 2;
-	}
-
+    register struct mfc *rt;
+    register struct vif *vifp;
+    register u_char *ipoptions;
+    u_long tunnel_src;
+    static struct sockproto	k_igmpproto 	= { AF_INET, IPPROTO_IGMP };
+    static struct sockaddr_in 	k_igmpsrc	= { AF_INET };
+    static struct sockaddr_in 	k_igmpdst 	= { AF_INET };
+    register struct mbuf *mm;
+    register struct mbuf *mn;
+    register struct ip *k_data;
+    int s;
+
+    if (mrtdebug > 1)
+	log(LOG_DEBUG, "ip_mforward: src %x, dst %x, ifp %x",
+	    ntohl(ip->ip_src.s_addr), ntohl(ip->ip_dst.s_addr), ifp);
+
+    if (ip->ip_hl < (IP_HDR_LEN + TUNNEL_LEN) >> 2 ||
+	(ipoptions = (u_char *)(ip + 1))[1] != IPOPT_LSRR ) {
 	/*
-	 * Don't forward a packet with time-to-live of zero or one,
-	 * or a packet destined to a local-only group.
+	 * Packet arrived via a physical interface.
 	 */
-	if (ip->ip_ttl <= 1 ||
-	    ntohl(ip->ip_dst.s_addr) <= INADDR_MAX_LOCAL_GROUP)
-		return ((int)tunnel_src);
-
+	tunnel_src = 0;
+    } else {
 	/*
-	 * Don't forward if we don't have a route for the packet's origin.
+	 * Packet arrived through a source-route tunnel.
+	 *
+	 * A source-route tunneled packet has a single NOP option and a
+	 * two-element
+	 * loose-source-and-record-route (LSRR) option immediately following
+	 * the fixed-size part of the IP header.  At this point in processing,
+	 * the IP header should contain the following IP addresses:
+	 *
+	 *	original source          - in the source address field
+	 *	destination group        - in the destination address field
+	 *	remote tunnel end-point  - in the first  element of LSRR
+	 *	one of this host's addrs - in the second element of LSRR
+	 *
+	 * NOTE: RFC-1075 would have the original source and remote tunnel
+	 *	 end-point addresses swapped.  However, that could cause
+	 *	 delivery of ICMP error messages to innocent applications
+	 *	 on intermediate routing hosts!  Therefore, we hereby
+	 *	 change the spec.
+	 */
+	
+	/*
+	 * Verify that the tunnel options are well-formed.
 	 */
-	if (!(rt = mrtfind(ip->ip_src))) {
-		mrtstat.mrts_no_route++;
-		return ((int)tunnel_src);
+	if (ipoptions[0] != IPOPT_NOP ||
+	    ipoptions[2] != 11 ||	/* LSRR option length   */
+	    ipoptions[3] != 12 ||	/* LSRR address pointer */
+	    (tunnel_src = *(u_long *)(&ipoptions[4])) == 0) {
+	    mrtstat.mrts_bad_tunnel++;
+	    if (mrtdebug)
+		log(LOG_DEBUG,
+		    "ip_mforward: bad tunnel from %u (%x %x %x %x %x %x)",
+		    ntohl(ip->ip_src.s_addr),
+		    ipoptions[0], ipoptions[1], ipoptions[2], ipoptions[3],
+		    *(u_long *)(&ipoptions[4]), *(u_long *)(&ipoptions[8]));
+	    return 1;
 	}
 
 	/*
-	 * Don't forward if it didn't arrive from the parent vif for its origin.
+	 * Delete the tunnel options from the packet.
 	 */
-	vifi = rt->mrt_parent;
-	if (tunnel_src == 0 ) {
-		if ((viftable[vifi].v_flags & VIFF_TUNNEL) ||
-		    viftable[vifi].v_ifp != ifp )
-			return ((int)tunnel_src);
-	} else {
-		if (!(viftable[vifi].v_flags & VIFF_TUNNEL) ||
-		    viftable[vifi].v_rmt_addr.s_addr != tunnel_src )
-			return ((int)tunnel_src);
-	}
+	ovbcopy((caddr_t)(ipoptions + TUNNEL_LEN), (caddr_t)ipoptions,
+		(unsigned)(m->m_len - (IP_HDR_LEN + TUNNEL_LEN)));
+	m->m_len   -= TUNNEL_LEN;
+	ip->ip_len -= TUNNEL_LEN;
+	ip->ip_hl  -= TUNNEL_LEN >> 2;
+
+	ifp = 0;
+    }
+
+    /*
+     * Don't forward a packet with time-to-live of zero or one,
+     * or a packet destined to a local-only group.
+     */
+    if (ip->ip_ttl <= 1 ||
+	ntohl(ip->ip_dst.s_addr) <= INADDR_MAX_LOCAL_GROUP)
+	return (int)tunnel_src;
+
+    /*
+     * Determine forwarding vifs from the forwarding cache table
+     */
+    s = splnet();
+    MFCFIND(ip->ip_src.s_addr, ip->ip_dst.s_addr, rt);
+
+    /* Entry exists, so forward if necessary */
+    if (rt != NULL) {
+	splx(s);
+	return (ip_mdq(m, ifp, tunnel_src, rt, imo));
+    }
 
+    else {
 	/*
-	 * For each vif, decide if a copy of the packet should be forwarded.
-	 * Forward if:
-	 *		- the ttl exceeds the vif's threshold AND
-	 *		- the vif is a child in the origin's route AND
-	 *		- ( the vif is not a leaf in the origin's route OR
-	 *		    the destination group has members on the vif )
-	 *
-	 * (This might be speeded up with some sort of cache -- someday.)
+	 * If we don't have a route for packet's origin,
+	 * Make a copy of the packet &
+	 * send message to routing daemon
 	 */
-	for (vifp = viftable, vifi = 0; vifi < numvifs; vifp++, vifi++) {
-		if (ip->ip_ttl > vifp->v_threshold &&
-		    VIFM_ISSET(vifi, rt->mrt_children) &&
-		    (!VIFM_ISSET(vifi, rt->mrt_leaves) ||
-		    grplst_member(vifp, ip->ip_dst))) {
-			if (vifp->v_flags & VIFF_TUNNEL)
-				tunnel_send(m, vifp);
-			else
-				phyint_send(m, vifp);
-		}
+
+	register struct mbuf *mb_rt;
+	register struct mbuf *mb_ntry;
+	register struct mbuf *mb0;
+	register struct rtdetq *rte;
+	register struct mbuf *rte_m;
+	register u_long hash;
+	register struct timeval tp;
+
+	mrtstat.mrts_no_route++;
+	if (mrtdebug)
+	    log(LOG_DEBUG, "ip_mforward: no rte s %x g %x",
+		ntohl(ip->ip_src.s_addr),
+		ntohl(ip->ip_dst.s_addr));
+
+	/* is there an upcall waiting for this packet? */
+	hash = nethash_fc(ip->ip_src.s_addr, ip->ip_dst.s_addr);
+	for (mb_rt = mfctable[hash]; mb_rt; mb_rt = mb_rt->m_next) {
+	    rt = mtod(mb_rt, struct mfc *);
+	    if (((ip->ip_src.s_addr & rt->mfc_originmask.s_addr) == 
+		 rt->mfc_origin.s_addr) &&
+		(ip->ip_dst.s_addr == rt->mfc_mcastgrp.s_addr) &&
+		(mb_rt->m_act != NULL))
+		break;
+	}
+
+	if (mb_rt == NULL) {
+	    /* no upcall, so make a new entry */
+	    MGET(mb_rt, M_DONTWAIT, MT_MRTABLE);
+	    if (mb_rt == NULL) {
+		splx(s);
+		return ENOBUFS;
+	    }
+
+	    rt = mtod(mb_rt, struct mfc *);
+
+	    /* insert new entry at head of hash chain */
+	    rt->mfc_origin.s_addr     = ip->ip_src.s_addr;
+	    rt->mfc_originmask.s_addr = (u_long)0xffffffff;
+	    rt->mfc_mcastgrp.s_addr   = ip->ip_dst.s_addr;
+
+	    /* link into table */
+	    hash = nethash_fc(rt->mfc_origin.s_addr, rt->mfc_mcastgrp.s_addr);
+	    mb_rt->m_next  = mfctable[hash];
+	    mfctable[hash] = mb_rt;
+	    mb_rt->m_act = NULL;
+
+	}
+
+	/* determine if q has overflowed */
+	for (rte_m = mb_rt, hash = 0; rte_m->m_act; rte_m = rte_m->m_act)
+	    hash++;
+
+	if (hash > MAX_UPQ) {
+	    mrtstat.mrts_upq_ovflw++;
+	    splx(s);
+	    return 0;
+	}
+
+	/* add this packet and timing, ifp info to m_act */
+	MGET(mb_ntry, M_DONTWAIT, MT_DATA);
+	if (mb_ntry == NULL) {
+	    splx(s);
+	    return ENOBUFS;
+	}
+
+	mb_ntry->m_act = NULL;
+	rte = mtod(mb_ntry, struct rtdetq *);
+
+	mb0 = m_copy(m, 0, M_COPYALL);
+	if (mb0 == NULL) {
+	    splx(s);
+	    return ENOBUFS;
 	}
 
-	return ((int)tunnel_src);
+	rte->m 			= mb0;
+	rte->ifp 		= ifp;
+	rte->tunnel_src 	= tunnel_src;
+	rte->imo		= imo;
+
+	rte_m->m_act = mb_ntry;
+
+	splx(s);
+
+	if (hash == 0) {
+	    /* 
+	     * Send message to routing daemon to install 
+	     * a route into the kernel table
+	     */
+	    k_igmpsrc.sin_addr = ip->ip_src;
+	    k_igmpdst.sin_addr = ip->ip_dst;
+	    
+	    mm = m_copy(m, 0, M_COPYALL);
+	    if (mm == NULL) {
+		splx(s);
+		return ENOBUFS;
+	    }
+	    
+	    k_data = mtod(mm, struct ip *);
+	    k_data->ip_p = 0;
+	    
+	    mrtstat.mrts_upcalls++;
+
+	    raw_input(mm, &k_igmpproto,
+		      (struct sockaddr *)&k_igmpsrc,
+		      (struct sockaddr *)&k_igmpdst);
+	    
+	    /* set timer to cleanup entry if upcall is lost */
+	    timeout(cleanup_cache, (caddr_t)mb_rt, 100);
+	    timeout_val++;
+	}
+	
+	return 0;
+    }		
 }
 
+/*
+ * Clean up the cache entry if upcall is not serviced
+ */
 static void
-phyint_send(m, vifp)
-	register struct mbuf *m;
-	register struct vif *vifp;
+cleanup_cache(xmb_rt)
+	void *xmb_rt;
+{
+    struct mbuf *mb_rt = xmb_rt;
+    struct mfc *rt;
+    u_long hash;
+    struct mbuf *prev_m0;
+    struct mbuf *m0;
+    struct mbuf *m;
+    struct rtdetq *rte;
+    int s;
+
+    rt = mtod(mb_rt, struct mfc *);
+    hash = nethash_fc(rt->mfc_origin.s_addr, rt->mfc_mcastgrp.s_addr);
+
+    if (mrtdebug)
+	log(LOG_DEBUG, "ip_mforward: cleanup ipm %d h %d s %x g %x", 
+	    ip_mrouter, hash, ntohl(rt->mfc_origin.s_addr), 
+	    ntohl(rt->mfc_mcastgrp.s_addr));
+
+    mrtstat.mrts_cache_cleanups++;
+
+    /*
+     * determine entry to be cleaned up in cache table
+     */
+    s = splnet();
+    for (prev_m0 = m0 = mfctable[hash]; m0; prev_m0 = m0, m0 = m0->m_next)
+	if (m0 == mb_rt)
+	    break;
+
+    /* 
+     * drop all the packets
+     * free the mbuf with the pkt, if, timing info
+     */
+    while (mb_rt->m_act) {
+	m = mb_rt->m_act;
+	mb_rt->m_act = m->m_act;
+
+	rte = mtod(m, struct rtdetq *);
+	m_freem(rte->m);
+	m_free(m);
+    }
+
+    /* 
+     * Delete the entry from the cache
+     */
+    if (prev_m0 != m0) {	/* if moved past head of list */
+	MFREE(m0, prev_m0->m_next);
+    } else			/* delete head of list, it is in the table */
+	mfctable[hash] = m_free(m0);
+    
+    timeout_val--;
+    splx(s);
+}
+
+/*
+ * Packet forwarding routine once entry in the cache is made
+ */
+static int
+ip_mdq(m, ifp, tunnel_src, rt, imo)
+    register struct mbuf *m;
+    register struct ifnet *ifp;
+    register u_long tunnel_src;
+    register struct mfc *rt;
+    register struct ip_moptions *imo;
 {
-	register struct ip *ip = mtod(m, struct ip *);
-	register struct mbuf *mb_copy;
-	register struct ip_moptions *imo;
-	register int error;
-	struct ip_moptions simo;
+    register struct ip  *ip = mtod(m, struct ip *);
+    register vifi_t vifi;
+    register struct vif *vifp;
+
+    /*
+     * Don't forward if it didn't arrive from the parent vif for its origin.
+     * Notes: v_ifp is zero for src route tunnels, multicast_decap_if
+     * for encapsulated tunnels and a real ifnet for non-tunnels so
+     * the first part of the if catches wrong physical interface or
+     * tunnel type; v_rmt_addr is zero for non-tunneled packets so
+     * the 2nd part catches both packets that arrive via a tunnel
+     * that shouldn't and packets that arrive via the wrong tunnel.
+     */
+    vifi = rt->mfc_parent;
+    if (viftable[vifi].v_ifp != ifp ||
+	(ifp == 0 && viftable[vifi].v_rmt_addr.s_addr != tunnel_src)) {
+	/* came in the wrong interface */
+	if (mrtdebug)
+	    log(LOG_DEBUG, "wrong if: ifp %x vifi %d",
+		ifp, vifi); 
+	++mrtstat.mrts_wrong_if;
+	return (int)tunnel_src;
+    }
+
+    /* increment the interface and s-g counters */
+    viftable[vifi].v_pkt_in++;
+    rt->mfc_pkt_cnt++;
+
+    /*
+     * For each vif, decide if a copy of the packet should be forwarded.
+     * Forward if:
+     *		- the ttl exceeds the vif's threshold
+     *		- there are group members downstream on interface
+     */
+#define MC_SEND(ip,vifp,m) {                             \
+		(vifp)->v_pkt_out++;                     \
+                if ((vifp)->v_flags & VIFF_SRCRT)        \
+                    srcrt_send((ip), (vifp), (m));       \
+                else if ((vifp)->v_flags & VIFF_TUNNEL)  \
+                    encap_send((ip), (vifp), (m));       \
+                else                                     \
+                    phyint_send((ip), (vifp), (m));      \
+                }                                  
+
+/* If no options or the imo_multicast_vif option is 0, don't do this part 
+ */
+    if ((imo != NULL) && 
+       (( vifi = imo->imo_multicast_vif - 1) < numvifs) /*&& (vifi>=0)*/) 
+    {  
+        MC_SEND(ip,viftable+vifi,m);
+        return (1);        /* make sure we are done: No more physical sends */
+    }
+
+    for (vifp = viftable, vifi = 0; vifi < numvifs; vifp++, vifi++)
+	if ((rt->mfc_ttls[vifi] > 0) &&
+	    (ip->ip_ttl > rt->mfc_ttls[vifi]))
+	    MC_SEND(ip, vifp, m);
+
+    return 0;
+}
 
-	mb_copy = m_copy(m, 0, M_COPYALL);
-	if (mb_copy == NULL)
-		return;
+/* check if a vif number is legal/ok. This is used by ip_output, to export
+ * numvifs there, 
+ */
+int
+legal_vif_num(vif)
+    int vif;
+{   if (vif>=0 && vif<=numvifs)
+       return(1);
+    else
+       return(0);
+}
 
-	imo = &simo;
-	imo->imo_multicast_ifp = vifp->v_ifp;
-	imo->imo_multicast_ttl = ip->ip_ttl - 1;
-	imo->imo_multicast_loop = 1;
+static void
+phyint_send(ip, vifp, m)
+    struct ip *ip;
+    struct vif *vifp;
+    struct mbuf *m;
+{
+    register struct mbuf *mb_copy;
+    register struct mbuf *mopts;
+    register struct ip_moptions *imo;
+
+    if ((mb_copy = m_copy(m, 0, M_COPYALL)) == NULL)
+	return;
+
+    MALLOC(imo, struct ip_moptions *, sizeof *imo, M_IPMOPTS, M_NOWAIT);
+    if (imo == NULL) {
+	m_freem(mb_copy);
+	return;
+    }
+
+    imo->imo_multicast_ifp  = vifp->v_ifp;
+    imo->imo_multicast_ttl  = ip->ip_ttl - 1;
+    imo->imo_multicast_loop = 1;
+
+    if (vifp->v_rate_limit <= 0)
+	tbf_send_packet(vifp, mb_copy, imo);
+    else
+	tbf_control(vifp, mb_copy, mtod(mb_copy, struct ip *), ip->ip_len,
+		    imo);
+}
 
-	error = ip_output(mb_copy, NULL, NULL, IP_FORWARDING, imo);
+static void
+srcrt_send(ip, vifp, m)
+    struct ip *ip;
+    struct vif *vifp;
+    struct mbuf *m;
+{
+    struct mbuf *mb_copy, *mb_opts;
+    register struct ip *ip_copy;
+    u_char *cp;
+
+    /*
+     * Make sure that adding the tunnel options won't exceed the
+     * maximum allowed number of option bytes.
+     */
+    if (ip->ip_hl > (60 - TUNNEL_LEN) >> 2) {
+	mrtstat.mrts_cant_tunnel++;
+	if (mrtdebug)
+	    log(LOG_DEBUG, "srcrt_send: no room for tunnel options, from %u",
+		ntohl(ip->ip_src.s_addr));
+	return;
+    }
+
+    if ((mb_copy = m_copy(m, 0, M_COPYALL)) == NULL)
+	return;
+
+    ip_copy = mtod(mb_copy, struct ip *);
+    ip_copy->ip_ttl--;
+    ip_copy->ip_dst = vifp->v_rmt_addr;	  /* remote tunnel end-point */
+    /*
+     * Adjust the ip header length to account for the tunnel options.
+     */
+    ip_copy->ip_hl  += TUNNEL_LEN >> 2;
+    ip_copy->ip_len += TUNNEL_LEN;
+    MGET(mb_opts, M_DONTWAIT, MT_HEADER);
+    if (mb_opts == NULL) {
+	m_freem(mb_copy);
+	return;
+    }
+    /*
+     * 'Delete' the base ip header from the mb_copy chain
+     */
+    mb_copy->m_len -= IP_HDR_LEN;
+    mb_copy->m_data += IP_HDR_LEN;
+    /*
+     * Make mb_opts be the new head of the packet chain.
+     * Any options of the packet were left in the old packet chain head
+     */
+    mb_opts->m_next = mb_copy;
+    mb_opts->m_data += 16;
+    mb_opts->m_len = IP_HDR_LEN + TUNNEL_LEN;
+    /*
+     * Copy the base ip header from the mb_copy chain to the new head mbuf
+     */
+    bcopy((caddr_t)ip_copy, mtod(mb_opts, caddr_t), IP_HDR_LEN);
+    /*
+     * Add the NOP and LSRR after the base ip header
+     */
+    cp = mtod(mb_opts, u_char *) + IP_HDR_LEN;
+    *cp++ = IPOPT_NOP;
+    *cp++ = IPOPT_LSRR;
+    *cp++ = 11; /* LSRR option length */
+    *cp++ = 8;  /* LSSR pointer to second element */
+    *(u_long*)cp = vifp->v_lcl_addr.s_addr;	/* local tunnel end-point */
+    cp += 4;
+    *(u_long*)cp = ip->ip_dst.s_addr;		/* destination group */
+
+    if (vifp->v_rate_limit <= 0)
+	tbf_send_packet(vifp, mb_opts, 0);
+    else
+	tbf_control(vifp, mb_opts, 
+		    mtod(mb_opts, struct ip *), ip_copy->ip_len, 0);
 }
 
 static void
-tunnel_send(m, vifp)
-	register struct mbuf *m;
-	register struct vif *vifp;
+encap_send(ip, vifp, m)
+    register struct ip *ip;
+    register struct vif *vifp;
+    register struct mbuf *m;
 {
-	register struct ip *ip = mtod(m, struct ip *);
-	register struct mbuf *mb_copy, *mb_opts;
-	register struct ip *ip_copy;
-	register int error;
-	register u_char *cp;
+    register struct mbuf *mb_copy;
+    register struct ip *ip_copy;
+    register int i, len = ip->ip_len;
+
+    /*
+     * copy the old packet & pullup it's IP header into the
+     * new mbuf so we can modify it.  Try to fill the new
+     * mbuf since if we don't the ethernet driver will.
+     */
+    MGET(mb_copy, M_DONTWAIT, MT_DATA);
+    if (mb_copy == NULL)
+	return;
+    mb_copy->m_data += 16;
+    mb_copy->m_len = sizeof(multicast_encap_iphdr);
+
+    if ((mb_copy->m_next = m_copy(m, 0, M_COPYALL)) == NULL) {
+	m_freem(mb_copy);
+	return;
+    }
+    i = MHLEN - M_LEADINGSPACE(mb_copy);
+    if (i > len)
+	i = len;
+    mb_copy = m_pullup(mb_copy, i);
+    if (mb_copy == NULL)
+	return;
+
+    /*
+     * fill in the encapsulating IP header.
+     */
+    ip_copy = mtod(mb_copy, struct ip *);
+    *ip_copy = multicast_encap_iphdr;
+    ip_copy->ip_id = htons(ip_id++);
+    ip_copy->ip_len += len;
+    ip_copy->ip_src = vifp->v_lcl_addr;
+    ip_copy->ip_dst = vifp->v_rmt_addr;
+
+    /*
+     * turn the encapsulated IP header back into a valid one.
+     */
+    ip = (struct ip *)((caddr_t)ip_copy + sizeof(multicast_encap_iphdr));
+    --ip->ip_ttl;
+    HTONS(ip->ip_len);
+    HTONS(ip->ip_off);
+    ip->ip_sum = 0;
+#if defined(LBL) && !defined(ultrix)
+    ip->ip_sum = ~oc_cksum((caddr_t)ip, ip->ip_hl << 2, 0);
+#else
+    mb_copy->m_data += sizeof(multicast_encap_iphdr);
+    ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2);
+    mb_copy->m_data -= sizeof(multicast_encap_iphdr);
+#endif
+
+    if (vifp->v_rate_limit <= 0)
+	tbf_send_packet(vifp, mb_copy, 0);
+    else
+	tbf_control(vifp, mb_copy, ip, ip_copy->ip_len, 0);
+}
 
+/*
+ * De-encapsulate a packet and feed it back through ip input (this
+ * routine is called whenever IP gets a packet with proto type
+ * ENCAP_PROTO and a local destination address).
+ */
+void
+multiencap_decap(m)
+    register struct mbuf *m;
+{
+    struct ifnet *ifp = m->m_pkthdr.rcvif;
+    register struct ip *ip = mtod(m, struct ip *);
+    register int hlen = ip->ip_hl << 2;
+    register int s;
+    register struct ifqueue *ifq;
+    register struct vif *vifp;
+
+    if (ip->ip_p != ENCAP_PROTO) {
+    	rip_input(m);
+	return;
+    }
+    /*
+     * dump the packet if it's not to a multicast destination or if
+     * we don't have an encapsulating tunnel with the source.
+     * Note:  This code assumes that the remote site IP address
+     * uniquely identifies the tunnel (i.e., that this site has
+     * at most one tunnel with the remote site).
+     */
+    if (! IN_MULTICAST(ntohl(((struct ip *)((char *)ip + hlen))->ip_dst.s_addr))) {
+	++mrtstat.mrts_bad_tunnel;
+	m_freem(m);
+	return;
+    }
+    if (ip->ip_src.s_addr != last_encap_src) {
+	register struct vif *vife;
+	
+	vifp = viftable;
+	vife = vifp + numvifs;
+	last_encap_src = ip->ip_src.s_addr;
+	last_encap_vif = 0;
+	for ( ; vifp < vife; ++vifp)
+	    if (vifp->v_rmt_addr.s_addr == ip->ip_src.s_addr) {
+		if ((vifp->v_flags & (VIFF_TUNNEL|VIFF_SRCRT))
+		    == VIFF_TUNNEL)
+		    last_encap_vif = vifp;
+		break;
+	    }
+    }
+    if ((vifp = last_encap_vif) == 0) {
+	last_encap_src = 0;
+	mrtstat.mrts_cant_tunnel++; /*XXX*/
+	m_freem(m);
+	if (mrtdebug)
+	    log(LOG_DEBUG, "ip_mforward: no tunnel with %u",
+		ntohl(ip->ip_src.s_addr));
+	return;
+    }
+    ifp = vifp->v_ifp;
+    hlen -= sizeof(struct ifnet *);
+    m->m_data += hlen;
+    m->m_len -= hlen;
+    *(mtod(m, struct ifnet **)) = ifp;
+    ifq = &ipintrq;
+    s = splimp();
+    if (IF_QFULL(ifq)) {
+	IF_DROP(ifq);
+	m_freem(m);
+    } else {
+	IF_ENQUEUE(ifq, m);
 	/*
-	 * Make sure that adding the tunnel options won't exceed the
-	 * maximum allowed number of option bytes.
+	 * normally we would need a "schednetisr(NETISR_IP)"
+	 * here but we were called by ip_input and it is going
+	 * to loop back & try to dequeue the packet we just
+	 * queued as soon as we return so we avoid the
+	 * unnecessary software interrrupt.
 	 */
-	if (ip->ip_hl > (60 - TUNNEL_LEN) >> 2) {
-		mrtstat.mrts_cant_tunnel++;
-		return;
+    }
+    splx(s);
+}
+
+/*
+ * Token bucket filter module
+ */
+void
+tbf_control(vifp, m, ip, p_len, imo)
+	register struct vif *vifp;
+	register struct mbuf *m;
+	register struct ip *ip;
+	register u_long p_len;
+	struct ip_moptions *imo;
+{
+    tbf_update_tokens(vifp);
+
+    /* if there are enough tokens, 
+     * and the queue is empty,
+     * send this packet out
+     */
+
+    if (vifp->v_tbf->q_len == 0) {
+	if (p_len <= vifp->v_tbf->n_tok) {
+	    vifp->v_tbf->n_tok -= p_len;
+	    tbf_send_packet(vifp, m, imo);
+	} else if (p_len > MAX_BKT_SIZE) {
+	    /* drop if packet is too large */
+	    mrtstat.mrts_pkt2large++;
+	    m_freem(m);
+	    return;
+	} else {
+	    /* queue packet and timeout till later */
+	    tbf_queue(vifp, m, ip, imo);
+	    timeout(tbf_reprocess_q, (caddr_t)vifp, 1);
 	}
+    } else if (vifp->v_tbf->q_len < MAXQSIZE) {
+	/* finite queue length, so queue pkts and process queue */
+	tbf_queue(vifp, m, ip, imo);
+	tbf_process_q(vifp);
+    } else {
+	/* queue length too much, try to dq and queue and process */
+	if (!tbf_dq_sel(vifp, ip)) {
+	    mrtstat.mrts_q_overflow++;
+	    m_freem(m);
+	    return;
+	} else {
+	    tbf_queue(vifp, m, ip, imo);
+	    tbf_process_q(vifp);
+	}
+    }
+    return;
+}
 
-	/* 
-	 * Get a private copy of the IP header so that changes to some 
-	 * of the IP fields don't damage the original header, which is
-	 * examined later in ip_input.c.
-	 */
-	mb_copy = m_copy(m, IP_HDR_LEN, M_COPYALL);
-	if (mb_copy == NULL)
-		return;
-	MGETHDR(mb_opts, M_DONTWAIT, MT_HEADER);
-	if (mb_opts == NULL) {
-		m_freem(mb_copy);
-		return;
+/* 
+ * adds a packet to the queue at the interface
+ */
+void
+tbf_queue(vifp, m, ip, imo) 
+	register struct vif *vifp;
+	register struct mbuf *m;
+	register struct ip *ip;
+	struct ip_moptions *imo;
+{
+    register u_long ql;
+    register int index = (vifp - viftable);
+    register int s = splnet();
+
+    ql = vifp->v_tbf->q_len;
+
+    qtable[index][ql].pkt_m = m;
+    qtable[index][ql].pkt_len = (mtod(m, struct ip *))->ip_len;
+    qtable[index][ql].pkt_ip = ip;
+    qtable[index][ql].pkt_imo = imo;
+
+    vifp->v_tbf->q_len++;
+    splx(s);
+}
+
+
+/* 
+ * processes the queue at the interface
+ */
+void
+tbf_process_q(vifp)
+    register struct vif *vifp;
+{
+    register struct mbuf *m;
+    register struct pkt_queue pkt_1;
+    register int index = (vifp - viftable);
+    register int s = splnet();
+
+    /* loop through the queue at the interface and send as many packets
+     * as possible
+     */
+    while (vifp->v_tbf->q_len > 0) {
+	/* locate the first packet */
+	pkt_1.pkt_len = ((qtable[index][0]).pkt_len);
+	pkt_1.pkt_m   = (qtable[index][0]).pkt_m;
+	pkt_1.pkt_ip   = (qtable[index][0]).pkt_ip;
+	pkt_1.pkt_imo = (qtable[index][0]).pkt_imo;
+
+	/* determine if the packet can be sent */
+	if (pkt_1.pkt_len <= vifp->v_tbf->n_tok) {
+	    /* if so,
+	     * reduce no of tokens, dequeue the queue,
+	     * send the packet.
+	     */
+	    vifp->v_tbf->n_tok -= pkt_1.pkt_len;
+
+	    tbf_dequeue(vifp, 0);
+
+	    tbf_send_packet(vifp, pkt_1.pkt_m, pkt_1.pkt_imo);
+
+	} else break;
+    }
+    splx(s);
+}
+
+/* 
+ * removes the jth packet from the queue at the interface
+ */
+void
+tbf_dequeue(vifp,j) 
+    register struct vif *vifp;
+    register int j;
+{
+    register u_long index = vifp - viftable;
+    register int i;
+
+    for (i=j+1; i <= vifp->v_tbf->q_len - 1; i++) {
+	qtable[index][i-1].pkt_m   = qtable[index][i].pkt_m;
+	qtable[index][i-1].pkt_len = qtable[index][i].pkt_len;
+	qtable[index][i-1].pkt_ip = qtable[index][i].pkt_ip;
+	qtable[index][i-1].pkt_imo = qtable[index][i].pkt_imo;
+    }		
+    qtable[index][i-1].pkt_m = NULL;
+    qtable[index][i-1].pkt_len = NULL;
+    qtable[index][i-1].pkt_ip = NULL;
+    qtable[index][i-1].pkt_imo = NULL;
+
+    vifp->v_tbf->q_len--;
+
+    if (tbfdebug > 1)
+	log(LOG_DEBUG, "tbf_dequeue: vif# %d qlen %d",vifp-viftable, i-1);
+}
+
+void
+tbf_reprocess_q(xvifp)
+	void *xvifp;
+{
+    register struct vif *vifp = xvifp;
+    if (ip_mrouter == NULL) 
+	return;
+
+    tbf_update_tokens(vifp);
+
+    tbf_process_q(vifp);
+
+    if (vifp->v_tbf->q_len)
+	timeout(tbf_reprocess_q, (caddr_t)vifp, 1);
+}
+
+/* function that will selectively discard a member of the queue
+ * based on the precedence value and the priority obtained through
+ * a lookup table - not yet implemented accurately!
+ */
+int
+tbf_dq_sel(vifp, ip)
+    register struct vif *vifp;
+    register struct ip *ip;
+{
+    register int i;
+    register int s = splnet();
+    register u_int p;
+
+    p = priority(vifp, ip);
+
+    for(i=vifp->v_tbf->q_len-1;i >= 0;i--) {
+	if (p > priority(vifp, qtable[vifp-viftable][i].pkt_ip)) {
+	    m_freem(qtable[vifp-viftable][i].pkt_m);
+	    tbf_dequeue(vifp,i);
+	    splx(s);
+	    mrtstat.mrts_drop_sel++;
+	    return(1);
 	}
-	/*
-	 * Make mb_opts be the new head of the packet chain.
-	 * Any options of the packet were left in the old packet chain head
-	 */
-	mb_opts->m_next = mb_copy;
-	mb_opts->m_len = IP_HDR_LEN + TUNNEL_LEN;
-	mb_opts->m_data += MSIZE - mb_opts->m_len;
+    }
+    splx(s);
+    return(0);
+}
 
-	ip_copy = mtod(mb_opts, struct ip *);
-	/*
-	 * Copy the base ip header to the new head mbuf.
-	 */
-	*ip_copy = *ip;
-	ip_copy->ip_ttl--;
-	ip_copy->ip_dst = vifp->v_rmt_addr;	/* remote tunnel end-point */
-	/*
-	 * Adjust the ip header length to account for the tunnel options.
-	 */
-	ip_copy->ip_hl += TUNNEL_LEN >> 2;
-	ip_copy->ip_len += TUNNEL_LEN;
-	/*
-	 * Add the NOP and LSRR after the base ip header
-	 */
-	cp = (u_char *)(ip_copy + 1);
-	*cp++ = IPOPT_NOP;
-	*cp++ = IPOPT_LSRR;
-	*cp++ = 11;		/* LSRR option length */
-	*cp++ = 8;		/* LSSR pointer to second element */
-	*(u_long*)cp = vifp->v_lcl_addr.s_addr;	/* local tunnel end-point */
-	cp += 4;
-	*(u_long*)cp = ip->ip_dst.s_addr;		/* destination group */
+void
+tbf_send_packet(vifp, m, imo)
+    register struct vif *vifp;
+    register struct mbuf *m;
+    struct ip_moptions *imo;
+{
+    register struct mbuf *mcp;
+    int error;
+    int s = splnet();
+
+    /* if source route tunnels */
+    if (vifp->v_flags & VIFF_SRCRT) {
+	error = ip_output(m, (struct mbuf *)0, (struct route *)0,
+			  IP_FORWARDING, imo);
+	if (mrtdebug > 1)
+	    log(LOG_DEBUG, "srcrt_send on vif %d err %d", vifp-viftable, error);
+    } else if (vifp->v_flags & VIFF_TUNNEL) {
+	/* If tunnel options */
+	ip_output(m, (struct mbuf *)0, (struct route *)0,
+		  IP_FORWARDING, imo);
+    } else {
+	/* if physical interface option, extract the options and then send */
+	error = ip_output(m, (struct mbuf *)0, (struct route *)0,
+			  IP_FORWARDING, imo);
+	FREE(imo, M_IPMOPTS);
+
+	if (mrtdebug > 1)
+	    log(LOG_DEBUG, "phyint_send on vif %d err %d", vifp-viftable, error);
+    }
+    splx(s);
+}
+
+/* determine the current time and then
+ * the elapsed time (between the last time and time now)
+ * in milliseconds & update the no. of tokens in the bucket
+ */
+void
+tbf_update_tokens(vifp)
+    register struct vif *vifp;
+{
+    struct timeval tp;
+    register u_long t;
+    register u_long elapsed;
+    register int s = splnet();
+
+    GET_TIME(tp);
+
+    t = tp.tv_sec*1000 + tp.tv_usec/1000;
+
+    elapsed = (t - vifp->v_tbf->last_pkt_t) * vifp->v_rate_limit /8;
+    vifp->v_tbf->n_tok += elapsed;
+    vifp->v_tbf->last_pkt_t = t;
 
-	error = ip_output(mb_opts, NULL, NULL, IP_FORWARDING, NULL);
+    if (vifp->v_tbf->n_tok > MAX_BKT_SIZE)
+	vifp->v_tbf->n_tok = MAX_BKT_SIZE;
+
+    splx(s);
+}
+
+static int
+priority(vifp, ip)
+    register struct vif *vifp;
+    register struct ip *ip;
+{
+    register u_long graddr;
+    register int prio;
+
+    /* temporary hack; will add general packet classifier some day */
+
+    prio = 50;  /* default priority */
+    
+    /* check for source route options and add option length to get dst */
+    if (vifp->v_flags & VIFF_SRCRT)
+	graddr = ntohl((ip+8)->ip_dst.s_addr);
+    else
+	graddr = ntohl(ip->ip_dst.s_addr);
+
+    switch (graddr & 0xf) {
+	case 0x0: break;
+	case 0x1: if (graddr == 0xe0020001) prio = 65; /* MBone Audio */
+		  break;
+	case 0x2: break;
+	case 0x3: break;
+	case 0x4: break;
+	case 0x5: break;
+	case 0x6: break;
+	case 0x7: break;
+	case 0x8: break;
+	case 0x9: break;
+	case 0xa: if (graddr == 0xe000010a) prio = 85; /* IETF Low Audio 1 */
+		  break;
+	case 0xb: if (graddr == 0xe000010b) prio = 75; /* IETF Audio 1 */
+		  break;
+	case 0xc: if (graddr == 0xe000010c) prio = 60; /* IETF Video 1 */
+		  break;
+	case 0xd: if (graddr == 0xe000010d) prio = 80; /* IETF Low Audio 2 */
+		  break;
+	case 0xe: if (graddr == 0xe000010e) prio = 70; /* IETF Audio 2 */
+		  break;
+	case 0xf: if (graddr == 0xe000010f) prio = 55; /* IETF Video 2 */
+		  break;
+    }
+
+    if (tbfdebug > 1) log(LOG_DEBUG, "graddr%x prio%d", graddr, prio);
+
+    return prio;
 }
+
+/*
+ * End of token bucket filter modifications 
+ */
 #endif
+
+
author	wollman <wollman@FreeBSD.org>	1994-09-06 22:42:31 +0000
committer	wollman <wollman@FreeBSD.org>	1994-09-06 22:42:31 +0000
commit	75ad508fd126c679edba9b67bd09d74a1fff3aba (patch)
tree	da36f83faafbd2141b041ae182b2406dfee02756 /sys/netinet/ip_mroute.c
parent	f624d4a80eef8e47182201473e55257609525b41 (diff)
download	FreeBSD-src-75ad508fd126c679edba9b67bd09d74a1fff3aba.zip FreeBSD-src-75ad508fd126c679edba9b67bd09d74a1fff3aba.tar.gz