diff options
Diffstat (limited to 'sys/netinet')
100 files changed, 60186 insertions, 0 deletions
diff --git a/sys/netinet/accf_data.c b/sys/netinet/accf_data.c new file mode 100644 index 0000000..b66e1c7 --- /dev/null +++ b/sys/netinet/accf_data.c @@ -0,0 +1,67 @@ +/*- + * Copyright (c) 2000 Alfred Perlstein <alfred@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#define ACCEPT_FILTER_MOD + +#include <sys/param.h> +#include <sys/sysctl.h> +#include <sys/kernel.h> +#include <sys/socketvar.h> + +/* accept filter that holds a socket until data arrives */ + +static void sohasdata(struct socket *so, void *arg, int waitflag); + +static struct accept_filter accf_data_filter = { + "dataready", + sohasdata, + NULL, + NULL +}; + +static moduledata_t accf_data_mod = { + "accf_data", + accept_filt_generic_mod_event, + &accf_data_filter +}; + +DECLARE_MODULE(accf_data, accf_data_mod, SI_SUB_DRIVERS, SI_ORDER_MIDDLE); + +static void +sohasdata(struct socket *so, void *arg, int waitflag) +{ + + if (!soreadable(so)) { + return; + } + + so->so_upcall = NULL; + so->so_rcv.sb_flags &= ~SB_UPCALL; + soisconnected(so); + return; +} diff --git a/sys/netinet/accf_http.c b/sys/netinet/accf_http.c new file mode 100644 index 0000000..a9a8fb0 --- /dev/null +++ b/sys/netinet/accf_http.c @@ -0,0 +1,346 @@ +/* + * Copyright (c) 2000 Paycounter, Inc. + * Author: Alfred Perlstein <alfred@paycounter.com>, <alfred@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#define ACCEPT_FILTER_MOD + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/socketvar.h> +#include <sys/mbuf.h> + +/* check for GET/HEAD */ +static void sohashttpget(struct socket *so, void *arg, int waitflag); +/* check for HTTP/1.0 or HTTP/1.1 */ +static void soparsehttpvers(struct socket *so, void *arg, int waitflag); +/* check for end of HTTP/1.x request */ +static void soishttpconnected(struct socket *so, void *arg, int waitflag); +/* strcmp on an mbuf chain */ +static int mbufstrcmp(struct mbuf *m, struct mbuf *npkt, int offset, char *cmp); +/* strncmp on an mbuf chain */ +static int mbufstrncmp(struct mbuf *m, struct mbuf *npkt, int offset, + int max, char *cmp); +/* socketbuffer is full */ +static int sbfull(struct sockbuf *sb); + +static struct accept_filter accf_http_filter = { + "httpready", + sohashttpget, + NULL, + NULL +}; + +static moduledata_t accf_http_mod = { + "accf_http", + accept_filt_generic_mod_event, + &accf_http_filter +}; + +DECLARE_MODULE(accf_http, accf_http_mod, SI_SUB_DRIVERS, SI_ORDER_MIDDLE); + +static int parse_http_version = 1; + +SYSCTL_NODE(_net_inet_accf, OID_AUTO, http, CTLFLAG_RW, 0, +"HTTP accept filter"); +SYSCTL_INT(_net_inet_accf_http, OID_AUTO, parsehttpversion, CTLFLAG_RW, +&parse_http_version, 1, +"Parse http version so that non 1.x requests work"); + +#ifdef ACCF_HTTP_DEBUG +#define DPRINT(fmt, args...) \ + do { \ + printf("%s:%d: " fmt "\n", __func__, __LINE__ , ##args); \ + } while (0) +#else +#define DPRINT(fmt, args...) +#endif + +static int +sbfull(struct sockbuf *sb) +{ + + DPRINT("sbfull, cc(%ld) >= hiwat(%ld): %d, mbcnt(%ld) >= mbmax(%ld): %d", + sb->sb_cc, sb->sb_hiwat, sb->sb_cc >= sb->sb_hiwat, + sb->sb_mbcnt, sb->sb_mbmax, sb->sb_mbcnt >= sb->sb_mbmax); + return(sb->sb_cc >= sb->sb_hiwat || sb->sb_mbcnt >= sb->sb_mbmax); +} + +/* + * start at mbuf m, (must provide npkt if exists) + * starting at offset in m compare characters in mbuf chain for 'cmp' + */ +static int +mbufstrcmp(struct mbuf *m, struct mbuf *npkt, int offset, char *cmp) +{ + struct mbuf *n; + + for (;m != NULL; m = n) { + n = npkt; + if (npkt) + npkt = npkt->m_nextpkt; + for (; m; m = m->m_next) { + for (; offset < m->m_len; offset++, cmp++) { + if (*cmp == '\0') { + return (1); + } else if (*cmp != *(mtod(m, char *) + offset)) { + return (0); + } + } + offset = 0; + } + } + return (0); +} + +/* + * start at mbuf m, (must provide npkt if exists) + * starting at offset in m compare characters in mbuf chain for 'cmp' + * stop at 'max' characters + */ +static int +mbufstrncmp(struct mbuf *m, struct mbuf *npkt, int offset, int max, char *cmp) +{ + struct mbuf *n; + + for (;m != NULL; m = n) { + n = npkt; + if (npkt) + npkt = npkt->m_nextpkt; + for (; m; m = m->m_next) { + for (; offset < m->m_len; offset++, cmp++, max--) { + if (max == 0 || *cmp == '\0') { + return (1); + } else if (*cmp != *(mtod(m, char *) + offset)) { + return (0); + } + } + offset = 0; + } + } + return (0); +} + +#define STRSETUP(sptr, slen, str) \ + do { \ + sptr = str; \ + slen = sizeof(str) - 1; \ + } while(0) + +static void +sohashttpget(struct socket *so, void *arg, int waitflag) +{ + + if ((so->so_state & SS_CANTRCVMORE) == 0 && !sbfull(&so->so_rcv)) { + struct mbuf *m; + char *cmp; + int cmplen, cc; + + m = so->so_rcv.sb_mb; + cc = so->so_rcv.sb_cc - 1; + if (cc < 1) + return; + switch (*mtod(m, char *)) { + case 'G': + STRSETUP(cmp, cmplen, "ET "); + break; + case 'H': + STRSETUP(cmp, cmplen, "EAD "); + break; + default: + goto fallout; + } + if (cc < cmplen) { + if (mbufstrncmp(m, m->m_nextpkt, 1, cc, cmp) == 1) { + DPRINT("short cc (%d) but mbufstrncmp ok", cc); + return; + } else { + DPRINT("short cc (%d) mbufstrncmp failed", cc); + goto fallout; + } + } + if (mbufstrcmp(m, m->m_nextpkt, 1, cmp) == 1) { + DPRINT("mbufstrcmp ok"); + if (parse_http_version == 0) + soishttpconnected(so, arg, waitflag); + else + soparsehttpvers(so, arg, waitflag); + return; + } + DPRINT("mbufstrcmp bad"); + } + +fallout: + DPRINT("fallout"); + so->so_upcall = NULL; + so->so_rcv.sb_flags &= ~SB_UPCALL; + soisconnected(so); + return; +} + +static void +soparsehttpvers(struct socket *so, void *arg, int waitflag) +{ + struct mbuf *m, *n; + int i, cc, spaces, inspaces; + + if ((so->so_state & SS_CANTRCVMORE) != 0 || sbfull(&so->so_rcv)) + goto fallout; + + m = so->so_rcv.sb_mb; + cc = so->so_rcv.sb_cc; + inspaces = spaces = 0; + for (m = so->so_rcv.sb_mb; m; m = n) { + n = m->m_nextpkt; + for (; m; m = m->m_next) { + for (i = 0; i < m->m_len; i++, cc--) { + switch (*(mtod(m, char *) + i)) { + case ' ': + if (!inspaces) { + spaces++; + inspaces = 1; + } + break; + case '\r': + case '\n': + DPRINT("newline"); + goto fallout; + default: + if (spaces == 2) { + /* make sure we have enough data left */ + if (cc < sizeof("HTTP/1.0") - 1) { + if (mbufstrncmp(m, n, i, cc, "HTTP/1.") == 1) { + DPRINT("mbufstrncmp ok"); + goto readmore; + } else { + DPRINT("mbufstrncmp bad"); + goto fallout; + } + } else if (mbufstrcmp(m, n, i, "HTTP/1.0") == 1 || + mbufstrcmp(m, n, i, "HTTP/1.1") == 1) { + DPRINT("mbufstrcmp ok"); + soishttpconnected(so, arg, waitflag); + return; + } else { + DPRINT("mbufstrcmp bad"); + goto fallout; + } + } + inspaces = 0; + break; + } + } + } + } +readmore: + DPRINT("readmore"); + /* + * if we hit here we haven't hit something + * we don't understand or a newline, so try again + */ + so->so_upcall = soparsehttpvers; + so->so_rcv.sb_flags |= SB_UPCALL; + return; + +fallout: + DPRINT("fallout"); + so->so_upcall = NULL; + so->so_rcv.sb_flags &= ~SB_UPCALL; + soisconnected(so); + return; +} + + +#define NCHRS 3 + +static void +soishttpconnected(struct socket *so, void *arg, int waitflag) +{ + char a, b, c; + struct mbuf *m, *n; + int ccleft, copied; + + DPRINT("start"); + if ((so->so_state & SS_CANTRCVMORE) != 0 || sbfull(&so->so_rcv)) + goto gotit; + + /* + * Walk the socketbuffer and copy the last NCHRS (3) into a, b, and c + * copied - how much we've copied so far + * ccleft - how many bytes remaining in the socketbuffer + * just loop over the mbufs subtracting from 'ccleft' until we only + * have NCHRS left + */ + copied = 0; + ccleft = so->so_rcv.sb_cc; + if (ccleft < NCHRS) + goto readmore; + a = b = c = '\0'; + for (m = so->so_rcv.sb_mb; m; m = n) { + n = m->m_nextpkt; + for (; m; m = m->m_next) { + ccleft -= m->m_len; + if (ccleft <= NCHRS) { + char *src; + int tocopy; + + tocopy = (NCHRS - ccleft) - copied; + src = mtod(m, char *) + (m->m_len - tocopy); + + while (tocopy--) { + switch (copied++) { + case 0: + a = *src++; + break; + case 1: + b = *src++; + break; + case 2: + c = *src++; + break; + } + } + } + } + } + if (c == '\n' && (b == '\n' || (b == '\r' && a == '\n'))) { + /* we have all request headers */ + goto gotit; + } + +readmore: + so->so_upcall = soishttpconnected; + so->so_rcv.sb_flags |= SB_UPCALL; + return; + +gotit: + so->so_upcall = NULL; + so->so_rcv.sb_flags &= ~SB_UPCALL; + soisconnected(so); + return; +} diff --git a/sys/netinet/fil.c b/sys/netinet/fil.c new file mode 100644 index 0000000..6d4622e --- /dev/null +++ b/sys/netinet/fil.c @@ -0,0 +1,2132 @@ +/* + * Copyright (C) 1993-2000 by Darren Reed. + * + * Redistribution and use in source and binary forms are permitted + * provided that this notice is preserved and due credit is given + * to the original author and the contributors. + */ +#if !defined(lint) +static const char sccsid[] = "@(#)fil.c 1.36 6/5/96 (C) 1993-2000 Darren Reed"; +/* static const char rcsid[] = "@(#)$Id: fil.c,v 2.3.2.16 2000/01/27 08:49:37 darrenr Exp $"; */ +static const char rcsid[] = "@(#)$FreeBSD$"; +#endif + +#include <sys/errno.h> +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/file.h> +#if defined(__NetBSD__) && (NetBSD >= 199905) && !defined(IPFILTER_LKM) && \ + defined(_KERNEL) +# include "opt_ipfilter_log.h" +#endif +#if (defined(KERNEL) || defined(_KERNEL)) && defined(__FreeBSD_version) && \ + (__FreeBSD_version >= 220000) +# if (__FreeBSD_version >= 400000) +# ifndef KLD_MODULE +# include "opt_inet6.h" +# endif +# if (__FreeBSD_version == 400019) +# define CSUM_DELAY_DATA +# endif +# endif +# include <sys/filio.h> +# include <sys/fcntl.h> +#else +# include <sys/ioctl.h> +#endif +#if (defined(_KERNEL) || defined(KERNEL)) && !defined(linux) +# include <sys/systm.h> +#else +# include <stdio.h> +# include <string.h> +# include <stdlib.h> +#endif +#include <sys/uio.h> +#if !defined(__SVR4) && !defined(__svr4__) +# ifndef linux +# include <sys/mbuf.h> +# endif +#else +# include <sys/byteorder.h> +# if SOLARIS2 < 5 +# include <sys/dditypes.h> +# endif +# include <sys/stream.h> +#endif +#ifndef linux +# include <sys/protosw.h> +# include <sys/socket.h> +#endif +#include <net/if.h> +#ifdef sun +# include <net/af.h> +#endif +#include <net/route.h> +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#ifndef linux +# include <netinet/ip_var.h> +#endif +#if defined(__sgi) && defined(IFF_DRVRLOCK) /* IRIX 6 */ +# include <sys/hashing.h> +# include <netinet/in_var.h> +#endif +#include <netinet/tcp.h> +#include <netinet/udp.h> +#include <netinet/ip_icmp.h> +#include "netinet/ip_compat.h" +#ifdef USE_INET6 +# include <netinet/icmp6.h> +# if !SOLARIS && defined(_KERNEL) +# include <netinet6/in6_var.h> +# endif +#endif +#include <netinet/tcpip.h> +#include "netinet/ip_fil.h" +#include "netinet/ip_proxy.h" +#include "netinet/ip_nat.h" +#include "netinet/ip_frag.h" +#include "netinet/ip_state.h" +#include "netinet/ip_auth.h" +# if defined(__FreeBSD_version) && (__FreeBSD_version >= 300000) +# include <sys/malloc.h> +# if defined(_KERNEL) && !defined(IPFILTER_LKM) +# include "opt_ipfilter.h" +# endif +# endif +#ifndef MIN +# define MIN(a,b) (((a)<(b))?(a):(b)) +#endif +#include "netinet/ipl.h" + +#include <machine/in_cksum.h> + +#ifndef _KERNEL +# include "ipf.h" +# include "ipt.h" +extern int opts; + +# define FR_VERBOSE(verb_pr) verbose verb_pr +# define FR_DEBUG(verb_pr) debug verb_pr +# define IPLLOG(a, c, d, e) ipllog() +#else /* #ifndef _KERNEL */ +# define FR_VERBOSE(verb_pr) +# define FR_DEBUG(verb_pr) +# define IPLLOG(a, c, d, e) ipflog(a, c, d, e) +# if SOLARIS || defined(__sgi) +extern KRWLOCK_T ipf_mutex, ipf_auth, ipf_nat; +extern kmutex_t ipf_rw; +# endif +# if SOLARIS +# define FR_NEWAUTH(m, fi, ip, qif) fr_newauth((mb_t *)m, fi, \ + ip, qif) +# else /* SOLARIS */ +# define FR_NEWAUTH(m, fi, ip, qif) fr_newauth((mb_t *)m, fi, ip) +# endif /* SOLARIS || __sgi */ +#endif /* _KERNEL */ + + +struct filterstats frstats[2] = {{0,0,0,0,0},{0,0,0,0,0}}; +struct frentry *ipfilter[2][2] = { { NULL, NULL }, { NULL, NULL } }, +#ifdef USE_INET6 + *ipfilter6[2][2] = { { NULL, NULL }, { NULL, NULL } }, + *ipacct6[2][2] = { { NULL, NULL }, { NULL, NULL } }, +#endif + *ipacct[2][2] = { { NULL, NULL }, { NULL, NULL } }; +struct frgroup *ipfgroups[3][2]; +int fr_flags = IPF_LOGGING; +int fr_active = 0; +int fr_chksrc = 0; +int fr_minttl = 3; +int fr_minttllog = 1; +#if defined(IPFILTER_DEFAULT_BLOCK) +int fr_pass = FR_NOMATCH|FR_BLOCK; +#else +int fr_pass = (IPF_DEFAULT_PASS|FR_NOMATCH); +#endif +char ipfilter_version[] = IPL_VERSION; + +fr_info_t frcache[2]; + +static int frflushlist __P((int, minor_t, int *, frentry_t **)); +#ifdef _KERNEL +static void frsynclist __P((frentry_t *)); +#endif + + +/* + * bit values for identifying presence of individual IP options + */ +struct optlist ipopts[20] = { + { IPOPT_NOP, 0x000001 }, + { IPOPT_RR, 0x000002 }, + { IPOPT_ZSU, 0x000004 }, + { IPOPT_MTUP, 0x000008 }, + { IPOPT_MTUR, 0x000010 }, + { IPOPT_ENCODE, 0x000020 }, + { IPOPT_TS, 0x000040 }, + { IPOPT_TR, 0x000080 }, + { IPOPT_SECURITY, 0x000100 }, + { IPOPT_LSRR, 0x000200 }, + { IPOPT_E_SEC, 0x000400 }, + { IPOPT_CIPSO, 0x000800 }, + { IPOPT_SATID, 0x001000 }, + { IPOPT_SSRR, 0x002000 }, + { IPOPT_ADDEXT, 0x004000 }, + { IPOPT_VISA, 0x008000 }, + { IPOPT_IMITD, 0x010000 }, + { IPOPT_EIP, 0x020000 }, + { IPOPT_FINN, 0x040000 }, + { 0, 0x000000 } +}; + +/* + * bit values for identifying presence of individual IP security options + */ +struct optlist secopt[8] = { + { IPSO_CLASS_RES4, 0x01 }, + { IPSO_CLASS_TOPS, 0x02 }, + { IPSO_CLASS_SECR, 0x04 }, + { IPSO_CLASS_RES3, 0x08 }, + { IPSO_CLASS_CONF, 0x10 }, + { IPSO_CLASS_UNCL, 0x20 }, + { IPSO_CLASS_RES2, 0x40 }, + { IPSO_CLASS_RES1, 0x80 } +}; + + +/* + * compact the IP header into a structure which contains just the info. + * which is useful for comparing IP headers with. + */ +void fr_makefrip(hlen, ip, fin) +int hlen; +ip_t *ip; +fr_info_t *fin; +{ + u_short optmsk = 0, secmsk = 0, auth = 0; + int i, mv, ol, off, p, plen, v; + fr_ip_t *fi = &fin->fin_fi; + struct optlist *op; + u_char *s, opt; + tcphdr_t *tcp; + + fin->fin_rev = 0; + fin->fin_fr = NULL; + fin->fin_tcpf = 0; + fin->fin_data[0] = 0; + fin->fin_data[1] = 0; + fin->fin_rule = -1; + fin->fin_group = -1; +#ifdef _KERNEL + fin->fin_icode = ipl_unreach; +#endif + v = fin->fin_v; + fi->fi_v = v; + fin->fin_hlen = hlen; + if (v == 4) { + fin->fin_id = ip->ip_id; + fi->fi_tos = ip->ip_tos; + off = (ip->ip_off & IP_OFFMASK) << 3; + tcp = (tcphdr_t *)((char *)ip + hlen); + (*(((u_short *)fi) + 1)) = (*(((u_short *)ip) + 4)); + fi->fi_src.i6[1] = 0; + fi->fi_src.i6[2] = 0; + fi->fi_src.i6[3] = 0; + fi->fi_dst.i6[1] = 0; + fi->fi_dst.i6[2] = 0; + fi->fi_dst.i6[3] = 0; + fi->fi_saddr = ip->ip_src.s_addr; + fi->fi_daddr = ip->ip_dst.s_addr; + p = ip->ip_p; + fi->fi_fl = (hlen > sizeof(ip_t)) ? FI_OPTIONS : 0; + if (ip->ip_off & 0x3fff) + fi->fi_fl |= FI_FRAG; + plen = ip->ip_len; + fin->fin_dlen = plen - hlen; + } +#ifdef USE_INET6 + else if (v == 6) { + ip6_t *ip6 = (ip6_t *)ip; + + off = 0; + p = ip6->ip6_nxt; + fi->fi_p = p; + fi->fi_ttl = ip6->ip6_hlim; + tcp = (tcphdr_t *)(ip6 + 1); + fi->fi_src.in6 = ip6->ip6_src; + fi->fi_dst.in6 = ip6->ip6_dst; + fin->fin_id = (u_short)(ip6->ip6_flow & 0xffff); + fi->fi_tos = 0; + fi->fi_fl = 0; + plen = ntohs(ip6->ip6_plen); + fin->fin_dlen = plen; + } +#endif + else + return; + + fin->fin_off = off; + fin->fin_plen = plen; + fin->fin_dp = (void *)tcp; + + switch (p) + { +#ifdef USE_INET6 + case IPPROTO_ICMPV6 : + { + int minicmpsz = sizeof(struct icmp6_hdr); + struct icmp6_hdr *icmp6; + + if (fin->fin_dlen > 1) { + fin->fin_data[0] = *(u_short *)tcp; + + icmp6 = (struct icmp6_hdr *)tcp; + + switch (icmp6->icmp6_type) + { + case ICMP6_ECHO_REPLY : + case ICMP6_ECHO_REQUEST : + minicmpsz = ICMP6ERR_MINPKTLEN; + break; + case ICMP6_DST_UNREACH : + case ICMP6_PACKET_TOO_BIG : + case ICMP6_TIME_EXCEEDED : + case ICMP6_PARAM_PROB : + minicmpsz = ICMP6ERR_IPICMPHLEN; + break; + default : + break; + } + } + + if (!(plen >= hlen + minicmpsz)) + fi->fi_fl |= FI_SHORT; + + break; + } +#endif + case IPPROTO_ICMP : + { + int minicmpsz = sizeof(struct icmp); + icmphdr_t *icmp; + + if (!off && (fin->fin_dlen > 1)) { + fin->fin_data[0] = *(u_short *)tcp; + + icmp = (icmphdr_t *)tcp; + + if (icmp->icmp_type == ICMP_ECHOREPLY || + icmp->icmp_type == ICMP_ECHO) + minicmpsz = ICMP_MINLEN; + + /* + * type(1) + code(1) + cksum(2) + id(2) seq(2) + + * 3*timestamp(3*4) + */ + else if (icmp->icmp_type == ICMP_TSTAMP || + icmp->icmp_type == ICMP_TSTAMPREPLY) + minicmpsz = 20; + + /* + * type(1) + code(1) + cksum(2) + id(2) seq(2) + + * mask(4) + */ + else if (icmp->icmp_type == ICMP_MASKREQ || + icmp->icmp_type == ICMP_MASKREPLY) + minicmpsz = 12; + } + + if ((!(plen >= hlen + minicmpsz) && !off) || + (off && off < sizeof(struct icmp))) + fi->fi_fl |= FI_SHORT; + + break; + } + case IPPROTO_TCP : + fi->fi_fl |= FI_TCPUDP; +#ifdef USE_INET6 + if (v == 6) { + if (plen < sizeof(struct tcphdr)) + fi->fi_fl |= FI_SHORT; + } else +#endif + if (v == 4) { + if ((!IPMINLEN(ip, tcphdr) && !off) || + (off && off < sizeof(struct tcphdr))) + fi->fi_fl |= FI_SHORT; + } + if (!(fi->fi_fl & FI_SHORT) && !off) + fin->fin_tcpf = tcp->th_flags; + goto getports; + case IPPROTO_UDP : + fi->fi_fl |= FI_TCPUDP; +#ifdef USE_INET6 + if (v == 6) { + if (plen < sizeof(struct udphdr)) + fi->fi_fl |= FI_SHORT; + } else +#endif + if (v == 4) { + if ((!IPMINLEN(ip, udphdr) && !off) || + (off && off < sizeof(struct udphdr))) + fi->fi_fl |= FI_SHORT; + } +getports: + if (!off && (fin->fin_dlen > 3)) { + fin->fin_data[0] = ntohs(tcp->th_sport); + fin->fin_data[1] = ntohs(tcp->th_dport); + } + break; + default : + break; + } + +#ifdef USE_INET6 + if (v == 6) { + fi->fi_optmsk = 0; + fi->fi_secmsk = 0; + fi->fi_auth = 0; + return; + } +#endif + + for (s = (u_char *)(ip + 1), hlen -= (int)sizeof(*ip); hlen > 0; ) { + opt = *s; + if (opt == '\0') + break; + else if (opt == IPOPT_NOP) + ol = 1; + else { + if (hlen < 2) + break; + ol = (int)*(s + 1); + if (ol < 2 || ol > hlen) + break; + } + for (i = 9, mv = 4; mv >= 0; ) { + op = ipopts + i; + if (opt == (u_char)op->ol_val) { + optmsk |= op->ol_bit; + if (opt == IPOPT_SECURITY) { + struct optlist *sp; + u_char sec; + int j, m; + + sec = *(s + 2); /* classification */ + for (j = 3, m = 2; m >= 0; ) { + sp = secopt + j; + if (sec == sp->ol_val) { + secmsk |= sp->ol_bit; + auth = *(s + 3); + auth *= 256; + auth += *(s + 4); + break; + } + if (sec < sp->ol_val) + j -= m--; + else + j += m--; + } + } + break; + } + if (opt < op->ol_val) + i -= mv--; + else + i += mv--; + } + hlen -= ol; + s += ol; + } + if (auth && !(auth & 0x0100)) + auth &= 0xff00; + fi->fi_optmsk = optmsk; + fi->fi_secmsk = secmsk; + fi->fi_auth = auth; +} + + +/* + * check an IP packet for TCP/UDP characteristics such as ports and flags. + */ +int fr_tcpudpchk(ft, fin) +frtuc_t *ft; +fr_info_t *fin; +{ + register u_short po, tup; + register char i; + register int err = 1; + + /* + * Both ports should *always* be in the first fragment. + * So far, I cannot find any cases where they can not be. + * + * compare destination ports + */ + if ((i = (int)ft->ftu_dcmp)) { + po = ft->ftu_dport; + tup = fin->fin_data[1]; + /* + * Do opposite test to that required and + * continue if that succeeds. + */ + if (!--i && tup != po) /* EQUAL */ + err = 0; + else if (!--i && tup == po) /* NOTEQUAL */ + err = 0; + else if (!--i && tup >= po) /* LESSTHAN */ + err = 0; + else if (!--i && tup <= po) /* GREATERTHAN */ + err = 0; + else if (!--i && tup > po) /* LT or EQ */ + err = 0; + else if (!--i && tup < po) /* GT or EQ */ + err = 0; + else if (!--i && /* Out of range */ + (tup >= po && tup <= ft->ftu_dtop)) + err = 0; + else if (!--i && /* In range */ + (tup <= po || tup >= ft->ftu_dtop)) + err = 0; + } + /* + * compare source ports + */ + if (err && (i = (int)ft->ftu_scmp)) { + po = ft->ftu_sport; + tup = fin->fin_data[0]; + if (!--i && tup != po) + err = 0; + else if (!--i && tup == po) + err = 0; + else if (!--i && tup >= po) + err = 0; + else if (!--i && tup <= po) + err = 0; + else if (!--i && tup > po) + err = 0; + else if (!--i && tup < po) + err = 0; + else if (!--i && /* Out of range */ + (tup >= po && tup <= ft->ftu_stop)) + err = 0; + else if (!--i && /* In range */ + (tup <= po || tup >= ft->ftu_stop)) + err = 0; + } + + /* + * If we don't have all the TCP/UDP header, then how can we + * expect to do any sort of match on it ? If we were looking for + * TCP flags, then NO match. If not, then match (which should + * satisfy the "short" class too). + */ + if (err && (fin->fin_fi.fi_p == IPPROTO_TCP)) { + if (fin->fin_fi.fi_fl & FI_SHORT) + return !(ft->ftu_tcpf | ft->ftu_tcpfm); + /* + * Match the flags ? If not, abort this match. + */ + if (ft->ftu_tcpfm && + ft->ftu_tcpf != (fin->fin_tcpf & ft->ftu_tcpfm)) { + FR_DEBUG(("f. %#x & %#x != %#x\n", fin->fin_tcpf, + ft->ftu_tcpfm, ft->ftu_tcpf)); + err = 0; + } + } + return err; +} + +/* + * Check the input/output list of rules for a match and result. + * Could be per interface, but this gets real nasty when you don't have + * kernel sauce. + */ +int fr_scanlist(pass, ip, fin, m) +u_32_t pass; +ip_t *ip; +register fr_info_t *fin; +void *m; +{ + register struct frentry *fr; + register fr_ip_t *fi = &fin->fin_fi; + int rulen, portcmp = 0, off, skip = 0, logged = 0; + u_32_t passt; + + fr = fin->fin_fr; + fin->fin_fr = NULL; + fin->fin_rule = 0; + fin->fin_group = 0; + if (fin->fin_v == 4) + off = ip->ip_off & IP_OFFMASK; + else + off = 0; + pass |= (fi->fi_fl << 24); + + if ((fi->fi_fl & FI_TCPUDP) && (fin->fin_dlen > 3) && !off) + portcmp = 1; + + for (rulen = 0; fr; fr = fr->fr_next, rulen++) { + if (skip) { + skip--; + continue; + } + /* + * In all checks below, a null (zero) value in the + * filter struture is taken to mean a wildcard. + * + * check that we are working for the right interface + */ +#ifdef _KERNEL +# if BSD >= 199306 + if (fin->fin_out != 0) { + if ((fr->fr_oifa && + fr->fr_oifa != ((mb_t *)m)->m_pkthdr.rcvif) || + (fr->fr_ifa && fr->fr_ifa != fin->fin_ifp)) + continue; + } else +# endif + if (fr->fr_ifa && fr->fr_ifa != fin->fin_ifp) + continue; +#else + if (opts & (OPT_VERBOSE|OPT_DEBUG)) + printf("\n"); + FR_VERBOSE(("%c", (pass & FR_PASS) ? 'p' : + (pass & FR_AUTH) ? 'a' : 'b')); + if (fr->fr_ifa && fr->fr_ifa != fin->fin_ifp) + continue; + FR_VERBOSE((":i")); +#endif + { + register u_32_t *ld, *lm, *lip; + register int i; + + lip = (u_32_t *)fi; + lm = (u_32_t *)&fr->fr_mip; + ld = (u_32_t *)&fr->fr_ip; + i = ((*lip & *lm) != *ld); + FR_DEBUG(("0. %#08x & %#08x != %#08x\n", + *lip, *lm, *ld)); + if (i) + continue; + /* + * We now know whether the packet version and the + * rule version match, along with protocol, ttl and + * tos. + */ + lip++, lm++, ld++; + /* + * Unrolled loops (4 each, for 32 bits). + */ + i |= ((*lip & *lm) != *ld) << 19; + FR_DEBUG(("1a. %#08x & %#08x != %#08x\n", + *lip, *lm, *ld)); + if (fi->fi_v == 6) { + lip++, lm++, ld++; + i |= ((*lip & *lm) != *ld) << 19; + FR_DEBUG(("1b. %#08x & %#08x != %#08x\n", + *lip, *lm, *ld)); + lip++, lm++, ld++; + i |= ((*lip & *lm) != *ld) << 19; + FR_DEBUG(("1c. %#08x & %#08x != %#08x\n", + *lip, *lm, *ld)); + lip++, lm++, ld++; + i |= ((*lip & *lm) != *ld) << 19; + FR_DEBUG(("1d. %#08x & %#08x != %#08x\n", + *lip, *lm, *ld)); + } else { + lip += 3; + lm += 3; + ld += 3; + } + i ^= (fr->fr_flags & FR_NOTSRCIP); + if (i) + continue; + lip++, lm++, ld++; + i |= ((*lip & *lm) != *ld) << 20; + FR_DEBUG(("2a. %#08x & %#08x != %#08x\n", + *lip, *lm, *ld)); + if (fi->fi_v == 6) { + lip++, lm++, ld++; + i |= ((*lip & *lm) != *ld) << 20; + FR_DEBUG(("2b. %#08x & %#08x != %#08x\n", + *lip, *lm, *ld)); + lip++, lm++, ld++; + i |= ((*lip & *lm) != *ld) << 20; + FR_DEBUG(("2c. %#08x & %#08x != %#08x\n", + *lip, *lm, *ld)); + lip++, lm++, ld++; + i |= ((*lip & *lm) != *ld) << 20; + FR_DEBUG(("2d. %#08x & %#08x != %#08x\n", + *lip, *lm, *ld)); + } else { + lip += 3; + lm += 3; + ld += 3; + } + i ^= (fr->fr_flags & FR_NOTDSTIP); + if (i) + continue; + lip++, lm++, ld++; + i |= ((*lip & *lm) != *ld); + FR_DEBUG(("3. %#08x & %#08x != %#08x\n", + *lip, *lm, *ld)); + lip++, lm++, ld++; + i |= ((*lip & *lm) != *ld); + FR_DEBUG(("4. %#08x & %#08x != %#08x\n", + *lip, *lm, *ld)); + if (i) + continue; + } + + /* + * If a fragment, then only the first has what we're looking + * for here... + */ + if (!portcmp && (fr->fr_dcmp || fr->fr_scmp || fr->fr_tcpf || + fr->fr_tcpfm)) + continue; + if (fi->fi_fl & FI_TCPUDP) { + if (!fr_tcpudpchk(&fr->fr_tuc, fin)) + continue; + } else if (fr->fr_icmpm || fr->fr_icmp) { + if ((fi->fi_p != IPPROTO_ICMP) || off || + (fin->fin_dlen < 2)) + continue; + if ((fin->fin_data[0] & fr->fr_icmpm) != fr->fr_icmp) { + FR_DEBUG(("i. %#x & %#x != %#x\n", + fin->fin_data[0], fr->fr_icmpm, + fr->fr_icmp)); + continue; + } + } + FR_VERBOSE(("*")); + /* + * Just log this packet... + */ + passt = fr->fr_flags; +#if (BSD >= 199306) && (defined(_KERNEL) || defined(KERNEL)) + if (securelevel <= 0) +#endif + if ((passt & FR_CALLNOW) && fr->fr_func) + passt = (*fr->fr_func)(passt, ip, fin); + fin->fin_fr = fr; +#ifdef IPFILTER_LOG + if ((passt & FR_LOGMASK) == FR_LOG) { + if (!IPLLOG(passt, ip, fin, m)) { + if (passt & FR_LOGORBLOCK) + passt |= FR_BLOCK|FR_QUICK; + ATOMIC_INCL(frstats[fin->fin_out].fr_skip); + } + ATOMIC_INCL(frstats[fin->fin_out].fr_pkl); + logged = 1; + } +#endif /* IPFILTER_LOG */ + if (!(skip = fr->fr_skip) && (passt & FR_LOGMASK) != FR_LOG) + pass = passt; + FR_DEBUG(("pass %#x\n", pass)); + ATOMIC_INCL(fr->fr_hits); + if (pass & FR_ACCOUNT) + fr->fr_bytes += (U_QUAD_T)ip->ip_len; + else + fin->fin_icode = fr->fr_icode; + fin->fin_rule = rulen; + fin->fin_group = fr->fr_group; + if (fr->fr_grp) { + fin->fin_fr = fr->fr_grp; + pass = fr_scanlist(pass, ip, fin, m); + if (fin->fin_fr == NULL) { + fin->fin_rule = rulen; + fin->fin_group = fr->fr_group; + fin->fin_fr = fr; + } + if (pass & FR_DONTCACHE) + logged = 1; + } + if (pass & FR_QUICK) + break; + } + if (logged) + pass |= FR_DONTCACHE; + return pass; +} + + +/* + * frcheck - filter check + * check using source and destination addresses/ports in a packet whether + * or not to pass it on or not. + */ +int fr_check(ip, hlen, ifp, out +#if defined(_KERNEL) && SOLARIS +, qif, mp) +qif_t *qif; +#else +, mp) +#endif +mb_t **mp; +ip_t *ip; +int hlen; +void *ifp; +int out; +{ + /* + * The above really sucks, but short of writing a diff + */ + fr_info_t frinfo, *fc; + register fr_info_t *fin = &frinfo; + int changed, error = EHOSTUNREACH, v = ip->ip_v; + frentry_t *fr = NULL, *list; + u_32_t pass, apass; +#if !SOLARIS || !defined(_KERNEL) + register mb_t *m = *mp; +#endif + +#ifdef _KERNEL + int p, len, drop = 0, logit = 0; + mb_t *mc = NULL; +# if !defined(__SVR4) && !defined(__svr4__) +# ifdef __sgi + char hbuf[(0xf << 2) + sizeof(struct icmp) + sizeof(ip_t) + 8]; +# endif + int up; + +# ifdef M_CANFASTFWD + /* + * XXX For now, IP Filter and fast-forwarding of cached flows + * XXX are mutually exclusive. Eventually, IP Filter should + * XXX get a "can-fast-forward" filter rule. + */ + m->m_flags &= ~M_CANFASTFWD; +# endif /* M_CANFASTFWD */ +# ifdef CSUM_DELAY_DATA + /* + * disable delayed checksums. + */ + if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { + in_delayed_cksum(m); + m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; + } +# endif /* CSUM_DELAY_DATA */ + +# ifdef USE_INET6 + if (v == 6) { + len = ntohs(((ip6_t*)ip)->ip6_plen); + p = ((ip6_t *)ip)->ip6_nxt; + } else +# endif + { + p = ip->ip_p; + len = ip->ip_len; + } + + if ((p == IPPROTO_TCP || p == IPPROTO_UDP || p == IPPROTO_ICMP +# ifdef USE_INET6 + || (v == 6 && p == IPPROTO_ICMPV6) +# endif + )) { + int plen = 0; + + if ((v == 6) || (ip->ip_off & IP_OFFMASK) == 0) + switch(p) + { + case IPPROTO_TCP: + plen = sizeof(tcphdr_t); + break; + case IPPROTO_UDP: + plen = sizeof(udphdr_t); + break; + /* 96 - enough for complete ICMP error IP header */ + case IPPROTO_ICMP: + plen = ICMPERR_MAXPKTLEN - sizeof(ip_t); + break; +# ifdef USE_INET6 + case IPPROTO_ICMPV6 : + /* + * XXX does not take intermediate header + * into account + */ + plen = ICMP6ERR_MINPKTLEN + 8 - sizeof(ip6_t); + break; +# endif + } + up = MIN(hlen + plen, len); + + if (up > m->m_len) { +# ifdef __sgi + /* Under IRIX, avoid m_pullup as it makes ping <hostname> panic */ + if ((up > sizeof(hbuf)) || (m_length(m) < up)) { + ATOMIC_INCL(frstats[out].fr_pull[1]); + return -1; + } + m_copydata(m, 0, up, hbuf); + ATOMIC_INCL(frstats[out].fr_pull[0]); + ip = (ip_t *)hbuf; +# else /* __ sgi */ +# ifndef linux + if ((*mp = m_pullup(m, up)) == 0) { + ATOMIC_INCL(frstats[out].fr_pull[1]); + return -1; + } else { + ATOMIC_INCL(frstats[out].fr_pull[0]); + m = *mp; + ip = mtod(m, ip_t *); + } +# endif /* !linux */ +# endif /* __sgi */ + } else + up = 0; + } else + up = 0; +# endif /* !defined(__SVR4) && !defined(__svr4__) */ +# if SOLARIS + mb_t *m = qif->qf_m; + + if ((u_int)ip & 0x3) + return 2; + fin->fin_qfm = m; + fin->fin_qif = qif; +# endif +#endif /* _KERNEL */ + +#ifndef __FreeBSD__ + /* + * Be careful here: ip_id is in network byte order when called + * from ip_output() + */ + if ((out) && (v == 4)) + ip->ip_id = ntohs(ip->ip_id); +#endif + + changed = 0; + fin->fin_ifp = ifp; + fin->fin_v = v; + fin->fin_out = out; + fin->fin_mp = mp; + fr_makefrip(hlen, ip, fin); + +#ifdef _KERNEL +# ifdef USE_INET6 + if (v == 6) { + ATOMIC_INCL(frstats[0].fr_ipv6[out]); + if (((ip6_t *)ip)->ip6_hlim < fr_minttl) { + ATOMIC_INCL(frstats[0].fr_badttl); + if (fr_minttllog) + logit = -2; + } + } else +# endif + if (!out) { + if (fr_chksrc && !fr_verifysrc(ip->ip_src, ifp)) { + ATOMIC_INCL(frstats[0].fr_badsrc); + if (fr_chksrc == 2) + logit = -2; + } else if (ip->ip_ttl < fr_minttl) { + ATOMIC_INCL(frstats[0].fr_badttl); + if (fr_minttllog) + logit = -3; + } + } + if (drop) { +# ifdef IPFILTER_LOG + if (logit) { + fin->fin_group = logit; + pass = FR_INQUE|FR_NOMATCH|FR_LOGB; + (void) IPLLOG(pass, ip, fin, m); + } +# endif +# if !SOLARIS + m_freem(m); +# endif + return error; + } +#endif + pass = fr_pass; + if (fin->fin_fi.fi_fl & FI_SHORT) { + ATOMIC_INCL(frstats[out].fr_short); + } + + READ_ENTER(&ipf_mutex); + + if (fin->fin_fi.fi_fl & FI_SHORT) + ATOMIC_INCL(frstats[out].fr_short); + + /* + * Check auth now. This, combined with the check below to see if apass + * is 0 is to ensure that we don't count the packet twice, which can + * otherwise occur when we reprocess it. As it is, we only count it + * after it has no auth. table matchup. This also stops NAT from + * occuring until after the packet has been auth'd. + */ + apass = fr_checkauth(ip, fin); + + if (!out) { +#ifdef USE_INET6 + if (v == 6) + list = ipacct6[0][fr_active]; + else +#endif + list = ipacct[0][fr_active]; + changed = ip_natin(ip, fin); + if (!apass && (fin->fin_fr = list) && + (fr_scanlist(FR_NOMATCH, ip, fin, m) & FR_ACCOUNT)) { + ATOMIC_INCL(frstats[0].fr_acct); + } + } + + if (apass || (!(fr = ipfr_knownfrag(ip, fin)) && + !(fr = fr_checkstate(ip, fin)))) { + /* + * If a packet is found in the auth table, then skip checking + * the access lists for permission but we do need to consider + * the result as if it were from the ACL's. + */ + if (!apass) { + fc = frcache + out; + if (!bcmp((char *)fin, (char *)fc, FI_CSIZE)) { + /* + * copy cached data so we can unlock the mutex + * earlier. + */ + bcopy((char *)fc, (char *)fin, FI_COPYSIZE); + ATOMIC_INCL(frstats[out].fr_chit); + if ((fr = fin->fin_fr)) { + ATOMIC_INCL(fr->fr_hits); + pass = fr->fr_flags; + } + } else { +#ifdef USE_INET6 + if (v == 6) + list = ipfilter6[out][fr_active]; + else +#endif + list = ipfilter[out][fr_active]; + if ((fin->fin_fr = list)) + pass = fr_scanlist(fr_pass, ip, fin, m); + if (!(pass & (FR_KEEPSTATE|FR_DONTCACHE))) + bcopy((char *)fin, (char *)fc, + FI_COPYSIZE); + if (pass & FR_NOMATCH) { + ATOMIC_INCL(frstats[out].fr_nom); + } + } + fr = fin->fin_fr; + } else + pass = apass; + + /* + * If we fail to add a packet to the authorization queue, + * then we drop the packet later. However, if it was added + * then pretend we've dropped it already. + */ + if ((pass & FR_AUTH)) + if (fr_newauth((mb_t *)m, fin, ip) != 0) +#ifdef _KERNEL + m = *mp = NULL; +#else + ; +#endif + + if (pass & FR_PREAUTH) { + READ_ENTER(&ipf_auth); + if ((fin->fin_fr = ipauth) && + (pass = fr_scanlist(0, ip, fin, m))) { + ATOMIC_INCL(fr_authstats.fas_hits); + } else { + ATOMIC_INCL(fr_authstats.fas_miss); + } + RWLOCK_EXIT(&ipf_auth); + } + + fin->fin_fr = fr; + if ((pass & (FR_KEEPFRAG|FR_KEEPSTATE)) == FR_KEEPFRAG) { + if (fin->fin_fi.fi_fl & FI_FRAG) { + if (ipfr_newfrag(ip, fin, pass) == -1) { + ATOMIC_INCL(frstats[out].fr_bnfr); + } else { + ATOMIC_INCL(frstats[out].fr_nfr); + } + } else { + ATOMIC_INCL(frstats[out].fr_cfr); + } + } + if (pass & FR_KEEPSTATE) { + if (fr_addstate(ip, fin, 0) == NULL) { + ATOMIC_INCL(frstats[out].fr_bads); + } else { + ATOMIC_INCL(frstats[out].fr_ads); + } + } + } else if (fr != NULL) { + pass = fr->fr_flags; + if (pass & FR_LOGFIRST) + pass &= ~(FR_LOGFIRST|FR_LOG); + } + +#if (BSD >= 199306) && (defined(_KERNEL) || defined(KERNEL)) + if (securelevel <= 0) +#endif + if (fr && fr->fr_func && !(pass & FR_CALLNOW)) + pass = (*fr->fr_func)(pass, ip, fin); + + /* + * Only count/translate packets which will be passed on, out the + * interface. + */ + if (out && (pass & FR_PASS)) { +#ifdef USE_INET6 + if (v == 6) + list = ipacct6[1][fr_active]; + else +#endif + list = ipacct[1][fr_active]; + if ((fin->fin_fr = list) && + (fr_scanlist(FR_NOMATCH, ip, fin, m) & FR_ACCOUNT)) { + ATOMIC_INCL(frstats[1].fr_acct); + } + fin->fin_fr = fr; + changed = ip_natout(ip, fin); + } else + fin->fin_fr = fr; + RWLOCK_EXIT(&ipf_mutex); + +#ifdef IPFILTER_LOG + if ((fr_flags & FF_LOGGING) || (pass & FR_LOGMASK)) { + if ((fr_flags & FF_LOGNOMATCH) && (pass & FR_NOMATCH)) { + pass |= FF_LOGNOMATCH; + ATOMIC_INCL(frstats[out].fr_npkl); + goto logit; + } else if (((pass & FR_LOGMASK) == FR_LOGP) || + ((pass & FR_PASS) && (fr_flags & FF_LOGPASS))) { + if ((pass & FR_LOGMASK) != FR_LOGP) + pass |= FF_LOGPASS; + ATOMIC_INCL(frstats[out].fr_ppkl); + goto logit; + } else if (((pass & FR_LOGMASK) == FR_LOGB) || + ((pass & FR_BLOCK) && (fr_flags & FF_LOGBLOCK))) { + if ((pass & FR_LOGMASK) != FR_LOGB) + pass |= FF_LOGBLOCK; + ATOMIC_INCL(frstats[out].fr_bpkl); +logit: + if (!IPLLOG(pass, ip, fin, m)) { + ATOMIC_INCL(frstats[out].fr_skip); + if ((pass & (FR_PASS|FR_LOGORBLOCK)) == + (FR_PASS|FR_LOGORBLOCK)) + pass ^= FR_PASS|FR_BLOCK; + } + } + } +#endif /* IPFILTER_LOG */ + +#ifndef __FreeBSD__ + if ((out) && (v == 4)) + ip->ip_id = htons(ip->ip_id); +#endif + +#ifdef _KERNEL + /* + * Only allow FR_DUP to work if a rule matched - it makes no sense to + * set FR_DUP as a "default" as there are no instructions about where + * to send the packet. + */ + if (fr && (pass & FR_DUP)) +# if SOLARIS + mc = dupmsg(m); +# else +# ifndef linux + mc = m_copy(m, 0, M_COPYALL); +# else + ; +# endif +# endif +#endif + if (pass & FR_PASS) { + ATOMIC_INCL(frstats[out].fr_pass); + } else if (pass & FR_BLOCK) { + ATOMIC_INCL(frstats[out].fr_block); + /* + * Should we return an ICMP packet to indicate error + * status passing through the packet filter ? + * WARNING: ICMP error packets AND TCP RST packets should + * ONLY be sent in repsonse to incoming packets. Sending them + * in response to outbound packets can result in a panic on + * some operating systems. + */ + if (!out) { +#ifdef _KERNEL + if (pass & FR_RETICMP) { + int dst; + + if ((pass & FR_RETMASK) == FR_FAKEICMP) + dst = 1; + else + dst = 0; + send_icmp_err(ip, ICMP_UNREACH, fin, dst); + ATOMIC_INCL(frstats[0].fr_ret); + } else if (((pass & FR_RETMASK) == FR_RETRST) && + !(fin->fin_fi.fi_fl & FI_SHORT)) { + if (send_reset(ip, fin) == 0) { + ATOMIC_INCL(frstats[1].fr_ret); + } + } +#else + if ((pass & FR_RETMASK) == FR_RETICMP) { + verbose("- ICMP unreachable sent\n"); + ATOMIC_INCL(frstats[0].fr_ret); + } else if ((pass & FR_RETMASK) == FR_FAKEICMP) { + verbose("- forged ICMP unreachable sent\n"); + ATOMIC_INCL(frstats[0].fr_ret); + } else if (((pass & FR_RETMASK) == FR_RETRST) && + !(fin->fin_fi.fi_fl & FI_SHORT)) { + verbose("- TCP RST sent\n"); + ATOMIC_INCL(frstats[1].fr_ret); + } +#endif + } else { + if (pass & FR_RETRST) + error = ECONNRESET; + } + } + + /* + * If we didn't drop off the bottom of the list of rules (and thus + * the 'current' rule fr is not NULL), then we may have some extra + * instructions about what to do with a packet. + * Once we're finished return to our caller, freeing the packet if + * we are dropping it (* BSD ONLY *). + */ + if ((changed == -1) && (pass & FR_PASS)) { + pass &= ~FR_PASS; + pass |= FR_BLOCK; + } +#if defined(_KERNEL) +# if !SOLARIS +# if !defined(linux) + if (fr) { + frdest_t *fdp = &fr->fr_tif; + + if (((pass & FR_FASTROUTE) && !out) || + (fdp->fd_ifp && fdp->fd_ifp != (struct ifnet *)-1)) { + if (ipfr_fastroute(m, fin, fdp) == 0) + m = *mp = NULL; + } + if (mc) + ipfr_fastroute(mc, fin, &fr->fr_dif); + } + if (!(pass & FR_PASS) && m) + m_freem(m); +# ifdef __sgi + else if (changed && up && m) + m_copyback(m, 0, up, hbuf); +# endif +# endif /* !linux */ +# else /* !SOLARIS */ + if (fr) { + frdest_t *fdp = &fr->fr_tif; + + if (((pass & FR_FASTROUTE) && !out) || + (fdp->fd_ifp && fdp->fd_ifp != (struct ifnet *)-1)) { + if (ipfr_fastroute(ip, m, mp, fin, fdp) == 0) + m = *mp = NULL; + } + if (mc) + ipfr_fastroute(ip, mc, mp, fin, &fr->fr_dif); + } +# endif /* !SOLARIS */ + return (pass & FR_PASS) ? 0 : error; +#else /* _KERNEL */ + if (pass & FR_NOMATCH) + return 1; + if (pass & FR_PASS) + return 0; + if (pass & FR_AUTH) + return -2; + return -1; +#endif /* _KERNEL */ +} + + +/* + * ipf_cksum + * addr should be 16bit aligned and len is in bytes. + * length is in bytes + */ +u_short ipf_cksum(addr, len) +register u_short *addr; +register int len; +{ + register u_32_t sum = 0; + + for (sum = 0; len > 1; len -= 2) + sum += *addr++; + + /* mop up an odd byte, if necessary */ + if (len == 1) + sum += *(u_char *)addr; + + /* + * add back carry outs from top 16 bits to low 16 bits + */ + sum = (sum >> 16) + (sum & 0xffff); /* add hi 16 to low 16 */ + sum += (sum >> 16); /* add carry */ + return (u_short)(~sum); +} + + +/* + * NB: This function assumes we've pullup'd enough for all of the IP header + * and the TCP header. We also assume that data blocks aren't allocated in + * odd sizes. + */ +u_short fr_tcpsum(m, ip, tcp) +mb_t *m; +ip_t *ip; +tcphdr_t *tcp; +{ + u_short *sp, slen, ts; + u_int sum, sum2; + int hlen; + + /* + * Add up IP Header portion + */ + hlen = ip->ip_hl << 2; + slen = ip->ip_len - hlen; + sum = htons((u_short)ip->ip_p); + sum += htons(slen); + sp = (u_short *)&ip->ip_src; + sum += *sp++; /* ip_src */ + sum += *sp++; + sum += *sp++; /* ip_dst */ + sum += *sp++; + ts = tcp->th_sum; + tcp->th_sum = 0; +#ifdef KERNEL +# if SOLARIS + sum2 = ip_cksum(m, hlen, sum); /* hlen == offset */ + sum2 = (sum2 & 0xffff) + (sum2 >> 16); + sum2 = ~sum2 & 0xffff; +# else /* SOLARIS */ +# if defined(BSD) || defined(sun) +# if BSD >= 199306 + m->m_data += hlen; +# else + m->m_off += hlen; +# endif + m->m_len -= hlen; + sum2 = in_cksum(m, slen); + m->m_len += hlen; +# if BSD >= 199306 + m->m_data -= hlen; +# else + m->m_off -= hlen; +# endif + /* + * Both sum and sum2 are partial sums, so combine them together. + */ + sum = (sum & 0xffff) + (sum >> 16); + sum = ~sum & 0xffff; + sum2 += sum; + sum2 = (sum2 & 0xffff) + (sum2 >> 16); +# else /* defined(BSD) || defined(sun) */ +{ + union { + u_char c[2]; + u_short s; + } bytes; + u_short len = ip->ip_len; +# if defined(__sgi) + int add; +# endif + + /* + * Add up IP Header portion + */ + sp = (u_short *)&ip->ip_src; + len -= (ip->ip_hl << 2); + sum = ntohs(IPPROTO_TCP); + sum += htons(len); + sum += *sp++; /* ip_src */ + sum += *sp++; + sum += *sp++; /* ip_dst */ + sum += *sp++; + if (sp != (u_short *)tcp) + sp = (u_short *)tcp; + sum += *sp++; /* sport */ + sum += *sp++; /* dport */ + sum += *sp++; /* seq */ + sum += *sp++; + sum += *sp++; /* ack */ + sum += *sp++; + sum += *sp++; /* off */ + sum += *sp++; /* win */ + sum += *sp++; /* Skip over checksum */ + sum += *sp++; /* urp */ + +# ifdef __sgi + /* + * In case we had to copy the IP & TCP header out of mbufs, + * skip over the mbuf bits which are the header + */ + if ((caddr_t)ip != mtod(m, caddr_t)) { + hlen = (caddr_t)sp - (caddr_t)ip; + while (hlen) { + add = MIN(hlen, m->m_len); + sp = (u_short *)(mtod(m, caddr_t) + add); + hlen -= add; + if (add == m->m_len) { + m = m->m_next; + if (!hlen) { + if (!m) + break; + sp = mtod(m, u_short *); + } + PANIC((!m),("fr_tcpsum(1): not enough data")); + } + } + } +# endif + + if (!(len -= sizeof(*tcp))) + goto nodata; + while (len > 1) { + if (((caddr_t)sp - mtod(m, caddr_t)) >= m->m_len) { + m = m->m_next; + PANIC((!m),("fr_tcpsum(2): not enough data")); + sp = mtod(m, u_short *); + } + if (((caddr_t)(sp + 1) - mtod(m, caddr_t)) > m->m_len) { + bytes.c[0] = *(u_char *)sp; + m = m->m_next; + PANIC((!m),("fr_tcpsum(3): not enough data")); + sp = mtod(m, u_short *); + bytes.c[1] = *(u_char *)sp; + sum += bytes.s; + sp = (u_short *)((u_char *)sp + 1); + } + if ((u_long)sp & 1) { + bcopy((char *)sp++, (char *)&bytes.s, sizeof(bytes.s)); + sum += bytes.s; + } else + sum += *sp++; + len -= 2; + } + if (len) + sum += ntohs(*(u_char *)sp << 8); +nodata: + while (sum > 0xffff) + sum = (sum & 0xffff) + (sum >> 16); + sum2 = (u_short)(~sum & 0xffff); +} +# endif /* defined(BSD) || defined(sun) */ +# endif /* SOLARIS */ +#else /* KERNEL */ + sum2 = 0; +#endif /* KERNEL */ + tcp->th_sum = ts; + return sum2; +} + + +#if defined(_KERNEL) && ( ((BSD < 199306) && !SOLARIS) || defined(__sgi) ) +/* + * Copyright (c) 1982, 1986, 1988, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94 + * $Id: fil.c,v 2.35.2.30 2000/12/17 05:49:22 darrenr Exp $ + */ +/* + * Copy data from an mbuf chain starting "off" bytes from the beginning, + * continuing for "len" bytes, into the indicated buffer. + */ +void +m_copydata(m, off, len, cp) + register mb_t *m; + register int off; + register int len; + caddr_t cp; +{ + register unsigned count; + + if (off < 0 || len < 0) + panic("m_copydata"); + while (off > 0) { + if (m == 0) + panic("m_copydata"); + if (off < m->m_len) + break; + off -= m->m_len; + m = m->m_next; + } + while (len > 0) { + if (m == 0) + panic("m_copydata"); + count = MIN(m->m_len - off, len); + bcopy(mtod(m, caddr_t) + off, cp, count); + len -= count; + cp += count; + off = 0; + m = m->m_next; + } +} + + +# ifndef linux +/* + * Copy data from a buffer back into the indicated mbuf chain, + * starting "off" bytes from the beginning, extending the mbuf + * chain if necessary. + */ +void +m_copyback(m0, off, len, cp) + struct mbuf *m0; + register int off; + register int len; + caddr_t cp; +{ + register int mlen; + register struct mbuf *m = m0, *n; + int totlen = 0; + + if (m0 == 0) + return; + while (off > (mlen = m->m_len)) { + off -= mlen; + totlen += mlen; + if (m->m_next == 0) { + n = m_getclr(M_DONTWAIT, m->m_type); + if (n == 0) + goto out; + n->m_len = min(MLEN, len + off); + m->m_next = n; + } + m = m->m_next; + } + while (len > 0) { + mlen = min (m->m_len - off, len); + bcopy(cp, off + mtod(m, caddr_t), (unsigned)mlen); + cp += mlen; + len -= mlen; + mlen += off; + off = 0; + totlen += mlen; + if (len == 0) + break; + if (m->m_next == 0) { + n = m_get(M_DONTWAIT, m->m_type); + if (n == 0) + break; + n->m_len = min(MLEN, len); + m->m_next = n; + } + m = m->m_next; + } +out: +#if 0 + if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) + m->m_pkthdr.len = totlen; +#endif + return; +} +# endif /* linux */ +#endif /* (_KERNEL) && ( ((BSD < 199306) && !SOLARIS) || __sgi) */ + + +frgroup_t *fr_findgroup(num, flags, which, set, fgpp) +u_32_t num, flags; +minor_t which; +int set; +frgroup_t ***fgpp; +{ + frgroup_t *fg, **fgp; + + if (which == IPL_LOGAUTH) + fgp = &ipfgroups[2][set]; + else if (flags & FR_ACCOUNT) + fgp = &ipfgroups[1][set]; + else if (flags & (FR_OUTQUE|FR_INQUE)) + fgp = &ipfgroups[0][set]; + else + return NULL; + num &= 0xffff; + + while ((fg = *fgp)) + if (fg->fg_num == num) + break; + else + fgp = &fg->fg_next; + if (fgpp) + *fgpp = fgp; + return fg; +} + + +frgroup_t *fr_addgroup(num, fp, which, set) +u_32_t num; +frentry_t *fp; +minor_t which; +int set; +{ + frgroup_t *fg, **fgp; + + if ((fg = fr_findgroup(num, fp->fr_flags, which, set, &fgp))) + return fg; + + KMALLOC(fg, frgroup_t *); + if (fg) { + fg->fg_num = num; + fg->fg_next = *fgp; + fg->fg_head = fp; + fg->fg_start = &fp->fr_grp; + *fgp = fg; + } + return fg; +} + + +void fr_delgroup(num, flags, which, set) +u_32_t num, flags; +minor_t which; +int set; +{ + frgroup_t *fg, **fgp; + + if (!(fg = fr_findgroup(num, flags, which, set, &fgp))) + return; + + *fgp = fg->fg_next; + KFREE(fg); +} + + + +/* + * recursively flush rules from the list, descending groups as they are + * encountered. if a rule is the head of a group and it has lost all its + * group members, then also delete the group reference. + */ +static int frflushlist(set, unit, nfreedp, listp) +int set; +minor_t unit; +int *nfreedp; +frentry_t **listp; +{ + register int freed = 0, i; + register frentry_t *fp; + + while ((fp = *listp)) { + *listp = fp->fr_next; + if (fp->fr_grp) { + i = frflushlist(set, unit, nfreedp, &fp->fr_grp); + MUTEX_ENTER(&ipf_rw); + fp->fr_ref -= i; + MUTEX_EXIT(&ipf_rw); + } + + ATOMIC_DEC32(fp->fr_ref); + if (fp->fr_grhead) { + fr_delgroup(fp->fr_grhead, fp->fr_flags, + unit, set); + fp->fr_grhead = 0; + } + if (fp->fr_ref == 0) { + KFREE(fp); + freed++; + } else + fp->fr_next = NULL; + } + *nfreedp += freed; + return freed; +} + + +int frflush(unit, flags) +minor_t unit; +int flags; +{ + int flushed = 0, set; + + if (unit != IPL_LOGIPF) + return 0; + WRITE_ENTER(&ipf_mutex); + bzero((char *)frcache, sizeof(frcache[0]) * 2); + + set = fr_active; + if (flags & FR_INACTIVE) + set = 1 - set; + + if (flags & FR_OUTQUE) { +#ifdef USE_INET6 + (void) frflushlist(set, unit, &flushed, &ipfilter6[1][set]); + (void) frflushlist(set, unit, &flushed, &ipacct6[1][set]); +#endif + (void) frflushlist(set, unit, &flushed, &ipfilter[1][set]); + (void) frflushlist(set, unit, &flushed, &ipacct[1][set]); + } + if (flags & FR_INQUE) { +#ifdef USE_INET6 + (void) frflushlist(set, unit, &flushed, &ipfilter6[0][set]); + (void) frflushlist(set, unit, &flushed, &ipacct6[0][set]); +#endif + (void) frflushlist(set, unit, &flushed, &ipfilter[0][set]); + (void) frflushlist(set, unit, &flushed, &ipacct[0][set]); + } + RWLOCK_EXIT(&ipf_mutex); + return flushed; +} + + +char *memstr(src, dst, slen, dlen) +char *src, *dst; +int slen, dlen; +{ + char *s = NULL; + + while (dlen >= slen) { + if (bcmp(src, dst, slen) == 0) { + s = dst; + break; + } + dst++; + dlen--; + } + return s; +} + + +void fixskip(listp, rp, addremove) +frentry_t **listp, *rp; +int addremove; +{ + frentry_t *fp; + int rules = 0, rn = 0; + + for (fp = *listp; fp && (fp != rp); fp = fp->fr_next, rules++) + ; + + if (!fp) + return; + + for (fp = *listp; fp && (fp != rp); fp = fp->fr_next, rn++) + if (fp->fr_skip && (rn + fp->fr_skip >= rules)) + fp->fr_skip += addremove; +} + + +#ifdef _KERNEL +/* + * count consecutive 1's in bit mask. If the mask generated by counting + * consecutive 1's is different to that passed, return -1, else return # + * of bits. + */ +int countbits(ip) +u_32_t ip; +{ + u_32_t ipn; + int cnt = 0, i, j; + + ip = ipn = ntohl(ip); + for (i = 32; i; i--, ipn *= 2) + if (ipn & 0x80000000) + cnt++; + else + break; + ipn = 0; + for (i = 32, j = cnt; i; i--, j--) { + ipn *= 2; + if (j > 0) + ipn++; + } + if (ipn == ip) + return cnt; + return -1; +} + + +/* + * return the first IP Address associated with an interface + */ +int fr_ifpaddr(v, ifptr, inp) +int v; +void *ifptr; +struct in_addr *inp; +{ +# ifdef USE_INET6 + struct in6_addr *inp6 = NULL; +# endif +# if SOLARIS + ill_t *ill = ifptr; +# else + struct ifnet *ifp = ifptr; +# endif + struct in_addr in; + +# if SOLARIS +# ifdef USE_INET6 + if (v == 6) { + struct in6_addr in6; + + /* + * First is always link local. + */ + if (ill->ill_ipif->ipif_next) + in6 = ill->ill_ipif->ipif_next->ipif_v6lcl_addr; + else + bzero((char *)&in6, sizeof(in6)); + bcopy((char *)&in6, (char *)inp, sizeof(in6)); + } else +# endif + { + in.s_addr = ill->ill_ipif->ipif_local_addr; + *inp = in; + } +# else /* SOLARIS */ +# if linux + ; +# else /* linux */ + struct sockaddr_in *sin; + struct ifaddr *ifa; + +# if (__FreeBSD_version >= 300000) + ifa = TAILQ_FIRST(&ifp->if_addrhead); +# else +# if defined(__NetBSD__) || defined(__OpenBSD__) + ifa = ifp->if_addrlist.tqh_first; +# else +# if defined(__sgi) && defined(IFF_DRVRLOCK) /* IRIX 6 */ + ifa = &((struct in_ifaddr *)ifp->in_ifaddr)->ia_ifa; +# else + ifa = ifp->if_addrlist; +# endif +# endif /* __NetBSD__ || __OpenBSD__ */ +# endif /* __FreeBSD_version >= 300000 */ +# if (BSD < 199306) && !(/*IRIX6*/defined(__sgi) && defined(IFF_DRVRLOCK)) + sin = (struct sockaddr_in *)&ifa->ifa_addr; +# else + sin = (struct sockaddr_in *)ifa->ifa_addr; + while (sin && ifa) { + if ((v == 4) && (sin->sin_family == AF_INET)) + break; +# ifdef USE_INET6 + if ((v == 6) && (sin->sin_family == AF_INET6)) { + inp6 = &((struct sockaddr_in6 *)sin)->sin6_addr; + if (!IN6_IS_ADDR_LINKLOCAL(inp6) && + !IN6_IS_ADDR_LOOPBACK(inp6)) + break; + } +# endif +# if (__FreeBSD_version >= 300000) + ifa = TAILQ_NEXT(ifa, ifa_link); +# else +# if defined(__NetBSD__) || defined(__OpenBSD__) + ifa = ifa->ifa_list.tqe_next; +# else + ifa = ifa->ifa_next; +# endif +# endif /* __FreeBSD_version >= 300000 */ + if (ifa) + sin = (struct sockaddr_in *)ifa->ifa_addr; + } + if (ifa == NULL) + sin = NULL; + if (sin == NULL) + return -1; +# endif /* (BSD < 199306) && (!__sgi && IFF_DRVLOCK) */ +# ifdef USE_INET6 + if (v == 6) + bcopy((char *)inp6, (char *)inp, sizeof(*inp6)); + else +# endif + { + in = sin->sin_addr; + *inp = in; + } +# endif /* linux */ +# endif /* SOLARIS */ + return 0; +} + + +static void frsynclist(fr) +register frentry_t *fr; +{ + for (; fr; fr = fr->fr_next) { + if (fr->fr_ifa != NULL) { + fr->fr_ifa = GETUNIT(fr->fr_ifname, fr->fr_ip.fi_v); + if (fr->fr_ifa == NULL) + fr->fr_ifa = (void *)-1; + } + if (fr->fr_grp) + frsynclist(fr->fr_grp); + } +} + + +void frsync() +{ +# if !SOLARIS + register struct ifnet *ifp; + +# if defined(__OpenBSD__) || ((NetBSD >= 199511) && (NetBSD < 1991011)) || \ + (defined(__FreeBSD_version) && (__FreeBSD_version >= 300000)) +# if (NetBSD >= 199905) || defined(__OpenBSD__) + for (ifp = ifnet.tqh_first; ifp; ifp = ifp->if_list.tqe_next) +# else + for (ifp = ifnet.tqh_first; ifp; ifp = ifp->if_link.tqe_next) +# endif +# else + for (ifp = ifnet; ifp; ifp = ifp->if_next) +# endif + { + ip_natsync(ifp); + ip_statesync(ifp); + } + ip_natsync((struct ifnet *)-1); +# endif + + WRITE_ENTER(&ipf_mutex); + frsynclist(ipacct[0][fr_active]); + frsynclist(ipacct[1][fr_active]); + frsynclist(ipfilter[0][fr_active]); + frsynclist(ipfilter[1][fr_active]); +#ifdef USE_INET6 + frsynclist(ipacct6[0][fr_active]); + frsynclist(ipacct6[1][fr_active]); + frsynclist(ipfilter6[0][fr_active]); + frsynclist(ipfilter6[1][fr_active]); +#endif + RWLOCK_EXIT(&ipf_mutex); +} + + +/* + * In the functions below, bcopy() is called because the pointer being + * copied _from_ in this instance is a pointer to a char buf (which could + * end up being unaligned) and on the kernel's local stack. + */ +int ircopyptr(a, b, c) +void *a, *b; +size_t c; +{ + caddr_t ca; + int err; + +#if SOLARIS + if (copyin(a, (char *)&ca, sizeof(ca))) + return EFAULT; +#else + bcopy(a, &ca, sizeof(ca)); +#endif + err = copyin(ca, b, c); + if (err) + err = EFAULT; + return err; +} + + +int iwcopyptr(a, b, c) +void *a, *b; +size_t c; +{ + caddr_t ca; + int err; + +#if SOLARIS + if (copyin(b, (char *)&ca, sizeof(ca))) + return EFAULT; +#else + bcopy(b, &ca, sizeof(ca)); +#endif + err = copyout(a, ca, c); + if (err) + err = EFAULT; + return err; +} + +#else /* _KERNEL */ + + +/* + * return the first IP Address associated with an interface + */ +int fr_ifpaddr(v, ifptr, inp) +int v; +void *ifptr; +struct in_addr *inp; +{ + return 0; +} + + +int ircopyptr(a, b, c) +void *a, *b; +size_t c; +{ + caddr_t ca; + + bcopy(a, &ca, sizeof(ca)); + bcopy(ca, b, c); + return 0; +} + + +int iwcopyptr(a, b, c) +void *a, *b; +size_t c; +{ + caddr_t ca; + + bcopy(b, &ca, sizeof(ca)); + bcopy(a, ca, c); + return 0; +} + + +#endif + + +int fr_lock(data, lockp) +caddr_t data; +int *lockp; +{ + int arg, error; + + error = IRCOPY(data, (caddr_t)&arg, sizeof(arg)); + if (!error) { + error = IWCOPY((caddr_t)lockp, data, sizeof(*lockp)); + if (!error) + *lockp = arg; + } + return error; +} + + +void fr_getstat(fiop) +friostat_t *fiop; +{ + bcopy((char *)frstats, (char *)fiop->f_st, sizeof(filterstats_t) * 2); + fiop->f_locks[0] = fr_state_lock; + fiop->f_locks[1] = fr_nat_lock; + fiop->f_locks[2] = fr_frag_lock; + fiop->f_locks[3] = fr_auth_lock; + fiop->f_fin[0] = ipfilter[0][0]; + fiop->f_fin[1] = ipfilter[0][1]; + fiop->f_fout[0] = ipfilter[1][0]; + fiop->f_fout[1] = ipfilter[1][1]; + fiop->f_acctin[0] = ipacct[0][0]; + fiop->f_acctin[1] = ipacct[0][1]; + fiop->f_acctout[0] = ipacct[1][0]; + fiop->f_acctout[1] = ipacct[1][1]; +#ifdef USE_INET6 + fiop->f_fin6[0] = ipfilter6[0][0]; + fiop->f_fin6[1] = ipfilter6[0][1]; + fiop->f_fout6[0] = ipfilter6[1][0]; + fiop->f_fout6[1] = ipfilter6[1][1]; + fiop->f_acctin6[0] = ipacct6[0][0]; + fiop->f_acctin6[1] = ipacct6[0][1]; + fiop->f_acctout6[0] = ipacct6[1][0]; + fiop->f_acctout6[1] = ipacct6[1][1]; +#else + fiop->f_fin6[0] = NULL; + fiop->f_fin6[1] = NULL; + fiop->f_fout6[0] = NULL; + fiop->f_fout6[1] = NULL; + fiop->f_acctin6[0] = NULL; + fiop->f_acctin6[1] = NULL; + fiop->f_acctout6[0] = NULL; + fiop->f_acctout6[1] = NULL; +#endif + fiop->f_active = fr_active; + fiop->f_froute[0] = ipl_frouteok[0]; + fiop->f_froute[1] = ipl_frouteok[1]; + + fiop->f_running = fr_running; + fiop->f_groups[0][0] = ipfgroups[0][0]; + fiop->f_groups[0][1] = ipfgroups[0][1]; + fiop->f_groups[1][0] = ipfgroups[1][0]; + fiop->f_groups[1][1] = ipfgroups[1][1]; + fiop->f_groups[2][0] = ipfgroups[2][0]; + fiop->f_groups[2][1] = ipfgroups[2][1]; +#ifdef IPFILTER_LOG + fiop->f_logging = 1; +#else + fiop->f_logging = 0; +#endif + fiop->f_defpass = fr_pass; + strncpy(fiop->f_version, ipfilter_version, sizeof(fiop->f_version)); +} + + +#ifdef USE_INET6 +int icmptoicmp6types[ICMP_MAXTYPE+1] = { + ICMP6_ECHO_REPLY, /* 0: ICMP_ECHOREPLY */ + -1, /* 1: UNUSED */ + -1, /* 2: UNUSED */ + ICMP6_DST_UNREACH, /* 3: ICMP_UNREACH */ + -1, /* 4: ICMP_SOURCEQUENCH */ + ND_REDIRECT, /* 5: ICMP_REDIRECT */ + -1, /* 6: UNUSED */ + -1, /* 7: UNUSED */ + ICMP6_ECHO_REQUEST, /* 8: ICMP_ECHO */ + -1, /* 9: UNUSED */ + -1, /* 10: UNUSED */ + ICMP6_TIME_EXCEEDED, /* 11: ICMP_TIMXCEED */ + ICMP6_PARAM_PROB, /* 12: ICMP_PARAMPROB */ + -1, /* 13: ICMP_TSTAMP */ + -1, /* 14: ICMP_TSTAMPREPLY */ + -1, /* 15: ICMP_IREQ */ + -1, /* 16: ICMP_IREQREPLY */ + -1, /* 17: ICMP_MASKREQ */ + -1, /* 18: ICMP_MASKREPLY */ +}; + + +int icmptoicmp6unreach[ICMP_MAX_UNREACH] = { + ICMP6_DST_UNREACH_ADDR, /* 0: ICMP_UNREACH_NET */ + ICMP6_DST_UNREACH_ADDR, /* 1: ICMP_UNREACH_HOST */ + -1, /* 2: ICMP_UNREACH_PROTOCOL */ + ICMP6_DST_UNREACH_NOPORT, /* 3: ICMP_UNREACH_PORT */ + -1, /* 4: ICMP_UNREACH_NEEDFRAG */ + ICMP6_DST_UNREACH_NOTNEIGHBOR, /* 5: ICMP_UNREACH_SRCFAIL */ + ICMP6_DST_UNREACH_ADDR, /* 6: ICMP_UNREACH_NET_UNKNOWN */ + ICMP6_DST_UNREACH_ADDR, /* 7: ICMP_UNREACH_HOST_UNKNOWN */ + -1, /* 8: ICMP_UNREACH_ISOLATED */ + ICMP6_DST_UNREACH_ADMIN, /* 9: ICMP_UNREACH_NET_PROHIB */ + ICMP6_DST_UNREACH_ADMIN, /* 10: ICMP_UNREACH_HOST_PROHIB */ + -1, /* 11: ICMP_UNREACH_TOSNET */ + -1, /* 12: ICMP_UNREACH_TOSHOST */ + ICMP6_DST_UNREACH_ADMIN, /* 13: ICMP_UNREACH_ADMIN_PROHIBIT */ +}; +#endif diff --git a/sys/netinet/icmp6.h b/sys/netinet/icmp6.h new file mode 100644 index 0000000..3625ee4 --- /dev/null +++ b/sys/netinet/icmp6.h @@ -0,0 +1,661 @@ +/* $FreeBSD$ */ +/* $KAME: icmp6.h,v 1.18 2000/07/03 02:51:08 itojun Exp $ */ + +/* + * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip_icmp.h 8.1 (Berkeley) 6/10/93 + */ + +#ifndef _NETINET_ICMP6_H_ +#define _NETINET_ICMP6_H_ + +#define ICMPV6_PLD_MAXLEN 1232 /* IPV6_MMTU - sizeof(struct ip6_hdr) + - sizeof(struct icmp6_hdr) */ + +struct icmp6_hdr { + u_int8_t icmp6_type; /* type field */ + u_int8_t icmp6_code; /* code field */ + u_int16_t icmp6_cksum; /* checksum field */ + union { + u_int32_t icmp6_un_data32[1]; /* type-specific field */ + u_int16_t icmp6_un_data16[2]; /* type-specific field */ + u_int8_t icmp6_un_data8[4]; /* type-specific field */ + } icmp6_dataun; +}; + +#define icmp6_data32 icmp6_dataun.icmp6_un_data32 +#define icmp6_data16 icmp6_dataun.icmp6_un_data16 +#define icmp6_data8 icmp6_dataun.icmp6_un_data8 +#define icmp6_pptr icmp6_data32[0] /* parameter prob */ +#define icmp6_mtu icmp6_data32[0] /* packet too big */ +#define icmp6_id icmp6_data16[0] /* echo request/reply */ +#define icmp6_seq icmp6_data16[1] /* echo request/reply */ +#define icmp6_maxdelay icmp6_data16[0] /* mcast group membership */ + +#define ICMP6_DST_UNREACH 1 /* dest unreachable, codes: */ +#define ICMP6_PACKET_TOO_BIG 2 /* packet too big */ +#define ICMP6_TIME_EXCEEDED 3 /* time exceeded, code: */ +#define ICMP6_PARAM_PROB 4 /* ip6 header bad */ + +#define ICMP6_ECHO_REQUEST 128 /* echo service */ +#define ICMP6_ECHO_REPLY 129 /* echo reply */ +#define ICMP6_MEMBERSHIP_QUERY 130 /* group membership query */ +#define MLD6_LISTENER_QUERY 130 /* multicast listener query */ +#define ICMP6_MEMBERSHIP_REPORT 131 /* group membership report */ +#define MLD6_LISTENER_REPORT 131 /* multicast listener report */ +#define ICMP6_MEMBERSHIP_REDUCTION 132 /* group membership termination */ +#define MLD6_LISTENER_DONE 132 /* multicast listener done */ + +#define ND_ROUTER_SOLICIT 133 /* router solicitation */ +#define ND_ROUTER_ADVERT 134 /* router advertisment */ +#define ND_NEIGHBOR_SOLICIT 135 /* neighbor solicitation */ +#define ND_NEIGHBOR_ADVERT 136 /* neighbor advertisment */ +#define ND_REDIRECT 137 /* redirect */ + +#define ICMP6_ROUTER_RENUMBERING 138 /* router renumbering */ + +#define ICMP6_WRUREQUEST 139 /* who are you request */ +#define ICMP6_WRUREPLY 140 /* who are you reply */ +#define ICMP6_FQDN_QUERY 139 /* FQDN query */ +#define ICMP6_FQDN_REPLY 140 /* FQDN reply */ +#define ICMP6_NI_QUERY 139 /* node information request */ +#define ICMP6_NI_REPLY 140 /* node information reply */ + +/* The definitions below are experimental. TBA */ +#define MLD6_MTRACE_RESP 141 /* mtrace response(to sender) */ +#define MLD6_MTRACE 142 /* mtrace messages */ + +#define ICMP6_MAXTYPE 142 + +#define ICMP6_DST_UNREACH_NOROUTE 0 /* no route to destination */ +#define ICMP6_DST_UNREACH_ADMIN 1 /* administratively prohibited */ +#define ICMP6_DST_UNREACH_NOTNEIGHBOR 2 /* not a neighbor(obsolete) */ +#define ICMP6_DST_UNREACH_BEYONDSCOPE 2 /* beyond scope of source address */ +#define ICMP6_DST_UNREACH_ADDR 3 /* address unreachable */ +#define ICMP6_DST_UNREACH_NOPORT 4 /* port unreachable */ + +#define ICMP6_TIME_EXCEED_TRANSIT 0 /* ttl==0 in transit */ +#define ICMP6_TIME_EXCEED_REASSEMBLY 1 /* ttl==0 in reass */ + +#define ICMP6_PARAMPROB_HEADER 0 /* erroneous header field */ +#define ICMP6_PARAMPROB_NEXTHEADER 1 /* unrecognized next header */ +#define ICMP6_PARAMPROB_OPTION 2 /* unrecognized option */ + +#define ICMP6_INFOMSG_MASK 0x80 /* all informational messages */ + +#define ICMP6_NI_SUBJ_IPV6 0 /* Query Subject is an IPv6 address */ +#define ICMP6_NI_SUBJ_FQDN 1 /* Query Subject is a Domain name */ +#define ICMP6_NI_SUBJ_IPV4 2 /* Query Subject is an IPv4 address */ + +#define ICMP6_NI_SUCESS 0 /* node information successful reply */ +#define ICMP6_NI_REFUSED 1 /* node information request is refused */ +#define ICMP6_NI_UNKNOWN 2 /* unknown Qtype */ + +#define ICMP6_ROUTER_RENUMBERING_COMMAND 0 /* rr command */ +#define ICMP6_ROUTER_RENUMBERING_RESULT 1 /* rr result */ +#define ICMP6_ROUTER_RENUMBERING_SEQNUM_RESET 255 /* rr seq num reset */ + +/* Used in kernel only */ +#define ND_REDIRECT_ONLINK 0 /* redirect to an on-link node */ +#define ND_REDIRECT_ROUTER 1 /* redirect to a better router */ + +/* + * Multicast Listener Discovery + */ +struct mld6_hdr { + struct icmp6_hdr mld6_hdr; + struct in6_addr mld6_addr; /* multicast address */ +}; + +#define mld6_type mld6_hdr.icmp6_type +#define mld6_code mld6_hdr.icmp6_code +#define mld6_cksum mld6_hdr.icmp6_cksum +#define mld6_maxdelay mld6_hdr.icmp6_data16[0] +#define mld6_reserved mld6_hdr.icmp6_data16[1] + +/* + * Neighbor Discovery + */ + +struct nd_router_solicit { /* router solicitation */ + struct icmp6_hdr nd_rs_hdr; + /* could be followed by options */ +}; + +#define nd_rs_type nd_rs_hdr.icmp6_type +#define nd_rs_code nd_rs_hdr.icmp6_code +#define nd_rs_cksum nd_rs_hdr.icmp6_cksum +#define nd_rs_reserved nd_rs_hdr.icmp6_data32[0] + +struct nd_router_advert { /* router advertisement */ + struct icmp6_hdr nd_ra_hdr; + u_int32_t nd_ra_reachable; /* reachable time */ + u_int32_t nd_ra_retransmit; /* retransmit timer */ + /* could be followed by options */ +}; + +#define nd_ra_type nd_ra_hdr.icmp6_type +#define nd_ra_code nd_ra_hdr.icmp6_code +#define nd_ra_cksum nd_ra_hdr.icmp6_cksum +#define nd_ra_curhoplimit nd_ra_hdr.icmp6_data8[0] +#define nd_ra_flags_reserved nd_ra_hdr.icmp6_data8[1] +#define ND_RA_FLAG_MANAGED 0x80 +#define ND_RA_FLAG_OTHER 0x40 +#define nd_ra_router_lifetime nd_ra_hdr.icmp6_data16[1] + +struct nd_neighbor_solicit { /* neighbor solicitation */ + struct icmp6_hdr nd_ns_hdr; + struct in6_addr nd_ns_target; /*target address */ + /* could be followed by options */ +}; + +#define nd_ns_type nd_ns_hdr.icmp6_type +#define nd_ns_code nd_ns_hdr.icmp6_code +#define nd_ns_cksum nd_ns_hdr.icmp6_cksum +#define nd_ns_reserved nd_ns_hdr.icmp6_data32[0] + +struct nd_neighbor_advert { /* neighbor advertisement */ + struct icmp6_hdr nd_na_hdr; + struct in6_addr nd_na_target; /* target address */ + /* could be followed by options */ +}; + +#define nd_na_type nd_na_hdr.icmp6_type +#define nd_na_code nd_na_hdr.icmp6_code +#define nd_na_cksum nd_na_hdr.icmp6_cksum +#define nd_na_flags_reserved nd_na_hdr.icmp6_data32[0] +#if BYTE_ORDER == BIG_ENDIAN +#define ND_NA_FLAG_ROUTER 0x80000000 +#define ND_NA_FLAG_SOLICITED 0x40000000 +#define ND_NA_FLAG_OVERRIDE 0x20000000 +#else +#if BYTE_ORDER == LITTLE_ENDIAN +#define ND_NA_FLAG_ROUTER 0x80 +#define ND_NA_FLAG_SOLICITED 0x40 +#define ND_NA_FLAG_OVERRIDE 0x20 +#endif +#endif + +struct nd_redirect { /* redirect */ + struct icmp6_hdr nd_rd_hdr; + struct in6_addr nd_rd_target; /* target address */ + struct in6_addr nd_rd_dst; /* destination address */ + /* could be followed by options */ +}; + +#define nd_rd_type nd_rd_hdr.icmp6_type +#define nd_rd_code nd_rd_hdr.icmp6_code +#define nd_rd_cksum nd_rd_hdr.icmp6_cksum +#define nd_rd_reserved nd_rd_hdr.icmp6_data32[0] + +struct nd_opt_hdr { /* Neighbor discovery option header */ + u_int8_t nd_opt_type; + u_int8_t nd_opt_len; + /* followed by option specific data*/ +}; + +#define ND_OPT_SOURCE_LINKADDR 1 +#define ND_OPT_TARGET_LINKADDR 2 +#define ND_OPT_PREFIX_INFORMATION 3 +#define ND_OPT_REDIRECTED_HEADER 4 +#define ND_OPT_MTU 5 + +struct nd_opt_prefix_info { /* prefix information */ + u_int8_t nd_opt_pi_type; + u_int8_t nd_opt_pi_len; + u_int8_t nd_opt_pi_prefix_len; + u_int8_t nd_opt_pi_flags_reserved; + u_int32_t nd_opt_pi_valid_time; + u_int32_t nd_opt_pi_preferred_time; + u_int32_t nd_opt_pi_reserved2; + struct in6_addr nd_opt_pi_prefix; +}; + +#define ND_OPT_PI_FLAG_ONLINK 0x80 +#define ND_OPT_PI_FLAG_AUTO 0x40 + +struct nd_opt_rd_hdr { /* redirected header */ + u_int8_t nd_opt_rh_type; + u_int8_t nd_opt_rh_len; + u_int16_t nd_opt_rh_reserved1; + u_int32_t nd_opt_rh_reserved2; + /* followed by IP header and data */ +}; + +struct nd_opt_mtu { /* MTU option */ + u_int8_t nd_opt_mtu_type; + u_int8_t nd_opt_mtu_len; + u_int16_t nd_opt_mtu_reserved; + u_int32_t nd_opt_mtu_mtu; +}; + +/* + * icmp6 namelookup + */ + +struct icmp6_namelookup { + struct icmp6_hdr icmp6_nl_hdr; + u_int8_t icmp6_nl_nonce[8]; + int32_t icmp6_nl_ttl; +#if 0 + u_int8_t icmp6_nl_len; + u_int8_t icmp6_nl_name[3]; +#endif + /* could be followed by options */ +}; + +/* + * icmp6 node information + */ +struct icmp6_nodeinfo { + struct icmp6_hdr icmp6_ni_hdr; + u_int8_t icmp6_ni_nonce[8]; + /* could be followed by reply data */ +}; + +#define ni_type icmp6_ni_hdr.icmp6_type +#define ni_code icmp6_ni_hdr.icmp6_code +#define ni_cksum icmp6_ni_hdr.icmp6_cksum +#define ni_qtype icmp6_ni_hdr.icmp6_data16[0] +#define ni_flags icmp6_ni_hdr.icmp6_data16[1] + +#define NI_QTYPE_NOOP 0 /* NOOP */ +#define NI_QTYPE_SUPTYPES 1 /* Supported Qtypes */ +#define NI_QTYPE_FQDN 2 /* FQDN */ +#define NI_QTYPE_NODEADDR 3 /* Node Addresses. XXX: spec says 2, but it may be a typo... */ + +#if BYTE_ORDER == BIG_ENDIAN +#define NI_SUPTYPE_FLAG_COMPRESS 0x1 +#define NI_FQDN_FLAG_VALIDTTL 0x1 +#elif BYTE_ORDER == LITTLE_ENDIAN +#define NI_SUPTYPE_FLAG_COMPRESS 0x0100 +#define NI_FQDN_FLAG_VALIDTTL 0x0100 +#endif + +#ifdef NAME_LOOKUPS_04 +#if BYTE_ORDER == BIG_ENDIAN +#define NI_NODEADDR_FLAG_LINKLOCAL 0x1 +#define NI_NODEADDR_FLAG_SITELOCAL 0x2 +#define NI_NODEADDR_FLAG_GLOBAL 0x4 +#define NI_NODEADDR_FLAG_ALL 0x8 +#define NI_NODEADDR_FLAG_TRUNCATE 0x10 +#define NI_NODEADDR_FLAG_ANYCAST 0x20 /* just experimental. not in spec */ +#elif BYTE_ORDER == LITTLE_ENDIAN +#define NI_NODEADDR_FLAG_LINKLOCAL 0x0100 +#define NI_NODEADDR_FLAG_SITELOCAL 0x0200 +#define NI_NODEADDR_FLAG_GLOBAL 0x0400 +#define NI_NODEADDR_FLAG_ALL 0x0800 +#define NI_NODEADDR_FLAG_TRUNCATE 0x1000 +#define NI_NODEADDR_FLAG_ANYCAST 0x2000 /* just experimental. not in spec */ +#endif +#else /* draft-ietf-ipngwg-icmp-name-lookups-05 (and later?) */ +#if BYTE_ORDER == BIG_ENDIAN +#define NI_NODEADDR_FLAG_TRUNCATE 0x1 +#define NI_NODEADDR_FLAG_ALL 0x2 +#define NI_NODEADDR_FLAG_COMPAT 0x4 +#define NI_NODEADDR_FLAG_LINKLOCAL 0x8 +#define NI_NODEADDR_FLAG_SITELOCAL 0x10 +#define NI_NODEADDR_FLAG_GLOBAL 0x20 +#define NI_NODEADDR_FLAG_ANYCAST 0x40 /* just experimental. not in spec */ +#elif BYTE_ORDER == LITTLE_ENDIAN +#define NI_NODEADDR_FLAG_TRUNCATE 0x0100 +#define NI_NODEADDR_FLAG_ALL 0x0200 +#define NI_NODEADDR_FLAG_COMPAT 0x0400 +#define NI_NODEADDR_FLAG_LINKLOCAL 0x0800 +#define NI_NODEADDR_FLAG_SITELOCAL 0x1000 +#define NI_NODEADDR_FLAG_GLOBAL 0x2000 +#define NI_NODEADDR_FLAG_ANYCAST 0x4000 /* just experimental. not in spec */ +#endif +#endif + +struct ni_reply_fqdn { + u_int32_t ni_fqdn_ttl; /* TTL */ + u_int8_t ni_fqdn_namelen; /* length in octets of the FQDN */ + u_int8_t ni_fqdn_name[3]; /* XXX: alignment */ +}; + +/* + * Router Renumbering. as router-renum-08.txt + */ +struct icmp6_router_renum { /* router renumbering header */ + struct icmp6_hdr rr_hdr; + u_int8_t rr_segnum; + u_int8_t rr_flags; + u_int16_t rr_maxdelay; + u_int32_t rr_reserved; +}; +#define ICMP6_RR_FLAGS_SEGNUM 0x80 +#define ICMP6_RR_FLAGS_TEST 0x40 +#define ICMP6_RR_FLAGS_REQRESULT 0x20 +#define ICMP6_RR_FLAGS_FORCEAPPLY 0x10 +#define ICMP6_RR_FLAGS_SPECSITE 0x08 +#define ICMP6_RR_FLAGS_PREVDONE 0x04 + +#define rr_type rr_hdr.icmp6_type +#define rr_code rr_hdr.icmp6_code +#define rr_cksum rr_hdr.icmp6_cksum +#define rr_seqnum rr_hdr.icmp6_data32[0] + +struct rr_pco_match { /* match prefix part */ + u_int8_t rpm_code; + u_int8_t rpm_len; + u_int8_t rpm_ordinal; + u_int8_t rpm_matchlen; + u_int8_t rpm_minlen; + u_int8_t rpm_maxlen; + u_int16_t rpm_reserved; + struct in6_addr rpm_prefix; +}; + +#define RPM_PCO_ADD 1 +#define RPM_PCO_CHANGE 2 +#define RPM_PCO_SETGLOBAL 3 +#define RPM_PCO_MAX 4 + +struct rr_pco_use { /* use prefix part */ + u_int8_t rpu_uselen; + u_int8_t rpu_keeplen; + u_int8_t rpu_ramask; + u_int8_t rpu_raflags; + u_int32_t rpu_vltime; + u_int32_t rpu_pltime; + u_int32_t rpu_flags; + struct in6_addr rpu_prefix; +}; +#define ICMP6_RR_PCOUSE_RAFLAGS_ONLINK 0x80 +#define ICMP6_RR_PCOUSE_RAFLAGS_AUTO 0x40 + +#if BYTE_ORDER == BIG_ENDIAN +#define ICMP6_RR_PCOUSE_FLAGS_DECRVLTIME 0x80000000 +#define ICMP6_RR_PCOUSE_FLAGS_DECRPLTIME 0x40000000 +#elif BYTE_ORDER == LITTLE_ENDIAN +#define ICMP6_RR_PCOUSE_FLAGS_DECRVLTIME 0x80 +#define ICMP6_RR_PCOUSE_FLAGS_DECRPLTIME 0x40 +#endif + +struct rr_result { /* router renumbering result message */ + u_int16_t rrr_flags; + u_int8_t rrr_ordinal; + u_int8_t rrr_matchedlen; + u_int32_t rrr_ifid; + struct in6_addr rrr_prefix; +}; +#if BYTE_ORDER == BIG_ENDIAN +#define ICMP6_RR_RESULT_FLAGS_OOB 0x0002 +#define ICMP6_RR_RESULT_FLAGS_FORBIDDEN 0x0001 +#elif BYTE_ORDER == LITTLE_ENDIAN +#define ICMP6_RR_RESULT_FLAGS_OOB 0x02 +#define ICMP6_RR_RESULT_FLAGS_FORBIDDEN 0x01 +#endif + +/* + * icmp6 filter structures. + */ + +struct icmp6_filter { + u_int32_t icmp6_filt[8]; +}; + +#ifdef _KERNEL +#define ICMP6_FILTER_SETPASSALL(filterp) \ +do { \ + int i; u_char *p; \ + p = (u_char *)filterp; \ + for (i = 0; i < sizeof(struct icmp6_filter); i++) \ + p[i] = 0xff; \ +} while (0) +#define ICMP6_FILTER_SETBLOCKALL(filterp) \ + bzero(filterp, sizeof(struct icmp6_filter)) +#else /* _KERNEL */ +#define ICMP6_FILTER_SETPASSALL(filterp) \ + memset(filterp, 0xff, sizeof(struct icmp6_filter)) +#define ICMP6_FILTER_SETBLOCKALL(filterp) \ + memset(filterp, 0x00, sizeof(struct icmp6_filter)) +#endif /* _KERNEL */ + +#define ICMP6_FILTER_SETPASS(type, filterp) \ + (((filterp)->icmp6_filt[(type) >> 5]) |= (1 << ((type) & 31))) +#define ICMP6_FILTER_SETBLOCK(type, filterp) \ + (((filterp)->icmp6_filt[(type) >> 5]) &= ~(1 << ((type) & 31))) +#define ICMP6_FILTER_WILLPASS(type, filterp) \ + ((((filterp)->icmp6_filt[(type) >> 5]) & (1 << ((type) & 31))) != 0) +#define ICMP6_FILTER_WILLBLOCK(type, filterp) \ + ((((filterp)->icmp6_filt[(type) >> 5]) & (1 << ((type) & 31))) == 0) + +/* + * Variables related to this implementation + * of the internet control message protocol version 6. + */ +struct icmp6errstat { + u_quad_t icp6errs_dst_unreach_noroute; + u_quad_t icp6errs_dst_unreach_admin; + u_quad_t icp6errs_dst_unreach_beyondscope; + u_quad_t icp6errs_dst_unreach_addr; + u_quad_t icp6errs_dst_unreach_noport; + u_quad_t icp6errs_packet_too_big; + u_quad_t icp6errs_time_exceed_transit; + u_quad_t icp6errs_time_exceed_reassembly; + u_quad_t icp6errs_paramprob_header; + u_quad_t icp6errs_paramprob_nextheader; + u_quad_t icp6errs_paramprob_option; + u_quad_t icp6errs_redirect; /* we regard redirect as an error here */ + u_quad_t icp6errs_unknown; +}; + +struct icmp6stat { +/* statistics related to icmp6 packets generated */ + u_quad_t icp6s_error; /* # of calls to icmp6_error */ + u_quad_t icp6s_canterror; /* no error 'cuz old was icmp */ + u_quad_t icp6s_toofreq; /* no error 'cuz rate limitation */ + u_quad_t icp6s_outhist[256]; +/* statistics related to input message processed */ + u_quad_t icp6s_badcode; /* icmp6_code out of range */ + u_quad_t icp6s_tooshort; /* packet < sizeof(struct icmp6_hdr) */ + u_quad_t icp6s_checksum; /* bad checksum */ + u_quad_t icp6s_badlen; /* calculated bound mismatch */ + u_quad_t icp6s_reflect; /* number of responses */ + u_quad_t icp6s_inhist[256]; + u_quad_t icp6s_nd_toomanyopt; /* too many ND options */ + struct icmp6errstat icp6s_outerrhist; +#define icp6s_odst_unreach_noroute \ + icp6s_outerrhist.icp6errs_dst_unreach_noroute +#define icp6s_odst_unreach_admin icp6s_outerrhist.icp6errs_dst_unreach_admin +#define icp6s_odst_unreach_beyondscope \ + icp6s_outerrhist.icp6errs_dst_unreach_beyondscope +#define icp6s_odst_unreach_addr icp6s_outerrhist.icp6errs_dst_unreach_addr +#define icp6s_odst_unreach_noport icp6s_outerrhist.icp6errs_dst_unreach_noport +#define icp6s_opacket_too_big icp6s_outerrhist.icp6errs_packet_too_big +#define icp6s_otime_exceed_transit \ + icp6s_outerrhist.icp6errs_time_exceed_transit +#define icp6s_otime_exceed_reassembly \ + icp6s_outerrhist.icp6errs_time_exceed_reassembly +#define icp6s_oparamprob_header icp6s_outerrhist.icp6errs_paramprob_header +#define icp6s_oparamprob_nextheader \ + icp6s_outerrhist.icp6errs_paramprob_nextheader +#define icp6s_oparamprob_option icp6s_outerrhist.icp6errs_paramprob_option +#define icp6s_oredirect icp6s_outerrhist.icp6errs_redirect +#define icp6s_ounknown icp6s_outerrhist.icp6errs_unknown +}; + +/* + * Names for ICMP sysctl objects + */ +#define ICMPV6CTL_STATS 1 +#define ICMPV6CTL_REDIRACCEPT 2 /* accept/process redirects */ +#define ICMPV6CTL_REDIRTIMEOUT 3 /* redirect cache time */ +#define ICMPV6CTL_ERRRATELIMIT 5 /* ICMPv6 error rate limitation */ +#define ICMPV6CTL_ND6_PRUNE 6 +#define ICMPV6CTL_ND6_DELAY 8 +#define ICMPV6CTL_ND6_UMAXTRIES 9 +#define ICMPV6CTL_ND6_MMAXTRIES 10 +#define ICMPV6CTL_ND6_USELOOPBACK 11 +/*#define ICMPV6CTL_ND6_PROXYALL 12 obsoleted, do not reuse here */ +#define ICMPV6CTL_NODEINFO 13 +#define ICMPV6CTL_ERRPPSLIMIT 14 /* ICMPv6 error pps limitation */ +#define ICMPV6CTL_ND6_MAXNUDHINT 15 +#define ICMPV6CTL_MAXID 16 + +#define ICMPV6CTL_NAMES { \ + { 0, 0 }, \ + { 0, 0 }, \ + { "rediraccept", CTLTYPE_INT }, \ + { "redirtimeout", CTLTYPE_INT }, \ + { 0, 0 }, \ + { "errratelimit", CTLTYPE_INT }, \ + { "nd6_prune", CTLTYPE_INT }, \ + { 0, 0 }, \ + { "nd6_delay", CTLTYPE_INT }, \ + { "nd6_umaxtries", CTLTYPE_INT }, \ + { "nd6_mmaxtries", CTLTYPE_INT }, \ + { "nd6_useloopback", CTLTYPE_INT }, \ + { 0, 0 }, \ + { "nodeinfo", CTLTYPE_INT }, \ + { "errppslimit", CTLTYPE_INT }, \ + { "nd6_maxnudhint", CTLTYPE_INT }, \ +} + +#define RTF_PROBEMTU RTF_PROTO1 + +#ifdef _KERNEL +# ifdef __STDC__ +struct rtentry; +struct rttimer; +struct in6_multi; +# endif +void icmp6_init __P((void)); +void icmp6_paramerror __P((struct mbuf *, int)); +void icmp6_error __P((struct mbuf *, int, int, int)); +int icmp6_input __P((struct mbuf **, int *, int)); +void icmp6_fasttimo __P((void)); +void icmp6_reflect __P((struct mbuf *, size_t)); +void icmp6_prepare __P((struct mbuf *)); +void icmp6_redirect_input __P((struct mbuf *, int)); +void icmp6_redirect_output __P((struct mbuf *, struct rtentry *)); + +/* XXX: is this the right place for these macros? */ +#define icmp6_ifstat_inc(ifp, tag) \ +do { \ + if ((ifp) && (ifp)->if_index <= if_index \ + && (ifp)->if_index < icmp6_ifstatmax \ + && icmp6_ifstat && icmp6_ifstat[(ifp)->if_index]) { \ + icmp6_ifstat[(ifp)->if_index]->tag++; \ + } \ +} while (0) + +#define icmp6_ifoutstat_inc(ifp, type, code) \ +do { \ + icmp6_ifstat_inc(ifp, ifs6_out_msg); \ + if (type < ICMP6_INFOMSG_MASK) \ + icmp6_ifstat_inc(ifp, ifs6_out_error); \ + switch(type) { \ + case ICMP6_DST_UNREACH: \ + icmp6_ifstat_inc(ifp, ifs6_out_dstunreach); \ + if (code == ICMP6_DST_UNREACH_ADMIN) \ + icmp6_ifstat_inc(ifp, ifs6_out_adminprohib); \ + break; \ + case ICMP6_PACKET_TOO_BIG: \ + icmp6_ifstat_inc(ifp, ifs6_out_pkttoobig); \ + break; \ + case ICMP6_TIME_EXCEEDED: \ + icmp6_ifstat_inc(ifp, ifs6_out_timeexceed); \ + break; \ + case ICMP6_PARAM_PROB: \ + icmp6_ifstat_inc(ifp, ifs6_out_paramprob); \ + break; \ + case ICMP6_ECHO_REQUEST: \ + icmp6_ifstat_inc(ifp, ifs6_out_echo); \ + break; \ + case ICMP6_ECHO_REPLY: \ + icmp6_ifstat_inc(ifp, ifs6_out_echoreply); \ + break; \ + case MLD6_LISTENER_QUERY: \ + icmp6_ifstat_inc(ifp, ifs6_out_mldquery); \ + break; \ + case MLD6_LISTENER_REPORT: \ + icmp6_ifstat_inc(ifp, ifs6_out_mldreport); \ + break; \ + case MLD6_LISTENER_DONE: \ + icmp6_ifstat_inc(ifp, ifs6_out_mlddone); \ + break; \ + case ND_ROUTER_SOLICIT: \ + icmp6_ifstat_inc(ifp, ifs6_out_routersolicit); \ + break; \ + case ND_ROUTER_ADVERT: \ + icmp6_ifstat_inc(ifp, ifs6_out_routeradvert); \ + break; \ + case ND_NEIGHBOR_SOLICIT: \ + icmp6_ifstat_inc(ifp, ifs6_out_neighborsolicit); \ + break; \ + case ND_NEIGHBOR_ADVERT: \ + icmp6_ifstat_inc(ifp, ifs6_out_neighboradvert); \ + break; \ + case ND_REDIRECT: \ + icmp6_ifstat_inc(ifp, ifs6_out_redirect); \ + break; \ + } \ +} while (0) + +extern int icmp6_rediraccept; /* accept/process redirects */ +extern int icmp6_redirtimeout; /* cache time for redirect routes */ +#endif /* _KERNEL */ + +#endif /* not _NETINET_ICMP6_H_ */ diff --git a/sys/netinet/icmp_var.h b/sys/netinet/icmp_var.h new file mode 100644 index 0000000..814e932 --- /dev/null +++ b/sys/netinet/icmp_var.h @@ -0,0 +1,89 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)icmp_var.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_ICMP_VAR_H_ +#define _NETINET_ICMP_VAR_H_ + + +/* + * Variables related to this implementation + * of the internet control message protocol. + */ +struct icmpstat { +/* statistics related to icmp packets generated */ + u_long icps_error; /* # of calls to icmp_error */ + u_long icps_oldshort; /* no error 'cuz old ip too short */ + u_long icps_oldicmp; /* no error 'cuz old was icmp */ + u_long icps_outhist[ICMP_MAXTYPE + 1]; +/* statistics related to input messages processed */ + u_long icps_badcode; /* icmp_code out of range */ + u_long icps_tooshort; /* packet < ICMP_MINLEN */ + u_long icps_checksum; /* bad checksum */ + u_long icps_badlen; /* calculated bound mismatch */ + u_long icps_reflect; /* number of responses */ + u_long icps_inhist[ICMP_MAXTYPE + 1]; + u_long icps_bmcastecho; /* b/mcast echo requests dropped */ + u_long icps_bmcasttstamp; /* b/mcast tstamp requests dropped */ +}; + +/* + * Names for ICMP sysctl objects + */ +#define ICMPCTL_MASKREPL 1 /* allow replies to netmask requests */ +#define ICMPCTL_STATS 2 /* statistics (read-only) */ +#define ICMPCTL_ICMPLIM 3 +#define ICMPCTL_MAXID 4 + +#define ICMPCTL_NAMES { \ + { 0, 0 }, \ + { "maskrepl", CTLTYPE_INT }, \ + { "stats", CTLTYPE_STRUCT }, \ + { "icmplim", CTLTYPE_INT }, \ +} + +#ifdef _KERNEL +SYSCTL_DECL(_net_inet_icmp); +extern int badport_bandlim __P((int)); +#define BANDLIM_UNLIMITED -1 +#define BANDLIM_ICMP_UNREACH 0 +#define BANDLIM_ICMP_ECHO 1 +#define BANDLIM_ICMP_TSTAMP 2 +#define BANDLIM_RST_CLOSEDPORT 3 /* No connection, and no listeners */ +#define BANDLIM_RST_OPENPORT 4 /* No connection, listener */ +#define BANDLIM_MAX 4 +#endif + +#endif diff --git a/sys/netinet/if_atm.c b/sys/netinet/if_atm.c new file mode 100644 index 0000000..04b49bf --- /dev/null +++ b/sys/netinet/if_atm.c @@ -0,0 +1,280 @@ +/* $NetBSD: if_atm.c,v 1.6 1996/10/13 02:03:01 christos Exp $ */ + +/* + * + * Copyright (c) 1996 Charles D. Cranor and Washington University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor and + * Washington University. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * IP <=> ATM address resolution. + */ + +#include "opt_inet.h" +#include "opt_inet6.h" +#include "opt_natm.h" + +#if defined(INET) || defined(INET6) + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/queue.h> +#include <sys/mbuf.h> +#include <sys/socket.h> +#include <sys/sockio.h> +#include <sys/syslog.h> + +#include <net/if.h> +#include <net/if_dl.h> +#include <net/route.h> +#include <net/if_atm.h> + +#include <netinet/in.h> +#include <netinet/if_atm.h> + +#ifdef NATM +#include <netnatm/natm.h> +#endif + + +#define SDL(s) ((struct sockaddr_dl *)s) + +/* + * atm_rtrequest: handle ATM rt request (in support of generic code) + * inputs: "req" = request code + * "rt" = route entry + * "sa" = sockaddr + */ + +void +atm_rtrequest(req, rt, sa) + int req; + register struct rtentry *rt; + struct sockaddr *sa; +{ + register struct sockaddr *gate = rt->rt_gateway; + struct atm_pseudoioctl api; +#ifdef NATM + struct sockaddr_in *sin; + struct natmpcb *npcb = NULL; + struct atm_pseudohdr *aph; +#endif + static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK}; + + if (rt->rt_flags & RTF_GATEWAY) /* link level requests only */ + return; + + switch (req) { + + case RTM_RESOLVE: /* resolve: only happens when cloning */ + printf("atm_rtrequest: RTM_RESOLVE request detected?\n"); + break; + + case RTM_ADD: + + /* + * route added by a command (e.g. ifconfig, route, arp...). + * + * first check to see if this is not a host route, in which + * case we are being called via "ifconfig" to set the address. + */ + + if ((rt->rt_flags & RTF_HOST) == 0) { + rt_setgate(rt,rt_key(rt),(struct sockaddr *)&null_sdl); + gate = rt->rt_gateway; + SDL(gate)->sdl_type = rt->rt_ifp->if_type; + SDL(gate)->sdl_index = rt->rt_ifp->if_index; + break; + } + + if ((rt->rt_flags & RTF_CLONING) != 0) { + printf("atm_rtrequest: cloning route detected?\n"); + break; + } + if (gate->sa_family != AF_LINK || + gate->sa_len < sizeof(null_sdl)) { + log(LOG_DEBUG, "atm_rtrequest: bad gateway value"); + break; + } + +#ifdef DIAGNOSTIC + if (rt->rt_ifp->if_ioctl == NULL) panic("atm null ioctl"); +#endif + +#ifdef NATM + /* + * let native ATM know we are using this VCI/VPI + * (i.e. reserve it) + */ + sin = (struct sockaddr_in *) rt_key(rt); + if (sin->sin_family != AF_INET) + goto failed; + aph = (struct atm_pseudohdr *) LLADDR(SDL(gate)); + npcb = npcb_add(NULL, rt->rt_ifp, ATM_PH_VCI(aph), + ATM_PH_VPI(aph)); + if (npcb == NULL) + goto failed; + npcb->npcb_flags |= NPCB_IP; + npcb->ipaddr.s_addr = sin->sin_addr.s_addr; + /* XXX: move npcb to llinfo when ATM ARP is ready */ + rt->rt_llinfo = (caddr_t) npcb; + rt->rt_flags |= RTF_LLINFO; +#endif + /* + * let the lower level know this circuit is active + */ + bcopy(LLADDR(SDL(gate)), &api.aph, sizeof(api.aph)); + api.rxhand = NULL; + if (rt->rt_ifp->if_ioctl(rt->rt_ifp, SIOCATMENA, + (caddr_t)&api) != 0) { + printf("atm: couldn't add VC\n"); + goto failed; + } + + SDL(gate)->sdl_type = rt->rt_ifp->if_type; + SDL(gate)->sdl_index = rt->rt_ifp->if_index; + + break; + +failed: +#ifdef NATM + if (npcb) { + npcb_free(npcb, NPCB_DESTROY); + rt->rt_llinfo = NULL; + rt->rt_flags &= ~RTF_LLINFO; + } +#endif + rtrequest(RTM_DELETE, rt_key(rt), (struct sockaddr *)0, + rt_mask(rt), 0, (struct rtentry **) 0); + break; + + case RTM_DELETE: + +#ifdef NATM + /* + * tell native ATM we are done with this VC + */ + + if (rt->rt_flags & RTF_LLINFO) { + npcb_free((struct natmpcb *)rt->rt_llinfo, + NPCB_DESTROY); + rt->rt_llinfo = NULL; + rt->rt_flags &= ~RTF_LLINFO; + } +#endif + /* + * tell the lower layer to disable this circuit + */ + + bcopy(LLADDR(SDL(gate)), &api.aph, sizeof(api.aph)); + api.rxhand = NULL; + (void)rt->rt_ifp->if_ioctl(rt->rt_ifp, SIOCATMDIS, + (caddr_t)&api); + + break; + } +} + +/* + * atmresolve: + * inputs: + * [1] "rt" = the link level route to use (or null if need to look one up) + * [2] "m" = mbuf containing the data to be sent + * [3] "dst" = sockaddr_in (IP) address of dest. + * output: + * [4] "desten" = ATM pseudo header which we will fill in VPI/VCI info + * return: + * 0 == resolve FAILED; note that "m" gets m_freem'd in this case + * 1 == resolve OK; desten contains result + * + * XXX: will need more work if we wish to support ATMARP in the kernel, + * but this is enough for PVCs entered via the "route" command. + */ + +int +atmresolve(rt, m, dst, desten) + +register struct rtentry *rt; +struct mbuf *m; +register struct sockaddr *dst; +register struct atm_pseudohdr *desten; /* OUT */ + +{ + struct sockaddr_dl *sdl; + + if (m->m_flags & (M_BCAST|M_MCAST)) { + log(LOG_INFO, "atmresolve: BCAST/MCAST packet detected/dumped"); + goto bad; + } + + if (rt == NULL) { + rt = RTALLOC1(dst, 0); + if (rt == NULL) goto bad; /* failed */ + rt->rt_refcnt--; /* don't keep LL references */ + if ((rt->rt_flags & RTF_GATEWAY) != 0 || + (rt->rt_flags & RTF_LLINFO) == 0 || + /* XXX: are we using LLINFO? */ + rt->rt_gateway->sa_family != AF_LINK) { + goto bad; + } + } + + /* + * note that rt_gateway is a sockaddr_dl which contains the + * atm_pseudohdr data structure for this route. we currently + * don't need any rt_llinfo info (but will if we want to support + * ATM ARP [c.f. if_ether.c]). + */ + + sdl = SDL(rt->rt_gateway); + + /* + * Check the address family and length is valid, the address + * is resolved; otherwise, try to resolve. + */ + + + if (sdl->sdl_family == AF_LINK && sdl->sdl_alen == sizeof(*desten)) { + bcopy(LLADDR(sdl), desten, sdl->sdl_alen); + return(1); /* ok, go for it! */ + } + + /* + * we got an entry, but it doesn't have valid link address + * info in it (it is prob. the interface route, which has + * sdl_alen == 0). dump packet. (fall through to "bad"). + */ + +bad: + m_freem(m); + return(0); +} +#endif /* INET */ diff --git a/sys/netinet/if_atm.h b/sys/netinet/if_atm.h new file mode 100644 index 0000000..b448253 --- /dev/null +++ b/sys/netinet/if_atm.h @@ -0,0 +1,47 @@ +/* $FreeBSD$ */ +/* $NetBSD: if_atm.h,v 1.2 1996/07/03 17:17:17 chuck Exp $ */ + +/* + * + * Copyright (c) 1996 Charles D. Cranor and Washington University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor and + * Washington University. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * if_atm.h + */ + +struct atm_pseudohdr; +struct mbuf; +struct rtentry; +struct sockaddr; + +void atm_rtrequest __P((int, struct rtentry *, struct sockaddr *)); +int atmresolve __P((struct rtentry *, struct mbuf *, struct sockaddr *, + struct atm_pseudohdr *)); diff --git a/sys/netinet/if_ether.c b/sys/netinet/if_ether.c new file mode 100644 index 0000000..de5d906 --- /dev/null +++ b/sys/netinet/if_ether.c @@ -0,0 +1,847 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)if_ether.c 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +/* + * Ethernet address resolution protocol. + * TODO: + * add "inuse/lock" bit (or ref. count) along with valid bit + */ + +#include "opt_inet.h" +#include "opt_bdg.h" + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/queue.h> +#include <sys/sysctl.h> +#include <sys/systm.h> +#include <sys/mbuf.h> +#include <sys/malloc.h> +#include <sys/socket.h> +#include <sys/syslog.h> + +#include <net/if.h> +#include <net/if_dl.h> +#include <net/if_types.h> +#include <net/route.h> +#include <net/netisr.h> +#include <net/if_llc.h> +#ifdef BRIDGE +#include <net/ethernet.h> +#include <net/bridge.h> +#endif + +#include <netinet/in.h> +#include <netinet/in_var.h> +#include <netinet/if_ether.h> + +#include <net/iso88025.h> + +#define SIN(s) ((struct sockaddr_in *)s) +#define SDL(s) ((struct sockaddr_dl *)s) + +SYSCTL_DECL(_net_link_ether); +SYSCTL_NODE(_net_link_ether, PF_INET, inet, CTLFLAG_RW, 0, ""); + +/* timer values */ +static int arpt_prune = (5*60*1); /* walk list every 5 minutes */ +static int arpt_keep = (20*60); /* once resolved, good for 20 more minutes */ +static int arpt_down = 20; /* once declared down, don't send for 20 sec */ + +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, prune_intvl, CTLFLAG_RW, + &arpt_prune, 0, ""); +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_age, CTLFLAG_RW, + &arpt_keep, 0, ""); +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, host_down_time, CTLFLAG_RW, + &arpt_down, 0, ""); + +#define rt_expire rt_rmx.rmx_expire + +struct llinfo_arp { + LIST_ENTRY(llinfo_arp) la_le; + struct rtentry *la_rt; + struct mbuf *la_hold; /* last packet until resolved/timeout */ + long la_asked; /* last time we QUERIED for this addr */ +#define la_timer la_rt->rt_rmx.rmx_expire /* deletion time in seconds */ +}; + +static LIST_HEAD(, llinfo_arp) llinfo_arp; + +struct ifqueue arpintrq; +static int arp_inuse, arp_allocated; + +static int arp_maxtries = 5; +static int useloopback = 1; /* use loopback interface for local traffic */ +static int arp_proxyall = 0; + +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxtries, CTLFLAG_RW, + &arp_maxtries, 0, ""); +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, useloopback, CTLFLAG_RW, + &useloopback, 0, ""); +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, proxyall, CTLFLAG_RW, + &arp_proxyall, 0, ""); + +static void arp_init __P((void)); +static void arp_rtrequest __P((int, struct rtentry *, struct sockaddr *)); +static void arprequest __P((struct arpcom *, + struct in_addr *, struct in_addr *, u_char *)); +static void arpintr __P((void)); +static void arptfree __P((struct llinfo_arp *)); +static void arptimer __P((void *)); +static struct llinfo_arp + *arplookup __P((u_long, int, int)); +#ifdef INET +static void in_arpinput __P((struct mbuf *)); +#endif + +/* + * Timeout routine. Age arp_tab entries periodically. + */ +/* ARGSUSED */ +static void +arptimer(ignored_arg) + void *ignored_arg; +{ + int s = splnet(); + register struct llinfo_arp *la = LIST_FIRST(&llinfo_arp); + struct llinfo_arp *ola; + + timeout(arptimer, (caddr_t)0, arpt_prune * hz); + while ((ola = la) != 0) { + register struct rtentry *rt = la->la_rt; + la = LIST_NEXT(la, la_le); + if (rt->rt_expire && rt->rt_expire <= time_second) + arptfree(ola); /* timer has expired, clear */ + } + splx(s); +} + +/* + * Parallel to llc_rtrequest. + */ +static void +arp_rtrequest(req, rt, sa) + int req; + register struct rtentry *rt; + struct sockaddr *sa; +{ + register struct sockaddr *gate = rt->rt_gateway; + register struct llinfo_arp *la = (struct llinfo_arp *)rt->rt_llinfo; + static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK}; + static int arpinit_done; + + if (!arpinit_done) { + arpinit_done = 1; + LIST_INIT(&llinfo_arp); + timeout(arptimer, (caddr_t)0, hz); + register_netisr(NETISR_ARP, arpintr); + } + if (rt->rt_flags & RTF_GATEWAY) + return; + switch (req) { + + case RTM_ADD: + /* + * XXX: If this is a manually added route to interface + * such as older version of routed or gated might provide, + * restore cloning bit. + */ + if ((rt->rt_flags & RTF_HOST) == 0 && + SIN(rt_mask(rt))->sin_addr.s_addr != 0xffffffff) + rt->rt_flags |= RTF_CLONING; + if (rt->rt_flags & RTF_CLONING) { + /* + * Case 1: This route should come from a route to iface. + */ + rt_setgate(rt, rt_key(rt), + (struct sockaddr *)&null_sdl); + gate = rt->rt_gateway; + SDL(gate)->sdl_type = rt->rt_ifp->if_type; + SDL(gate)->sdl_index = rt->rt_ifp->if_index; + rt->rt_expire = time_second; + break; + } + /* Announce a new entry if requested. */ + if (rt->rt_flags & RTF_ANNOUNCE) + arprequest((struct arpcom *)rt->rt_ifp, + &SIN(rt_key(rt))->sin_addr, + &SIN(rt_key(rt))->sin_addr, + (u_char *)LLADDR(SDL(gate))); + /*FALLTHROUGH*/ + case RTM_RESOLVE: + if (gate->sa_family != AF_LINK || + gate->sa_len < sizeof(null_sdl)) { + log(LOG_DEBUG, "arp_rtrequest: bad gateway value\n"); + break; + } + SDL(gate)->sdl_type = rt->rt_ifp->if_type; + SDL(gate)->sdl_index = rt->rt_ifp->if_index; + if (la != 0) + break; /* This happens on a route change */ + /* + * Case 2: This route may come from cloning, or a manual route + * add with a LL address. + */ + R_Malloc(la, struct llinfo_arp *, sizeof(*la)); + rt->rt_llinfo = (caddr_t)la; + if (la == 0) { + log(LOG_DEBUG, "arp_rtrequest: malloc failed\n"); + break; + } + arp_inuse++, arp_allocated++; + Bzero(la, sizeof(*la)); + la->la_rt = rt; + rt->rt_flags |= RTF_LLINFO; + LIST_INSERT_HEAD(&llinfo_arp, la, la_le); + +#ifdef INET + /* + * This keeps the multicast addresses from showing up + * in `arp -a' listings as unresolved. It's not actually + * functional. Then the same for broadcast. + */ + if (IN_MULTICAST(ntohl(SIN(rt_key(rt))->sin_addr.s_addr))) { + ETHER_MAP_IP_MULTICAST(&SIN(rt_key(rt))->sin_addr, + LLADDR(SDL(gate))); + SDL(gate)->sdl_alen = 6; + rt->rt_expire = 0; + } + if (in_broadcast(SIN(rt_key(rt))->sin_addr, rt->rt_ifp)) { + memcpy(LLADDR(SDL(gate)), etherbroadcastaddr, 6); + SDL(gate)->sdl_alen = 6; + rt->rt_expire = 0; + } +#endif + + if (SIN(rt_key(rt))->sin_addr.s_addr == + (IA_SIN(rt->rt_ifa))->sin_addr.s_addr) { + /* + * This test used to be + * if (loif.if_flags & IFF_UP) + * It allowed local traffic to be forced + * through the hardware by configuring the loopback down. + * However, it causes problems during network configuration + * for boards that can't receive packets they send. + * It is now necessary to clear "useloopback" and remove + * the route to force traffic out to the hardware. + */ + rt->rt_expire = 0; + Bcopy(((struct arpcom *)rt->rt_ifp)->ac_enaddr, + LLADDR(SDL(gate)), SDL(gate)->sdl_alen = 6); + if (useloopback) + rt->rt_ifp = loif; + + } + break; + + case RTM_DELETE: + if (la == 0) + break; + arp_inuse--; + LIST_REMOVE(la, la_le); + rt->rt_llinfo = 0; + rt->rt_flags &= ~RTF_LLINFO; + if (la->la_hold) + m_freem(la->la_hold); + Free((caddr_t)la); + } +} + +/* + * Broadcast an ARP request. Caller specifies: + * - arp header source ip address + * - arp header target ip address + * - arp header source ethernet address + */ +static void +arprequest(ac, sip, tip, enaddr) + register struct arpcom *ac; + register struct in_addr *sip, *tip; + register u_char *enaddr; +{ + register struct mbuf *m; + register struct ether_header *eh; + register struct ether_arp *ea; + struct sockaddr sa; + static u_char llcx[] = { 0x82, 0x40, LLC_SNAP_LSAP, LLC_SNAP_LSAP, + LLC_UI, 0x00, 0x00, 0x00, 0x08, 0x06 }; + + if ((m = m_gethdr(M_DONTWAIT, MT_DATA)) == NULL) + return; + m->m_pkthdr.rcvif = (struct ifnet *)0; + switch (ac->ac_if.if_type) { + case IFT_ISO88025: + m->m_len = sizeof(*ea) + sizeof(llcx); + m->m_pkthdr.len = sizeof(*ea) + sizeof(llcx); + MH_ALIGN(m, sizeof(*ea) + sizeof(llcx)); + (void)memcpy(mtod(m, caddr_t), llcx, sizeof(llcx)); + (void)memcpy(sa.sa_data, etherbroadcastaddr, 6); + (void)memcpy(sa.sa_data + 6, enaddr, 6); + sa.sa_data[6] |= TR_RII; + sa.sa_data[12] = TR_AC; + sa.sa_data[13] = TR_LLC_FRAME; + ea = (struct ether_arp *)(mtod(m, char *) + sizeof(llcx)); + bzero((caddr_t)ea, sizeof (*ea)); + ea->arp_hrd = htons(ARPHRD_IEEE802); + break; + case IFT_FDDI: + case IFT_ETHER: + /* + * This may not be correct for types not explicitly + * listed, but this is our best guess + */ + default: + m->m_len = sizeof(*ea); + m->m_pkthdr.len = sizeof(*ea); + MH_ALIGN(m, sizeof(*ea)); + ea = mtod(m, struct ether_arp *); + eh = (struct ether_header *)sa.sa_data; + bzero((caddr_t)ea, sizeof (*ea)); + /* if_output will not swap */ + eh->ether_type = htons(ETHERTYPE_ARP); + (void)memcpy(eh->ether_dhost, etherbroadcastaddr, + sizeof(eh->ether_dhost)); + ea->arp_hrd = htons(ARPHRD_ETHER); + break; + } + ea->arp_pro = htons(ETHERTYPE_IP); + ea->arp_hln = sizeof(ea->arp_sha); /* hardware address length */ + ea->arp_pln = sizeof(ea->arp_spa); /* protocol address length */ + ea->arp_op = htons(ARPOP_REQUEST); + (void)memcpy(ea->arp_sha, enaddr, sizeof(ea->arp_sha)); + (void)memcpy(ea->arp_spa, sip, sizeof(ea->arp_spa)); + (void)memcpy(ea->arp_tpa, tip, sizeof(ea->arp_tpa)); + sa.sa_family = AF_UNSPEC; + sa.sa_len = sizeof(sa); + (*ac->ac_if.if_output)(&ac->ac_if, m, &sa, (struct rtentry *)0); +} + +/* + * Resolve an IP address into an ethernet address. If success, + * desten is filled in. If there is no entry in arptab, + * set one up and broadcast a request for the IP address. + * Hold onto this mbuf and resend it once the address + * is finally resolved. A return value of 1 indicates + * that desten has been filled in and the packet should be sent + * normally; a 0 return indicates that the packet has been + * taken over here, either now or for later transmission. + */ +int +arpresolve(ac, rt, m, dst, desten, rt0) + register struct arpcom *ac; + register struct rtentry *rt; + struct mbuf *m; + register struct sockaddr *dst; + register u_char *desten; + struct rtentry *rt0; +{ + register struct llinfo_arp *la = 0; + struct sockaddr_dl *sdl; + + if (m->m_flags & M_BCAST) { /* broadcast */ + (void)memcpy(desten, etherbroadcastaddr, sizeof(etherbroadcastaddr)); + return (1); + } + if (m->m_flags & M_MCAST) { /* multicast */ + ETHER_MAP_IP_MULTICAST(&SIN(dst)->sin_addr, desten); + return(1); + } + if (rt) + la = (struct llinfo_arp *)rt->rt_llinfo; + if (la == 0) { + la = arplookup(SIN(dst)->sin_addr.s_addr, 1, 0); + if (la) + rt = la->la_rt; + } + if (la == 0 || rt == 0) { + log(LOG_DEBUG, "arpresolve: can't allocate llinfo for %s%s%s\n", + inet_ntoa(SIN(dst)->sin_addr), la ? "la" : "", + rt ? "rt" : ""); + m_freem(m); + return (0); + } + sdl = SDL(rt->rt_gateway); + /* + * Check the address family and length is valid, the address + * is resolved; otherwise, try to resolve. + */ + if ((rt->rt_expire == 0 || rt->rt_expire > time_second) && + sdl->sdl_family == AF_LINK && sdl->sdl_alen != 0) { + bcopy(LLADDR(sdl), desten, sdl->sdl_alen); + return 1; + } + /* + * There is an arptab entry, but no ethernet address + * response yet. Replace the held mbuf with this + * latest one. + */ + if (la->la_hold) + m_freem(la->la_hold); + la->la_hold = m; + if (rt->rt_expire) { + rt->rt_flags &= ~RTF_REJECT; + if (la->la_asked == 0 || rt->rt_expire != time_second) { + rt->rt_expire = time_second; + if (la->la_asked++ < arp_maxtries) + arprequest(ac, + &SIN(rt->rt_ifa->ifa_addr)->sin_addr, + &SIN(dst)->sin_addr, ac->ac_enaddr); + else { + rt->rt_flags |= RTF_REJECT; + rt->rt_expire += arpt_down; + la->la_asked = 0; + } + + } + } + return (0); +} + +/* + * Common length and type checks are done here, + * then the protocol-specific routine is called. + */ +static void +arpintr() +{ + register struct mbuf *m; + register struct arphdr *ar; + int s; + + while (arpintrq.ifq_head) { + s = splimp(); + IF_DEQUEUE(&arpintrq, m); + splx(s); + if (m == 0 || (m->m_flags & M_PKTHDR) == 0) + panic("arpintr"); + + if (m->m_len < sizeof(struct arphdr) && + ((m = m_pullup(m, sizeof(struct arphdr))) == NULL)) { + log(LOG_ERR, "arp: runt packet -- m_pullup failed\n"); + continue; + } + ar = mtod(m, struct arphdr *); + + if (ntohs(ar->ar_hrd) != ARPHRD_ETHER + && ntohs(ar->ar_hrd) != ARPHRD_IEEE802) { + log(LOG_ERR, + "arp: unknown hardware address format (0x%2D)\n", + (unsigned char *)&ar->ar_hrd, ""); + m_freem(m); + continue; + } + + if (m->m_pkthdr.len < sizeof(struct arphdr) + 2 * ar->ar_hln + + 2 * ar->ar_pln) { + log(LOG_ERR, "arp: runt packet\n"); + m_freem(m); + continue; + } + + switch (ntohs(ar->ar_pro)) { +#ifdef INET + case ETHERTYPE_IP: + in_arpinput(m); + continue; +#endif + } + m_freem(m); + } +} + +#ifdef INET +/* + * ARP for Internet protocols on 10 Mb/s Ethernet. + * Algorithm is that given in RFC 826. + * In addition, a sanity check is performed on the sender + * protocol address, to catch impersonators. + * We no longer handle negotiations for use of trailer protocol: + * Formerly, ARP replied for protocol type ETHERTYPE_TRAIL sent + * along with IP replies if we wanted trailers sent to us, + * and also sent them in response to IP replies. + * This allowed either end to announce the desire to receive + * trailer packets. + * We no longer reply to requests for ETHERTYPE_TRAIL protocol either, + * but formerly didn't normally send requests. + */ +static int log_arp_wrong_iface = 1; + +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_wrong_iface, CTLFLAG_RW, + &log_arp_wrong_iface, 0, + "log arp packets arriving on the wrong interface"); + +static void +in_arpinput(m) + struct mbuf *m; +{ + register struct ether_arp *ea; + register struct arpcom *ac = (struct arpcom *)m->m_pkthdr.rcvif; + struct ether_header *eh; + struct iso88025_header *th = (struct iso88025_header *)0; + register struct llinfo_arp *la = 0; + register struct rtentry *rt; + struct in_ifaddr *ia, *maybe_ia = 0; + struct sockaddr_dl *sdl; + struct sockaddr sa; + struct in_addr isaddr, itaddr, myaddr; + int op, rif_len; + + if (m->m_len < sizeof(struct ether_arp) && + (m = m_pullup(m, sizeof(struct ether_arp))) == NULL) { + log(LOG_ERR, "in_arp: runt packet -- m_pullup failed\n"); + return; + } + + ea = mtod(m, struct ether_arp *); + op = ntohs(ea->arp_op); + (void)memcpy(&isaddr, ea->arp_spa, sizeof (isaddr)); + (void)memcpy(&itaddr, ea->arp_tpa, sizeof (itaddr)); + TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { + /* + * For a bridge, we want to check the address irrespective + * of the receive interface. (This will change slightly + * when we have clusters of interfaces). + */ +#ifdef BRIDGE +#define BRIDGE_TEST (do_bridge) +#else +#define BRIDGE_TEST (0) /* cc will optimise the test away */ +#endif + if ((BRIDGE_TEST) || (ia->ia_ifp == &ac->ac_if)) { + maybe_ia = ia; + if ((itaddr.s_addr == ia->ia_addr.sin_addr.s_addr) || + (isaddr.s_addr == ia->ia_addr.sin_addr.s_addr)) { + break; + } + } + } + if (maybe_ia == 0) { + m_freem(m); + return; + } + myaddr = ia ? ia->ia_addr.sin_addr : maybe_ia->ia_addr.sin_addr; + if (!bcmp((caddr_t)ea->arp_sha, (caddr_t)ac->ac_enaddr, + sizeof (ea->arp_sha))) { + m_freem(m); /* it's from me, ignore it. */ + return; + } + if (!bcmp((caddr_t)ea->arp_sha, (caddr_t)etherbroadcastaddr, + sizeof (ea->arp_sha))) { + log(LOG_ERR, + "arp: ether address is broadcast for IP address %s!\n", + inet_ntoa(isaddr)); + m_freem(m); + return; + } + if (isaddr.s_addr == myaddr.s_addr) { + log(LOG_ERR, + "arp: %6D is using my IP address %s!\n", + ea->arp_sha, ":", inet_ntoa(isaddr)); + itaddr = myaddr; + goto reply; + } + la = arplookup(isaddr.s_addr, itaddr.s_addr == myaddr.s_addr, 0); + if (la && (rt = la->la_rt) && (sdl = SDL(rt->rt_gateway))) { + /* the following is not an error when doing bridging */ + if (!BRIDGE_TEST && rt->rt_ifp != &ac->ac_if) { + if (log_arp_wrong_iface) + log(LOG_ERR, "arp: %s is on %s%d but got reply from %6D on %s%d\n", + inet_ntoa(isaddr), + rt->rt_ifp->if_name, rt->rt_ifp->if_unit, + ea->arp_sha, ":", + ac->ac_if.if_name, ac->ac_if.if_unit); + goto reply; + } + if (sdl->sdl_alen && + bcmp((caddr_t)ea->arp_sha, LLADDR(sdl), sdl->sdl_alen)) { + if (rt->rt_expire) + log(LOG_INFO, "arp: %s moved from %6D to %6D on %s%d\n", + inet_ntoa(isaddr), (u_char *)LLADDR(sdl), ":", + ea->arp_sha, ":", + ac->ac_if.if_name, ac->ac_if.if_unit); + else { + log(LOG_ERR, + "arp: %6D attempts to modify permanent entry for %s on %s%d\n", + ea->arp_sha, ":", inet_ntoa(isaddr), + ac->ac_if.if_name, ac->ac_if.if_unit); + goto reply; + } + } + (void)memcpy(LLADDR(sdl), ea->arp_sha, sizeof(ea->arp_sha)); + sdl->sdl_alen = sizeof(ea->arp_sha); + sdl->sdl_rcf = (u_short)0; + /* + * If we receive an arp from a token-ring station over + * a token-ring nic then try to save the source + * routing info. + */ + if (ac->ac_if.if_type == IFT_ISO88025) { + th = (struct iso88025_header *)m->m_pkthdr.header; + rif_len = TR_RCF_RIFLEN(th->rcf); + if ((th->iso88025_shost[0] & TR_RII) && + (rif_len > 2)) { + sdl->sdl_rcf = th->rcf; + sdl->sdl_rcf ^= htons(TR_RCF_DIR); + memcpy(sdl->sdl_route, th->rd, rif_len - 2); + sdl->sdl_rcf &= ~htons(TR_RCF_BCST_MASK); + /* + * Set up source routing information for + * reply packet (XXX) + */ + m->m_data -= rif_len; + m->m_len += rif_len; + m->m_pkthdr.len += rif_len; + } else { + th->iso88025_shost[0] &= ~TR_RII; + } + m->m_data -= 8; + m->m_len += 8; + m->m_pkthdr.len += 8; + th->rcf = sdl->sdl_rcf; + } else { + sdl->sdl_rcf = (u_short)0; + } + if (rt->rt_expire) + rt->rt_expire = time_second + arpt_keep; + rt->rt_flags &= ~RTF_REJECT; + la->la_asked = 0; + if (la->la_hold) { + (*ac->ac_if.if_output)(&ac->ac_if, la->la_hold, + rt_key(rt), rt); + la->la_hold = 0; + } + } +reply: + if (op != ARPOP_REQUEST) { + m_freem(m); + return; + } + if (itaddr.s_addr == myaddr.s_addr) { + /* I am the target */ + (void)memcpy(ea->arp_tha, ea->arp_sha, sizeof(ea->arp_sha)); + (void)memcpy(ea->arp_sha, ac->ac_enaddr, sizeof(ea->arp_sha)); + } else { + la = arplookup(itaddr.s_addr, 0, SIN_PROXY); + if (la == NULL) { + struct sockaddr_in sin; + + if (!arp_proxyall) { + m_freem(m); + return; + } + + bzero(&sin, sizeof sin); + sin.sin_family = AF_INET; + sin.sin_len = sizeof sin; + sin.sin_addr = itaddr; + + rt = rtalloc1((struct sockaddr *)&sin, 0, 0UL); + if (!rt) { + m_freem(m); + return; + } + /* + * Don't send proxies for nodes on the same interface + * as this one came out of, or we'll get into a fight + * over who claims what Ether address. + */ + if (rt->rt_ifp == &ac->ac_if) { + rtfree(rt); + m_freem(m); + return; + } + (void)memcpy(ea->arp_tha, ea->arp_sha, sizeof(ea->arp_sha)); + (void)memcpy(ea->arp_sha, ac->ac_enaddr, sizeof(ea->arp_sha)); + rtfree(rt); + + /* + * Also check that the node which sent the ARP packet + * is on the the interface we expect it to be on. This + * avoids ARP chaos if an interface is connected to the + * wrong network. + */ + sin.sin_addr = isaddr; + + rt = rtalloc1((struct sockaddr *)&sin, 0, 0UL); + if (!rt) { + m_freem(m); + return; + } + if (rt->rt_ifp != &ac->ac_if) { + log(LOG_INFO, "arp_proxy: ignoring request" + " from %s via %s%d, expecting %s%d\n", + inet_ntoa(isaddr), ac->ac_if.if_name, + ac->ac_if.if_unit, rt->rt_ifp->if_name, + rt->rt_ifp->if_unit); + rtfree(rt); + m_freem(m); + return; + } + rtfree(rt); + +#ifdef DEBUG_PROXY + printf("arp: proxying for %s\n", + inet_ntoa(itaddr)); +#endif + } else { + rt = la->la_rt; + (void)memcpy(ea->arp_tha, ea->arp_sha, sizeof(ea->arp_sha)); + sdl = SDL(rt->rt_gateway); + (void)memcpy(ea->arp_sha, LLADDR(sdl), sizeof(ea->arp_sha)); + } + } + + (void)memcpy(ea->arp_tpa, ea->arp_spa, sizeof(ea->arp_spa)); + (void)memcpy(ea->arp_spa, &itaddr, sizeof(ea->arp_spa)); + ea->arp_op = htons(ARPOP_REPLY); + ea->arp_pro = htons(ETHERTYPE_IP); /* let's be sure! */ + switch (ac->ac_if.if_type) { + case IFT_ISO88025: + /* Re-arrange the source/dest address */ + memcpy(th->iso88025_dhost, th->iso88025_shost, + sizeof(th->iso88025_dhost)); + memcpy(th->iso88025_shost, ac->ac_enaddr, + sizeof(th->iso88025_shost)); + /* Set the source routing bit if neccesary */ + if (th->iso88025_dhost[0] & TR_RII) { + th->iso88025_dhost[0] &= ~TR_RII; + if (TR_RCF_RIFLEN(th->rcf) > 2) + th->iso88025_shost[0] |= TR_RII; + } + /* Copy the addresses, ac and fc into sa_data */ + memcpy(sa.sa_data, th->iso88025_dhost, + sizeof(th->iso88025_dhost) * 2); + sa.sa_data[(sizeof(th->iso88025_dhost) * 2)] = TR_AC; + sa.sa_data[(sizeof(th->iso88025_dhost) * 2) + 1] = TR_LLC_FRAME; + break; + case IFT_ETHER: + case IFT_FDDI: + /* + * May not be correct for types not explictly + * listed, but it is our best guess. + */ + default: + eh = (struct ether_header *)sa.sa_data; + (void)memcpy(eh->ether_dhost, ea->arp_tha, + sizeof(eh->ether_dhost)); + eh->ether_type = htons(ETHERTYPE_ARP); + break; + } + sa.sa_family = AF_UNSPEC; + sa.sa_len = sizeof(sa); + (*ac->ac_if.if_output)(&ac->ac_if, m, &sa, (struct rtentry *)0); + return; +} +#endif + +/* + * Free an arp entry. + */ +static void +arptfree(la) + register struct llinfo_arp *la; +{ + register struct rtentry *rt = la->la_rt; + register struct sockaddr_dl *sdl; + if (rt == 0) + panic("arptfree"); + if (rt->rt_refcnt > 0 && (sdl = SDL(rt->rt_gateway)) && + sdl->sdl_family == AF_LINK) { + sdl->sdl_alen = 0; + la->la_asked = 0; + rt->rt_flags &= ~RTF_REJECT; + return; + } + rtrequest(RTM_DELETE, rt_key(rt), (struct sockaddr *)0, rt_mask(rt), + 0, (struct rtentry **)0); +} +/* + * Lookup or enter a new address in arptab. + */ +static struct llinfo_arp * +arplookup(addr, create, proxy) + u_long addr; + int create, proxy; +{ + register struct rtentry *rt; + static struct sockaddr_inarp sin = {sizeof(sin), AF_INET }; + const char *why = 0; + + sin.sin_addr.s_addr = addr; + sin.sin_other = proxy ? SIN_PROXY : 0; + rt = rtalloc1((struct sockaddr *)&sin, create, 0UL); + if (rt == 0) + return (0); + rt->rt_refcnt--; + + if (rt->rt_flags & RTF_GATEWAY) + why = "host is not on local network"; + else if ((rt->rt_flags & RTF_LLINFO) == 0) + why = "could not allocate llinfo"; + else if (rt->rt_gateway->sa_family != AF_LINK) + why = "gateway route is not ours"; + + if (why && create) { + log(LOG_DEBUG, "arplookup %s failed: %s\n", + inet_ntoa(sin.sin_addr), why); + return 0; + } else if (why) { + return 0; + } + return ((struct llinfo_arp *)rt->rt_llinfo); +} + +void +arp_ifinit(ac, ifa) + struct arpcom *ac; + struct ifaddr *ifa; +{ + if (ntohl(IA_SIN(ifa)->sin_addr.s_addr) != INADDR_ANY) + arprequest(ac, &IA_SIN(ifa)->sin_addr, + &IA_SIN(ifa)->sin_addr, ac->ac_enaddr); + ifa->ifa_rtrequest = arp_rtrequest; + ifa->ifa_flags |= RTF_CLONING; +} + +static void +arp_init(void) +{ + + arpintrq.ifq_maxlen = 50; + mtx_init(&arpintrq.ifq_mtx, "arp_inq", MTX_DEF); +} + +SYSINIT(arp, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, arp_init, 0); diff --git a/sys/netinet/if_ether.h b/sys/netinet/if_ether.h new file mode 100644 index 0000000..2d590ad --- /dev/null +++ b/sys/netinet/if_ether.h @@ -0,0 +1,122 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)if_ether.h 8.3 (Berkeley) 5/2/95 + * $FreeBSD$ + */ + +#ifndef _NETINET_IF_ETHER_H_ +#define _NETINET_IF_ETHER_H_ + +#include <net/ethernet.h> +#include <net/if_arp.h> + +/* + * Macro to map an IP multicast address to an Ethernet multicast address. + * The high-order 25 bits of the Ethernet address are statically assigned, + * and the low-order 23 bits are taken from the low end of the IP address. + */ +#define ETHER_MAP_IP_MULTICAST(ipaddr, enaddr) \ + /* struct in_addr *ipaddr; */ \ + /* u_char enaddr[ETHER_ADDR_LEN]; */ \ +{ \ + (enaddr)[0] = 0x01; \ + (enaddr)[1] = 0x00; \ + (enaddr)[2] = 0x5e; \ + (enaddr)[3] = ((u_char *)ipaddr)[1] & 0x7f; \ + (enaddr)[4] = ((u_char *)ipaddr)[2]; \ + (enaddr)[5] = ((u_char *)ipaddr)[3]; \ +} +/* + * Macro to map an IP6 multicast address to an Ethernet multicast address. + * The high-order 16 bits of the Ethernet address are statically assigned, + * and the low-order 32 bits are taken from the low end of the IP6 address. + */ +#define ETHER_MAP_IPV6_MULTICAST(ip6addr, enaddr) \ +/* struct in6_addr *ip6addr; */ \ +/* u_char enaddr[ETHER_ADDR_LEN]; */ \ +{ \ + (enaddr)[0] = 0x33; \ + (enaddr)[1] = 0x33; \ + (enaddr)[2] = ((u_char *)ip6addr)[12]; \ + (enaddr)[3] = ((u_char *)ip6addr)[13]; \ + (enaddr)[4] = ((u_char *)ip6addr)[14]; \ + (enaddr)[5] = ((u_char *)ip6addr)[15]; \ +} + +/* + * Ethernet Address Resolution Protocol. + * + * See RFC 826 for protocol description. Structure below is adapted + * to resolving internet addresses. Field names used correspond to + * RFC 826. + */ +struct ether_arp { + struct arphdr ea_hdr; /* fixed-size header */ + u_char arp_sha[ETHER_ADDR_LEN]; /* sender hardware address */ + u_char arp_spa[4]; /* sender protocol address */ + u_char arp_tha[ETHER_ADDR_LEN]; /* target hardware address */ + u_char arp_tpa[4]; /* target protocol address */ +}; +#define arp_hrd ea_hdr.ar_hrd +#define arp_pro ea_hdr.ar_pro +#define arp_hln ea_hdr.ar_hln +#define arp_pln ea_hdr.ar_pln +#define arp_op ea_hdr.ar_op + +struct sockaddr_inarp { + u_char sin_len; + u_char sin_family; + u_short sin_port; + struct in_addr sin_addr; + struct in_addr sin_srcaddr; + u_short sin_tos; + u_short sin_other; +#define SIN_PROXY 1 +}; +/* + * IP and ethernet specific routing flags + */ +#define RTF_USETRAILERS RTF_PROTO1 /* use trailers */ +#define RTF_ANNOUNCE RTF_PROTO2 /* announce new arp entry */ + +#ifdef _KERNEL +extern u_char ether_ipmulticast_min[ETHER_ADDR_LEN]; +extern u_char ether_ipmulticast_max[ETHER_ADDR_LEN]; +extern struct ifqueue arpintrq; + +int arpresolve __P((struct arpcom *, struct rtentry *, struct mbuf *, + struct sockaddr *, u_char *, struct rtentry *)); +void arp_ifinit __P((struct arpcom *, struct ifaddr *)); +#endif + +#endif diff --git a/sys/netinet/if_fddi.h b/sys/netinet/if_fddi.h new file mode 100644 index 0000000..9f83882 --- /dev/null +++ b/sys/netinet/if_fddi.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * Copyright (c) 1995 Matt Thomas (thomas@lkg.dec.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)if_fddi.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_IF_FDDI_H_ +#define _NETINET_IF_FDDI_H_ + +/* + * Structure of an 100Mb/s FDDI header. + */ +struct fddi_header { + u_char fddi_fc; + u_char fddi_dhost[6]; + u_char fddi_shost[6]; +}; + +#define FDDIIPMTU 4352 +#define FDDIMTU 4470 +#define FDDIMIN 3 + +#define FDDIFC_C 0x80 /* 0b10000000 */ +#define FDDIFC_L 0x40 /* 0b01000000 */ +#define FDDIFC_F 0x30 /* 0b00110000 */ +#define FDDIFC_Z 0x0F /* 0b00001111 */ + +#define FDDIFC_LLC_ASYNC 0x50 +#define FDDIFC_LLC_PRIO0 0 +#define FDDIFC_LLC_PRIO1 1 +#define FDDIFC_LLC_PRIO2 2 +#define FDDIFC_LLC_PRIO3 3 +#define FDDIFC_LLC_PRIO4 4 +#define FDDIFC_LLC_PRIO5 5 +#define FDDIFC_LLC_PRIO6 6 +#define FDDIFC_LLC_PRIO7 7 +#define FDDIFC_LLC_SYNC 0xd0 +#define FDDIFC_SMT 0x40 + +#if defined(_KERNEL) +#define fddibroadcastaddr etherbroadcastaddr +#define fddi_ipmulticast_min ether_ipmulticast_min +#define fddi_ipmulticast_max ether_ipmulticast_max +#define fddi_addmulti ether_addmulti +#define fddi_delmulti ether_delmulti +#define fddi_sprintf ether_sprintf + +void fddi_ifattach __P((struct ifnet *)); +void fddi_input __P((struct ifnet *, struct fddi_header *, struct mbuf *)); +int fddi_output __P((struct ifnet *, + struct mbuf *, struct sockaddr *, struct rtentry *)); + +#endif + +#endif diff --git a/sys/netinet/igmp.c b/sys/netinet/igmp.c new file mode 100644 index 0000000..77be25b --- /dev/null +++ b/sys/netinet/igmp.c @@ -0,0 +1,487 @@ +/* + * Copyright (c) 1988 Stephen Deering. + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Stephen Deering of Stanford University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)igmp.c 8.1 (Berkeley) 7/19/93 + * $FreeBSD$ + */ + +/* + * Internet Group Management Protocol (IGMP) routines. + * + * Written by Steve Deering, Stanford, May 1988. + * Modified by Rosen Sharma, Stanford, Aug 1994. + * Modified by Bill Fenner, Xerox PARC, Feb 1995. + * Modified to fully comply to IGMPv2 by Bill Fenner, Oct 1995. + * + * MULTICAST Revision: 3.5.1.4 + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/socket.h> +#include <sys/protosw.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_var.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#include <netinet/igmp.h> +#include <netinet/igmp_var.h> + +#include <machine/in_cksum.h> + +static MALLOC_DEFINE(M_IGMP, "igmp", "igmp state"); + +static struct router_info * + find_rti __P((struct ifnet *ifp)); + +static struct igmpstat igmpstat; + +SYSCTL_STRUCT(_net_inet_igmp, IGMPCTL_STATS, stats, CTLFLAG_RD, + &igmpstat, igmpstat, ""); + +static int igmp_timers_are_running; +static u_long igmp_all_hosts_group; +static u_long igmp_all_rtrs_group; +static struct mbuf *router_alert; +static struct router_info *Head; + +static void igmp_sendpkt __P((struct in_multi *, int, unsigned long)); + +void +igmp_init() +{ + struct ipoption *ra; + + /* + * To avoid byte-swapping the same value over and over again. + */ + igmp_all_hosts_group = htonl(INADDR_ALLHOSTS_GROUP); + igmp_all_rtrs_group = htonl(INADDR_ALLRTRS_GROUP); + + igmp_timers_are_running = 0; + + /* + * Construct a Router Alert option to use in outgoing packets + */ + MGET(router_alert, M_DONTWAIT, MT_DATA); + ra = mtod(router_alert, struct ipoption *); + ra->ipopt_dst.s_addr = 0; + ra->ipopt_list[0] = IPOPT_RA; /* Router Alert Option */ + ra->ipopt_list[1] = 0x04; /* 4 bytes long */ + ra->ipopt_list[2] = 0x00; + ra->ipopt_list[3] = 0x00; + router_alert->m_len = sizeof(ra->ipopt_dst) + ra->ipopt_list[1]; + + Head = (struct router_info *) 0; +} + +static struct router_info * +find_rti(ifp) + struct ifnet *ifp; +{ + register struct router_info *rti = Head; + +#ifdef IGMP_DEBUG + printf("[igmp.c, _find_rti] --> entering \n"); +#endif + while (rti) { + if (rti->rti_ifp == ifp) { +#ifdef IGMP_DEBUG + printf("[igmp.c, _find_rti] --> found old entry \n"); +#endif + return rti; + } + rti = rti->rti_next; + } + MALLOC(rti, struct router_info *, sizeof *rti, M_IGMP, M_NOWAIT); + rti->rti_ifp = ifp; + rti->rti_type = IGMP_V2_ROUTER; + rti->rti_time = 0; + rti->rti_next = Head; + Head = rti; +#ifdef IGMP_DEBUG + printf("[igmp.c, _find_rti] --> created an entry \n"); +#endif + return rti; +} + +void +igmp_input(m, off, proto) + register struct mbuf *m; + int off, proto; +{ + register int iphlen = off; + register struct igmp *igmp; + register struct ip *ip; + register int igmplen; + register struct ifnet *ifp = m->m_pkthdr.rcvif; + register int minlen; + register struct in_multi *inm; + register struct in_ifaddr *ia; + struct in_multistep step; + struct router_info *rti; + + int timer; /** timer value in the igmp query header **/ + + ++igmpstat.igps_rcv_total; + + ip = mtod(m, struct ip *); + igmplen = ip->ip_len; + + /* + * Validate lengths + */ + if (igmplen < IGMP_MINLEN) { + ++igmpstat.igps_rcv_tooshort; + m_freem(m); + return; + } + minlen = iphlen + IGMP_MINLEN; + if ((m->m_flags & M_EXT || m->m_len < minlen) && + (m = m_pullup(m, minlen)) == 0) { + ++igmpstat.igps_rcv_tooshort; + return; + } + + /* + * Validate checksum + */ + m->m_data += iphlen; + m->m_len -= iphlen; + igmp = mtod(m, struct igmp *); + if (in_cksum(m, igmplen)) { + ++igmpstat.igps_rcv_badsum; + m_freem(m); + return; + } + m->m_data -= iphlen; + m->m_len += iphlen; + + ip = mtod(m, struct ip *); + timer = igmp->igmp_code * PR_FASTHZ / IGMP_TIMER_SCALE; + if (timer == 0) + timer = 1; + rti = find_rti(ifp); + + /* + * In the IGMPv2 specification, there are 3 states and a flag. + * + * In Non-Member state, we simply don't have a membership record. + * In Delaying Member state, our timer is running (inm->inm_timer) + * In Idle Member state, our timer is not running (inm->inm_timer==0) + * + * The flag is inm->inm_state, it is set to IGMP_OTHERMEMBER if + * we have heard a report from another member, or IGMP_IREPORTEDLAST + * if I sent the last report. + */ + switch (igmp->igmp_type) { + + case IGMP_MEMBERSHIP_QUERY: + ++igmpstat.igps_rcv_queries; + + if (ifp->if_flags & IFF_LOOPBACK) + break; + + if (igmp->igmp_code == 0) { + /* + * Old router. Remember that the querier on this + * interface is old, and set the timer to the + * value in RFC 1112. + */ + + rti->rti_type = IGMP_V1_ROUTER; + rti->rti_time = 0; + + timer = IGMP_MAX_HOST_REPORT_DELAY * PR_FASTHZ; + + if (ip->ip_dst.s_addr != igmp_all_hosts_group || + igmp->igmp_group.s_addr != 0) { + ++igmpstat.igps_rcv_badqueries; + m_freem(m); + return; + } + } else { + /* + * New router. Simply do the new validity check. + */ + + if (igmp->igmp_group.s_addr != 0 && + !IN_MULTICAST(ntohl(igmp->igmp_group.s_addr))) { + ++igmpstat.igps_rcv_badqueries; + m_freem(m); + return; + } + } + + /* + * - Start the timers in all of our membership records + * that the query applies to for the interface on + * which the query arrived excl. those that belong + * to the "all-hosts" group (224.0.0.1). + * - Restart any timer that is already running but has + * a value longer than the requested timeout. + * - Use the value specified in the query message as + * the maximum timeout. + */ + IN_FIRST_MULTI(step, inm); + while (inm != NULL) { + if (inm->inm_ifp == ifp && + inm->inm_addr.s_addr != igmp_all_hosts_group && + (igmp->igmp_group.s_addr == 0 || + igmp->igmp_group.s_addr == inm->inm_addr.s_addr)) { + if (inm->inm_timer == 0 || + inm->inm_timer > timer) { + inm->inm_timer = + IGMP_RANDOM_DELAY(timer); + igmp_timers_are_running = 1; + } + } + IN_NEXT_MULTI(step, inm); + } + + break; + + case IGMP_V1_MEMBERSHIP_REPORT: + case IGMP_V2_MEMBERSHIP_REPORT: + /* + * For fast leave to work, we have to know that we are the + * last person to send a report for this group. Reports + * can potentially get looped back if we are a multicast + * router, so discard reports sourced by me. + */ + IFP_TO_IA(ifp, ia); + if (ia && ip->ip_src.s_addr == IA_SIN(ia)->sin_addr.s_addr) + break; + + ++igmpstat.igps_rcv_reports; + + if (ifp->if_flags & IFF_LOOPBACK) + break; + + if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr))) { + ++igmpstat.igps_rcv_badreports; + m_freem(m); + return; + } + + /* + * KLUDGE: if the IP source address of the report has an + * unspecified (i.e., zero) subnet number, as is allowed for + * a booting host, replace it with the correct subnet number + * so that a process-level multicast routing demon can + * determine which subnet it arrived from. This is necessary + * to compensate for the lack of any way for a process to + * determine the arrival interface of an incoming packet. + */ + if ((ntohl(ip->ip_src.s_addr) & IN_CLASSA_NET) == 0) + if (ia) ip->ip_src.s_addr = htonl(ia->ia_subnet); + + /* + * If we belong to the group being reported, stop + * our timer for that group. + */ + IN_LOOKUP_MULTI(igmp->igmp_group, ifp, inm); + + if (inm != NULL) { + inm->inm_timer = 0; + ++igmpstat.igps_rcv_ourreports; + + inm->inm_state = IGMP_OTHERMEMBER; + } + + break; + } + + /* + * Pass all valid IGMP packets up to any process(es) listening + * on a raw IGMP socket. + */ + rip_input(m, off, proto); +} + +void +igmp_joingroup(inm) + struct in_multi *inm; +{ + int s = splnet(); + + if (inm->inm_addr.s_addr == igmp_all_hosts_group + || inm->inm_ifp->if_flags & IFF_LOOPBACK) { + inm->inm_timer = 0; + inm->inm_state = IGMP_OTHERMEMBER; + } else { + inm->inm_rti = find_rti(inm->inm_ifp); + igmp_sendpkt(inm, inm->inm_rti->rti_type, 0); + inm->inm_timer = IGMP_RANDOM_DELAY( + IGMP_MAX_HOST_REPORT_DELAY*PR_FASTHZ); + inm->inm_state = IGMP_IREPORTEDLAST; + igmp_timers_are_running = 1; + } + splx(s); +} + +void +igmp_leavegroup(inm) + struct in_multi *inm; +{ + if (inm->inm_state == IGMP_IREPORTEDLAST && + inm->inm_addr.s_addr != igmp_all_hosts_group && + !(inm->inm_ifp->if_flags & IFF_LOOPBACK) && + inm->inm_rti->rti_type != IGMP_V1_ROUTER) + igmp_sendpkt(inm, IGMP_V2_LEAVE_GROUP, igmp_all_rtrs_group); +} + +void +igmp_fasttimo() +{ + register struct in_multi *inm; + struct in_multistep step; + int s; + + /* + * Quick check to see if any work needs to be done, in order + * to minimize the overhead of fasttimo processing. + */ + + if (!igmp_timers_are_running) + return; + + s = splnet(); + igmp_timers_are_running = 0; + IN_FIRST_MULTI(step, inm); + while (inm != NULL) { + if (inm->inm_timer == 0) { + /* do nothing */ + } else if (--inm->inm_timer == 0) { + igmp_sendpkt(inm, inm->inm_rti->rti_type, 0); + inm->inm_state = IGMP_IREPORTEDLAST; + } else { + igmp_timers_are_running = 1; + } + IN_NEXT_MULTI(step, inm); + } + splx(s); +} + +void +igmp_slowtimo() +{ + int s = splnet(); + register struct router_info *rti = Head; + +#ifdef IGMP_DEBUG + printf("[igmp.c,_slowtimo] -- > entering \n"); +#endif + while (rti) { + if (rti->rti_type == IGMP_V1_ROUTER) { + rti->rti_time++; + if (rti->rti_time >= IGMP_AGE_THRESHOLD) { + rti->rti_type = IGMP_V2_ROUTER; + } + } + rti = rti->rti_next; + } +#ifdef IGMP_DEBUG + printf("[igmp.c,_slowtimo] -- > exiting \n"); +#endif + splx(s); +} + +static struct route igmprt; + +static void +igmp_sendpkt(inm, type, addr) + struct in_multi *inm; + int type; + unsigned long addr; +{ + struct mbuf *m; + struct igmp *igmp; + struct ip *ip; + struct ip_moptions imo; + + MGETHDR(m, M_DONTWAIT, MT_HEADER); + if (m == NULL) + return; + + m->m_pkthdr.rcvif = loif; + m->m_pkthdr.len = sizeof(struct ip) + IGMP_MINLEN; + MH_ALIGN(m, IGMP_MINLEN + sizeof(struct ip)); + m->m_data += sizeof(struct ip); + m->m_len = IGMP_MINLEN; + igmp = mtod(m, struct igmp *); + igmp->igmp_type = type; + igmp->igmp_code = 0; + igmp->igmp_group = inm->inm_addr; + igmp->igmp_cksum = 0; + igmp->igmp_cksum = in_cksum(m, IGMP_MINLEN); + + m->m_data -= sizeof(struct ip); + m->m_len += sizeof(struct ip); + ip = mtod(m, struct ip *); + ip->ip_tos = 0; + ip->ip_len = sizeof(struct ip) + IGMP_MINLEN; + ip->ip_off = 0; + ip->ip_p = IPPROTO_IGMP; + ip->ip_src.s_addr = INADDR_ANY; + ip->ip_dst.s_addr = addr ? addr : igmp->igmp_group.s_addr; + + imo.imo_multicast_ifp = inm->inm_ifp; + imo.imo_multicast_ttl = 1; + imo.imo_multicast_vif = -1; + /* + * Request loopback of the report if we are acting as a multicast + * router, so that the process-level routing demon can hear it. + */ + imo.imo_multicast_loop = (ip_mrouter != NULL); + + /* + * XXX + * Do we have to worry about reentrancy here? Don't think so. + */ + ip_output(m, router_alert, &igmprt, 0, &imo); + + ++igmpstat.igps_snd_reports; +} diff --git a/sys/netinet/igmp.h b/sys/netinet/igmp.h new file mode 100644 index 0000000..7d943d6 --- /dev/null +++ b/sys/netinet/igmp.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 1988 Stephen Deering. + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Stephen Deering of Stanford University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)igmp.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_IGMP_H_ +#define _NETINET_IGMP_H_ + +/* + * Internet Group Management Protocol (IGMP) definitions. + * + * Written by Steve Deering, Stanford, May 1988. + * + * MULTICAST Revision: 3.5.1.2 + */ + +/* + * IGMP packet format. + */ +struct igmp { + u_char igmp_type; /* version & type of IGMP message */ + u_char igmp_code; /* subtype for routing msgs */ + u_short igmp_cksum; /* IP-style checksum */ + struct in_addr igmp_group; /* group address being reported */ +}; /* (zero for queries) */ + +#define IGMP_MINLEN 8 + +/* + * Message types, including version number. + */ +#define IGMP_MEMBERSHIP_QUERY 0x11 /* membership query */ +#define IGMP_V1_MEMBERSHIP_REPORT 0x12 /* Ver. 1 membership report */ +#define IGMP_V2_MEMBERSHIP_REPORT 0x16 /* Ver. 2 membership report */ +#define IGMP_V2_LEAVE_GROUP 0x17 /* Leave-group message */ + +#define IGMP_DVMRP 0x13 /* DVMRP routing message */ +#define IGMP_PIM 0x14 /* PIM routing message */ + +#define IGMP_MTRACE_RESP 0x1e /* traceroute resp.(to sender)*/ +#define IGMP_MTRACE 0x1f /* mcast traceroute messages */ + +#define IGMP_MAX_HOST_REPORT_DELAY 10 /* max delay for response to */ + /* query (in seconds) according */ + /* to RFC1112 */ + + +#define IGMP_TIMER_SCALE 10 /* denotes that the igmp code field */ + /* specifies time in 10th of seconds*/ + +/* + * The following four defininitions are for backwards compatibility. + * They should be removed as soon as all applications are updated to + * use the new constant names. + */ +#define IGMP_HOST_MEMBERSHIP_QUERY IGMP_MEMBERSHIP_QUERY +#define IGMP_HOST_MEMBERSHIP_REPORT IGMP_V1_MEMBERSHIP_REPORT +#define IGMP_HOST_NEW_MEMBERSHIP_REPORT IGMP_V2_MEMBERSHIP_REPORT +#define IGMP_HOST_LEAVE_MESSAGE IGMP_V2_LEAVE_GROUP + +#endif /* _NETINET_IGMP_H_ */ diff --git a/sys/netinet/igmp_var.h b/sys/netinet/igmp_var.h new file mode 100644 index 0000000..5688f24 --- /dev/null +++ b/sys/netinet/igmp_var.h @@ -0,0 +1,109 @@ +/* + * Copyright (c) 1988 Stephen Deering. + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Stephen Deering of Stanford University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)igmp_var.h 8.1 (Berkeley) 7/19/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_IGMP_VAR_H_ +#define _NETINET_IGMP_VAR_H_ + +/* + * Internet Group Management Protocol (IGMP), + * implementation-specific definitions. + * + * Written by Steve Deering, Stanford, May 1988. + * + * MULTICAST Revision: 3.5.1.3 + */ + +struct igmpstat { + u_int igps_rcv_total; /* total IGMP messages received */ + u_int igps_rcv_tooshort; /* received with too few bytes */ + u_int igps_rcv_badsum; /* received with bad checksum */ + u_int igps_rcv_queries; /* received membership queries */ + u_int igps_rcv_badqueries; /* received invalid queries */ + u_int igps_rcv_reports; /* received membership reports */ + u_int igps_rcv_badreports; /* received invalid reports */ + u_int igps_rcv_ourreports; /* received reports for our groups */ + u_int igps_snd_reports; /* sent membership reports */ +}; + +#ifdef _KERNEL +#define IGMP_RANDOM_DELAY(X) (random() % (X) + 1) + +/* + * States for IGMPv2's leave processing + */ +#define IGMP_OTHERMEMBER 0 +#define IGMP_IREPORTEDLAST 1 + +/* + * We must remember what version the subnet's querier is. + * We conveniently use the IGMP message type for the proper + * membership report to keep this state. + */ +#define IGMP_V1_ROUTER IGMP_V1_MEMBERSHIP_REPORT +#define IGMP_V2_ROUTER IGMP_V2_MEMBERSHIP_REPORT + +/* + * Revert to new router if we haven't heard from an old router in + * this amount of time. + */ +#define IGMP_AGE_THRESHOLD 540 + +void igmp_init __P((void)); +void igmp_input __P((struct mbuf *, int, int)); +void igmp_joingroup __P((struct in_multi *)); +void igmp_leavegroup __P((struct in_multi *)); +void igmp_fasttimo __P((void)); +void igmp_slowtimo __P((void)); + +SYSCTL_DECL(_net_inet_igmp); + +#endif + +/* + * Names for IGMP sysctl objects + */ +#define IGMPCTL_STATS 1 /* statistics (read-only) */ +#define IGMPCTL_MAXID 2 + +#define IGMPCTL_NAMES { \ + { 0, 0 }, \ + { "stats", CTLTYPE_STRUCT }, \ +} +#endif diff --git a/sys/netinet/in.c b/sys/netinet/in.c new file mode 100644 index 0000000..95abe3f --- /dev/null +++ b/sys/netinet/in.c @@ -0,0 +1,861 @@ +/* + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in.c 8.4 (Berkeley) 1/9/95 + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sockio.h> +#include <sys/malloc.h> +#include <sys/socket.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> + +#include <net/if.h> +#include <net/if_types.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_var.h> + +#include <netinet/igmp_var.h> + +#include "gif.h" +#if NGIF > 0 +#include <net/if_gif.h> +#endif + +static MALLOC_DEFINE(M_IPMADDR, "in_multi", "internet multicast address"); + +static int in_mask2len __P((struct in_addr *)); +static void in_len2mask __P((struct in_addr *, int)); +static int in_lifaddr_ioctl __P((struct socket *, u_long, caddr_t, + struct ifnet *, struct proc *)); + +static void in_socktrim __P((struct sockaddr_in *)); +static int in_ifinit __P((struct ifnet *, + struct in_ifaddr *, struct sockaddr_in *, int)); + +static int subnetsarelocal = 0; +SYSCTL_INT(_net_inet_ip, OID_AUTO, subnets_are_local, CTLFLAG_RW, + &subnetsarelocal, 0, ""); + +struct in_multihead in_multihead; /* XXX BSS initialization */ + +/* + * Return 1 if an internet address is for a ``local'' host + * (one to which we have a connection). If subnetsarelocal + * is true, this includes other subnets of the local net. + * Otherwise, it includes only the directly-connected (sub)nets. + */ +int +in_localaddr(in) + struct in_addr in; +{ + register u_long i = ntohl(in.s_addr); + register struct in_ifaddr *ia; + + if (subnetsarelocal) { + TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) + if ((i & ia->ia_netmask) == ia->ia_net) + return (1); + } else { + TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) + if ((i & ia->ia_subnetmask) == ia->ia_subnet) + return (1); + } + return (0); +} + +/* + * Determine whether an IP address is in a reserved set of addresses + * that may not be forwarded, or whether datagrams to that destination + * may be forwarded. + */ +int +in_canforward(in) + struct in_addr in; +{ + register u_long i = ntohl(in.s_addr); + register u_long net; + + if (IN_EXPERIMENTAL(i) || IN_MULTICAST(i)) + return (0); + if (IN_CLASSA(i)) { + net = i & IN_CLASSA_NET; + if (net == 0 || net == (IN_LOOPBACKNET << IN_CLASSA_NSHIFT)) + return (0); + } + return (1); +} + +/* + * Trim a mask in a sockaddr + */ +static void +in_socktrim(ap) +struct sockaddr_in *ap; +{ + register char *cplim = (char *) &ap->sin_addr; + register char *cp = (char *) (&ap->sin_addr + 1); + + ap->sin_len = 0; + while (--cp >= cplim) + if (*cp) { + (ap)->sin_len = cp - (char *) (ap) + 1; + break; + } +} + +static int +in_mask2len(mask) + struct in_addr *mask; +{ + int x, y; + u_char *p; + + p = (u_char *)mask; + for (x = 0; x < sizeof(*mask); x++) { + if (p[x] != 0xff) + break; + } + y = 0; + if (x < sizeof(*mask)) { + for (y = 0; y < 8; y++) { + if ((p[x] & (0x80 >> y)) == 0) + break; + } + } + return x * 8 + y; +} + +static void +in_len2mask(mask, len) + struct in_addr *mask; + int len; +{ + int i; + u_char *p; + + p = (u_char *)mask; + bzero(mask, sizeof(*mask)); + for (i = 0; i < len / 8; i++) + p[i] = 0xff; + if (len % 8) + p[i] = (0xff00 >> (len % 8)) & 0xff; +} + +static int in_interfaces; /* number of external internet interfaces */ + +/* + * Generic internet control operations (ioctl's). + * Ifp is 0 if not an interface-specific ioctl. + */ +/* ARGSUSED */ +int +in_control(so, cmd, data, ifp, p) + struct socket *so; + u_long cmd; + caddr_t data; + register struct ifnet *ifp; + struct proc *p; +{ + register struct ifreq *ifr = (struct ifreq *)data; + register struct in_ifaddr *ia = 0, *iap; + register struct ifaddr *ifa; + struct in_ifaddr *oia; + struct in_aliasreq *ifra = (struct in_aliasreq *)data; + struct sockaddr_in oldaddr; + int error, hostIsNew, maskIsNew, s; + u_long i; + +#if NGIF > 0 + if (ifp && ifp->if_type == IFT_GIF) { + switch (cmd) { + case SIOCSIFPHYADDR: + case SIOCDIFPHYADDR: + if (p && + (error = suser(p)) != 0) + return(error); + case SIOCGIFPSRCADDR: + case SIOCGIFPDSTADDR: + return gif_ioctl(ifp, cmd, data); + } + } +#endif + + switch (cmd) { + case SIOCALIFADDR: + case SIOCDLIFADDR: + if (p && (error = suser(p)) != 0) + return error; + /*fall through*/ + case SIOCGLIFADDR: + if (!ifp) + return EINVAL; + return in_lifaddr_ioctl(so, cmd, data, ifp, p); + } + + /* + * Find address for this interface, if it exists. + * + * If an alias address was specified, find that one instead of + * the first one on the interface. + */ + if (ifp) + TAILQ_FOREACH(iap, &in_ifaddrhead, ia_link) + if (iap->ia_ifp == ifp) { + if (((struct sockaddr_in *)&ifr->ifr_addr)->sin_addr.s_addr == + iap->ia_addr.sin_addr.s_addr) { + ia = iap; + break; + } else if (ia == NULL) { + ia = iap; + if (ifr->ifr_addr.sa_family != AF_INET) + break; + } + } + + switch (cmd) { + + case SIOCAIFADDR: + case SIOCDIFADDR: + if (ifp == 0) + return (EADDRNOTAVAIL); + if (ifra->ifra_addr.sin_family == AF_INET) { + for (oia = ia; ia; ia = TAILQ_NEXT(ia, ia_link)) { + if (ia->ia_ifp == ifp && + ia->ia_addr.sin_addr.s_addr == + ifra->ifra_addr.sin_addr.s_addr) + break; + } + if ((ifp->if_flags & IFF_POINTOPOINT) + && (cmd == SIOCAIFADDR) + && (ifra->ifra_dstaddr.sin_addr.s_addr + == INADDR_ANY)) { + return EDESTADDRREQ; + } + } + if (cmd == SIOCDIFADDR && ia == 0) + return (EADDRNOTAVAIL); + /* FALLTHROUGH */ + case SIOCSIFADDR: + case SIOCSIFNETMASK: + case SIOCSIFDSTADDR: + if (p && (error = suser(p)) != 0) + return error; + + if (ifp == 0) + return (EADDRNOTAVAIL); + if (ia == (struct in_ifaddr *)0) { + ia = (struct in_ifaddr *) + malloc(sizeof *ia, M_IFADDR, M_WAITOK | M_ZERO); + if (ia == (struct in_ifaddr *)NULL) + return (ENOBUFS); + /* + * Protect from ipintr() traversing address list + * while we're modifying it. + */ + s = splnet(); + + TAILQ_INSERT_TAIL(&in_ifaddrhead, ia, ia_link); + ifa = &ia->ia_ifa; + TAILQ_INSERT_TAIL(&ifp->if_addrhead, ifa, ifa_link); + + ifa->ifa_addr = (struct sockaddr *)&ia->ia_addr; + ifa->ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr; + ifa->ifa_netmask = (struct sockaddr *)&ia->ia_sockmask; + ia->ia_sockmask.sin_len = 8; + if (ifp->if_flags & IFF_BROADCAST) { + ia->ia_broadaddr.sin_len = sizeof(ia->ia_addr); + ia->ia_broadaddr.sin_family = AF_INET; + } + ia->ia_ifp = ifp; + if (!(ifp->if_flags & IFF_LOOPBACK)) + in_interfaces++; + splx(s); + } + break; + + case SIOCSIFBRDADDR: + if (p && (error = suser(p)) != 0) + return error; + /* FALLTHROUGH */ + + case SIOCGIFADDR: + case SIOCGIFNETMASK: + case SIOCGIFDSTADDR: + case SIOCGIFBRDADDR: + if (ia == (struct in_ifaddr *)0) + return (EADDRNOTAVAIL); + break; + } + switch (cmd) { + + case SIOCGIFADDR: + *((struct sockaddr_in *)&ifr->ifr_addr) = ia->ia_addr; + break; + + case SIOCGIFBRDADDR: + if ((ifp->if_flags & IFF_BROADCAST) == 0) + return (EINVAL); + *((struct sockaddr_in *)&ifr->ifr_dstaddr) = ia->ia_broadaddr; + break; + + case SIOCGIFDSTADDR: + if ((ifp->if_flags & IFF_POINTOPOINT) == 0) + return (EINVAL); + *((struct sockaddr_in *)&ifr->ifr_dstaddr) = ia->ia_dstaddr; + break; + + case SIOCGIFNETMASK: + *((struct sockaddr_in *)&ifr->ifr_addr) = ia->ia_sockmask; + break; + + case SIOCSIFDSTADDR: + if ((ifp->if_flags & IFF_POINTOPOINT) == 0) + return (EINVAL); + oldaddr = ia->ia_dstaddr; + ia->ia_dstaddr = *(struct sockaddr_in *)&ifr->ifr_dstaddr; + if (ifp->if_ioctl && (error = (*ifp->if_ioctl) + (ifp, SIOCSIFDSTADDR, (caddr_t)ia))) { + ia->ia_dstaddr = oldaddr; + return (error); + } + if (ia->ia_flags & IFA_ROUTE) { + ia->ia_ifa.ifa_dstaddr = (struct sockaddr *)&oldaddr; + rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST); + ia->ia_ifa.ifa_dstaddr = + (struct sockaddr *)&ia->ia_dstaddr; + rtinit(&(ia->ia_ifa), (int)RTM_ADD, RTF_HOST|RTF_UP); + } + break; + + case SIOCSIFBRDADDR: + if ((ifp->if_flags & IFF_BROADCAST) == 0) + return (EINVAL); + ia->ia_broadaddr = *(struct sockaddr_in *)&ifr->ifr_broadaddr; + break; + + case SIOCSIFADDR: + return (in_ifinit(ifp, ia, + (struct sockaddr_in *) &ifr->ifr_addr, 1)); + + case SIOCSIFNETMASK: + i = ifra->ifra_addr.sin_addr.s_addr; + ia->ia_subnetmask = ntohl(ia->ia_sockmask.sin_addr.s_addr = i); + break; + + case SIOCAIFADDR: + maskIsNew = 0; + hostIsNew = 1; + error = 0; + if (ia->ia_addr.sin_family == AF_INET) { + if (ifra->ifra_addr.sin_len == 0) { + ifra->ifra_addr = ia->ia_addr; + hostIsNew = 0; + } else if (ifra->ifra_addr.sin_addr.s_addr == + ia->ia_addr.sin_addr.s_addr) + hostIsNew = 0; + } + if (ifra->ifra_mask.sin_len) { + in_ifscrub(ifp, ia); + ia->ia_sockmask = ifra->ifra_mask; + ia->ia_subnetmask = + ntohl(ia->ia_sockmask.sin_addr.s_addr); + maskIsNew = 1; + } + if ((ifp->if_flags & IFF_POINTOPOINT) && + (ifra->ifra_dstaddr.sin_family == AF_INET)) { + in_ifscrub(ifp, ia); + ia->ia_dstaddr = ifra->ifra_dstaddr; + maskIsNew = 1; /* We lie; but the effect's the same */ + } + if (ifra->ifra_addr.sin_family == AF_INET && + (hostIsNew || maskIsNew)) + error = in_ifinit(ifp, ia, &ifra->ifra_addr, 0); + if ((ifp->if_flags & IFF_BROADCAST) && + (ifra->ifra_broadaddr.sin_family == AF_INET)) + ia->ia_broadaddr = ifra->ifra_broadaddr; + return (error); + + case SIOCDIFADDR: + /* + * in_ifscrub kills the interface route. + */ + in_ifscrub(ifp, ia); + /* + * in_ifadown gets rid of all the rest of + * the routes. This is not quite the right + * thing to do, but at least if we are running + * a routing process they will come back. + */ + in_ifadown(&ia->ia_ifa, 1); + + /* + * Protect from ipintr() traversing address list + * while we're modifying it. + */ + s = splnet(); + + ifa = &ia->ia_ifa; + TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link); + oia = ia; + TAILQ_REMOVE(&in_ifaddrhead, oia, ia_link); + IFAFREE(&oia->ia_ifa); + splx(s); + break; + + default: + if (ifp == 0 || ifp->if_ioctl == 0) + return (EOPNOTSUPP); + return ((*ifp->if_ioctl)(ifp, cmd, data)); + } + return (0); +} + +/* + * SIOC[GAD]LIFADDR. + * SIOCGLIFADDR: get first address. (?!?) + * SIOCGLIFADDR with IFLR_PREFIX: + * get first address that matches the specified prefix. + * SIOCALIFADDR: add the specified address. + * SIOCALIFADDR with IFLR_PREFIX: + * EINVAL since we can't deduce hostid part of the address. + * SIOCDLIFADDR: delete the specified address. + * SIOCDLIFADDR with IFLR_PREFIX: + * delete the first address that matches the specified prefix. + * return values: + * EINVAL on invalid parameters + * EADDRNOTAVAIL on prefix match failed/specified address not found + * other values may be returned from in_ioctl() + */ +static int +in_lifaddr_ioctl(so, cmd, data, ifp, p) + struct socket *so; + u_long cmd; + caddr_t data; + struct ifnet *ifp; + struct proc *p; +{ + struct if_laddrreq *iflr = (struct if_laddrreq *)data; + struct ifaddr *ifa; + + /* sanity checks */ + if (!data || !ifp) { + panic("invalid argument to in_lifaddr_ioctl"); + /*NOTRECHED*/ + } + + switch (cmd) { + case SIOCGLIFADDR: + /* address must be specified on GET with IFLR_PREFIX */ + if ((iflr->flags & IFLR_PREFIX) == 0) + break; + /*FALLTHROUGH*/ + case SIOCALIFADDR: + case SIOCDLIFADDR: + /* address must be specified on ADD and DELETE */ + if (iflr->addr.ss_family != AF_INET) + return EINVAL; + if (iflr->addr.ss_len != sizeof(struct sockaddr_in)) + return EINVAL; + /* XXX need improvement */ + if (iflr->dstaddr.ss_family + && iflr->dstaddr.ss_family != AF_INET) + return EINVAL; + if (iflr->dstaddr.ss_family + && iflr->dstaddr.ss_len != sizeof(struct sockaddr_in)) + return EINVAL; + break; + default: /*shouldn't happen*/ + return EOPNOTSUPP; + } + if (sizeof(struct in_addr) * 8 < iflr->prefixlen) + return EINVAL; + + switch (cmd) { + case SIOCALIFADDR: + { + struct in_aliasreq ifra; + + if (iflr->flags & IFLR_PREFIX) + return EINVAL; + + /* copy args to in_aliasreq, perform ioctl(SIOCAIFADDR_IN6). */ + bzero(&ifra, sizeof(ifra)); + bcopy(iflr->iflr_name, ifra.ifra_name, + sizeof(ifra.ifra_name)); + + bcopy(&iflr->addr, &ifra.ifra_addr, iflr->addr.ss_len); + + if (iflr->dstaddr.ss_family) { /*XXX*/ + bcopy(&iflr->dstaddr, &ifra.ifra_dstaddr, + iflr->dstaddr.ss_len); + } + + ifra.ifra_mask.sin_family = AF_INET; + ifra.ifra_mask.sin_len = sizeof(struct sockaddr_in); + in_len2mask(&ifra.ifra_mask.sin_addr, iflr->prefixlen); + + return in_control(so, SIOCAIFADDR, (caddr_t)&ifra, ifp, p); + } + case SIOCGLIFADDR: + case SIOCDLIFADDR: + { + struct in_ifaddr *ia; + struct in_addr mask, candidate, match; + struct sockaddr_in *sin; + int cmp; + + bzero(&mask, sizeof(mask)); + if (iflr->flags & IFLR_PREFIX) { + /* lookup a prefix rather than address. */ + in_len2mask(&mask, iflr->prefixlen); + + sin = (struct sockaddr_in *)&iflr->addr; + match.s_addr = sin->sin_addr.s_addr; + match.s_addr &= mask.s_addr; + + /* if you set extra bits, that's wrong */ + if (match.s_addr != sin->sin_addr.s_addr) + return EINVAL; + + cmp = 1; + } else { + if (cmd == SIOCGLIFADDR) { + /* on getting an address, take the 1st match */ + cmp = 0; /*XXX*/ + } else { + /* on deleting an address, do exact match */ + in_len2mask(&mask, 32); + sin = (struct sockaddr_in *)&iflr->addr; + match.s_addr = sin->sin_addr.s_addr; + + cmp = 1; + } + } + + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr->sa_family != AF_INET6) + continue; + if (!cmp) + break; + candidate.s_addr = ((struct sockaddr_in *)&ifa->ifa_addr)->sin_addr.s_addr; + candidate.s_addr &= mask.s_addr; + if (candidate.s_addr == match.s_addr) + break; + } + if (!ifa) + return EADDRNOTAVAIL; + ia = (struct in_ifaddr *)ifa; + + if (cmd == SIOCGLIFADDR) { + /* fill in the if_laddrreq structure */ + bcopy(&ia->ia_addr, &iflr->addr, ia->ia_addr.sin_len); + + if ((ifp->if_flags & IFF_POINTOPOINT) != 0) { + bcopy(&ia->ia_dstaddr, &iflr->dstaddr, + ia->ia_dstaddr.sin_len); + } else + bzero(&iflr->dstaddr, sizeof(iflr->dstaddr)); + + iflr->prefixlen = + in_mask2len(&ia->ia_sockmask.sin_addr); + + iflr->flags = 0; /*XXX*/ + + return 0; + } else { + struct in_aliasreq ifra; + + /* fill in_aliasreq and do ioctl(SIOCDIFADDR_IN6) */ + bzero(&ifra, sizeof(ifra)); + bcopy(iflr->iflr_name, ifra.ifra_name, + sizeof(ifra.ifra_name)); + + bcopy(&ia->ia_addr, &ifra.ifra_addr, + ia->ia_addr.sin_len); + if ((ifp->if_flags & IFF_POINTOPOINT) != 0) { + bcopy(&ia->ia_dstaddr, &ifra.ifra_dstaddr, + ia->ia_dstaddr.sin_len); + } + bcopy(&ia->ia_sockmask, &ifra.ifra_dstaddr, + ia->ia_sockmask.sin_len); + + return in_control(so, SIOCDIFADDR, (caddr_t)&ifra, + ifp, p); + } + } + } + + return EOPNOTSUPP; /*just for safety*/ +} + +/* + * Delete any existing route for an interface. + */ +void +in_ifscrub(ifp, ia) + register struct ifnet *ifp; + register struct in_ifaddr *ia; +{ + + if ((ia->ia_flags & IFA_ROUTE) == 0) + return; + if (ifp->if_flags & (IFF_LOOPBACK|IFF_POINTOPOINT)) + rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST); + else + rtinit(&(ia->ia_ifa), (int)RTM_DELETE, 0); + ia->ia_flags &= ~IFA_ROUTE; +} + +/* + * Initialize an interface's internet address + * and routing table entry. + */ +static int +in_ifinit(ifp, ia, sin, scrub) + register struct ifnet *ifp; + register struct in_ifaddr *ia; + struct sockaddr_in *sin; + int scrub; +{ + register u_long i = ntohl(sin->sin_addr.s_addr); + struct sockaddr_in oldaddr; + int s = splimp(), flags = RTF_UP, error; + + oldaddr = ia->ia_addr; + ia->ia_addr = *sin; + /* + * Give the interface a chance to initialize + * if this is its first address, + * and to validate the address if necessary. + */ + if (ifp->if_ioctl && + (error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia))) { + splx(s); + ia->ia_addr = oldaddr; + return (error); + } + splx(s); + if (scrub) { + ia->ia_ifa.ifa_addr = (struct sockaddr *)&oldaddr; + in_ifscrub(ifp, ia); + ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr; + } + if (IN_CLASSA(i)) + ia->ia_netmask = IN_CLASSA_NET; + else if (IN_CLASSB(i)) + ia->ia_netmask = IN_CLASSB_NET; + else + ia->ia_netmask = IN_CLASSC_NET; + /* + * The subnet mask usually includes at least the standard network part, + * but may may be smaller in the case of supernetting. + * If it is set, we believe it. + */ + if (ia->ia_subnetmask == 0) { + ia->ia_subnetmask = ia->ia_netmask; + ia->ia_sockmask.sin_addr.s_addr = htonl(ia->ia_subnetmask); + } else + ia->ia_netmask &= ia->ia_subnetmask; + ia->ia_net = i & ia->ia_netmask; + ia->ia_subnet = i & ia->ia_subnetmask; + in_socktrim(&ia->ia_sockmask); + /* + * Add route for the network. + */ + ia->ia_ifa.ifa_metric = ifp->if_metric; + if (ifp->if_flags & IFF_BROADCAST) { + ia->ia_broadaddr.sin_addr.s_addr = + htonl(ia->ia_subnet | ~ia->ia_subnetmask); + ia->ia_netbroadcast.s_addr = + htonl(ia->ia_net | ~ ia->ia_netmask); + } else if (ifp->if_flags & IFF_LOOPBACK) { + ia->ia_ifa.ifa_dstaddr = ia->ia_ifa.ifa_addr; + flags |= RTF_HOST; + } else if (ifp->if_flags & IFF_POINTOPOINT) { + if (ia->ia_dstaddr.sin_family != AF_INET) + return (0); + flags |= RTF_HOST; + } + if ((error = rtinit(&(ia->ia_ifa), (int)RTM_ADD, flags)) == 0) + ia->ia_flags |= IFA_ROUTE; + + /* + * If the interface supports multicast, join the "all hosts" + * multicast group on that interface. + */ + if (ifp->if_flags & IFF_MULTICAST) { + struct in_addr addr; + + addr.s_addr = htonl(INADDR_ALLHOSTS_GROUP); + in_addmulti(&addr, ifp); + } + return (error); +} + + +/* + * Return 1 if the address might be a local broadcast address. + */ +int +in_broadcast(in, ifp) + struct in_addr in; + struct ifnet *ifp; +{ + register struct ifaddr *ifa; + u_long t; + + if (in.s_addr == INADDR_BROADCAST || + in.s_addr == INADDR_ANY) + return 1; + if ((ifp->if_flags & IFF_BROADCAST) == 0) + return 0; + t = ntohl(in.s_addr); + /* + * Look through the list of addresses for a match + * with a broadcast address. + */ +#define ia ((struct in_ifaddr *)ifa) + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) + if (ifa->ifa_addr->sa_family == AF_INET && + (in.s_addr == ia->ia_broadaddr.sin_addr.s_addr || + in.s_addr == ia->ia_netbroadcast.s_addr || + /* + * Check for old-style (host 0) broadcast. + */ + t == ia->ia_subnet || t == ia->ia_net) && + /* + * Check for an all one subnetmask. These + * only exist when an interface gets a secondary + * address. + */ + ia->ia_subnetmask != (u_long)0xffffffff) + return 1; + return (0); +#undef ia +} +/* + * Add an address to the list of IP multicast addresses for a given interface. + */ +struct in_multi * +in_addmulti(ap, ifp) + register struct in_addr *ap; + register struct ifnet *ifp; +{ + register struct in_multi *inm; + int error; + struct sockaddr_in sin; + struct ifmultiaddr *ifma; + int s = splnet(); + + /* + * Call generic routine to add membership or increment + * refcount. It wants addresses in the form of a sockaddr, + * so we build one here (being careful to zero the unused bytes). + */ + bzero(&sin, sizeof sin); + sin.sin_family = AF_INET; + sin.sin_len = sizeof sin; + sin.sin_addr = *ap; + error = if_addmulti(ifp, (struct sockaddr *)&sin, &ifma); + if (error) { + splx(s); + return 0; + } + + /* + * If ifma->ifma_protospec is null, then if_addmulti() created + * a new record. Otherwise, we are done. + */ + if (ifma->ifma_protospec != 0) { + splx(s); + return ifma->ifma_protospec; + } + + /* XXX - if_addmulti uses M_WAITOK. Can this really be called + at interrupt time? If so, need to fix if_addmulti. XXX */ + inm = (struct in_multi *)malloc(sizeof(*inm), M_IPMADDR, + M_NOWAIT | M_ZERO); + if (inm == NULL) { + splx(s); + return (NULL); + } + + inm->inm_addr = *ap; + inm->inm_ifp = ifp; + inm->inm_ifma = ifma; + ifma->ifma_protospec = inm; + LIST_INSERT_HEAD(&in_multihead, inm, inm_link); + + /* + * Let IGMP know that we have joined a new IP multicast group. + */ + igmp_joingroup(inm); + splx(s); + return (inm); +} + +/* + * Delete a multicast address record. + */ +void +in_delmulti(inm) + register struct in_multi *inm; +{ + struct ifmultiaddr *ifma = inm->inm_ifma; + struct in_multi my_inm; + int s = splnet(); + + my_inm.inm_ifp = NULL ; /* don't send the leave msg */ + if (ifma->ifma_refcount == 1) { + /* + * No remaining claims to this record; let IGMP know that + * we are leaving the multicast group. + * But do it after the if_delmulti() which might reset + * the interface and nuke the packet. + */ + my_inm = *inm ; + ifma->ifma_protospec = 0; + LIST_REMOVE(inm, inm_link); + free(inm, M_IPMADDR); + } + /* XXX - should be separate API for when we have an ifma? */ + if_delmulti(ifma->ifma_ifp, ifma->ifma_addr); + if (my_inm.inm_ifp != NULL) + igmp_leavegroup(&my_inm); + splx(s); +} diff --git a/sys/netinet/in.h b/sys/netinet/in.h new file mode 100644 index 0000000..65f8a8e --- /dev/null +++ b/sys/netinet/in.h @@ -0,0 +1,492 @@ +/* + * Copyright (c) 1982, 1986, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in.h 8.3 (Berkeley) 1/3/94 + * $FreeBSD$ + */ + +#ifndef _NETINET_IN_H_ +#define _NETINET_IN_H_ + +/* + * Constants and structures defined by the internet system, + * Per RFC 790, September 1981, and numerous additions. + */ + +/* + * Protocols (RFC 1700) + */ +#define IPPROTO_IP 0 /* dummy for IP */ +#define IPPROTO_HOPOPTS 0 /* IP6 hop-by-hop options */ +#define IPPROTO_ICMP 1 /* control message protocol */ +#define IPPROTO_IGMP 2 /* group mgmt protocol */ +#define IPPROTO_GGP 3 /* gateway^2 (deprecated) */ +#define IPPROTO_IPV4 4 /* IPv4 encapsulation */ +#define IPPROTO_IPIP IPPROTO_IPV4 /* for compatibility */ +#define IPPROTO_TCP 6 /* tcp */ +#define IPPROTO_ST 7 /* Stream protocol II */ +#define IPPROTO_EGP 8 /* exterior gateway protocol */ +#define IPPROTO_PIGP 9 /* private interior gateway */ +#define IPPROTO_RCCMON 10 /* BBN RCC Monitoring */ +#define IPPROTO_NVPII 11 /* network voice protocol*/ +#define IPPROTO_PUP 12 /* pup */ +#define IPPROTO_ARGUS 13 /* Argus */ +#define IPPROTO_EMCON 14 /* EMCON */ +#define IPPROTO_XNET 15 /* Cross Net Debugger */ +#define IPPROTO_CHAOS 16 /* Chaos*/ +#define IPPROTO_UDP 17 /* user datagram protocol */ +#define IPPROTO_MUX 18 /* Multiplexing */ +#define IPPROTO_MEAS 19 /* DCN Measurement Subsystems */ +#define IPPROTO_HMP 20 /* Host Monitoring */ +#define IPPROTO_PRM 21 /* Packet Radio Measurement */ +#define IPPROTO_IDP 22 /* xns idp */ +#define IPPROTO_TRUNK1 23 /* Trunk-1 */ +#define IPPROTO_TRUNK2 24 /* Trunk-2 */ +#define IPPROTO_LEAF1 25 /* Leaf-1 */ +#define IPPROTO_LEAF2 26 /* Leaf-2 */ +#define IPPROTO_RDP 27 /* Reliable Data */ +#define IPPROTO_IRTP 28 /* Reliable Transaction */ +#define IPPROTO_TP 29 /* tp-4 w/ class negotiation */ +#define IPPROTO_BLT 30 /* Bulk Data Transfer */ +#define IPPROTO_NSP 31 /* Network Services */ +#define IPPROTO_INP 32 /* Merit Internodal */ +#define IPPROTO_SEP 33 /* Sequential Exchange */ +#define IPPROTO_3PC 34 /* Third Party Connect */ +#define IPPROTO_IDPR 35 /* InterDomain Policy Routing */ +#define IPPROTO_XTP 36 /* XTP */ +#define IPPROTO_DDP 37 /* Datagram Delivery */ +#define IPPROTO_CMTP 38 /* Control Message Transport */ +#define IPPROTO_TPXX 39 /* TP++ Transport */ +#define IPPROTO_IL 40 /* IL transport protocol */ +#define IPPROTO_IPV6 41 /* IP6 header */ +#define IPPROTO_SDRP 42 /* Source Demand Routing */ +#define IPPROTO_ROUTING 43 /* IP6 routing header */ +#define IPPROTO_FRAGMENT 44 /* IP6 fragmentation header */ +#define IPPROTO_IDRP 45 /* InterDomain Routing*/ +#define IPPROTO_RSVP 46 /* resource reservation */ +#define IPPROTO_GRE 47 /* General Routing Encap. */ +#define IPPROTO_MHRP 48 /* Mobile Host Routing */ +#define IPPROTO_BHA 49 /* BHA */ +#define IPPROTO_ESP 50 /* IP6 Encap Sec. Payload */ +#define IPPROTO_AH 51 /* IP6 Auth Header */ +#define IPPROTO_INLSP 52 /* Integ. Net Layer Security */ +#define IPPROTO_SWIPE 53 /* IP with encryption */ +#define IPPROTO_NHRP 54 /* Next Hop Resolution */ +#define IPPROTO_MOBILE 55 /* IP Mobility */ +#define IPPROTO_TLSP 56 /* Transport Layer Security */ +#define IPPROTO_SKIP 57 /* SKIP */ +#define IPPROTO_ICMPV6 58 /* ICMP6 */ +#define IPPROTO_NONE 59 /* IP6 no next header */ +#define IPPROTO_DSTOPTS 60 /* IP6 destination option */ +#define IPPROTO_AHIP 61 /* any host internal protocol */ +#define IPPROTO_CFTP 62 /* CFTP */ +#define IPPROTO_HELLO 63 /* "hello" routing protocol */ +#define IPPROTO_SATEXPAK 64 /* SATNET/Backroom EXPAK */ +#define IPPROTO_KRYPTOLAN 65 /* Kryptolan */ +#define IPPROTO_RVD 66 /* Remote Virtual Disk */ +#define IPPROTO_IPPC 67 /* Pluribus Packet Core */ +#define IPPROTO_ADFS 68 /* Any distributed FS */ +#define IPPROTO_SATMON 69 /* Satnet Monitoring */ +#define IPPROTO_VISA 70 /* VISA Protocol */ +#define IPPROTO_IPCV 71 /* Packet Core Utility */ +#define IPPROTO_CPNX 72 /* Comp. Prot. Net. Executive */ +#define IPPROTO_CPHB 73 /* Comp. Prot. HeartBeat */ +#define IPPROTO_WSN 74 /* Wang Span Network */ +#define IPPROTO_PVP 75 /* Packet Video Protocol */ +#define IPPROTO_BRSATMON 76 /* BackRoom SATNET Monitoring */ +#define IPPROTO_ND 77 /* Sun net disk proto (temp.) */ +#define IPPROTO_WBMON 78 /* WIDEBAND Monitoring */ +#define IPPROTO_WBEXPAK 79 /* WIDEBAND EXPAK */ +#define IPPROTO_EON 80 /* ISO cnlp */ +#define IPPROTO_VMTP 81 /* VMTP */ +#define IPPROTO_SVMTP 82 /* Secure VMTP */ +#define IPPROTO_VINES 83 /* Banyon VINES */ +#define IPPROTO_TTP 84 /* TTP */ +#define IPPROTO_IGP 85 /* NSFNET-IGP */ +#define IPPROTO_DGP 86 /* dissimilar gateway prot. */ +#define IPPROTO_TCF 87 /* TCF */ +#define IPPROTO_IGRP 88 /* Cisco/GXS IGRP */ +#define IPPROTO_OSPFIGP 89 /* OSPFIGP */ +#define IPPROTO_SRPC 90 /* Strite RPC protocol */ +#define IPPROTO_LARP 91 /* Locus Address Resoloution */ +#define IPPROTO_MTP 92 /* Multicast Transport */ +#define IPPROTO_AX25 93 /* AX.25 Frames */ +#define IPPROTO_IPEIP 94 /* IP encapsulated in IP */ +#define IPPROTO_MICP 95 /* Mobile Int.ing control */ +#define IPPROTO_SCCSP 96 /* Semaphore Comm. security */ +#define IPPROTO_ETHERIP 97 /* Ethernet IP encapsulation */ +#define IPPROTO_ENCAP 98 /* encapsulation header */ +#define IPPROTO_APES 99 /* any private encr. scheme */ +#define IPPROTO_GMTP 100 /* GMTP*/ +#define IPPROTO_IPCOMP 108 /* payload compression (IPComp) */ +/* 101-254: Partly Unassigned */ +#define IPPROTO_PIM 103 /* Protocol Independent Mcast */ +#define IPPROTO_PGM 113 /* PGM */ +/* 255: Reserved */ +/* BSD Private, local use, namespace incursion */ +#define IPPROTO_DIVERT 254 /* divert pseudo-protocol */ +#define IPPROTO_RAW 255 /* raw IP packet */ +#define IPPROTO_MAX 256 + +/* last return value of *_input(), meaning "all job for this pkt is done". */ +#define IPPROTO_DONE 257 + +/* + * Local port number conventions: + * + * When a user does a bind(2) or connect(2) with a port number of zero, + * a non-conflicting local port address is chosen. + * The default range is IPPORT_RESERVED through + * IPPORT_USERRESERVED, although that is settable by sysctl. + * + * A user may set the IPPROTO_IP option IP_PORTRANGE to change this + * default assignment range. + * + * The value IP_PORTRANGE_DEFAULT causes the default behavior. + * + * The value IP_PORTRANGE_HIGH changes the range of candidate port numbers + * into the "high" range. These are reserved for client outbound connections + * which do not want to be filtered by any firewalls. + * + * The value IP_PORTRANGE_LOW changes the range to the "low" are + * that is (by convention) restricted to privileged processes. This + * convention is based on "vouchsafe" principles only. It is only secure + * if you trust the remote host to restrict these ports. + * + * The default range of ports and the high range can be changed by + * sysctl(3). (net.inet.ip.port{hi,low}{first,last}_auto) + * + * Changing those values has bad security implications if you are + * using a a stateless firewall that is allowing packets outside of that + * range in order to allow transparent outgoing connections. + * + * Such a firewall configuration will generally depend on the use of these + * default values. If you change them, you may find your Security + * Administrator looking for you with a heavy object. + * + * For a slightly more orthodox text view on this: + * + * ftp://ftp.isi.edu/in-notes/iana/assignments/port-numbers + * + * port numbers are divided into three ranges: + * + * 0 - 1023 Well Known Ports + * 1024 - 49151 Registered Ports + * 49152 - 65535 Dynamic and/or Private Ports + * + */ + +/* + * Ports < IPPORT_RESERVED are reserved for + * privileged processes (e.g. root). (IP_PORTRANGE_LOW) + * Ports > IPPORT_USERRESERVED are reserved + * for servers, not necessarily privileged. (IP_PORTRANGE_DEFAULT) + */ +#define IPPORT_RESERVED 1024 +#define IPPORT_USERRESERVED 5000 + +/* + * Default local port range to use by setting IP_PORTRANGE_HIGH + */ +#define IPPORT_HIFIRSTAUTO 49152 +#define IPPORT_HILASTAUTO 65535 + +/* + * Scanning for a free reserved port return a value below IPPORT_RESERVED, + * but higher than IPPORT_RESERVEDSTART. Traditionally the start value was + * 512, but that conflicts with some well-known-services that firewalls may + * have a fit if we use. + */ +#define IPPORT_RESERVEDSTART 600 + +/* + * Internet address (a structure for historical reasons) + */ +struct in_addr { + in_addr_t s_addr; +}; + +/* + * Definitions of bits in internet address integers. + * On subnets, the decomposition of addresses to host and net parts + * is done according to subnet mask, not the masks here. + */ +#define IN_CLASSA(i) (((u_int32_t)(i) & 0x80000000) == 0) +#define IN_CLASSA_NET 0xff000000 +#define IN_CLASSA_NSHIFT 24 +#define IN_CLASSA_HOST 0x00ffffff +#define IN_CLASSA_MAX 128 + +#define IN_CLASSB(i) (((u_int32_t)(i) & 0xc0000000) == 0x80000000) +#define IN_CLASSB_NET 0xffff0000 +#define IN_CLASSB_NSHIFT 16 +#define IN_CLASSB_HOST 0x0000ffff +#define IN_CLASSB_MAX 65536 + +#define IN_CLASSC(i) (((u_int32_t)(i) & 0xe0000000) == 0xc0000000) +#define IN_CLASSC_NET 0xffffff00 +#define IN_CLASSC_NSHIFT 8 +#define IN_CLASSC_HOST 0x000000ff + +#define IN_CLASSD(i) (((u_int32_t)(i) & 0xf0000000) == 0xe0000000) +#define IN_CLASSD_NET 0xf0000000 /* These ones aren't really */ +#define IN_CLASSD_NSHIFT 28 /* net and host fields, but */ +#define IN_CLASSD_HOST 0x0fffffff /* routing needn't know. */ +#define IN_MULTICAST(i) IN_CLASSD(i) + +#define IN_EXPERIMENTAL(i) (((u_int32_t)(i) & 0xf0000000) == 0xf0000000) +#define IN_BADCLASS(i) (((u_int32_t)(i) & 0xf0000000) == 0xf0000000) + +#define INADDR_ANY (u_int32_t)0x00000000 +#define INADDR_LOOPBACK (u_int32_t)0x7f000001 +#define INADDR_BROADCAST (u_int32_t)0xffffffff /* must be masked */ +#ifndef _KERNEL +#define INADDR_NONE 0xffffffff /* -1 return */ +#endif + +#define INADDR_UNSPEC_GROUP (u_int32_t)0xe0000000 /* 224.0.0.0 */ +#define INADDR_ALLHOSTS_GROUP (u_int32_t)0xe0000001 /* 224.0.0.1 */ +#define INADDR_ALLRTRS_GROUP (u_int32_t)0xe0000002 /* 224.0.0.2 */ +#define INADDR_MAX_LOCAL_GROUP (u_int32_t)0xe00000ff /* 224.0.0.255 */ + +#define IN_LOOPBACKNET 127 /* official! */ + +/* + * Socket address, internet style. + */ +struct sockaddr_in { + u_char sin_len; + u_char sin_family; + u_short sin_port; + struct in_addr sin_addr; + char sin_zero[8]; +}; + +#define INET_ADDRSTRLEN 16 + +/* + * Structure used to describe IP options. + * Used to store options internally, to pass them to a process, + * or to restore options retrieved earlier. + * The ip_dst is used for the first-hop gateway when using a source route + * (this gets put into the header proper). + */ +struct ip_opts { + struct in_addr ip_dst; /* first hop, 0 w/o src rt */ + char ip_opts[40]; /* actually variable in size */ +}; + +/* + * Options for use with [gs]etsockopt at the IP level. + * First word of comment is data type; bool is stored in int. + */ +#define IP_OPTIONS 1 /* buf/ip_opts; set/get IP options */ +#define IP_HDRINCL 2 /* int; header is included with data */ +#define IP_TOS 3 /* int; IP type of service and preced. */ +#define IP_TTL 4 /* int; IP time to live */ +#define IP_RECVOPTS 5 /* bool; receive all IP opts w/dgram */ +#define IP_RECVRETOPTS 6 /* bool; receive IP opts for response */ +#define IP_RECVDSTADDR 7 /* bool; receive IP dst addr w/dgram */ +#define IP_RETOPTS 8 /* ip_opts; set/get IP options */ +#define IP_MULTICAST_IF 9 /* u_char; set/get IP multicast i/f */ +#define IP_MULTICAST_TTL 10 /* u_char; set/get IP multicast ttl */ +#define IP_MULTICAST_LOOP 11 /* u_char; set/get IP multicast loopback */ +#define IP_ADD_MEMBERSHIP 12 /* ip_mreq; add an IP group membership */ +#define IP_DROP_MEMBERSHIP 13 /* ip_mreq; drop an IP group membership */ +#define IP_MULTICAST_VIF 14 /* set/get IP mcast virt. iface */ +#define IP_RSVP_ON 15 /* enable RSVP in kernel */ +#define IP_RSVP_OFF 16 /* disable RSVP in kernel */ +#define IP_RSVP_VIF_ON 17 /* set RSVP per-vif socket */ +#define IP_RSVP_VIF_OFF 18 /* unset RSVP per-vif socket */ +#define IP_PORTRANGE 19 /* int; range to choose for unspec port */ +#define IP_RECVIF 20 /* bool; receive reception if w/dgram */ +/* for IPSEC */ +#define IP_IPSEC_POLICY 21 /* int; set/get security policy */ +#define IP_FAITH 22 /* bool; accept FAITH'ed connections */ + +#define IP_FW_ADD 50 /* add a firewall rule to chain */ +#define IP_FW_DEL 51 /* delete a firewall rule from chain */ +#define IP_FW_FLUSH 52 /* flush firewall rule chain */ +#define IP_FW_ZERO 53 /* clear single/all firewall counter(s) */ +#define IP_FW_GET 54 /* get entire firewall rule chain */ +#define IP_FW_RESETLOG 55 /* reset logging counters */ + +#define IP_DUMMYNET_CONFIGURE 60 /* add/configure a dummynet pipe */ +#define IP_DUMMYNET_DEL 61 /* delete a dummynet pipe from chain */ +#define IP_DUMMYNET_FLUSH 62 /* flush dummynet */ +#define IP_DUMMYNET_GET 64 /* get entire dummynet pipes */ + +/* + * Defaults and limits for options + */ +#define IP_DEFAULT_MULTICAST_TTL 1 /* normally limit m'casts to 1 hop */ +#define IP_DEFAULT_MULTICAST_LOOP 1 /* normally hear sends if a member */ +#define IP_MAX_MEMBERSHIPS 20 /* per socket */ + +/* + * Argument structure for IP_ADD_MEMBERSHIP and IP_DROP_MEMBERSHIP. + */ +struct ip_mreq { + struct in_addr imr_multiaddr; /* IP multicast address of group */ + struct in_addr imr_interface; /* local IP address of interface */ +}; + +/* + * Argument for IP_PORTRANGE: + * - which range to search when port is unspecified at bind() or connect() + */ +#define IP_PORTRANGE_DEFAULT 0 /* default range */ +#define IP_PORTRANGE_HIGH 1 /* "high" - request firewall bypass */ +#define IP_PORTRANGE_LOW 2 /* "low" - vouchsafe security */ + +/* + * Definitions for inet sysctl operations. + * + * Third level is protocol number. + * Fourth level is desired variable within that protocol. + */ +#define IPPROTO_MAXID (IPPROTO_AH + 1) /* don't list to IPPROTO_MAX */ + +#define CTL_IPPROTO_NAMES { \ + { "ip", CTLTYPE_NODE }, \ + { "icmp", CTLTYPE_NODE }, \ + { "igmp", CTLTYPE_NODE }, \ + { "ggp", CTLTYPE_NODE }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { "tcp", CTLTYPE_NODE }, \ + { 0, 0 }, \ + { "egp", CTLTYPE_NODE }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { "pup", CTLTYPE_NODE }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { "udp", CTLTYPE_NODE }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { "idp", CTLTYPE_NODE }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { "ipsec", CTLTYPE_NODE }, \ +} + +/* + * Names for IP sysctl objects + */ +#define IPCTL_FORWARDING 1 /* act as router */ +#define IPCTL_SENDREDIRECTS 2 /* may send redirects when forwarding */ +#define IPCTL_DEFTTL 3 /* default TTL */ +#ifdef notyet +#define IPCTL_DEFMTU 4 /* default MTU */ +#endif +#define IPCTL_RTEXPIRE 5 /* cloned route expiration time */ +#define IPCTL_RTMINEXPIRE 6 /* min value for expiration time */ +#define IPCTL_RTMAXCACHE 7 /* trigger level for dynamic expire */ +#define IPCTL_SOURCEROUTE 8 /* may perform source routes */ +#define IPCTL_DIRECTEDBROADCAST 9 /* may re-broadcast received packets */ +#define IPCTL_INTRQMAXLEN 10 /* max length of netisr queue */ +#define IPCTL_INTRQDROPS 11 /* number of netisr q drops */ +#define IPCTL_STATS 12 /* ipstat structure */ +#define IPCTL_ACCEPTSOURCEROUTE 13 /* may accept source routed packets */ +#define IPCTL_FASTFORWARDING 14 /* use fast IP forwarding code */ +#define IPCTL_KEEPFAITH 15 /* FAITH IPv4->IPv6 translater ctl */ +#define IPCTL_GIF_TTL 16 /* default TTL for gif encap packet */ +#define IPCTL_MAXID 17 + +#define IPCTL_NAMES { \ + { 0, 0 }, \ + { "forwarding", CTLTYPE_INT }, \ + { "redirect", CTLTYPE_INT }, \ + { "ttl", CTLTYPE_INT }, \ + { "mtu", CTLTYPE_INT }, \ + { "rtexpire", CTLTYPE_INT }, \ + { "rtminexpire", CTLTYPE_INT }, \ + { "rtmaxcache", CTLTYPE_INT }, \ + { "sourceroute", CTLTYPE_INT }, \ + { "directed-broadcast", CTLTYPE_INT }, \ + { "intr-queue-maxlen", CTLTYPE_INT }, \ + { "intr-queue-drops", CTLTYPE_INT }, \ + { "stats", CTLTYPE_STRUCT }, \ + { "accept_sourceroute", CTLTYPE_INT }, \ + { "fastforwarding", CTLTYPE_INT }, \ +} + +/* INET6 stuff */ +#define __KAME_NETINET_IN_H_INCLUDED_ +#include <netinet6/in6.h> +#undef __KAME_NETINET_IN_H_INCLUDED_ + +#ifdef _KERNEL +struct ifnet; struct mbuf; /* forward declarations for Standard C */ +struct proc; + +int in_broadcast __P((struct in_addr, struct ifnet *)); +int in_canforward __P((struct in_addr)); +int in_localaddr __P((struct in_addr)); +char *inet_ntoa __P((struct in_addr)); /* in libkern */ +char *inet_ntoa_r __P((struct in_addr ina, char *buf)); /* in libkern */ + +#endif + +#endif diff --git a/sys/netinet/in_cksum.c b/sys/netinet/in_cksum.c new file mode 100644 index 0000000..eaf1493 --- /dev/null +++ b/sys/netinet/in_cksum.c @@ -0,0 +1,150 @@ +/* + * Copyright (c) 1988, 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_cksum.c 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/mbuf.h> + +/* + * Checksum routine for Internet Protocol family headers (Portable Version). + * + * This routine is very heavily used in the network + * code and should be modified for each CPU to be as fast as possible. + */ + +#define ADDCARRY(x) (x > 65535 ? x -= 65535 : x) +#define REDUCE {l_util.l = sum; sum = l_util.s[0] + l_util.s[1]; ADDCARRY(sum);} + +int +in_cksum(m, len) + register struct mbuf *m; + register int len; +{ + register u_short *w; + register int sum = 0; + register int mlen = 0; + int byte_swapped = 0; + + union { + char c[2]; + u_short s; + } s_util; + union { + u_short s[2]; + long l; + } l_util; + + for (;m && len; m = m->m_next) { + if (m->m_len == 0) + continue; + w = mtod(m, u_short *); + if (mlen == -1) { + /* + * The first byte of this mbuf is the continuation + * of a word spanning between this mbuf and the + * last mbuf. + * + * s_util.c[0] is already saved when scanning previous + * mbuf. + */ + s_util.c[1] = *(char *)w; + sum += s_util.s; + w = (u_short *)((char *)w + 1); + mlen = m->m_len - 1; + len--; + } else + mlen = m->m_len; + if (len < mlen) + mlen = len; + len -= mlen; + /* + * Force to even boundary. + */ + if ((1 & (int) w) && (mlen > 0)) { + REDUCE; + sum <<= 8; + s_util.c[0] = *(u_char *)w; + w = (u_short *)((char *)w + 1); + mlen--; + byte_swapped = 1; + } + /* + * Unroll the loop to make overhead from + * branches &c small. + */ + while ((mlen -= 32) >= 0) { + sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3]; + sum += w[4]; sum += w[5]; sum += w[6]; sum += w[7]; + sum += w[8]; sum += w[9]; sum += w[10]; sum += w[11]; + sum += w[12]; sum += w[13]; sum += w[14]; sum += w[15]; + w += 16; + } + mlen += 32; + while ((mlen -= 8) >= 0) { + sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3]; + w += 4; + } + mlen += 8; + if (mlen == 0 && byte_swapped == 0) + continue; + REDUCE; + while ((mlen -= 2) >= 0) { + sum += *w++; + } + if (byte_swapped) { + REDUCE; + sum <<= 8; + byte_swapped = 0; + if (mlen == -1) { + s_util.c[1] = *(char *)w; + sum += s_util.s; + mlen = 0; + } else + mlen = -1; + } else if (mlen == -1) + s_util.c[0] = *(char *)w; + } + if (len) + printf("cksum: out of data\n"); + if (mlen == -1) { + /* The last mbuf has odd # of bytes. Follow the + standard (the odd byte may be shifted left by 8 bits + or not as determined by endian-ness of the machine) */ + s_util.c[1] = 0; + sum += s_util.s; + } + REDUCE; + return (~sum & 0xffff); +} diff --git a/sys/netinet/in_gif.c b/sys/netinet/in_gif.c new file mode 100644 index 0000000..17955ad --- /dev/null +++ b/sys/netinet/in_gif.c @@ -0,0 +1,382 @@ +/* $FreeBSD$ */ +/* $KAME: in_gif.c,v 1.44 2000/08/15 07:24:24 itojun Exp $ */ + +/* + * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "opt_mrouting.h" +#include "opt_inet.h" +#include "opt_inet6.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/socket.h> +#include <sys/sockio.h> +#include <sys/mbuf.h> +#include <sys/errno.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> + +#include <sys/malloc.h> + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#include <netinet/in_gif.h> +#include <netinet/in_var.h> +#include <netinet/ip_encap.h> +#include <netinet/ip_ecn.h> + +#ifdef INET6 +#include <netinet/ip6.h> +#endif + +#ifdef MROUTING +#include <netinet/ip_mroute.h> +#endif /* MROUTING */ + +#include <net/if_gif.h> + +#include "gif.h" + +#include <machine/stdarg.h> + +#include <net/net_osdep.h> + +#if NGIF > 0 +int ip_gif_ttl = GIF_TTL; +#else +int ip_gif_ttl = 0; +#endif +SYSCTL_INT(_net_inet_ip, IPCTL_GIF_TTL, gifttl, CTLFLAG_RW, + &ip_gif_ttl, 0, ""); + +int +in_gif_output(ifp, family, m, rt) + struct ifnet *ifp; + int family; + struct mbuf *m; + struct rtentry *rt; +{ + register struct gif_softc *sc = (struct gif_softc*)ifp; + struct sockaddr_in *dst = (struct sockaddr_in *)&sc->gif_ro.ro_dst; + struct sockaddr_in *sin_src = (struct sockaddr_in *)sc->gif_psrc; + struct sockaddr_in *sin_dst = (struct sockaddr_in *)sc->gif_pdst; + struct ip iphdr; /* capsule IP header, host byte ordered */ + int proto, error; + u_int8_t tos; + + if (sin_src == NULL || sin_dst == NULL || + sin_src->sin_family != AF_INET || + sin_dst->sin_family != AF_INET) { + m_freem(m); + return EAFNOSUPPORT; + } + + switch (family) { +#ifdef INET + case AF_INET: + { + struct ip *ip; + + proto = IPPROTO_IPV4; + if (m->m_len < sizeof(*ip)) { + m = m_pullup(m, sizeof(*ip)); + if (!m) + return ENOBUFS; + } + ip = mtod(m, struct ip *); + tos = ip->ip_tos; + break; + } +#endif /*INET*/ +#ifdef INET6 + case AF_INET6: + { + struct ip6_hdr *ip6; + proto = IPPROTO_IPV6; + if (m->m_len < sizeof(*ip6)) { + m = m_pullup(m, sizeof(*ip6)); + if (!m) + return ENOBUFS; + } + ip6 = mtod(m, struct ip6_hdr *); + tos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; + break; + } +#endif /*INET6*/ + default: +#ifdef DEBUG + printf("in_gif_output: warning: unknown family %d passed\n", + family); +#endif + m_freem(m); + return EAFNOSUPPORT; + } + + bzero(&iphdr, sizeof(iphdr)); + iphdr.ip_src = sin_src->sin_addr; + if (ifp->if_flags & IFF_LINK0) { + /* multi-destination mode */ + if (sin_dst->sin_addr.s_addr != INADDR_ANY) + iphdr.ip_dst = sin_dst->sin_addr; + else if (rt) { + if (family != AF_INET) { + m_freem(m); + return EINVAL; /*XXX*/ + } + iphdr.ip_dst = ((struct sockaddr_in *) + (rt->rt_gateway))->sin_addr; + } else { + m_freem(m); + return ENETUNREACH; + } + } else { + /* bidirectional configured tunnel mode */ + if (sin_dst->sin_addr.s_addr != INADDR_ANY) + iphdr.ip_dst = sin_dst->sin_addr; + else { + m_freem(m); + return ENETUNREACH; + } + } + iphdr.ip_p = proto; + /* version will be set in ip_output() */ + iphdr.ip_ttl = ip_gif_ttl; + iphdr.ip_len = m->m_pkthdr.len + sizeof(struct ip); + if (ifp->if_flags & IFF_LINK1) + ip_ecn_ingress(ECN_ALLOWED, &iphdr.ip_tos, &tos); + + /* prepend new IP header */ + M_PREPEND(m, sizeof(struct ip), M_DONTWAIT); + if (m && m->m_len < sizeof(struct ip)) + m = m_pullup(m, sizeof(struct ip)); + if (m == NULL) { + printf("ENOBUFS in in_gif_output %d\n", __LINE__); + return ENOBUFS; + } + bcopy(&iphdr, mtod(m, struct ip *), sizeof(struct ip)); + + if (dst->sin_family != sin_dst->sin_family || + dst->sin_addr.s_addr != sin_dst->sin_addr.s_addr) { + /* cache route doesn't match */ + dst->sin_family = sin_dst->sin_family; + dst->sin_len = sizeof(struct sockaddr_in); + dst->sin_addr = sin_dst->sin_addr; + if (sc->gif_ro.ro_rt) { + RTFREE(sc->gif_ro.ro_rt); + sc->gif_ro.ro_rt = NULL; + } +#if 0 + sc->gif_if.if_mtu = GIF_MTU; +#endif + } + + if (sc->gif_ro.ro_rt == NULL) { + rtalloc(&sc->gif_ro); + if (sc->gif_ro.ro_rt == NULL) { + m_freem(m); + return ENETUNREACH; + } + + /* if it constitutes infinite encapsulation, punt. */ + if (sc->gif_ro.ro_rt->rt_ifp == ifp) { + m_freem(m); + return ENETUNREACH; /*XXX*/ + } +#if 0 + ifp->if_mtu = sc->gif_ro.ro_rt->rt_ifp->if_mtu + - sizeof(struct ip); +#endif + } + + error = ip_output(m, NULL, &sc->gif_ro, 0, NULL); + return(error); +} + +void +#if __STDC__ +in_gif_input(struct mbuf *m, ...) +#else +in_gif_input(m, va_alist) + struct mbuf *m; + va_dcl +#endif +{ + int off, proto; + struct ifnet *gifp = NULL; + struct ip *ip; + va_list ap; + int af; + u_int8_t otos; + + va_start(ap, m); + off = va_arg(ap, int); + proto = va_arg(ap, int); + va_end(ap); + + ip = mtod(m, struct ip *); + + gifp = (struct ifnet *)encap_getarg(m); + + if (gifp == NULL || (gifp->if_flags & IFF_UP) == 0) { + m_freem(m); + ipstat.ips_nogif++; + return; + } + + otos = ip->ip_tos; + m_adj(m, off); + + switch (proto) { +#ifdef INET + case IPPROTO_IPV4: + { + struct ip *ip; + af = AF_INET; + if (m->m_len < sizeof(*ip)) { + m = m_pullup(m, sizeof(*ip)); + if (!m) + return; + } + ip = mtod(m, struct ip *); + if (gifp->if_flags & IFF_LINK1) + ip_ecn_egress(ECN_ALLOWED, &otos, &ip->ip_tos); + break; + } +#endif +#ifdef INET6 + case IPPROTO_IPV6: + { + struct ip6_hdr *ip6; + u_int8_t itos; + af = AF_INET6; + if (m->m_len < sizeof(*ip6)) { + m = m_pullup(m, sizeof(*ip6)); + if (!m) + return; + } + ip6 = mtod(m, struct ip6_hdr *); + itos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; + if (gifp->if_flags & IFF_LINK1) + ip_ecn_egress(ECN_ALLOWED, &otos, &itos); + ip6->ip6_flow &= ~htonl(0xff << 20); + ip6->ip6_flow |= htonl((u_int32_t)itos << 20); + break; + } +#endif /* INET6 */ + default: + ipstat.ips_nogif++; + m_freem(m); + return; + } + gif_input(m, af, gifp); + return; +} + +/* + * we know that we are in IFF_UP, outer address available, and outer family + * matched the physical addr family. see gif_encapcheck(). + */ +int +gif_encapcheck4(m, off, proto, arg) + const struct mbuf *m; + int off; + int proto; + void *arg; +{ + struct ip ip; + struct gif_softc *sc; + struct sockaddr_in *src, *dst; + int addrmatch; + struct in_ifaddr *ia4; + + /* sanity check done in caller */ + sc = (struct gif_softc *)arg; + src = (struct sockaddr_in *)sc->gif_psrc; + dst = (struct sockaddr_in *)sc->gif_pdst; + + /* LINTED const cast */ + m_copydata((struct mbuf *)m, 0, sizeof(ip), (caddr_t)&ip); + + /* check for address match */ + addrmatch = 0; + if (src->sin_addr.s_addr == ip.ip_dst.s_addr) + addrmatch |= 1; + if (dst->sin_addr.s_addr == ip.ip_src.s_addr) + addrmatch |= 2; + else if ((sc->gif_if.if_flags & IFF_LINK0) != 0 && + dst->sin_addr.s_addr == INADDR_ANY) { + addrmatch |= 2; /* we accept any source */ + } + if (addrmatch != 3) + return 0; + + /* martian filters on outer source - NOT done in ip_input! */ + if (IN_MULTICAST(ntohl(ip.ip_src.s_addr))) + return 0; + switch ((ntohl(ip.ip_src.s_addr) & 0xff000000) >> 24) { + case 0: case 127: case 255: + return 0; + } + /* reject packets with broadcast on source */ + TAILQ_FOREACH(ia4, &in_ifaddrhead, ia_link) + { + if ((ia4->ia_ifa.ifa_ifp->if_flags & IFF_BROADCAST) == 0) + continue; + if (ip.ip_src.s_addr == ia4->ia_broadaddr.sin_addr.s_addr) + return 0; + } + + /* ingress filters on outer source */ + if ((m->m_flags & M_PKTHDR) != 0 && m->m_pkthdr.rcvif) { + struct sockaddr_in sin; + struct rtentry *rt; + + bzero(&sin, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_len = sizeof(struct sockaddr_in); + sin.sin_addr = ip.ip_src; + rt = rtalloc1((struct sockaddr *)&sin, 0, 0UL); + if (!rt) + return 0; + if (rt->rt_ifp != m->m_pkthdr.rcvif) { + rtfree(rt); + return 0; + } + rtfree(rt); + } + + /* prioritize: IFF_LINK0 mode is less preferred */ + return (sc->gif_if.if_flags & IFF_LINK0) ? 32 : 32 * 2; +} diff --git a/sys/netinet/in_gif.h b/sys/netinet/in_gif.h new file mode 100644 index 0000000..de03c6e --- /dev/null +++ b/sys/netinet/in_gif.h @@ -0,0 +1,44 @@ +/* $FreeBSD$ */ +/* $KAME: in_gif.h,v 1.5 2000/04/14 08:36:02 itojun Exp $ */ + +/* + * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _NETINET_IN_GIF_H_ +#define _NETINET_IN_GIF_H_ + +#define GIF_TTL 30 + +extern int ip_gif_ttl; + +void in_gif_input __P((struct mbuf *, ...)); +int in_gif_output __P((struct ifnet *, int, struct mbuf *, struct rtentry *)); +int gif_encapcheck4 __P((const struct mbuf *, int, int, void *)); + +#endif /*_NETINET_IN_GIF_H_*/ diff --git a/sys/netinet/in_hostcache.c b/sys/netinet/in_hostcache.c new file mode 100644 index 0000000..36a92fd --- /dev/null +++ b/sys/netinet/in_hostcache.c @@ -0,0 +1,157 @@ +/* + * Copyright 1997 Massachusetts Institute of Technology + * + * Permission to use, copy, modify, and distribute this software and + * its documentation for any purpose and without fee is hereby + * granted, provided that both the above copyright notice and this + * permission notice appear in all copies, that both the above + * copyright notice and this permission notice appear in all + * supporting documentation, and that the name of M.I.T. not be used + * in advertising or publicity pertaining to distribution of the + * software without specific, written prior permission. M.I.T. makes + * no representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied + * warranty. + * + * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS + * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT + * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/socket.h> +#include <sys/socketvar.h> + +#include <net/hostcache.h> +#include <net/if.h> +#include <net/if_var.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_hostcache.h> +#include <netinet/tcp.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> + +/* + * Manage the IP per-host cache (really a thin veneer over the generic + * per-host cache code). + */ + +/* Look up an entry -- can be called from interrupt context. */ +struct in_hcentry * +inhc_lookup(struct sockaddr_in *sin) +{ + struct hcentry *hc; + + hc = hc_get((struct sockaddr *)sin); + return ((struct in_hcentry *)hc); +} + +/* Look up and possibly create an entry -- must be called from user mode. */ +struct in_hcentry * +inhc_alloc(struct sockaddr_in *sin) +{ + struct in_hcentry *inhc; + struct rtentry *rt; + int error; + /* xxx mutual exclusion for smp */ + + inhc = inhc_lookup(sin); + if (inhc != 0) + return inhc; + + rt = rtalloc1(inhc->inhc_hc.hc_host, 1, 0); + if (rt == 0) + return 0; + + MALLOC(inhc, struct in_hcentry *, sizeof *inhc, M_HOSTCACHE, + M_WAITOK | M_ZERO); + inhc->inhc_hc.hc_host = dup_sockaddr((struct sockaddr *)sin, 1); + if (in_broadcast(sin->sin_addr, rt->rt_ifp)) + inhc->inhc_flags |= INHC_BROADCAST; + else if (((struct sockaddr_in *)rt->rt_ifa->ifa_addr)->sin_addr.s_addr + == sin->sin_addr.s_addr) + inhc->inhc_flags |= INHC_LOCAL; + else if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) + inhc->inhc_flags |= INHC_MULTICAST; + inhc->inhc_pmtu = rt->rt_rmx.rmx_mtu; + inhc->inhc_recvpipe = rt->rt_rmx.rmx_recvpipe; + inhc->inhc_sendpipe = rt->rt_rmx.rmx_sendpipe; + inhc->inhc_ssthresh = rt->rt_rmx.rmx_ssthresh; + if (rt->rt_rmx.rmx_locks & RTV_RTT) + inhc->inhc_rttmin = rt->rt_rmx.rmx_rtt + / (RTM_RTTUNIT / TCP_RTT_SCALE); + inhc->inhc_hc.hc_rt = rt; + error = hc_insert(&inhc->inhc_hc); + if (error != 0) { + RTFREE(rt); + FREE(inhc, M_HOSTCACHE); + return 0; + } + /* + * We don't return the structure directly because hc_get() needs + * to be allowed to do its own processing. + */ + return (inhc_lookup(sin)); +} + +/* + * This is Van Jacobson's hash function for IPv4 addresses. + * It is designed to work with a power-of-two-sized hash table. + */ +static u_long +inhc_hash(struct sockaddr *sa, u_long nbuckets) +{ + u_long ip; + + ip = ((struct sockaddr_in *)sa)->sin_addr.s_addr; + return ((ip ^ (ip >> 23) ^ (ip >> 17)) & ~(nbuckets - 1)); +} + +/* + * We don't need to do any special work... if there are no references, + * as the caller has already ensured, then it's OK to kill. + */ +static int +inhc_delete(struct hcentry *hc) +{ + return 0; +} + +/* + * Return the next increment for the number of buckets in the hash table. + * Zero means ``do not bump''. + */ +static u_long +inhc_bump(u_long oldsize) +{ + if (oldsize < 512) + return (oldsize << 1); + return 0; +} + +static struct hccallback inhc_cb = { + inhc_hash, inhc_delete, inhc_bump +}; + +int +inhc_init(void) +{ + + return (hc_init(AF_INET, &inhc_cb, 128, 0)); +} + diff --git a/sys/netinet/in_hostcache.h b/sys/netinet/in_hostcache.h new file mode 100644 index 0000000..710756a --- /dev/null +++ b/sys/netinet/in_hostcache.h @@ -0,0 +1,83 @@ +/* + * Copyright 1997 Massachusetts Institute of Technology + * + * Permission to use, copy, modify, and distribute this software and + * its documentation for any purpose and without fee is hereby + * granted, provided that both the above copyright notice and this + * permission notice appear in all copies, that both the above + * copyright notice and this permission notice appear in all + * supporting documentation, and that the name of M.I.T. not be used + * in advertising or publicity pertaining to distribution of the + * software without specific, written prior permission. M.I.T. makes + * no representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied + * warranty. + * + * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS + * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT + * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NETINET_IN_HOSTCACHE_H +#define _NETINET_IN_HOSTCACHE_H 1 + +/* + * This file defines the particular structures contained in the host cache + * for the use of IP. + */ + +/* + * An IP host cache entry. Note that we include the srtt/var here, + * with the expectation that it might be used to keep a persistent, + * cross-connection view of this statistic. + */ +struct in_hcentry { + struct hcentry inhc_hc; + u_long inhc_pmtu; + u_long inhc_recvpipe; + u_long inhc_sendpipe; + u_long inhc_pksent; + u_long inhc_flags; + u_long inhc_ssthresh; + int inhc_srtt; /* VJ RTT estimator */ + int inhc_srttvar; /* VJ */ + u_int inhc_rttmin; /* VJ */ + int inhc_rxt; /* TCP retransmit timeout */ + u_long inhc_cc; /* deliberate type pun with tcp_cc */ + u_long inhc_ccsent; /* as above */ + u_short inhc_mssopt; +}; + +#define inhc_addr(inhc) ((struct sockaddr_in *)(inhc)->inhc_hc.hc_host) + +/* Flags for inhc_flags... */ +#define INHC_LOCAL 0x0001 /* this address is local */ +#define INHC_BROADCAST 0x0002 /* this address is broadcast */ +#define INHC_MULTICAST 0x0004 /* this address is multicast */ +#define INHC_REDUCEDMTU 0x0008 /* we reduced the mtu via PMTU discovery */ + +#ifdef _KERNEL +/* + * inhc_alloc can block while adding a new entry to the cache; + * inhc_lookup will does not add new entries and so can be called + * in non-process context. + */ +struct in_hcentry *inhc_alloc(struct sockaddr_in *sin); +int inhc_init(void); +struct in_hcentry *inhc_lookup(struct sockaddr_in *sin); +#define inhc_ref(inhc) (hc_ref(&(inhc)->inhc_hc)) +#define inhc_rele(inhc) (hc_rele(&(inhc)->inhc_hc)) +#endif + +#endif /* _NETINET_IN_HOSTCACHE_H */ diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c new file mode 100644 index 0000000..b24b404 --- /dev/null +++ b/sys/netinet/in_pcb.c @@ -0,0 +1,1014 @@ +/* + * Copyright (c) 1982, 1986, 1991, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95 + * $FreeBSD$ + */ + +#include "opt_ipsec.h" +#include "opt_inet6.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/domain.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/proc.h> +#include <sys/jail.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> + +#include <machine/limits.h> + +#include <vm/vm_zone.h> + +#include <net/if.h> +#include <net/if_types.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_pcb.h> +#include <netinet/in_var.h> +#include <netinet/ip_var.h> +#ifdef INET6 +#include <netinet/ip6.h> +#include <netinet6/ip6_var.h> +#endif /* INET6 */ + +#include "faith.h" + +#ifdef IPSEC +#include <netinet6/ipsec.h> +#include <netkey/key.h> +#endif /* IPSEC */ + +struct in_addr zeroin_addr; + +/* + * These configure the range of local port addresses assigned to + * "unspecified" outgoing connections/packets/whatever. + */ +int ipport_lowfirstauto = IPPORT_RESERVED - 1; /* 1023 */ +int ipport_lowlastauto = IPPORT_RESERVEDSTART; /* 600 */ +int ipport_firstauto = IPPORT_RESERVED; /* 1024 */ +int ipport_lastauto = IPPORT_USERRESERVED; /* 5000 */ +int ipport_hifirstauto = IPPORT_HIFIRSTAUTO; /* 49152 */ +int ipport_hilastauto = IPPORT_HILASTAUTO; /* 65535 */ + +#define RANGECHK(var, min, max) \ + if ((var) < (min)) { (var) = (min); } \ + else if ((var) > (max)) { (var) = (max); } + +static int +sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) +{ + int error = sysctl_handle_int(oidp, + oidp->oid_arg1, oidp->oid_arg2, req); + if (!error) { + RANGECHK(ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); + RANGECHK(ipport_lowlastauto, 1, IPPORT_RESERVED - 1); + RANGECHK(ipport_firstauto, IPPORT_RESERVED, USHRT_MAX); + RANGECHK(ipport_lastauto, IPPORT_RESERVED, USHRT_MAX); + RANGECHK(ipport_hifirstauto, IPPORT_RESERVED, USHRT_MAX); + RANGECHK(ipport_hilastauto, IPPORT_RESERVED, USHRT_MAX); + } + return error; +} + +#undef RANGECHK + +SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports"); + +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLTYPE_INT|CTLFLAG_RW, + &ipport_lowfirstauto, 0, &sysctl_net_ipport_check, "I", ""); +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLTYPE_INT|CTLFLAG_RW, + &ipport_lowlastauto, 0, &sysctl_net_ipport_check, "I", ""); +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLTYPE_INT|CTLFLAG_RW, + &ipport_firstauto, 0, &sysctl_net_ipport_check, "I", ""); +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLTYPE_INT|CTLFLAG_RW, + &ipport_lastauto, 0, &sysctl_net_ipport_check, "I", ""); +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLTYPE_INT|CTLFLAG_RW, + &ipport_hifirstauto, 0, &sysctl_net_ipport_check, "I", ""); +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLTYPE_INT|CTLFLAG_RW, + &ipport_hilastauto, 0, &sysctl_net_ipport_check, "I", ""); + +/* + * in_pcb.c: manage the Protocol Control Blocks. + * + * NOTE: It is assumed that most of these functions will be called at + * splnet(). XXX - There are, unfortunately, a few exceptions to this + * rule that should be fixed. + */ + +/* + * Allocate a PCB and associate it with the socket. + */ +int +in_pcballoc(so, pcbinfo, p) + struct socket *so; + struct inpcbinfo *pcbinfo; + struct proc *p; +{ + register struct inpcb *inp; + + inp = zalloc(pcbinfo->ipi_zone); + if (inp == NULL) + return (ENOBUFS); + bzero((caddr_t)inp, sizeof(*inp)); + inp->inp_gencnt = ++pcbinfo->ipi_gencnt; + inp->inp_pcbinfo = pcbinfo; + inp->inp_socket = so; +#if defined(INET6) + if (ip6_mapped_addr_on) + inp->inp_flags &= ~IN6P_BINDV6ONLY; + else + inp->inp_flags |= IN6P_BINDV6ONLY; +#endif + LIST_INSERT_HEAD(pcbinfo->listhead, inp, inp_list); + pcbinfo->ipi_count++; + so->so_pcb = (caddr_t)inp; + return (0); +} + +int +in_pcbbind(inp, nam, p) + register struct inpcb *inp; + struct sockaddr *nam; + struct proc *p; +{ + register struct socket *so = inp->inp_socket; + unsigned short *lastport; + struct sockaddr_in *sin; + struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; + u_short lport = 0; + int wild = 0, reuseport = (so->so_options & SO_REUSEPORT); + int error, prison = 0; + + if (TAILQ_EMPTY(&in_ifaddrhead)) /* XXX broken! */ + return (EADDRNOTAVAIL); + if (inp->inp_lport || inp->inp_laddr.s_addr != INADDR_ANY) + return (EINVAL); + if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0) + wild = 1; + if (nam) { + sin = (struct sockaddr_in *)nam; + if (nam->sa_len != sizeof (*sin)) + return (EINVAL); +#ifdef notdef + /* + * We should check the family, but old programs + * incorrectly fail to initialize it. + */ + if (sin->sin_family != AF_INET) + return (EAFNOSUPPORT); +#endif + if (sin->sin_addr.s_addr != INADDR_ANY) + if (prison_ip(p->p_ucred, 0, &sin->sin_addr.s_addr)) + return(EINVAL); + lport = sin->sin_port; + if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { + /* + * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; + * allow complete duplication of binding if + * SO_REUSEPORT is set, or if SO_REUSEADDR is set + * and a multicast address is bound on both + * new and duplicated sockets. + */ + if (so->so_options & SO_REUSEADDR) + reuseport = SO_REUSEADDR|SO_REUSEPORT; + } else if (sin->sin_addr.s_addr != INADDR_ANY) { + sin->sin_port = 0; /* yech... */ + if (ifa_ifwithaddr((struct sockaddr *)sin) == 0) + return (EADDRNOTAVAIL); + } + if (lport) { + struct inpcb *t; + /* GROSS */ + if (ntohs(lport) < IPPORT_RESERVED && p && + suser_xxx(0, p, PRISON_ROOT)) + return (EACCES); + if (p && jailed(p->p_ucred)) + prison = 1; + if (so->so_cred->cr_uid != 0 && + !IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { + t = in_pcblookup_local(inp->inp_pcbinfo, + sin->sin_addr, lport, + prison ? 0 : INPLOOKUP_WILDCARD); + if (t && + (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || + ntohl(t->inp_laddr.s_addr) != INADDR_ANY || + (t->inp_socket->so_options & + SO_REUSEPORT) == 0) && + (so->so_cred->cr_uid != + t->inp_socket->so_cred->cr_uid)) { +#if defined(INET6) + if ((inp->inp_flags & + IN6P_BINDV6ONLY) != 0 || + ntohl(sin->sin_addr.s_addr) != + INADDR_ANY || + ntohl(t->inp_laddr.s_addr) != + INADDR_ANY || + INP_SOCKAF(so) == + INP_SOCKAF(t->inp_socket)) +#endif /* defined(INET6) */ + return (EADDRINUSE); + } + } + if (prison && + prison_ip(p->p_ucred, 0, &sin->sin_addr.s_addr)) + return (EADDRNOTAVAIL); + t = in_pcblookup_local(pcbinfo, sin->sin_addr, + lport, prison ? 0 : wild); + if (t && + (reuseport & t->inp_socket->so_options) == 0) { +#if defined(INET6) + if ((inp->inp_flags & IN6P_BINDV6ONLY) != 0 || + ntohl(sin->sin_addr.s_addr) != + INADDR_ANY || + ntohl(t->inp_laddr.s_addr) != + INADDR_ANY || + INP_SOCKAF(so) == + INP_SOCKAF(t->inp_socket)) +#endif /* defined(INET6) */ + return (EADDRINUSE); + } + } + inp->inp_laddr = sin->sin_addr; + } + if (lport == 0) { + ushort first, last; + int count; + + if (inp->inp_laddr.s_addr != INADDR_ANY) + if (prison_ip(p->p_ucred, 0, &inp->inp_laddr.s_addr )) { + inp->inp_laddr.s_addr = INADDR_ANY; + return (EINVAL); + } + inp->inp_flags |= INP_ANONPORT; + + if (inp->inp_flags & INP_HIGHPORT) { + first = ipport_hifirstauto; /* sysctl */ + last = ipport_hilastauto; + lastport = &pcbinfo->lasthi; + } else if (inp->inp_flags & INP_LOWPORT) { + if (p && (error = suser_xxx(0, p, PRISON_ROOT))) { + inp->inp_laddr.s_addr = INADDR_ANY; + return error; + } + first = ipport_lowfirstauto; /* 1023 */ + last = ipport_lowlastauto; /* 600 */ + lastport = &pcbinfo->lastlow; + } else { + first = ipport_firstauto; /* sysctl */ + last = ipport_lastauto; + lastport = &pcbinfo->lastport; + } + /* + * Simple check to ensure all ports are not used up causing + * a deadlock here. + * + * We split the two cases (up and down) so that the direction + * is not being tested on each round of the loop. + */ + if (first > last) { + /* + * counting down + */ + count = first - last; + + do { + if (count-- < 0) { /* completely used? */ + inp->inp_laddr.s_addr = INADDR_ANY; + return (EADDRNOTAVAIL); + } + --*lastport; + if (*lastport > first || *lastport < last) + *lastport = first; + lport = htons(*lastport); + } while (in_pcblookup_local(pcbinfo, + inp->inp_laddr, lport, wild)); + } else { + /* + * counting up + */ + count = last - first; + + do { + if (count-- < 0) { /* completely used? */ + /* + * Undo any address bind that may have + * occurred above. + */ + inp->inp_laddr.s_addr = INADDR_ANY; + return (EADDRNOTAVAIL); + } + ++*lastport; + if (*lastport < first || *lastport > last) + *lastport = first; + lport = htons(*lastport); + } while (in_pcblookup_local(pcbinfo, + inp->inp_laddr, lport, wild)); + } + } + inp->inp_lport = lport; + if (prison_ip(p->p_ucred, 0, &inp->inp_laddr.s_addr)) { + inp->inp_laddr.s_addr = INADDR_ANY; + inp->inp_lport = 0; + return(EINVAL); + } + if (in_pcbinshash(inp) != 0) { + inp->inp_laddr.s_addr = INADDR_ANY; + inp->inp_lport = 0; + return (EAGAIN); + } + return (0); +} + +/* + * Transform old in_pcbconnect() into an inner subroutine for new + * in_pcbconnect(): Do some validity-checking on the remote + * address (in mbuf 'nam') and then determine local host address + * (i.e., which interface) to use to access that remote host. + * + * This preserves definition of in_pcbconnect(), while supporting a + * slightly different version for T/TCP. (This is more than + * a bit of a kludge, but cleaning up the internal interfaces would + * have forced minor changes in every protocol). + */ + +int +in_pcbladdr(inp, nam, plocal_sin) + register struct inpcb *inp; + struct sockaddr *nam; + struct sockaddr_in **plocal_sin; +{ + struct in_ifaddr *ia; + register struct sockaddr_in *sin = (struct sockaddr_in *)nam; + + if (nam->sa_len != sizeof (*sin)) + return (EINVAL); + if (sin->sin_family != AF_INET) + return (EAFNOSUPPORT); + if (sin->sin_port == 0) + return (EADDRNOTAVAIL); + if (!TAILQ_EMPTY(&in_ifaddrhead)) { + /* + * If the destination address is INADDR_ANY, + * use the primary local address. + * If the supplied address is INADDR_BROADCAST, + * and the primary interface supports broadcast, + * choose the broadcast address for that interface. + */ +#define satosin(sa) ((struct sockaddr_in *)(sa)) +#define sintosa(sin) ((struct sockaddr *)(sin)) +#define ifatoia(ifa) ((struct in_ifaddr *)(ifa)) + if (sin->sin_addr.s_addr == INADDR_ANY) + sin->sin_addr = IA_SIN(TAILQ_FIRST(&in_ifaddrhead))->sin_addr; + else if (sin->sin_addr.s_addr == (u_long)INADDR_BROADCAST && + (TAILQ_FIRST(&in_ifaddrhead)->ia_ifp->if_flags & IFF_BROADCAST)) + sin->sin_addr = satosin(&TAILQ_FIRST(&in_ifaddrhead)->ia_broadaddr)->sin_addr; + } + if (inp->inp_laddr.s_addr == INADDR_ANY) { + register struct route *ro; + + ia = (struct in_ifaddr *)0; + /* + * If route is known or can be allocated now, + * our src addr is taken from the i/f, else punt. + */ + ro = &inp->inp_route; + if (ro->ro_rt && + (satosin(&ro->ro_dst)->sin_addr.s_addr != + sin->sin_addr.s_addr || + inp->inp_socket->so_options & SO_DONTROUTE)) { + RTFREE(ro->ro_rt); + ro->ro_rt = (struct rtentry *)0; + } + if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0 && /*XXX*/ + (ro->ro_rt == (struct rtentry *)0 || + ro->ro_rt->rt_ifp == (struct ifnet *)0)) { + /* No route yet, so try to acquire one */ + ro->ro_dst.sa_family = AF_INET; + ro->ro_dst.sa_len = sizeof(struct sockaddr_in); + ((struct sockaddr_in *) &ro->ro_dst)->sin_addr = + sin->sin_addr; + rtalloc(ro); + } + /* + * If we found a route, use the address + * corresponding to the outgoing interface + * unless it is the loopback (in case a route + * to our address on another net goes to loopback). + */ + if (ro->ro_rt && !(ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK)) + ia = ifatoia(ro->ro_rt->rt_ifa); + if (ia == 0) { + u_short fport = sin->sin_port; + + sin->sin_port = 0; + ia = ifatoia(ifa_ifwithdstaddr(sintosa(sin))); + if (ia == 0) + ia = ifatoia(ifa_ifwithnet(sintosa(sin))); + sin->sin_port = fport; + if (ia == 0) + ia = TAILQ_FIRST(&in_ifaddrhead); + if (ia == 0) + return (EADDRNOTAVAIL); + } + /* + * If the destination address is multicast and an outgoing + * interface has been set as a multicast option, use the + * address of that interface as our source address. + */ + if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) && + inp->inp_moptions != NULL) { + struct ip_moptions *imo; + struct ifnet *ifp; + + imo = inp->inp_moptions; + if (imo->imo_multicast_ifp != NULL) { + ifp = imo->imo_multicast_ifp; + TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) + if (ia->ia_ifp == ifp) + break; + if (ia == 0) + return (EADDRNOTAVAIL); + } + } + /* + * Don't do pcblookup call here; return interface in plocal_sin + * and exit to caller, that will do the lookup. + */ + *plocal_sin = &ia->ia_addr; + + } + return(0); +} + +/* + * Outer subroutine: + * Connect from a socket to a specified address. + * Both address and port must be specified in argument sin. + * If don't have a local address for this socket yet, + * then pick one. + */ +int +in_pcbconnect(inp, nam, p) + register struct inpcb *inp; + struct sockaddr *nam; + struct proc *p; +{ + struct sockaddr_in *ifaddr; + struct sockaddr_in *sin = (struct sockaddr_in *)nam; + struct sockaddr_in sa; + struct ucred *cred; + int error; + + cred = inp->inp_socket->so_cred; + if (inp->inp_laddr.s_addr == INADDR_ANY && jailed(cred)) { + bzero(&sa, sizeof (sa)); + sa.sin_addr.s_addr = htonl(cred->cr_prison->pr_ip); + sa.sin_len=sizeof (sa); + sa.sin_family = AF_INET; + error = in_pcbbind(inp, (struct sockaddr *)&sa, p); + if (error) + return (error); + } + /* + * Call inner routine, to assign local interface address. + */ + if ((error = in_pcbladdr(inp, nam, &ifaddr)) != 0) + return(error); + + if (in_pcblookup_hash(inp->inp_pcbinfo, sin->sin_addr, sin->sin_port, + inp->inp_laddr.s_addr ? inp->inp_laddr : ifaddr->sin_addr, + inp->inp_lport, 0, NULL) != NULL) { + return (EADDRINUSE); + } + if (inp->inp_laddr.s_addr == INADDR_ANY) { + if (inp->inp_lport == 0) { + error = in_pcbbind(inp, (struct sockaddr *)0, p); + if (error) + return (error); + } + inp->inp_laddr = ifaddr->sin_addr; + } + inp->inp_faddr = sin->sin_addr; + inp->inp_fport = sin->sin_port; + in_pcbrehash(inp); + return (0); +} + +void +in_pcbdisconnect(inp) + struct inpcb *inp; +{ + + inp->inp_faddr.s_addr = INADDR_ANY; + inp->inp_fport = 0; + in_pcbrehash(inp); + if (inp->inp_socket->so_state & SS_NOFDREF) + in_pcbdetach(inp); +} + +void +in_pcbdetach(inp) + struct inpcb *inp; +{ + struct socket *so = inp->inp_socket; + struct inpcbinfo *ipi = inp->inp_pcbinfo; + struct rtentry *rt = inp->inp_route.ro_rt; + +#ifdef IPSEC + ipsec4_delete_pcbpolicy(inp); +#endif /*IPSEC*/ + inp->inp_gencnt = ++ipi->ipi_gencnt; + in_pcbremlists(inp); + so->so_pcb = 0; + sofree(so); + if (inp->inp_options) + (void)m_free(inp->inp_options); + if (rt) { + /* + * route deletion requires reference count to be <= zero + */ + if ((rt->rt_flags & RTF_DELCLONE) && + (rt->rt_flags & RTF_WASCLONED) && + (rt->rt_refcnt <= 1)) { + rt->rt_refcnt--; + rt->rt_flags &= ~RTF_UP; + rtrequest(RTM_DELETE, rt_key(rt), + rt->rt_gateway, rt_mask(rt), + rt->rt_flags, (struct rtentry **)0); + } + else + rtfree(rt); + } + ip_freemoptions(inp->inp_moptions); + inp->inp_vflag = 0; + zfree(ipi->ipi_zone, inp); +} + +/* + * The calling convention of in_setsockaddr() and in_setpeeraddr() was + * modified to match the pru_sockaddr() and pru_peeraddr() entry points + * in struct pr_usrreqs, so that protocols can just reference then directly + * without the need for a wrapper function. The socket must have a valid + * (i.e., non-nil) PCB, but it should be impossible to get an invalid one + * except through a kernel programming error, so it is acceptable to panic + * (or in this case trap) if the PCB is invalid. (Actually, we don't trap + * because there actually /is/ a programming error somewhere... XXX) + */ +int +in_setsockaddr(so, nam) + struct socket *so; + struct sockaddr **nam; +{ + int s; + register struct inpcb *inp; + register struct sockaddr_in *sin; + + /* + * Do the malloc first in case it blocks. + */ + MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME, + M_WAITOK | M_ZERO); + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + + s = splnet(); + inp = sotoinpcb(so); + if (!inp) { + splx(s); + free(sin, M_SONAME); + return ECONNRESET; + } + sin->sin_port = inp->inp_lport; + sin->sin_addr = inp->inp_laddr; + splx(s); + + *nam = (struct sockaddr *)sin; + return 0; +} + +int +in_setpeeraddr(so, nam) + struct socket *so; + struct sockaddr **nam; +{ + int s; + struct inpcb *inp; + register struct sockaddr_in *sin; + + /* + * Do the malloc first in case it blocks. + */ + MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME, + M_WAITOK | M_ZERO); + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + + s = splnet(); + inp = sotoinpcb(so); + if (!inp) { + splx(s); + free(sin, M_SONAME); + return ECONNRESET; + } + sin->sin_port = inp->inp_fport; + sin->sin_addr = inp->inp_faddr; + splx(s); + + *nam = (struct sockaddr *)sin; + return 0; +} + +void +in_pcbnotifyall(head, faddr, errno, notify) + struct inpcbhead *head; + struct in_addr faddr; + int errno; + void (*notify) __P((struct inpcb *, int)); +{ + struct inpcb *inp, *ninp; + int s; + + s = splnet(); + for (inp = LIST_FIRST(head); inp != NULL; inp = ninp) { + ninp = LIST_NEXT(inp, inp_list); +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV4) == 0) + continue; +#endif + if (inp->inp_faddr.s_addr != faddr.s_addr || + inp->inp_socket == NULL) + continue; + (*notify)(inp, errno); + } + splx(s); +} + +/* + * Check for alternatives when higher level complains + * about service problems. For now, invalidate cached + * routing information. If the route was created dynamically + * (by a redirect), time to try a default gateway again. + */ +void +in_losing(inp) + struct inpcb *inp; +{ + register struct rtentry *rt; + struct rt_addrinfo info; + + if ((rt = inp->inp_route.ro_rt)) { + inp->inp_route.ro_rt = 0; + bzero((caddr_t)&info, sizeof(info)); + info.rti_info[RTAX_DST] = + (struct sockaddr *)&inp->inp_route.ro_dst; + info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; + info.rti_info[RTAX_NETMASK] = rt_mask(rt); + rt_missmsg(RTM_LOSING, &info, rt->rt_flags, 0); + if (rt->rt_flags & RTF_DYNAMIC) + (void) rtrequest(RTM_DELETE, rt_key(rt), + rt->rt_gateway, rt_mask(rt), rt->rt_flags, + (struct rtentry **)0); + else + /* + * A new route can be allocated + * the next time output is attempted. + */ + rtfree(rt); + } +} + +/* + * After a routing change, flush old routing + * and allocate a (hopefully) better one. + */ +void +in_rtchange(inp, errno) + register struct inpcb *inp; + int errno; +{ + if (inp->inp_route.ro_rt) { + rtfree(inp->inp_route.ro_rt); + inp->inp_route.ro_rt = 0; + /* + * A new route can be allocated the next time + * output is attempted. + */ + } +} + +/* + * Lookup a PCB based on the local address and port. + */ +struct inpcb * +in_pcblookup_local(pcbinfo, laddr, lport_arg, wild_okay) + struct inpcbinfo *pcbinfo; + struct in_addr laddr; + u_int lport_arg; + int wild_okay; +{ + register struct inpcb *inp; + int matchwild = 3, wildcard; + u_short lport = lport_arg; + + if (!wild_okay) { + struct inpcbhead *head; + /* + * Look for an unconnected (wildcard foreign addr) PCB that + * matches the local address and port we're looking for. + */ + head = &pcbinfo->hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->hashmask)]; + LIST_FOREACH(inp, head, inp_hash) { +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV4) == 0) + continue; +#endif + if (inp->inp_faddr.s_addr == INADDR_ANY && + inp->inp_laddr.s_addr == laddr.s_addr && + inp->inp_lport == lport) { + /* + * Found. + */ + return (inp); + } + } + /* + * Not found. + */ + return (NULL); + } else { + struct inpcbporthead *porthash; + struct inpcbport *phd; + struct inpcb *match = NULL; + /* + * Best fit PCB lookup. + * + * First see if this local port is in use by looking on the + * port hash list. + */ + porthash = &pcbinfo->porthashbase[INP_PCBPORTHASH(lport, + pcbinfo->porthashmask)]; + LIST_FOREACH(phd, porthash, phd_hash) { + if (phd->phd_port == lport) + break; + } + if (phd != NULL) { + /* + * Port is in use by one or more PCBs. Look for best + * fit. + */ + LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { + wildcard = 0; +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV4) == 0) + continue; +#endif + if (inp->inp_faddr.s_addr != INADDR_ANY) + wildcard++; + if (inp->inp_laddr.s_addr != INADDR_ANY) { + if (laddr.s_addr == INADDR_ANY) + wildcard++; + else if (inp->inp_laddr.s_addr != laddr.s_addr) + continue; + } else { + if (laddr.s_addr != INADDR_ANY) + wildcard++; + } + if (wildcard < matchwild) { + match = inp; + matchwild = wildcard; + if (matchwild == 0) { + break; + } + } + } + } + return (match); + } +} + +/* + * Lookup PCB in hash list. + */ +struct inpcb * +in_pcblookup_hash(pcbinfo, faddr, fport_arg, laddr, lport_arg, wildcard, + ifp) + struct inpcbinfo *pcbinfo; + struct in_addr faddr, laddr; + u_int fport_arg, lport_arg; + int wildcard; + struct ifnet *ifp; +{ + struct inpcbhead *head; + register struct inpcb *inp; + u_short fport = fport_arg, lport = lport_arg; + + /* + * First look for an exact match. + */ + head = &pcbinfo->hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, pcbinfo->hashmask)]; + LIST_FOREACH(inp, head, inp_hash) { +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV4) == 0) + continue; +#endif + if (inp->inp_faddr.s_addr == faddr.s_addr && + inp->inp_laddr.s_addr == laddr.s_addr && + inp->inp_fport == fport && + inp->inp_lport == lport) { + /* + * Found. + */ + return (inp); + } + } + if (wildcard) { + struct inpcb *local_wild = NULL; +#if defined(INET6) + struct inpcb *local_wild_mapped = NULL; +#endif /* defined(INET6) */ + + head = &pcbinfo->hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->hashmask)]; + LIST_FOREACH(inp, head, inp_hash) { +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV4) == 0) + continue; +#endif + if (inp->inp_faddr.s_addr == INADDR_ANY && + inp->inp_lport == lport) { +#if defined(NFAITH) && NFAITH > 0 + if (ifp && ifp->if_type == IFT_FAITH && + (inp->inp_flags & INP_FAITH) == 0) + continue; +#endif + if (inp->inp_laddr.s_addr == laddr.s_addr) + return (inp); + else if (inp->inp_laddr.s_addr == INADDR_ANY) { +#if defined(INET6) + if (INP_CHECK_SOCKAF(inp->inp_socket, + AF_INET6)) + local_wild_mapped = inp; + else +#endif /* defined(INET6) */ + local_wild = inp; + } + } + } +#if defined(INET6) + if (local_wild == NULL) + return (local_wild_mapped); +#endif /* defined(INET6) */ + return (local_wild); + } + + /* + * Not found. + */ + return (NULL); +} + +/* + * Insert PCB onto various hash lists. + */ +int +in_pcbinshash(inp) + struct inpcb *inp; +{ + struct inpcbhead *pcbhash; + struct inpcbporthead *pcbporthash; + struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; + struct inpcbport *phd; + u_int32_t hashkey_faddr; + +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6) + hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */; + else +#endif /* INET6 */ + hashkey_faddr = inp->inp_faddr.s_addr; + + pcbhash = &pcbinfo->hashbase[INP_PCBHASH(hashkey_faddr, + inp->inp_lport, inp->inp_fport, pcbinfo->hashmask)]; + + pcbporthash = &pcbinfo->porthashbase[INP_PCBPORTHASH(inp->inp_lport, + pcbinfo->porthashmask)]; + + /* + * Go through port list and look for a head for this lport. + */ + LIST_FOREACH(phd, pcbporthash, phd_hash) { + if (phd->phd_port == inp->inp_lport) + break; + } + /* + * If none exists, malloc one and tack it on. + */ + if (phd == NULL) { + MALLOC(phd, struct inpcbport *, sizeof(struct inpcbport), M_PCB, M_NOWAIT); + if (phd == NULL) { + return (ENOBUFS); /* XXX */ + } + phd->phd_port = inp->inp_lport; + LIST_INIT(&phd->phd_pcblist); + LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); + } + inp->inp_phd = phd; + LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); + LIST_INSERT_HEAD(pcbhash, inp, inp_hash); + return (0); +} + +/* + * Move PCB to the proper hash bucket when { faddr, fport } have been + * changed. NOTE: This does not handle the case of the lport changing (the + * hashed port list would have to be updated as well), so the lport must + * not change after in_pcbinshash() has been called. + */ +void +in_pcbrehash(inp) + struct inpcb *inp; +{ + struct inpcbhead *head; + u_int32_t hashkey_faddr; + +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6) + hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */; + else +#endif /* INET6 */ + hashkey_faddr = inp->inp_faddr.s_addr; + + head = &inp->inp_pcbinfo->hashbase[INP_PCBHASH(hashkey_faddr, + inp->inp_lport, inp->inp_fport, inp->inp_pcbinfo->hashmask)]; + + LIST_REMOVE(inp, inp_hash); + LIST_INSERT_HEAD(head, inp, inp_hash); +} + +/* + * Remove PCB from various lists. + */ +void +in_pcbremlists(inp) + struct inpcb *inp; +{ + inp->inp_gencnt = ++inp->inp_pcbinfo->ipi_gencnt; + if (inp->inp_lport) { + struct inpcbport *phd = inp->inp_phd; + + LIST_REMOVE(inp, inp_hash); + LIST_REMOVE(inp, inp_portlist); + if (LIST_FIRST(&phd->phd_pcblist) == NULL) { + LIST_REMOVE(phd, phd_hash); + free(phd, M_PCB); + } + } + LIST_REMOVE(inp, inp_list); + inp->inp_pcbinfo->ipi_count--; +} + +int +prison_xinpcb(struct proc *p, struct inpcb *inp) +{ + if (!jailed(p->p_ucred)) + return (0); + if (ntohl(inp->inp_laddr.s_addr) == p->p_ucred->cr_prison->pr_ip) + return (0); + return (1); +} diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h new file mode 100644 index 0000000..f4abb4d --- /dev/null +++ b/sys/netinet/in_pcb.h @@ -0,0 +1,302 @@ +/* + * Copyright (c) 1982, 1986, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_pcb.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_IN_PCB_H_ +#define _NETINET_IN_PCB_H_ + +#include <sys/queue.h> + + +#include <netinet6/ipsec.h> /* for IPSEC */ + +#define in6pcb inpcb /* for KAME src sync over BSD*'s */ +#define in6p_sp inp_sp /* for KAME src sync over BSD*'s */ + +/* + * Common structure pcb for internet protocol implementation. + * Here are stored pointers to local and foreign host table + * entries, local and foreign socket numbers, and pointers + * up (to a socket structure) and down (to a protocol-specific) + * control block. + */ +LIST_HEAD(inpcbhead, inpcb); +LIST_HEAD(inpcbporthead, inpcbport); +typedef u_quad_t inp_gen_t; + +/* + * PCB with AF_INET6 null bind'ed laddr can receive AF_INET input packet. + * So, AF_INET6 null laddr is also used as AF_INET null laddr, + * by utilize following structure. (At last, same as INRIA) + */ +struct in_addr_4in6 { + u_int32_t ia46_pad32[3]; + struct in_addr ia46_addr4; +}; + +/* + * NB: the zone allocator is type-stable EXCEPT FOR THE FIRST TWO LONGS + * of the structure. Therefore, it is important that the members in + * that position not contain any information which is required to be + * stable. + */ +struct icmp6_filter; + +struct inpcb { + LIST_ENTRY(inpcb) inp_hash; /* hash list */ + u_short inp_fport; /* foreign port */ + u_short inp_lport; /* local port */ + LIST_ENTRY(inpcb) inp_list; /* list for all PCBs of this proto */ + u_int32_t inp_flow; + + /* protocol dependent part, local and foreign addr */ + union { + /* foreign host table entry */ + struct in_addr_4in6 inp46_foreign; + struct in6_addr inp6_foreign; + } inp_dependfaddr; + union { + /* local host table entry */ + struct in_addr_4in6 inp46_local; + struct in6_addr inp6_local; + } inp_dependladdr; + + caddr_t inp_ppcb; /* pointer to per-protocol pcb */ + struct inpcbinfo *inp_pcbinfo; /* PCB list info */ + struct socket *inp_socket; /* back pointer to socket */ + /* list for this PCB's local port */ + int inp_flags; /* generic IP/datagram flags */ + + /* protocol dependent part; cached route */ + union { + /* placeholder for routing entry */ + struct route inp4_route; + struct route_in6 inp6_route; + } inp_dependroute; + + struct inpcbpolicy *inp_sp; /* for IPSEC */ + u_char inp_vflag; +#define INP_IPV4 0x1 +#define INP_IPV6 0x2 + u_char inp_ip_ttl; /* time to live proto */ + u_char inp_ip_p; /* protocol proto */ + + /* protocol dependent part; options */ + struct { + u_char inp4_ip_tos; /* type of service proto */ + struct mbuf *inp4_options; /* IP options */ + struct ip_moptions *inp4_moptions; /* IP multicast options */ + } inp_depend4; +#define inp_faddr inp_dependfaddr.inp46_foreign.ia46_addr4 +#define inp_laddr inp_dependladdr.inp46_local.ia46_addr4 +#define inp_route inp_dependroute.inp4_route +#define inp_ip_tos inp_depend4.inp4_ip_tos +#define inp_options inp_depend4.inp4_options +#define inp_moptions inp_depend4.inp4_moptions + struct { + /* IP options */ + struct mbuf *inp6_options; + /* IP6 options for outgoing packets */ + struct ip6_pktopts *inp6_outputopts; + /* IP multicast options */ + struct ip6_moptions *inp6_moptions; + /* ICMPv6 code type filter */ + struct icmp6_filter *inp6_icmp6filt; + /* IPV6_CHECKSUM setsockopt */ + int inp6_cksum; + u_short inp6_ifindex; + short inp6_hops; + u_int8_t inp6_hlim; + } inp_depend6; + LIST_ENTRY(inpcb) inp_portlist; + struct inpcbport *inp_phd; /* head of this list */ + inp_gen_t inp_gencnt; /* generation count of this instance */ +#define in6p_faddr inp_dependfaddr.inp6_foreign +#define in6p_laddr inp_dependladdr.inp6_local +#define in6p_route inp_dependroute.inp6_route +#define in6p_ip6_hlim inp_depend6.inp6_hlim +#define in6p_hops inp_depend6.inp6_hops /* default hop limit */ +#define in6p_ip6_nxt inp_ip_p +#define in6p_flowinfo inp_flow +#define in6p_vflag inp_vflag +#define in6p_options inp_depend6.inp6_options +#define in6p_outputopts inp_depend6.inp6_outputopts +#define in6p_moptions inp_depend6.inp6_moptions +#define in6p_icmp6filt inp_depend6.inp6_icmp6filt +#define in6p_cksum inp_depend6.inp6_cksum +#define inp6_ifindex inp_depend6.inp6_ifindex +#define in6p_flags inp_flags /* for KAME src sync over BSD*'s */ +#define in6p_socket inp_socket /* for KAME src sync over BSD*'s */ +#define in6p_lport inp_lport /* for KAME src sync over BSD*'s */ +#define in6p_fport inp_fport /* for KAME src sync over BSD*'s */ +#define in6p_ppcb inp_ppcb /* for KAME src sync over BSD*'s */ +}; +/* + * The range of the generation count, as used in this implementation, + * is 9e19. We would have to create 300 billion connections per + * second for this number to roll over in a year. This seems sufficiently + * unlikely that we simply don't concern ourselves with that possibility. + */ + +/* + * Interface exported to userland by various protocols which use + * inpcbs. Hack alert -- only define if struct xsocket is in scope. + */ +#ifdef _SYS_SOCKETVAR_H_ +struct xinpcb { + size_t xi_len; /* length of this structure */ + struct inpcb xi_inp; + struct xsocket xi_socket; + u_quad_t xi_alignment_hack; +}; + +struct xinpgen { + size_t xig_len; /* length of this structure */ + u_int xig_count; /* number of PCBs at this time */ + inp_gen_t xig_gen; /* generation count at this time */ + so_gen_t xig_sogen; /* socket generation count at this time */ +}; +#endif /* _SYS_SOCKETVAR_H_ */ + +struct inpcbport { + LIST_ENTRY(inpcbport) phd_hash; + struct inpcbhead phd_pcblist; + u_short phd_port; +}; + +struct inpcbinfo { /* XXX documentation, prefixes */ + struct inpcbhead *hashbase; + u_long hashmask; + struct inpcbporthead *porthashbase; + u_long porthashmask; + struct inpcbhead *listhead; + u_short lastport; + u_short lastlow; + u_short lasthi; + struct vm_zone *ipi_zone; /* zone to allocate pcbs from */ + u_int ipi_count; /* number of pcbs in this list */ + u_quad_t ipi_gencnt; /* current generation count */ +}; + +#define INP_PCBHASH(faddr, lport, fport, mask) \ + (((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask)) +#define INP_PCBPORTHASH(lport, mask) \ + (ntohs((lport)) & (mask)) + +/* flags in inp_flags: */ +#define INP_RECVOPTS 0x01 /* receive incoming IP options */ +#define INP_RECVRETOPTS 0x02 /* receive IP options for reply */ +#define INP_RECVDSTADDR 0x04 /* receive IP dst address */ +#define INP_HDRINCL 0x08 /* user supplies entire IP header */ +#define INP_HIGHPORT 0x10 /* user wants "high" port binding */ +#define INP_LOWPORT 0x20 /* user wants "low" port binding */ +#define INP_ANONPORT 0x40 /* port chosen for user */ +#define INP_RECVIF 0x80 /* receive incoming interface */ +#define INP_MTUDISC 0x100 /* user can do MTU discovery */ +#define INP_FAITH 0x200 /* accept FAITH'ed connections */ +#define IN6P_PKTINFO 0x010000 +#define IN6P_HOPLIMIT 0x020000 +#define IN6P_NEXTHOP 0x040000 +#define IN6P_HOPOPTS 0x080000 +#define IN6P_DSTOPTS 0x100000 +#define IN6P_RTHDR 0x200000 +#define IN6P_BINDV6ONLY 0x400000 +#define INP_CONTROLOPTS (INP_RECVOPTS|INP_RECVRETOPTS|INP_RECVDSTADDR|\ + INP_RECVIF|\ + IN6P_PKTINFO|IN6P_HOPLIMIT|IN6P_NEXTHOP|\ + IN6P_HOPOPTS|IN6P_DSTOPTS|IN6P_RTHDR) + +#define INP_UNMAPPABLEOPTS (IN6P_HOPOPTS|IN6P_DSTOPTS|IN6P_RTHDR) + + /* for KAME src sync over BSD*'s */ +#define IN6P_RECVOPTS INP_RECVOPTS +#define IN6P_RECVRETOPTS INP_RECVRETOPTS +#define IN6P_RECVDSTADDR INP_RECVDSTADDR +#define IN6P_HDRINCL INP_HDRINCL +#define IN6P_HIGHPORT INP_HIGHPORT +#define IN6P_LOWPORT INP_LOWPORT +#define IN6P_ANONPORT INP_ANONPORT +#define IN6P_RECVIF INP_RECVIF +#define IN6P_MTUDISC INP_MTUDISC +#define IN6P_FAITH INP_FAITH +#define IN6P_CONTROLOPTS INP_CONTROLOPTS + /* + * socket AF version is {newer than,or include} + * actual datagram AF version + */ + +#define INPLOOKUP_WILDCARD 1 +#define sotoinpcb(so) ((struct inpcb *)(so)->so_pcb) +#define sotoin6pcb(so) sotoinpcb(so) /* for KAME src sync over BSD*'s */ + +#define INP_SOCKAF(so) so->so_proto->pr_domain->dom_family + +#define INP_CHECK_SOCKAF(so, af) (INP_SOCKAF(so) == af) + +#ifdef _KERNEL +extern int ipport_lowfirstauto; +extern int ipport_lowlastauto; +extern int ipport_firstauto; +extern int ipport_lastauto; +extern int ipport_hifirstauto; +extern int ipport_hilastauto; + +void in_losing __P((struct inpcb *)); +void in_rtchange __P((struct inpcb *, int)); +int in_pcballoc __P((struct socket *, struct inpcbinfo *, struct proc *)); +int in_pcbbind __P((struct inpcb *, struct sockaddr *, struct proc *)); +int in_pcbconnect __P((struct inpcb *, struct sockaddr *, struct proc *)); +void in_pcbdetach __P((struct inpcb *)); +void in_pcbdisconnect __P((struct inpcb *)); +int in_pcbinshash __P((struct inpcb *)); +int in_pcbladdr __P((struct inpcb *, struct sockaddr *, + struct sockaddr_in **)); +struct inpcb * + in_pcblookup_local __P((struct inpcbinfo *, + struct in_addr, u_int, int)); +struct inpcb * + in_pcblookup_hash __P((struct inpcbinfo *, + struct in_addr, u_int, struct in_addr, u_int, + int, struct ifnet *)); +void in_pcbnotifyall __P((struct inpcbhead *, struct in_addr, + int, void (*)(struct inpcb *, int))); +void in_pcbrehash __P((struct inpcb *)); +int in_setpeeraddr __P((struct socket *so, struct sockaddr **nam)); +int in_setsockaddr __P((struct socket *so, struct sockaddr **nam)); +void in_pcbremlists __P((struct inpcb *inp)); +int prison_xinpcb __P((struct proc *p, struct inpcb *inp)); +#endif /* _KERNEL */ + +#endif /* !_NETINET_IN_PCB_H_ */ diff --git a/sys/netinet/in_proto.c b/sys/netinet/in_proto.c new file mode 100644 index 0000000..02e6313 --- /dev/null +++ b/sys/netinet/in_proto.c @@ -0,0 +1,255 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_proto.c 8.2 (Berkeley) 2/9/95 + * $FreeBSD$ + */ + +#include "opt_ipdivert.h" +#include "opt_ipx.h" +#include "opt_ipsec.h" +#include "opt_inet6.h" + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/socket.h> +#include <sys/domain.h> +#include <sys/protosw.h> +#include <sys/queue.h> +#include <sys/sysctl.h> + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#include <netinet/ip_icmp.h> +#include <netinet/igmp_var.h> +#include <netinet/tcp.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#include <netinet/udp.h> +#include <netinet/udp_var.h> +#include <netinet/ip_encap.h> + +#include <netinet/ipprotosw.h> + +/* + * TCP/IP protocol family: IP, ICMP, UDP, TCP. + */ + +#ifdef IPSEC +#include <netinet6/ipsec.h> +#include <netinet6/ah.h> +#ifdef IPSEC_ESP +#include <netinet6/esp.h> +#endif +#endif /* IPSEC */ + +#include "gif.h" +#if NGIF > 0 +#include <netinet/in_gif.h> +#endif + +#include "stf.h" +#if NSTF > 0 +#include <net/if_stf.h> +#endif + +#ifdef IPXIP +#include <netipx/ipx_ip.h> +#endif + +#ifdef NSIP +#include <netns/ns.h> +#include <netns/ns_if.h> +#endif + +extern struct domain inetdomain; +static struct pr_usrreqs nousrreqs; + +struct ipprotosw inetsw[] = { +{ 0, &inetdomain, 0, 0, + 0, 0, 0, 0, + 0, + ip_init, 0, ip_slowtimo, ip_drain, + &nousrreqs +}, +{ SOCK_DGRAM, &inetdomain, IPPROTO_UDP, PR_ATOMIC|PR_ADDR, + udp_input, 0, udp_ctlinput, ip_ctloutput, + 0, + udp_init, 0, 0, 0, + &udp_usrreqs +}, +{ SOCK_STREAM, &inetdomain, IPPROTO_TCP, + PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD, + tcp_input, 0, tcp_ctlinput, tcp_ctloutput, + 0, + tcp_init, 0, tcp_slowtimo, tcp_drain, + &tcp_usrreqs +}, +{ SOCK_RAW, &inetdomain, IPPROTO_RAW, PR_ATOMIC|PR_ADDR, + rip_input, 0, rip_ctlinput, rip_ctloutput, + 0, + 0, 0, 0, 0, + &rip_usrreqs +}, +{ SOCK_RAW, &inetdomain, IPPROTO_ICMP, PR_ATOMIC|PR_ADDR, + icmp_input, 0, 0, rip_ctloutput, + 0, + 0, 0, 0, 0, + &rip_usrreqs +}, +{ SOCK_RAW, &inetdomain, IPPROTO_IGMP, PR_ATOMIC|PR_ADDR, + igmp_input, 0, 0, rip_ctloutput, + 0, + igmp_init, igmp_fasttimo, igmp_slowtimo, 0, + &rip_usrreqs +}, +{ SOCK_RAW, &inetdomain, IPPROTO_RSVP, PR_ATOMIC|PR_ADDR, + rsvp_input, 0, 0, rip_ctloutput, + 0, + 0, 0, 0, 0, + &rip_usrreqs +}, +#ifdef IPSEC +{ SOCK_RAW, &inetdomain, IPPROTO_AH, PR_ATOMIC|PR_ADDR, + ah4_input, 0, 0, 0, + 0, + 0, 0, 0, 0, + &nousrreqs +}, +#ifdef IPSEC_ESP +{ SOCK_RAW, &inetdomain, IPPROTO_ESP, PR_ATOMIC|PR_ADDR, + esp4_input, 0, 0, 0, + 0, + 0, 0, 0, 0, + &nousrreqs +}, +#endif +#endif /* IPSEC */ +{ SOCK_RAW, &inetdomain, IPPROTO_IPV4, PR_ATOMIC|PR_ADDR, + encap4_input, 0, 0, rip_ctloutput, + 0, + encap_init, 0, 0, 0, + &nousrreqs +}, +# ifdef INET6 +{ SOCK_RAW, &inetdomain, IPPROTO_IPV6, PR_ATOMIC|PR_ADDR, + encap4_input, 0, 0, rip_ctloutput, + 0, + 0, 0, 0, 0, + &nousrreqs +}, +#endif +#ifdef IPDIVERT +{ SOCK_RAW, &inetdomain, IPPROTO_DIVERT, PR_ATOMIC|PR_ADDR, + div_input, 0, 0, ip_ctloutput, + 0, + div_init, 0, 0, 0, + &div_usrreqs, +}, +#endif +#ifdef IPXIP +{ SOCK_RAW, &inetdomain, IPPROTO_IDP, PR_ATOMIC|PR_ADDR, + ipxip_input, 0, ipxip_ctlinput, 0, + 0, + 0, 0, 0, 0, + &rip_usrreqs +}, +#endif +#ifdef NSIP +{ SOCK_RAW, &inetdomain, IPPROTO_IDP, PR_ATOMIC|PR_ADDR, + idpip_input, 0, nsip_ctlinput, 0, + 0, + 0, 0, 0, 0, + &rip_usrreqs +}, +#endif + /* raw wildcard */ +{ SOCK_RAW, &inetdomain, 0, PR_ATOMIC|PR_ADDR, + rip_input, 0, 0, rip_ctloutput, + 0, + rip_init, 0, 0, 0, + &rip_usrreqs +}, +}; + +#if NGIF > 0 +struct ipprotosw in_gif_protosw = +{ SOCK_RAW, &inetdomain, 0/*IPPROTO_IPV[46]*/, PR_ATOMIC|PR_ADDR, + in_gif_input, rip_output, 0, rip_ctloutput, + 0, + 0, 0, 0, 0, + &rip_usrreqs +}; +#endif /*NGIF*/ + +#if NSTF > 0 +struct ipprotosw in_stf_protosw = +{ SOCK_RAW, &inetdomain, IPPROTO_IPV6, PR_ATOMIC|PR_ADDR, + in_stf_input, rip_output, 0, rip_ctloutput, + 0, + 0, 0, 0, 0, + &rip_usrreqs +}; +#endif /*NSTF*/ + +extern int in_inithead __P((void **, int)); + +struct domain inetdomain = + { AF_INET, "internet", 0, 0, 0, + (struct protosw *)inetsw, + (struct protosw *)&inetsw[sizeof(inetsw)/sizeof(inetsw[0])], 0, + in_inithead, 32, sizeof(struct sockaddr_in) + }; + +DOMAIN_SET(inet); + +SYSCTL_NODE(_net, PF_INET, inet, CTLFLAG_RW, 0, + "Internet Family"); + +SYSCTL_NODE(_net_inet, IPPROTO_IP, ip, CTLFLAG_RW, 0, "IP"); +SYSCTL_NODE(_net_inet, IPPROTO_ICMP, icmp, CTLFLAG_RW, 0, "ICMP"); +SYSCTL_NODE(_net_inet, IPPROTO_UDP, udp, CTLFLAG_RW, 0, "UDP"); +SYSCTL_NODE(_net_inet, IPPROTO_TCP, tcp, CTLFLAG_RW, 0, "TCP"); +SYSCTL_NODE(_net_inet, IPPROTO_IGMP, igmp, CTLFLAG_RW, 0, "IGMP"); +#ifdef IPSEC +SYSCTL_NODE(_net_inet, IPPROTO_AH, ipsec, CTLFLAG_RW, 0, "IPSEC"); +#endif /* IPSEC */ +SYSCTL_NODE(_net_inet, IPPROTO_RAW, raw, CTLFLAG_RW, 0, "RAW"); +#ifdef IPDIVERT +SYSCTL_NODE(_net_inet, IPPROTO_DIVERT, divert, CTLFLAG_RW, 0, "DIVERT"); +#endif + diff --git a/sys/netinet/in_rmx.c b/sys/netinet/in_rmx.c new file mode 100644 index 0000000..bfd65e6 --- /dev/null +++ b/sys/netinet/in_rmx.c @@ -0,0 +1,428 @@ +/* + * Copyright 1994, 1995 Massachusetts Institute of Technology + * + * Permission to use, copy, modify, and distribute this software and + * its documentation for any purpose and without fee is hereby + * granted, provided that both the above copyright notice and this + * permission notice appear in all copies, that both the above + * copyright notice and this permission notice appear in all + * supporting documentation, and that the name of M.I.T. not be used + * in advertising or publicity pertaining to distribution of the + * software without specific, written prior permission. M.I.T. makes + * no representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied + * warranty. + * + * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS + * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT + * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * This code does two things necessary for the enhanced TCP metrics to + * function in a useful manner: + * 1) It marks all non-host routes as `cloning', thus ensuring that + * every actual reference to such a route actually gets turned + * into a reference to a host route to the specific destination + * requested. + * 2) When such routes lose all their references, it arranges for them + * to be deleted in some random collection of circumstances, so that + * a large quantity of stale routing data is not kept in kernel memory + * indefinitely. See in_rtqtimo() below for the exact mechanism. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/socket.h> +#include <sys/mbuf.h> +#include <sys/syslog.h> + +#include <net/if.h> +#include <net/route.h> +#include <netinet/in.h> +#include <netinet/in_var.h> +#include <netinet/ip_var.h> + +extern int in_inithead __P((void **head, int off)); + +#define RTPRF_OURS RTF_PROTO3 /* set on routes we manage */ + +/* + * Do what we need to do when inserting a route. + */ +static struct radix_node * +in_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, + struct radix_node *treenodes) +{ + struct rtentry *rt = (struct rtentry *)treenodes; + struct sockaddr_in *sin = (struct sockaddr_in *)rt_key(rt); + struct radix_node *ret; + + /* + * For IP, all unicast non-host routes are automatically cloning. + */ + if(IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) + rt->rt_flags |= RTF_MULTICAST; + + if(!(rt->rt_flags & (RTF_HOST | RTF_CLONING | RTF_MULTICAST))) { + rt->rt_flags |= RTF_PRCLONING; + } + + /* + * A little bit of help for both IP output and input: + * For host routes, we make sure that RTF_BROADCAST + * is set for anything that looks like a broadcast address. + * This way, we can avoid an expensive call to in_broadcast() + * in ip_output() most of the time (because the route passed + * to ip_output() is almost always a host route). + * + * We also do the same for local addresses, with the thought + * that this might one day be used to speed up ip_input(). + * + * We also mark routes to multicast addresses as such, because + * it's easy to do and might be useful (but this is much more + * dubious since it's so easy to inspect the address). (This + * is done above.) + */ + if (rt->rt_flags & RTF_HOST) { + if (in_broadcast(sin->sin_addr, rt->rt_ifp)) { + rt->rt_flags |= RTF_BROADCAST; + } else { +#define satosin(sa) ((struct sockaddr_in *)sa) + if (satosin(rt->rt_ifa->ifa_addr)->sin_addr.s_addr + == sin->sin_addr.s_addr) + rt->rt_flags |= RTF_LOCAL; +#undef satosin + } + } + + if (!rt->rt_rmx.rmx_mtu && !(rt->rt_rmx.rmx_locks & RTV_MTU) + && rt->rt_ifp) + rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu; + + ret = rn_addroute(v_arg, n_arg, head, treenodes); + if (ret == NULL && rt->rt_flags & RTF_HOST) { + struct rtentry *rt2; + /* + * We are trying to add a host route, but can't. + * Find out if it is because of an + * ARP entry and delete it if so. + */ + rt2 = rtalloc1((struct sockaddr *)sin, 0, + RTF_CLONING | RTF_PRCLONING); + if (rt2) { + if (rt2->rt_flags & RTF_LLINFO && + rt2->rt_flags & RTF_HOST && + rt2->rt_gateway && + rt2->rt_gateway->sa_family == AF_LINK) { + rtrequest(RTM_DELETE, + (struct sockaddr *)rt_key(rt2), + rt2->rt_gateway, + rt_mask(rt2), rt2->rt_flags, 0); + ret = rn_addroute(v_arg, n_arg, head, + treenodes); + } + RTFREE(rt2); + } + } + + /* + * If the new route created successfully, and we are forwarding, + * and there is a cached route, free it. Otherwise, we may end + * up using the wrong route. + */ + if (ret != NULL && ipforwarding && ipforward_rt.ro_rt) { + RTFREE(ipforward_rt.ro_rt); + ipforward_rt.ro_rt = 0; + } + + return ret; +} + +/* + * This code is the inverse of in_clsroute: on first reference, if we + * were managing the route, stop doing so and set the expiration timer + * back off again. + */ +static struct radix_node * +in_matroute(void *v_arg, struct radix_node_head *head) +{ + struct radix_node *rn = rn_match(v_arg, head); + struct rtentry *rt = (struct rtentry *)rn; + + if(rt && rt->rt_refcnt == 0) { /* this is first reference */ + if(rt->rt_flags & RTPRF_OURS) { + rt->rt_flags &= ~RTPRF_OURS; + rt->rt_rmx.rmx_expire = 0; + } + } + return rn; +} + +static int rtq_reallyold = 60*60; + /* one hour is ``really old'' */ +SYSCTL_INT(_net_inet_ip, IPCTL_RTEXPIRE, rtexpire, CTLFLAG_RW, + &rtq_reallyold , 0, + "Default expiration time on dynamically learned routes"); + +static int rtq_minreallyold = 10; + /* never automatically crank down to less */ +SYSCTL_INT(_net_inet_ip, IPCTL_RTMINEXPIRE, rtminexpire, CTLFLAG_RW, + &rtq_minreallyold , 0, + "Minimum time to attempt to hold onto dynamically learned routes"); + +static int rtq_toomany = 128; + /* 128 cached routes is ``too many'' */ +SYSCTL_INT(_net_inet_ip, IPCTL_RTMAXCACHE, rtmaxcache, CTLFLAG_RW, + &rtq_toomany , 0, "Upper limit on dynamically learned routes"); + +/* + * On last reference drop, mark the route as belong to us so that it can be + * timed out. + */ +static void +in_clsroute(struct radix_node *rn, struct radix_node_head *head) +{ + struct rtentry *rt = (struct rtentry *)rn; + + if(!(rt->rt_flags & RTF_UP)) + return; /* prophylactic measures */ + + if((rt->rt_flags & (RTF_LLINFO | RTF_HOST)) != RTF_HOST) + return; + + if((rt->rt_flags & (RTF_WASCLONED | RTPRF_OURS)) + != RTF_WASCLONED) + return; + + /* + * As requested by David Greenman: + * If rtq_reallyold is 0, just delete the route without + * waiting for a timeout cycle to kill it. + */ + if(rtq_reallyold != 0) { + rt->rt_flags |= RTPRF_OURS; + rt->rt_rmx.rmx_expire = time_second + rtq_reallyold; + } else { + rtrequest(RTM_DELETE, + (struct sockaddr *)rt_key(rt), + rt->rt_gateway, rt_mask(rt), + rt->rt_flags, 0); + } +} + +struct rtqk_arg { + struct radix_node_head *rnh; + int draining; + int killed; + int found; + int updating; + time_t nextstop; +}; + +/* + * Get rid of old routes. When draining, this deletes everything, even when + * the timeout is not expired yet. When updating, this makes sure that + * nothing has a timeout longer than the current value of rtq_reallyold. + */ +static int +in_rtqkill(struct radix_node *rn, void *rock) +{ + struct rtqk_arg *ap = rock; + struct rtentry *rt = (struct rtentry *)rn; + int err; + + if(rt->rt_flags & RTPRF_OURS) { + ap->found++; + + if(ap->draining || rt->rt_rmx.rmx_expire <= time_second) { + if(rt->rt_refcnt > 0) + panic("rtqkill route really not free"); + + err = rtrequest(RTM_DELETE, + (struct sockaddr *)rt_key(rt), + rt->rt_gateway, rt_mask(rt), + rt->rt_flags, 0); + if(err) { + log(LOG_WARNING, "in_rtqkill: error %d\n", err); + } else { + ap->killed++; + } + } else { + if(ap->updating + && (rt->rt_rmx.rmx_expire - time_second + > rtq_reallyold)) { + rt->rt_rmx.rmx_expire = time_second + + rtq_reallyold; + } + ap->nextstop = lmin(ap->nextstop, + rt->rt_rmx.rmx_expire); + } + } + + return 0; +} + +#define RTQ_TIMEOUT 60*10 /* run no less than once every ten minutes */ +static int rtq_timeout = RTQ_TIMEOUT; + +static void +in_rtqtimo(void *rock) +{ + struct radix_node_head *rnh = rock; + struct rtqk_arg arg; + struct timeval atv; + static time_t last_adjusted_timeout = 0; + int s; + + arg.found = arg.killed = 0; + arg.rnh = rnh; + arg.nextstop = time_second + rtq_timeout; + arg.draining = arg.updating = 0; + s = splnet(); + rnh->rnh_walktree(rnh, in_rtqkill, &arg); + splx(s); + + /* + * Attempt to be somewhat dynamic about this: + * If there are ``too many'' routes sitting around taking up space, + * then crank down the timeout, and see if we can't make some more + * go away. However, we make sure that we will never adjust more + * than once in rtq_timeout seconds, to keep from cranking down too + * hard. + */ + if((arg.found - arg.killed > rtq_toomany) + && (time_second - last_adjusted_timeout >= rtq_timeout) + && rtq_reallyold > rtq_minreallyold) { + rtq_reallyold = 2*rtq_reallyold / 3; + if(rtq_reallyold < rtq_minreallyold) { + rtq_reallyold = rtq_minreallyold; + } + + last_adjusted_timeout = time_second; +#ifdef DIAGNOSTIC + log(LOG_DEBUG, "in_rtqtimo: adjusted rtq_reallyold to %d\n", + rtq_reallyold); +#endif + arg.found = arg.killed = 0; + arg.updating = 1; + s = splnet(); + rnh->rnh_walktree(rnh, in_rtqkill, &arg); + splx(s); + } + + atv.tv_usec = 0; + atv.tv_sec = arg.nextstop - time_second; + timeout(in_rtqtimo, rock, tvtohz(&atv)); +} + +void +in_rtqdrain(void) +{ + struct radix_node_head *rnh = rt_tables[AF_INET]; + struct rtqk_arg arg; + int s; + arg.found = arg.killed = 0; + arg.rnh = rnh; + arg.nextstop = 0; + arg.draining = 1; + arg.updating = 0; + s = splnet(); + rnh->rnh_walktree(rnh, in_rtqkill, &arg); + splx(s); +} + +/* + * Initialize our routing tree. + */ +int +in_inithead(void **head, int off) +{ + struct radix_node_head *rnh; + + if(!rn_inithead(head, off)) + return 0; + + if(head != (void **)&rt_tables[AF_INET]) /* BOGUS! */ + return 1; /* only do this for the real routing table */ + + rnh = *head; + rnh->rnh_addaddr = in_addroute; + rnh->rnh_matchaddr = in_matroute; + rnh->rnh_close = in_clsroute; + in_rtqtimo(rnh); /* kick off timeout first time */ + return 1; +} + + +/* + * This zaps old routes when the interface goes down or interface + * address is deleted. In the latter case, it deletes static routes + * that point to this address. If we don't do this, we may end up + * using the old address in the future. The ones we always want to + * get rid of are things like ARP entries, since the user might down + * the interface, walk over to a completely different network, and + * plug back in. + */ +struct in_ifadown_arg { + struct radix_node_head *rnh; + struct ifaddr *ifa; + int del; +}; + +static int +in_ifadownkill(struct radix_node *rn, void *xap) +{ + struct in_ifadown_arg *ap = xap; + struct rtentry *rt = (struct rtentry *)rn; + int err; + + if (rt->rt_ifa == ap->ifa && + (ap->del || !(rt->rt_flags & RTF_STATIC))) { + /* + * We need to disable the automatic prune that happens + * in this case in rtrequest() because it will blow + * away the pointers that rn_walktree() needs in order + * continue our descent. We will end up deleting all + * the routes that rtrequest() would have in any case, + * so that behavior is not needed there. + */ + rt->rt_flags &= ~(RTF_CLONING | RTF_PRCLONING); + err = rtrequest(RTM_DELETE, (struct sockaddr *)rt_key(rt), + rt->rt_gateway, rt_mask(rt), rt->rt_flags, 0); + if (err) { + log(LOG_WARNING, "in_ifadownkill: error %d\n", err); + } + } + return 0; +} + +int +in_ifadown(struct ifaddr *ifa, int delete) +{ + struct in_ifadown_arg arg; + struct radix_node_head *rnh; + + if (ifa->ifa_addr->sa_family != AF_INET) + return 1; + + arg.rnh = rnh = rt_tables[AF_INET]; + arg.ifa = ifa; + arg.del = delete; + rnh->rnh_walktree(rnh, in_ifadownkill, &arg); + ifa->ifa_flags &= ~IFA_ROUTE; + return 0; +} diff --git a/sys/netinet/in_systm.h b/sys/netinet/in_systm.h new file mode 100644 index 0000000..13fa81d --- /dev/null +++ b/sys/netinet/in_systm.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_systm.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_IN_SYSTM_H_ +#define _NETINET_IN_SYSTM_H_ + +/* + * Miscellaneous internetwork + * definitions for kernel. + */ + +/* + * Network types. + * + * Internally the system keeps counters in the headers with the bytes + * swapped so that VAX instructions will work on them. It reverses + * the bytes before transmission at each protocol level. The n_ types + * represent the types with the bytes in ``high-ender'' order. + */ +typedef u_int16_t n_short; /* short as received from the net */ +typedef u_int32_t n_long; /* long as received from the net */ + +typedef u_int32_t n_time; /* ms since 00:00 GMT, byte rev */ + +#ifdef _KERNEL +n_time iptime __P((void)); +#endif + +#endif diff --git a/sys/netinet/in_var.h b/sys/netinet/in_var.h new file mode 100644 index 0000000..f5c1464 --- /dev/null +++ b/sys/netinet/in_var.h @@ -0,0 +1,235 @@ +/* + * Copyright (c) 1985, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_var.h 8.2 (Berkeley) 1/9/95 + * $FreeBSD$ + */ + +#ifndef _NETINET_IN_VAR_H_ +#define _NETINET_IN_VAR_H_ + +#include <sys/queue.h> + +/* + * Interface address, Internet version. One of these structures + * is allocated for each Internet address on an interface. + * The ifaddr structure contains the protocol-independent part + * of the structure and is assumed to be first. + */ +struct in_ifaddr { + struct ifaddr ia_ifa; /* protocol-independent info */ +#define ia_ifp ia_ifa.ifa_ifp +#define ia_flags ia_ifa.ifa_flags + /* ia_{,sub}net{,mask} in host order */ + u_long ia_net; /* network number of interface */ + u_long ia_netmask; /* mask of net part */ + u_long ia_subnet; /* subnet number, including net */ + u_long ia_subnetmask; /* mask of subnet part */ + struct in_addr ia_netbroadcast; /* to recognize net broadcasts */ + TAILQ_ENTRY(in_ifaddr) ia_link; /* tailq macro glue */ + struct sockaddr_in ia_addr; /* reserve space for interface name */ + struct sockaddr_in ia_dstaddr; /* reserve space for broadcast addr */ +#define ia_broadaddr ia_dstaddr + struct sockaddr_in ia_sockmask; /* reserve space for general netmask */ +}; + +struct in_aliasreq { + char ifra_name[IFNAMSIZ]; /* if name, e.g. "en0" */ + struct sockaddr_in ifra_addr; + struct sockaddr_in ifra_broadaddr; +#define ifra_dstaddr ifra_broadaddr + struct sockaddr_in ifra_mask; +}; +/* + * Given a pointer to an in_ifaddr (ifaddr), + * return a pointer to the addr as a sockaddr_in. + */ +#define IA_SIN(ia) (&(((struct in_ifaddr *)(ia))->ia_addr)) +#define IA_DSTSIN(ia) (&(((struct in_ifaddr *)(ia))->ia_dstaddr)) + +#define IN_LNAOF(in, ifa) \ + ((ntohl((in).s_addr) & ~((struct in_ifaddr *)(ifa)->ia_subnetmask)) + + +#ifdef _KERNEL +extern TAILQ_HEAD(in_ifaddrhead, in_ifaddr) in_ifaddrhead; +extern struct ifqueue ipintrq; /* ip packet input queue */ +extern struct in_addr zeroin_addr; +extern u_char inetctlerrmap[]; + +/* + * Macro for finding the interface (ifnet structure) corresponding to one + * of our IP addresses. + */ +#define INADDR_TO_IFP(addr, ifp) \ + /* struct in_addr addr; */ \ + /* struct ifnet *ifp; */ \ +{ \ + register struct in_ifaddr *ia; \ +\ + for (ia = TAILQ_FIRST(&in_ifaddrhead); \ + ia != NULL && ((ia->ia_ifp->if_flags & IFF_POINTOPOINT)? \ + IA_DSTSIN(ia):IA_SIN(ia))->sin_addr.s_addr != (addr).s_addr; \ + ia = TAILQ_NEXT(ia, ia_link)) \ + continue; \ + if (ia == NULL) \ + TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) \ + if (ia->ia_ifp->if_flags & IFF_POINTOPOINT && \ + IA_SIN(ia)->sin_addr.s_addr == (addr).s_addr) \ + break; \ + (ifp) = (ia == NULL) ? NULL : ia->ia_ifp; \ +} + +/* + * Macro for finding the internet address structure (in_ifaddr) corresponding + * to a given interface (ifnet structure). + */ +#define IFP_TO_IA(ifp, ia) \ + /* struct ifnet *ifp; */ \ + /* struct in_ifaddr *ia; */ \ +{ \ + for ((ia) = TAILQ_FIRST(&in_ifaddrhead); \ + (ia) != NULL && (ia)->ia_ifp != (ifp); \ + (ia) = TAILQ_NEXT((ia), ia_link)) \ + continue; \ +} +#endif + +/* + * This information should be part of the ifnet structure but we don't wish + * to change that - as it might break a number of things + */ + +struct router_info { + struct ifnet *rti_ifp; + int rti_type; /* type of router which is querier on this interface */ + int rti_time; /* # of slow timeouts since last old query */ + struct router_info *rti_next; +}; + +/* + * Internet multicast address structure. There is one of these for each IP + * multicast group to which this host belongs on a given network interface. + * For every entry on the interface's if_multiaddrs list which represents + * an IP multicast group, there is one of these structures. They are also + * kept on a system-wide list to make it easier to keep our legacy IGMP code + * compatible with the rest of the world (see IN_FIRST_MULTI et al, below). + */ +struct in_multi { + LIST_ENTRY(in_multi) inm_link; /* queue macro glue */ + struct in_addr inm_addr; /* IP multicast address, convenience */ + struct ifnet *inm_ifp; /* back pointer to ifnet */ + struct ifmultiaddr *inm_ifma; /* back pointer to ifmultiaddr */ + u_int inm_timer; /* IGMP membership report timer */ + u_int inm_state; /* state of the membership */ + struct router_info *inm_rti; /* router info*/ +}; + +#ifdef _KERNEL + +#ifdef SYSCTL_DECL +SYSCTL_DECL(_net_inet_ip); +SYSCTL_DECL(_net_inet_raw); +#endif + +extern LIST_HEAD(in_multihead, in_multi) in_multihead; + +/* + * Structure used by macros below to remember position when stepping through + * all of the in_multi records. + */ +struct in_multistep { + struct in_multi *i_inm; +}; + +/* + * Macro for looking up the in_multi record for a given IP multicast address + * on a given interface. If no matching record is found, "inm" is set null. + */ +#define IN_LOOKUP_MULTI(addr, ifp, inm) \ + /* struct in_addr addr; */ \ + /* struct ifnet *ifp; */ \ + /* struct in_multi *inm; */ \ +do { \ + register struct ifmultiaddr *ifma; \ +\ + TAILQ_FOREACH(ifma, &((ifp)->if_multiaddrs), ifma_link) { \ + if (ifma->ifma_addr->sa_family == AF_INET \ + && ((struct sockaddr_in *)ifma->ifma_addr)->sin_addr.s_addr == \ + (addr).s_addr) \ + break; \ + } \ + (inm) = ifma ? ifma->ifma_protospec : 0; \ +} while(0) + +/* + * Macro to step through all of the in_multi records, one at a time. + * The current position is remembered in "step", which the caller must + * provide. IN_FIRST_MULTI(), below, must be called to initialize "step" + * and get the first record. Both macros return a NULL "inm" when there + * are no remaining records. + */ +#define IN_NEXT_MULTI(step, inm) \ + /* struct in_multistep step; */ \ + /* struct in_multi *inm; */ \ +do { \ + if (((inm) = (step).i_inm) != NULL) \ + (step).i_inm = LIST_NEXT((step).i_inm, inm_link); \ +} while(0) + +#define IN_FIRST_MULTI(step, inm) \ + /* struct in_multistep step; */ \ + /* struct in_multi *inm; */ \ +do { \ + (step).i_inm = LIST_FIRST(&in_multihead); \ + IN_NEXT_MULTI((step), (inm)); \ +} while(0) + +struct route; +struct in_multi *in_addmulti __P((struct in_addr *, struct ifnet *)); +void in_delmulti __P((struct in_multi *)); +int in_control __P((struct socket *, u_long, caddr_t, struct ifnet *, + struct proc *)); +void in_rtqdrain __P((void)); +void ip_input __P((struct mbuf *)); +int in_ifadown __P((struct ifaddr *ifa, int)); +void in_ifscrub __P((struct ifnet *, struct in_ifaddr *)); +int ipflow_fastforward __P((struct mbuf *)); +void ipflow_create __P((const struct route *, struct mbuf *)); +void ipflow_slowtimo __P((void)); + +#endif /* _KERNEL */ + +/* INET6 stuff */ +#include <netinet6/in6_var.h> + +#endif /* _NETINET_IN_VAR_H_ */ diff --git a/sys/netinet/ip.h b/sys/netinet/ip.h new file mode 100644 index 0000000..7f92ba7 --- /dev/null +++ b/sys/netinet/ip.h @@ -0,0 +1,188 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip.h 8.2 (Berkeley) 6/1/94 + * $FreeBSD$ + */ + +#ifndef _NETINET_IP_H_ +#define _NETINET_IP_H_ + +/* + * Definitions for internet protocol version 4. + * Per RFC 791, September 1981. + */ +#define IPVERSION 4 + +/* + * Structure of an internet header, naked of options. + */ +struct ip { +#ifdef _IP_VHL + u_char ip_vhl; /* version << 4 | header length >> 2 */ +#else +#if BYTE_ORDER == LITTLE_ENDIAN + u_int ip_hl:4, /* header length */ + ip_v:4; /* version */ +#endif +#if BYTE_ORDER == BIG_ENDIAN + u_int ip_v:4, /* version */ + ip_hl:4; /* header length */ +#endif +#endif /* not _IP_VHL */ + u_char ip_tos; /* type of service */ + u_short ip_len; /* total length */ + u_short ip_id; /* identification */ + u_short ip_off; /* fragment offset field */ +#define IP_RF 0x8000 /* reserved fragment flag */ +#define IP_DF 0x4000 /* dont fragment flag */ +#define IP_MF 0x2000 /* more fragments flag */ +#define IP_OFFMASK 0x1fff /* mask for fragmenting bits */ + u_char ip_ttl; /* time to live */ + u_char ip_p; /* protocol */ + u_short ip_sum; /* checksum */ + struct in_addr ip_src,ip_dst; /* source and dest address */ +}; + +#ifdef _IP_VHL +#define IP_MAKE_VHL(v, hl) ((v) << 4 | (hl)) +#define IP_VHL_HL(vhl) ((vhl) & 0x0f) +#define IP_VHL_V(vhl) ((vhl) >> 4) +#define IP_VHL_BORING 0x45 +#endif + +#define IP_MAXPACKET 65535 /* maximum packet size */ + +/* + * Definitions for IP type of service (ip_tos) + */ +#define IPTOS_LOWDELAY 0x10 +#define IPTOS_THROUGHPUT 0x08 +#define IPTOS_RELIABILITY 0x04 +#define IPTOS_MINCOST 0x02 +/* ECN bits proposed by Sally Floyd */ +#define IPTOS_CE 0x01 /* congestion experienced */ +#define IPTOS_ECT 0x02 /* ECN-capable transport */ + + +/* + * Definitions for IP precedence (also in ip_tos) (hopefully unused) + */ +#define IPTOS_PREC_NETCONTROL 0xe0 +#define IPTOS_PREC_INTERNETCONTROL 0xc0 +#define IPTOS_PREC_CRITIC_ECP 0xa0 +#define IPTOS_PREC_FLASHOVERRIDE 0x80 +#define IPTOS_PREC_FLASH 0x60 +#define IPTOS_PREC_IMMEDIATE 0x40 +#define IPTOS_PREC_PRIORITY 0x20 +#define IPTOS_PREC_ROUTINE 0x00 + +/* + * Definitions for options. + */ +#define IPOPT_COPIED(o) ((o)&0x80) +#define IPOPT_CLASS(o) ((o)&0x60) +#define IPOPT_NUMBER(o) ((o)&0x1f) + +#define IPOPT_CONTROL 0x00 +#define IPOPT_RESERVED1 0x20 +#define IPOPT_DEBMEAS 0x40 +#define IPOPT_RESERVED2 0x60 + +#define IPOPT_EOL 0 /* end of option list */ +#define IPOPT_NOP 1 /* no operation */ + +#define IPOPT_RR 7 /* record packet route */ +#define IPOPT_TS 68 /* timestamp */ +#define IPOPT_SECURITY 130 /* provide s,c,h,tcc */ +#define IPOPT_LSRR 131 /* loose source route */ +#define IPOPT_SATID 136 /* satnet id */ +#define IPOPT_SSRR 137 /* strict source route */ +#define IPOPT_RA 148 /* router alert */ + +/* + * Offsets to fields in options other than EOL and NOP. + */ +#define IPOPT_OPTVAL 0 /* option ID */ +#define IPOPT_OLEN 1 /* option length */ +#define IPOPT_OFFSET 2 /* offset within option */ +#define IPOPT_MINOFF 4 /* min value of above */ + +/* + * Time stamp option structure. + */ +struct ip_timestamp { + u_char ipt_code; /* IPOPT_TS */ + u_char ipt_len; /* size of structure (variable) */ + u_char ipt_ptr; /* index of current entry */ +#if BYTE_ORDER == LITTLE_ENDIAN + u_int ipt_flg:4, /* flags, see below */ + ipt_oflw:4; /* overflow counter */ +#endif +#if BYTE_ORDER == BIG_ENDIAN + u_int ipt_oflw:4, /* overflow counter */ + ipt_flg:4; /* flags, see below */ +#endif + union ipt_timestamp { + n_long ipt_time[1]; + struct ipt_ta { + struct in_addr ipt_addr; + n_long ipt_time; + } ipt_ta[1]; + } ipt_timestamp; +}; + +/* flag bits for ipt_flg */ +#define IPOPT_TS_TSONLY 0 /* timestamps only */ +#define IPOPT_TS_TSANDADDR 1 /* timestamps and addresses */ +#define IPOPT_TS_PRESPEC 3 /* specified modules only */ + +/* bits for security (not byte swapped) */ +#define IPOPT_SECUR_UNCLASS 0x0000 +#define IPOPT_SECUR_CONFID 0xf135 +#define IPOPT_SECUR_EFTO 0x789a +#define IPOPT_SECUR_MMMM 0xbc4d +#define IPOPT_SECUR_RESTR 0xaf13 +#define IPOPT_SECUR_SECRET 0xd788 +#define IPOPT_SECUR_TOPSECRET 0x6bc5 + +/* + * Internet implementation parameters. + */ +#define MAXTTL 255 /* maximum time to live (seconds) */ +#define IPDEFTTL 64 /* default ttl, from RFC 1340 */ +#define IPFRAGTTL 60 /* time to live for frags, slowhz */ +#define IPTTLDEC 1 /* subtracted when forwarding */ + +#define IP_MSS 576 /* default maximum segment size */ + +#endif diff --git a/sys/netinet/ip6.h b/sys/netinet/ip6.h new file mode 100644 index 0000000..77d1ab6 --- /dev/null +++ b/sys/netinet/ip6.h @@ -0,0 +1,296 @@ +/* $FreeBSD$ */ +/* $KAME: ip6.h,v 1.9 2000/07/02 21:01:32 itojun Exp $ */ + +/* + * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip.h 8.1 (Berkeley) 6/10/93 + */ + +#ifndef _NETINET_IP6_H_ +#define _NETINET_IP6_H_ + +/* + * Definition for internet protocol version 6. + * RFC 2460 + */ + +struct ip6_hdr { + union { + struct ip6_hdrctl { + u_int32_t ip6_un1_flow; /* 20 bits of flow-ID */ + u_int16_t ip6_un1_plen; /* payload length */ + u_int8_t ip6_un1_nxt; /* next header */ + u_int8_t ip6_un1_hlim; /* hop limit */ + } ip6_un1; + u_int8_t ip6_un2_vfc; /* 4 bits version, top 4 bits class */ + } ip6_ctlun; + struct in6_addr ip6_src; /* source address */ + struct in6_addr ip6_dst; /* destination address */ +}; + +#define ip6_vfc ip6_ctlun.ip6_un2_vfc +#define ip6_flow ip6_ctlun.ip6_un1.ip6_un1_flow +#define ip6_plen ip6_ctlun.ip6_un1.ip6_un1_plen +#define ip6_nxt ip6_ctlun.ip6_un1.ip6_un1_nxt +#define ip6_hlim ip6_ctlun.ip6_un1.ip6_un1_hlim +#define ip6_hops ip6_ctlun.ip6_un1.ip6_un1_hlim + +#define IPV6_VERSION 0x60 +#define IPV6_VERSION_MASK 0xf0 + +#if BYTE_ORDER == BIG_ENDIAN +#define IPV6_FLOWINFO_MASK 0x0fffffff /* flow info (28 bits) */ +#define IPV6_FLOWLABEL_MASK 0x000fffff /* flow label (20 bits) */ +#else +#if BYTE_ORDER == LITTLE_ENDIAN +#define IPV6_FLOWINFO_MASK 0xffffff0f /* flow info (28 bits) */ +#define IPV6_FLOWLABEL_MASK 0xffff0f00 /* flow label (20 bits) */ +#endif /* LITTLE_ENDIAN */ +#endif +/* ECN bits proposed by Sally Floyd */ +#define IP6TOS_CE 0x01 /* congestion experienced */ +#define IP6TOS_ECT 0x02 /* ECN-capable transport */ + +/* + * Extension Headers + */ + +struct ip6_ext { + u_char ip6e_nxt; + u_char ip6e_len; +}; + +/* Hop-by-Hop options header */ +/* XXX should we pad it to force alignment on an 8-byte boundary? */ +struct ip6_hbh { + u_int8_t ip6h_nxt; /* next header */ + u_int8_t ip6h_len; /* length in units of 8 octets */ + /* followed by options */ +}; + +/* Destination options header */ +/* XXX should we pad it to force alignment on an 8-byte boundary? */ +struct ip6_dest { + u_int8_t ip6d_nxt; /* next header */ + u_int8_t ip6d_len; /* length in units of 8 octets */ + /* followed by options */ +}; + +/* Option types and related macros */ +#define IP6OPT_PAD1 0x00 /* 00 0 00000 */ +#define IP6OPT_PADN 0x01 /* 00 0 00001 */ +#define IP6OPT_JUMBO 0xC2 /* 11 0 00010 = 194 */ +#define IP6OPT_JUMBO_LEN 6 +#define IP6OPT_RTALERT 0x05 /* 00 0 00101 */ +#define IP6OPT_RTALERT_LEN 4 +#define IP6OPT_RTALERT_MLD 0 /* Datagram contains an MLD message */ +#define IP6OPT_RTALERT_RSVP 1 /* Datagram contains an RSVP message */ +#define IP6OPT_RTALERT_ACTNET 2 /* contains an Active Networks msg */ +#define IP6OPT_MINLEN 2 + +#define IP6OPT_TYPE(o) ((o) & 0xC0) +#define IP6OPT_TYPE_SKIP 0x00 +#define IP6OPT_TYPE_DISCARD 0x40 +#define IP6OPT_TYPE_FORCEICMP 0x80 +#define IP6OPT_TYPE_ICMP 0xC0 + +#define IP6OPT_MUTABLE 0x20 + +/* Routing header */ +struct ip6_rthdr { + u_int8_t ip6r_nxt; /* next header */ + u_int8_t ip6r_len; /* length in units of 8 octets */ + u_int8_t ip6r_type; /* routing type */ + u_int8_t ip6r_segleft; /* segments left */ + /* followed by routing type specific data */ +}; + +/* Type 0 Routing header */ +struct ip6_rthdr0 { + u_int8_t ip6r0_nxt; /* next header */ + u_int8_t ip6r0_len; /* length in units of 8 octets */ + u_int8_t ip6r0_type; /* always zero */ + u_int8_t ip6r0_segleft; /* segments left */ + u_int8_t ip6r0_reserved; /* reserved field */ + u_int8_t ip6r0_slmap[3]; /* strict/loose bit map */ + struct in6_addr ip6r0_addr[1]; /* up to 23 addresses */ +}; + +/* Fragment header */ +struct ip6_frag { + u_int8_t ip6f_nxt; /* next header */ + u_int8_t ip6f_reserved; /* reserved field */ + u_int16_t ip6f_offlg; /* offset, reserved, and flag */ + u_int32_t ip6f_ident; /* identification */ +}; + +#if BYTE_ORDER == BIG_ENDIAN +#define IP6F_OFF_MASK 0xfff8 /* mask out offset from _offlg */ +#define IP6F_RESERVED_MASK 0x0006 /* reserved bits in ip6f_offlg */ +#define IP6F_MORE_FRAG 0x0001 /* more-fragments flag */ +#else /* BYTE_ORDER == LITTLE_ENDIAN */ +#define IP6F_OFF_MASK 0xf8ff /* mask out offset from _offlg */ +#define IP6F_RESERVED_MASK 0x0600 /* reserved bits in ip6f_offlg */ +#define IP6F_MORE_FRAG 0x0100 /* more-fragments flag */ +#endif /* BYTE_ORDER == LITTLE_ENDIAN */ + +/* + * Internet implementation parameters. + */ +#define IPV6_MAXHLIM 255 /* maximun hoplimit */ +#define IPV6_DEFHLIM 64 /* default hlim */ +#define IPV6_FRAGTTL 120 /* ttl for fragment packets, in slowtimo tick */ +#define IPV6_HLIMDEC 1 /* subtracted when forwaeding */ + +#define IPV6_MMTU 1280 /* minimal MTU and reassembly. 1024 + 256 */ +#define IPV6_MAXPACKET 65535 /* ip6 max packet size without Jumbo payload*/ + +#ifdef _KERNEL +/* + * IP6_EXTHDR_CHECK ensures that region between the IP6 header and the + * target header (including IPv6 itself, extension headers and + * TCP/UDP/ICMP6 headers) are continuous. KAME requires drivers + * to store incoming data into one internal mbuf or one or more external + * mbufs(never into two or more internal mbufs). Thus, the third case is + * supposed to never be matched but is prepared just in case. + */ + +#define IP6_EXTHDR_CHECK(m, off, hlen, ret) \ +do { \ + if ((m)->m_next != NULL) { \ + if (((m)->m_flags & M_LOOP) && \ + ((m)->m_len < (off) + (hlen)) && \ + (((m) = m_pullup((m), (off) + (hlen))) == NULL)) { \ + ip6stat.ip6s_exthdrtoolong++; \ + return ret; \ + } else if ((m)->m_flags & M_EXT) { \ + if ((m)->m_len < (off) + (hlen)) { \ + ip6stat.ip6s_exthdrtoolong++; \ + m_freem(m); \ + return ret; \ + } \ + } else { \ + if ((m)->m_len < (off) + (hlen)) { \ + ip6stat.ip6s_exthdrtoolong++; \ + m_freem(m); \ + return ret; \ + } \ + } \ + } else { \ + if ((m)->m_len < (off) + (hlen)) { \ + ip6stat.ip6s_tooshort++; \ + in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated); \ + m_freem(m); \ + return ret; \ + } \ + } \ +} while (0) + +/* + * IP6_EXTHDR_GET ensures that intermediate protocol header (from "off" to + * "len") is located in single mbuf, on contiguous memory region. + * The pointer to the region will be returned to pointer variable "val", + * with type "typ". + * IP6_EXTHDR_GET0 does the same, except that it aligns the structure at the + * very top of mbuf. GET0 is likely to make memory copy than GET. + * + * XXX we're now testing this, needs m_pulldown() + */ +#define IP6_EXTHDR_GET(val, typ, m, off, len) \ +do { \ + struct mbuf *t; \ + int tmp; \ + if ((m)->m_len >= (off) + (len)) \ + (val) = (typ)(mtod((m), caddr_t) + (off)); \ + else { \ + t = m_pulldown((m), (off), (len), &tmp); \ + if (t) { \ + if (t->m_len < tmp + (len)) \ + panic("m_pulldown malfunction"); \ + (val) = (typ)(mtod(t, caddr_t) + tmp); \ + } else { \ + (val) = (typ)NULL; \ + (m) = NULL; \ + } \ + } \ +} while (0) + +#define IP6_EXTHDR_GET0(val, typ, m, off, len) \ +do { \ + struct mbuf *t; \ + if ((off) == 0) \ + (val) = (typ)mtod(m, caddr_t); \ + else { \ + t = m_pulldown((m), (off), (len), NULL); \ + if (t) { \ + if (t->m_len < (len)) \ + panic("m_pulldown malfunction"); \ + (val) = (typ)mtod(t, caddr_t); \ + } else { \ + (val) = (typ)NULL; \ + (m) = NULL; \ + } \ + } \ +} while (0) +#endif /*_KERNEL*/ + +#endif /* not _NETINET_IP6_H_ */ diff --git a/sys/netinet/ip_auth.c b/sys/netinet/ip_auth.c new file mode 100644 index 0000000..2931cc6 --- /dev/null +++ b/sys/netinet/ip_auth.c @@ -0,0 +1,543 @@ +/* + * Copyright (C) 1998-2000 by Darren Reed & Guido van Rooij. + * + * Redistribution and use in source and binary forms are permitted + * provided that this notice is preserved and due credit is given + * to the original author and the contributors. + */ +#if !defined(lint) +/*static const char rcsid[] = "@(#)$Id: ip_auth.c,v 2.1.2.2 2000/01/16 10:12:14 darrenr Exp $";*/ +static const char rcsid[] = "@(#)$FreeBSD$"; +#endif + +#include <sys/errno.h> +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/file.h> +#if !defined(_KERNEL) && !defined(KERNEL) +# include <stdio.h> +# include <stdlib.h> +# include <string.h> +#endif +#if (defined(KERNEL) || defined(_KERNEL)) && (__FreeBSD_version >= 220000) +# include <sys/filio.h> +# include <sys/fcntl.h> +#else +# include <sys/ioctl.h> +#endif +#include <sys/uio.h> +#ifndef linux +# include <sys/protosw.h> +#endif +#include <sys/socket.h> +#if (defined(_KERNEL) || defined(KERNEL)) && !defined(linux) +# include <sys/systm.h> +#endif +#if !defined(__SVR4) && !defined(__svr4__) +# ifndef linux +# include <sys/mbuf.h> +# endif +#else +# include <sys/filio.h> +# include <sys/byteorder.h> +# ifdef _KERNEL +# include <sys/dditypes.h> +# endif +# include <sys/stream.h> +# include <sys/kmem.h> +#endif +#if (_BSDI_VERSION >= 199802) || (__FreeBSD_version >= 400000) +# include <sys/queue.h> +#endif +#if defined(__NetBSD__) || defined(__OpenBSD__) || defined(bsdi) +# include <machine/cpu.h> +#endif +#include <net/if.h> +#ifdef sun +# include <net/af.h> +#endif +#include <net/route.h> +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#ifndef KERNEL +# define KERNEL +# define NOT_KERNEL +#endif +#ifndef linux +# include <netinet/ip_var.h> +#endif +#ifdef NOT_KERNEL +# undef KERNEL +#endif +#ifdef __sgi +# ifdef IFF_DRVRLOCK /* IRIX6 */ +# include <sys/hashing.h> +# endif +#endif +#include <netinet/tcp.h> +#if defined(__sgi) && !defined(IFF_DRVRLOCK) /* IRIX < 6 */ +extern struct ifqueue ipintrq; /* ip packet input queue */ +#else +# ifndef linux +# if __FreeBSD_version >= 300000 +# include <net/if_var.h> +# endif +# include <netinet/in_var.h> +# include <netinet/tcp_fsm.h> +# endif +#endif +#include <netinet/udp.h> +#include <netinet/ip_icmp.h> +#include "netinet/ip_compat.h" +#include <netinet/tcpip.h> +#include "netinet/ip_fil.h" +#include "netinet/ip_auth.h" +#if !SOLARIS && !defined(linux) +# include <net/netisr.h> +# ifdef __FreeBSD__ +# include <machine/cpufunc.h> +# endif +#endif +#if (__FreeBSD_version >= 300000) +# include <sys/malloc.h> +# if (defined(_KERNEL) || defined(KERNEL)) && !defined(IPFILTER_LKM) +# include <sys/libkern.h> +# include <sys/systm.h> +# endif +#endif + + + +#if (SOLARIS || defined(__sgi)) && defined(_KERNEL) +extern KRWLOCK_T ipf_auth; +extern kmutex_t ipf_authmx; +# if SOLARIS +extern kcondvar_t ipfauthwait; +# endif +#endif +#ifdef linux +static struct wait_queue *ipfauthwait = NULL; +#endif + +int fr_authsize = FR_NUMAUTH; +int fr_authused = 0; +int fr_defaultauthage = 600; +int fr_auth_lock = 0; +fr_authstat_t fr_authstats; +static frauth_t fr_auth[FR_NUMAUTH]; +mb_t *fr_authpkts[FR_NUMAUTH]; +static int fr_authstart = 0, fr_authend = 0, fr_authnext = 0; +static frauthent_t *fae_list = NULL; +frentry_t *ipauth = NULL; + + +/* + * Check if a packet has authorization. If the packet is found to match an + * authorization result and that would result in a feedback loop (i.e. it + * will end up returning FR_AUTH) then return FR_BLOCK instead. + */ +u_32_t fr_checkauth(ip, fin) +ip_t *ip; +fr_info_t *fin; +{ + u_short id = ip->ip_id; + u_32_t pass; + int i; + + if (fr_auth_lock) + return 0; + + READ_ENTER(&ipf_auth); + for (i = fr_authstart; i != fr_authend; ) { + /* + * index becomes -2 only after an SIOCAUTHW. Check this in + * case the same packet gets sent again and it hasn't yet been + * auth'd. + */ + if ((fr_auth[i].fra_index == -2) && + (id == fr_auth[i].fra_info.fin_id) && + !bcmp((char *)fin,(char *)&fr_auth[i].fra_info,FI_CSIZE)) { + /* + * Avoid feedback loop. + */ + if (!(pass = fr_auth[i].fra_pass) || (pass & FR_AUTH)) + pass = FR_BLOCK; + RWLOCK_EXIT(&ipf_auth); + WRITE_ENTER(&ipf_auth); + fr_authstats.fas_hits++; + fr_auth[i].fra_index = -1; + fr_authused--; + if (i == fr_authstart) { + while (fr_auth[i].fra_index == -1) { + i++; + if (i == FR_NUMAUTH) + i = 0; + fr_authstart = i; + if (i == fr_authend) + break; + } + if (fr_authstart == fr_authend) { + fr_authnext = 0; + fr_authstart = fr_authend = 0; + } + } + RWLOCK_EXIT(&ipf_auth); + return pass; + } + i++; + if (i == FR_NUMAUTH) + i = 0; + } + fr_authstats.fas_miss++; + RWLOCK_EXIT(&ipf_auth); + return 0; +} + + +/* + * Check if we have room in the auth array to hold details for another packet. + * If we do, store it and wake up any user programs which are waiting to + * hear about these events. + */ +int fr_newauth(m, fin, ip) +mb_t *m; +fr_info_t *fin; +ip_t *ip; +{ +#if defined(_KERNEL) && SOLARIS + qif_t *qif = fin->fin_qif; +#endif + int i; + + if (fr_auth_lock) + return 0; + + WRITE_ENTER(&ipf_auth); + if (fr_authstart > fr_authend) { + fr_authstats.fas_nospace++; + RWLOCK_EXIT(&ipf_auth); + return 0; + } else { + if ((fr_authstart == 0) && (fr_authend == FR_NUMAUTH - 1)) { + fr_authstats.fas_nospace++; + RWLOCK_EXIT(&ipf_auth); + return 0; + } + } + + fr_authstats.fas_added++; + fr_authused++; + i = fr_authend++; + if (fr_authend == FR_NUMAUTH) + fr_authend = 0; + RWLOCK_EXIT(&ipf_auth); + fr_auth[i].fra_index = i; + fr_auth[i].fra_pass = 0; + fr_auth[i].fra_age = fr_defaultauthage; + bcopy((char *)fin, (char *)&fr_auth[i].fra_info, sizeof(*fin)); +#if !defined(sparc) && !defined(m68k) + /* + * No need to copyback here as we want to undo the changes, not keep + * them. + */ +# if SOLARIS && defined(_KERNEL) + if ((ip == (ip_t *)m->b_rptr) && (ip->ip_v == 4)) +# endif + { + register u_short bo; + + bo = ip->ip_len; + ip->ip_len = htons(bo); +# if !SOLARIS && !defined(__NetBSD__) && !defined(__FreeBSD__) + /* 4.4BSD converts this ip_input.c, but I don't in solaris.c */ + bo = ip->ip_id; + ip->ip_id = htons(bo); +# endif + bo = ip->ip_off; + ip->ip_off = htons(bo); + } +#endif +#if SOLARIS && defined(_KERNEL) + m->b_rptr -= qif->qf_off; + fr_authpkts[i] = *(mblk_t **)fin->fin_mp; + fr_auth[i].fra_q = qif->qf_q; + cv_signal(&ipfauthwait); +#else + fr_authpkts[i] = m; +# if defined(linux) && defined(_KERNEL) + wake_up_interruptible(&ipfauthwait); +# else + WAKEUP(&fr_authnext); +# endif +#endif + return 1; +} + + +int fr_auth_ioctl(data, cmd, fr, frptr) +caddr_t data; +#if defined(__NetBSD__) || defined(__OpenBSD__) || (FreeBSD_version >= 300003) +u_long cmd; +#else +int cmd; +#endif +frentry_t *fr, **frptr; +{ + mb_t *m; + frauth_t auth, *au = &auth; + frauthent_t *fae, **faep; + int i, error = 0; + + switch (cmd) + { + case SIOCSTLCK : + error = fr_lock(data, &fr_auth_lock); + break; + case SIOCINIFR : + case SIOCRMIFR : + case SIOCADIFR : + error = EINVAL; + break; + case SIOCINAFR : + error = EINVAL; + break; + case SIOCRMAFR : + case SIOCADAFR : + for (faep = &fae_list; (fae = *faep); ) + if (&fae->fae_fr == fr) + break; + else + faep = &fae->fae_next; + if (cmd == SIOCRMAFR) { + if (!fae) + error = ESRCH; + else { + WRITE_ENTER(&ipf_auth); + *faep = fae->fae_next; + *frptr = fr->fr_next; + RWLOCK_EXIT(&ipf_auth); + KFREE(fae); + } + } else { + KMALLOC(fae, frauthent_t *); + if (fae != NULL) { + bcopy((char *)fr, (char *)&fae->fae_fr, + sizeof(*fr)); + WRITE_ENTER(&ipf_auth); + fae->fae_age = fr_defaultauthage; + fae->fae_fr.fr_hits = 0; + fae->fae_fr.fr_next = *frptr; + *frptr = &fae->fae_fr; + fae->fae_next = *faep; + *faep = fae; + ipauth = &fae_list->fae_fr; + RWLOCK_EXIT(&ipf_auth); + } else + error = ENOMEM; + } + break; + case SIOCATHST: + READ_ENTER(&ipf_auth); + fr_authstats.fas_faelist = fae_list; + RWLOCK_EXIT(&ipf_auth); + error = IWCOPYPTR((char *)&fr_authstats, data, + sizeof(fr_authstats)); + break; + case SIOCAUTHW: +fr_authioctlloop: + READ_ENTER(&ipf_auth); + if ((fr_authnext != fr_authend) && fr_authpkts[fr_authnext]) { + error = IWCOPYPTR((char *)&fr_auth[fr_authnext], data, + sizeof(frauth_t)); + RWLOCK_EXIT(&ipf_auth); + if (error) + break; + WRITE_ENTER(&ipf_auth); + fr_authnext++; + if (fr_authnext == FR_NUMAUTH) + fr_authnext = 0; + RWLOCK_EXIT(&ipf_auth); + return 0; + } +#ifdef _KERNEL +# if SOLARIS + mutex_enter(&ipf_authmx); + if (!cv_wait_sig(&ipfauthwait, &ipf_authmx)) { + mutex_exit(&ipf_authmx); + return EINTR; + } + mutex_exit(&ipf_authmx); +# else +# ifdef linux + interruptible_sleep_on(&ipfauthwait); + if (current->signal & ~current->blocked) + error = -EINTR; +# else + error = SLEEP(&fr_authnext, "fr_authnext"); +# endif +# endif +#endif + RWLOCK_EXIT(&ipf_auth); + if (!error) + goto fr_authioctlloop; + break; + case SIOCAUTHR: + error = IRCOPYPTR(data, (caddr_t)&auth, sizeof(auth)); + if (error) + return error; + WRITE_ENTER(&ipf_auth); + i = au->fra_index; + if ((i < 0) || (i > FR_NUMAUTH) || + (fr_auth[i].fra_info.fin_id != au->fra_info.fin_id)) { + RWLOCK_EXIT(&ipf_auth); + return EINVAL; + } + m = fr_authpkts[i]; + fr_auth[i].fra_index = -2; + fr_auth[i].fra_pass = au->fra_pass; + fr_authpkts[i] = NULL; +#ifdef _KERNEL + RWLOCK_EXIT(&ipf_auth); +# ifndef linux + if (m && au->fra_info.fin_out) { +# if SOLARIS + error = fr_qout(fr_auth[i].fra_q, m); +# else /* SOLARIS */ +# if (_BSDI_VERSION >= 199802) || defined(__OpenBSD__) + error = ip_output(m, NULL, NULL, IP_FORWARDING, NULL, + NULL); +# else + error = ip_output(m, NULL, NULL, IP_FORWARDING, NULL); +# endif +# endif /* SOLARIS */ + if (error) + fr_authstats.fas_sendfail++; + else + fr_authstats.fas_sendok++; + } else if (m) { +# if SOLARIS + error = fr_qin(fr_auth[i].fra_q, m); +# else /* SOLARIS */ + if (! IF_HANDOFF(&ipintrq, m, NULL)) + error = ENOBUFS; + else + schednetisr(NETISR_IP); +# endif /* SOLARIS */ + if (error) + fr_authstats.fas_quefail++; + else + fr_authstats.fas_queok++; + } else + error = EINVAL; +# endif +# if SOLARIS + if (error) + error = EINVAL; +# else + /* + * If we experience an error which will result in the packet + * not being processed, make sure we advance to the next one. + */ + if (error == ENOBUFS) { + fr_authused--; + fr_auth[i].fra_index = -1; + fr_auth[i].fra_pass = 0; + if (i == fr_authstart) { + while (fr_auth[i].fra_index == -1) { + i++; + if (i == FR_NUMAUTH) + i = 0; + fr_authstart = i; + if (i == fr_authend) + break; + } + if (fr_authstart == fr_authend) { + fr_authnext = 0; + fr_authstart = fr_authend = 0; + } + } + } +# endif +#endif /* _KERNEL */ + break; + default : + error = EINVAL; + break; + } + return error; +} + + +#ifdef _KERNEL +/* + * Free all network buffer memory used to keep saved packets. + */ +void fr_authunload() +{ + register int i; + register frauthent_t *fae, **faep; + mb_t *m; + + WRITE_ENTER(&ipf_auth); + for (i = 0; i < FR_NUMAUTH; i++) { + if ((m = fr_authpkts[i])) { + FREE_MB_T(m); + fr_authpkts[i] = NULL; + fr_auth[i].fra_index = -1; + } + } + + + for (faep = &fae_list; (fae = *faep); ) { + *faep = fae->fae_next; + KFREE(fae); + } + ipauth = NULL; + RWLOCK_EXIT(&ipf_auth); +} + + +/* + * Slowly expire held auth records. Timeouts are set + * in expectation of this being called twice per second. + */ +void fr_authexpire() +{ + register int i; + register frauth_t *fra; + register frauthent_t *fae, **faep; + mb_t *m; +#if !SOLARIS + int s; +#endif + + if (fr_auth_lock) + return; + + SPL_NET(s); + WRITE_ENTER(&ipf_auth); + for (i = 0, fra = fr_auth; i < FR_NUMAUTH; i++, fra++) { + if ((!--fra->fra_age) && (m = fr_authpkts[i])) { + FREE_MB_T(m); + fr_authpkts[i] = NULL; + fr_auth[i].fra_index = -1; + fr_authstats.fas_expire++; + fr_authused--; + } + } + + for (faep = &fae_list; (fae = *faep); ) { + if (!--fae->fae_age) { + *faep = fae->fae_next; + KFREE(fae); + fr_authstats.fas_expire++; + } else + faep = &fae->fae_next; + } + ipauth = &fae_list->fae_fr; + RWLOCK_EXIT(&ipf_auth); + SPL_X(s); +} +#endif diff --git a/sys/netinet/ip_auth.h b/sys/netinet/ip_auth.h new file mode 100644 index 0000000..2851c3d --- /dev/null +++ b/sys/netinet/ip_auth.h @@ -0,0 +1,63 @@ +/* + * Copyright (C) 1997-2000 by Darren Reed & Guido Van Rooij. + * + * Redistribution and use in source and binary forms are permitted + * provided that this notice is preserved and due credit is given + * to the original author and the contributors. + * + * $Id: ip_auth.h,v 2.1 1999/08/04 17:29:54 darrenr Exp $ + * $FreeBSD$ + * + */ +#ifndef __IP_AUTH_H__ +#define __IP_AUTH_H__ + +#define FR_NUMAUTH 32 + +typedef struct frauth { + int fra_age; + int fra_index; + u_32_t fra_pass; + fr_info_t fra_info; +#if SOLARIS + queue_t *fra_q; +#endif +} frauth_t; + +typedef struct frauthent { + struct frentry fae_fr; + struct frauthent *fae_next; + u_long fae_age; +} frauthent_t; + +typedef struct fr_authstat { + U_QUAD_T fas_hits; + U_QUAD_T fas_miss; + u_long fas_nospace; + u_long fas_added; + u_long fas_sendfail; + u_long fas_sendok; + u_long fas_queok; + u_long fas_quefail; + u_long fas_expire; + frauthent_t *fas_faelist; +} fr_authstat_t; + + +extern frentry_t *ipauth; +extern struct fr_authstat fr_authstats; +extern int fr_defaultauthage; +extern int fr_authsize; +extern int fr_authused; +extern int fr_auth_lock; +extern u_32_t fr_checkauth __P((ip_t *, fr_info_t *)); +extern void fr_authexpire __P((void)); +extern void fr_authunload __P((void)); +extern mb_t *fr_authpkts[]; +extern int fr_newauth __P((mb_t *, fr_info_t *, ip_t *)); +#if defined(__NetBSD__) || defined(__OpenBSD__) +extern int fr_auth_ioctl __P((caddr_t, u_long, frentry_t *, frentry_t **)); +#else +extern int fr_auth_ioctl __P((caddr_t, int, frentry_t *, frentry_t **)); +#endif +#endif /* __IP_AUTH_H__ */ diff --git a/sys/netinet/ip_compat.h b/sys/netinet/ip_compat.h new file mode 100644 index 0000000..aefd50d --- /dev/null +++ b/sys/netinet/ip_compat.h @@ -0,0 +1,1013 @@ +/* + * Copyright (C) 1993-2000 by Darren Reed. + * + * Redistribution and use in source and binary forms are permitted + * provided that this notice is preserved and due credit is given + * to the original author and the contributors. + * + * @(#)ip_compat.h 1.8 1/14/96 + * $Id: ip_compat.h,v 2.26.2.9 2001/01/14 14:58:01 darrenr Exp $ + * $FreeBSD$ + */ + +#ifndef __IP_COMPAT_H__ +#define __IP_COMPAT_H__ + +#ifndef __P +# ifdef __STDC__ +# define __P(x) x +# else +# define __P(x) () +# endif +#endif +#ifndef __STDC__ +# undef const +# define const +#endif + +#ifndef SOLARIS +#define SOLARIS (defined(sun) && (defined(__svr4__) || defined(__SVR4))) +#endif +#if SOLARIS2 >= 8 +# ifndef USE_INET6 +# define USE_INET6 +# endif +#endif + +#if defined(_KERNEL) || defined(KERNEL) || defined(__KERNEL__) +# undef KERNEL +# undef _KERNEL +# undef __KERNEL__ +# define KERNEL +# define _KERNEL +# define __KERNEL__ +#endif + +#if defined(__SVR4) || defined(__svr4__) || defined(__sgi) +#define index strchr +# if !defined(KERNEL) +# define bzero(a,b) memset(a,0,b) +# define bcmp memcmp +# define bcopy(a,b,c) memmove(b,a,c) +# endif +#endif + +#ifndef offsetof +#define offsetof(t,m) (int)((&((t *)0L)->m)) +#endif + +#if defined(__sgi) || defined(bsdi) +struct ether_addr { + u_char ether_addr_octet[6]; +}; +#endif + +#if defined(__sgi) && !defined(IPFILTER_LKM) +# ifdef __STDC__ +# define IPL_EXTERN(ep) ipfilter##ep +# else +# define IPL_EXTERN(ep) ipfilter/**/ep +# endif +#else +# ifdef __STDC__ +# define IPL_EXTERN(ep) ipl##ep +# else +# define IPL_EXTERN(ep) ipl/**/ep +# endif +#endif + +#ifdef linux +# include <sys/sysmacros.h> +#endif +#if SOLARIS +# define MTYPE(m) ((m)->b_datap->db_type) +# include <sys/isa_defs.h> +# include <sys/ioccom.h> +# include <sys/sysmacros.h> +# include <sys/kmem.h> +/* + * because Solaris 2 defines these in two places :-/ + */ +# undef IPOPT_EOL +# undef IPOPT_NOP +# undef IPOPT_LSRR +# undef IPOPT_RR +# undef IPOPT_SSRR +# ifndef KERNEL +# define _KERNEL +# undef RES_INIT +# if SOLARIS2 >= 8 +# include <netinet/ip6.h> +# endif +# include <inet/common.h> +# include <inet/ip.h> +# include <inet/ip_ire.h> +# undef _KERNEL +# else /* _KERNEL */ +# if SOLARIS2 >= 8 +# include <netinet/ip6.h> +# endif +# include <inet/common.h> +# include <inet/ip.h> +# include <inet/ip_ire.h> +# endif /* _KERNEL */ +# if SOLARIS2 >= 8 +# include <inet/ip_if.h> +# include <netinet/ip6.h> +# define ipif_local_addr ipif_lcl_addr +/* Only defined in private include file */ +# ifndef V4_PART_OF_V6 +# define V4_PART_OF_V6(v6) v6.s6_addr32[3] +# endif +# endif +#else +# if !defined(__sgi) +typedef int minor_t; +#endif +#endif /* SOLARIS */ +#define IPMINLEN(i, h) ((i)->ip_len >= ((i)->ip_hl * 4 + sizeof(struct h))) + +#if defined(__FreeBSD__) && (__FreeBSD__ >= 5) && defined(_KERNEL) +# include <machine/in_cksum.h> +#endif + +#ifndef IP_OFFMASK +#define IP_OFFMASK 0x1fff +#endif + +#if BSD > 199306 +# define USE_QUAD_T +# define U_QUAD_T u_quad_t +# define QUAD_T quad_t +#else /* BSD > 199306 */ +# define U_QUAD_T u_long +# define QUAD_T long +#endif /* BSD > 199306 */ + + +/* + * These operating systems already take care of the problem for us. + */ +#if defined(__NetBSD__) || defined(__OpenBSD__) || defined(__FreeBSD__) || \ + defined(__sgi) +typedef u_int32_t u_32_t; +# if defined(_KERNEL) && !defined(IPFILTER_LKM) +# if defined(__NetBSD_Version__) && (__NetBSD_Version__ >= 104110000) +# include "opt_inet.h" +# endif +# if defined(__FreeBSD_version) && (__FreeBSD_version >= 400000) && \ + !defined(KLD_MODULE) +# include "opt_inet6.h" +# endif +# ifdef INET6 +# define USE_INET6 +# endif +# endif +#else +/* + * Really, any arch where sizeof(long) != sizeof(int). + */ +# if defined(__alpha__) || defined(__alpha) || defined(_LP64) +typedef unsigned int u_32_t; +# else +# if SOLARIS2 >= 6 +typedef uint32_t u_32_t; +# else +typedef unsigned int u_32_t; +# endif +# endif +#endif /* __NetBSD__ || __OpenBSD__ || __FreeBSD__ || __sgi */ + +#ifdef USE_INET6 +# if defined(__NetBSD__) || defined(__OpenBSD__) || defined(__FreeBSD__) +# include <netinet/ip6.h> +# ifdef _KERNEL +# include <netinet6/ip6_var.h> +# endif +typedef struct ip6_hdr ip6_t; +# endif +union i6addr { + u_32_t i6[4]; + struct in_addr in4; + struct in6_addr in6; +}; +#else +union i6addr { + u_32_t i6[4]; + struct in_addr in4; +}; +#endif + +#define IP6CMP(a,b) bcmp((char *)&(a), (char *)&(b), sizeof(a)) +#define IP6EQ(a,b) (bcmp((char *)&(a), (char *)&(b), sizeof(a)) == 0) +#define IP6NEQ(a,b) (bcmp((char *)&(a), (char *)&(b), sizeof(a)) != 0) + +#ifndef MAX +#define MAX(a,b) (((a) > (b)) ? (a) : (b)) +#endif + +/* + * Security Options for Intenet Protocol (IPSO) as defined in RFC 1108. + * + * Basic Option + * + * 00000001 - (Reserved 4) + * 00111101 - Top Secret + * 01011010 - Secret + * 10010110 - Confidential + * 01100110 - (Reserved 3) + * 11001100 - (Reserved 2) + * 10101011 - Unclassified + * 11110001 - (Reserved 1) + */ +#define IPSO_CLASS_RES4 0x01 +#define IPSO_CLASS_TOPS 0x3d +#define IPSO_CLASS_SECR 0x5a +#define IPSO_CLASS_CONF 0x96 +#define IPSO_CLASS_RES3 0x66 +#define IPSO_CLASS_RES2 0xcc +#define IPSO_CLASS_UNCL 0xab +#define IPSO_CLASS_RES1 0xf1 + +#define IPSO_AUTH_GENSER 0x80 +#define IPSO_AUTH_ESI 0x40 +#define IPSO_AUTH_SCI 0x20 +#define IPSO_AUTH_NSA 0x10 +#define IPSO_AUTH_DOE 0x08 +#define IPSO_AUTH_UN 0x06 +#define IPSO_AUTH_FTE 0x01 + +/* + * IP option #defines + */ +/*#define IPOPT_RR 7 */ +#define IPOPT_ZSU 10 /* ZSU */ +#define IPOPT_MTUP 11 /* MTUP */ +#define IPOPT_MTUR 12 /* MTUR */ +#define IPOPT_ENCODE 15 /* ENCODE */ +/*#define IPOPT_TS 68 */ +#define IPOPT_TR 82 /* TR */ +/*#define IPOPT_SECURITY 130 */ +/*#define IPOPT_LSRR 131 */ +#define IPOPT_E_SEC 133 /* E-SEC */ +#define IPOPT_CIPSO 134 /* CIPSO */ +/*#define IPOPT_SATID 136 */ +#ifndef IPOPT_SID +# define IPOPT_SID IPOPT_SATID +#endif +/*#define IPOPT_SSRR 137 */ +#define IPOPT_ADDEXT 147 /* ADDEXT */ +#define IPOPT_VISA 142 /* VISA */ +#define IPOPT_IMITD 144 /* IMITD */ +#define IPOPT_EIP 145 /* EIP */ +#define IPOPT_FINN 205 /* FINN */ + + +#if defined(__FreeBSD__) && (defined(KERNEL) || defined(_KERNEL)) +# ifdef IPFILTER_LKM +# include <sys/param.h> +# define ACTUALLY_LKM_NOT_KERNEL +# else +# include <sys/param.h> +# endif +# if __FreeBSD__ < 3 +# include <machine/spl.h> +# else +# if __FreeBSD__ == 3 +# if defined(IPFILTER_LKM) && !defined(ACTUALLY_LKM_NOT_KERNEL) +# define ACTUALLY_LKM_NOT_KERNEL +# endif +# endif +# endif +#endif /* __FreeBSD__ && KERNEL */ + +/* + * Build some macros and #defines to enable the same code to compile anywhere + * Well, that's the idea, anyway :-) + */ +#if !SOLARIS || (SOLARIS2 < 6) || !defined(KERNEL) +# define ATOMIC_INCL ATOMIC_INC +# define ATOMIC_INC64 ATOMIC_INC +# define ATOMIC_INC32 ATOMIC_INC +# define ATOMIC_INC16 ATOMIC_INC +# define ATOMIC_DECL ATOMIC_DEC +# define ATOMIC_DEC64 ATOMIC_DEC +# define ATOMIC_DEC32 ATOMIC_DEC +# define ATOMIC_DEC16 ATOMIC_DEC +#endif +#ifdef __sgi +# define hz HZ +# include <sys/ksynch.h> +# define IPF_LOCK_PL plhi +# include <sys/sema.h> +#undef kmutex_t +typedef struct { + lock_t *l; + int pl; +} kmutex_t; +# undef MUTEX_INIT +# undef MUTEX_DESTROY +#endif +#ifdef KERNEL +# if SOLARIS +# if SOLARIS2 >= 6 +# include <sys/atomic.h> +# if SOLARIS2 == 6 +# define ATOMIC_INCL(x) atomic_add_long((uint32_t*)&(x), 1) +# define ATOMIC_DECL(x) atomic_add_long((uint32_t*)&(x), -1) +# else +# define ATOMIC_INCL(x) atomic_add_long(&(x), 1) +# define ATOMIC_DECL(x) atomic_add_long(&(x), -1) +# endif +# define ATOMIC_INC64(x) atomic_add_64((uint64_t*)&(x), 1) +# define ATOMIC_INC32(x) atomic_add_32((uint32_t*)&(x), 1) +# define ATOMIC_INC16(x) atomic_add_16((uint16_t*)&(x), 1) +# define ATOMIC_DEC64(x) atomic_add_64((uint64_t*)&(x), -1) +# define ATOMIC_DEC32(x) atomic_add_32((uint32_t*)&(x), -1) +# define ATOMIC_DEC16(x) atomic_add_16((uint16_t*)&(x), -1) +# else +# define ATOMIC_INC(x) { mutex_enter(&ipf_rw); (x)++; \ + mutex_exit(&ipf_rw); } +# define ATOMIC_DEC(x) { mutex_enter(&ipf_rw); (x)--; \ + mutex_exit(&ipf_rw); } +# endif +# define MUTEX_ENTER(x) mutex_enter(x) +# if 1 +# define KRWLOCK_T krwlock_t +# define READ_ENTER(x) rw_enter(x, RW_READER) +# define WRITE_ENTER(x) rw_enter(x, RW_WRITER) +# define RW_UPGRADE(x) { if (rw_tryupgrade(x) == 0) { \ + rw_exit(x); \ + rw_enter(x, RW_WRITER); } \ + } +# define MUTEX_DOWNGRADE(x) rw_downgrade(x) +# define RWLOCK_INIT(x, y, z) rw_init((x), (y), RW_DRIVER, (z)) +# define RWLOCK_EXIT(x) rw_exit(x) +# define RW_DESTROY(x) rw_destroy(x) +# else +# define KRWLOCK_T kmutex_t +# define READ_ENTER(x) mutex_enter(x) +# define WRITE_ENTER(x) mutex_enter(x) +# define MUTEX_DOWNGRADE(x) ; +# define RWLOCK_INIT(x, y, z) mutex_init((x), (y), MUTEX_DRIVER, (z)) +# define RWLOCK_EXIT(x) mutex_exit(x) +# define RW_DESTROY(x) mutex_destroy(x) +# endif +# define MUTEX_INIT(x, y, z) mutex_init((x), (y), MUTEX_DRIVER, (z)) +# define MUTEX_DESTROY(x) mutex_destroy(x) +# define MUTEX_EXIT(x) mutex_exit(x) +# define MTOD(m,t) (t)((m)->b_rptr) +# define IRCOPY(a,b,c) copyin((caddr_t)(a), (caddr_t)(b), (c)) +# define IWCOPY(a,b,c) copyout((caddr_t)(a), (caddr_t)(b), (c)) +# define IRCOPYPTR ircopyptr +# define IWCOPYPTR iwcopyptr +# define FREE_MB_T(m) freemsg(m) +# define SPL_NET(x) ; +# define SPL_IMP(x) ; +# undef SPL_X +# define SPL_X(x) ; +# ifdef sparc +# define ntohs(x) (x) +# define ntohl(x) (x) +# define htons(x) (x) +# define htonl(x) (x) +# endif /* sparc */ +# define KMALLOC(a,b) (a) = (b)kmem_alloc(sizeof(*(a)), KM_NOSLEEP) +# define KMALLOCS(a,b,c) (a) = (b)kmem_alloc((c), KM_NOSLEEP) +# define GET_MINOR(x) getminor(x) +typedef struct qif { + struct qif *qf_next; + ill_t *qf_ill; + kmutex_t qf_lock; + void *qf_iptr; + void *qf_optr; + queue_t *qf_in; + queue_t *qf_out; + struct qinit *qf_wqinfo; + struct qinit *qf_rqinfo; + struct qinit qf_wqinit; + struct qinit qf_rqinit; + mblk_t *qf_m; /* These three fields are for passing data up from */ + queue_t *qf_q; /* fr_qin and fr_qout to the packet processing. */ + size_t qf_off; + size_t qf_len; /* this field is used for in ipfr_fastroute */ + char qf_name[8]; + /* + * in case the ILL has disappeared... + */ + size_t qf_hl; /* header length */ + int qf_sap; +} qif_t; +extern ill_t *get_unit __P((char *, int)); +# define GETUNIT(n, v) get_unit(n, v) +# define IFNAME(x) ((ill_t *)x)->ill_name +# else /* SOLARIS */ +# if defined(__sgi) +# define ATOMIC_INC(x) { MUTEX_ENTER(&ipf_rw); \ + (x)++; MUTEX_EXIT(&ipf_rw); } +# define ATOMIC_DEC(x) { MUTEX_ENTER(&ipf_rw); \ + (x)--; MUTEX_EXIT(&ipf_rw); } +# define MUTEX_ENTER(x) (x)->pl = LOCK((x)->l, IPF_LOCK_PL); +# define KRWLOCK_T kmutex_t +# define READ_ENTER(x) MUTEX_ENTER(x) +# define WRITE_ENTER(x) MUTEX_ENTER(x) +# define RW_UPGRADE(x) ; +# define MUTEX_DOWNGRADE(x) ; +# define RWLOCK_EXIT(x) MUTEX_EXIT(x) +# define MUTEX_EXIT(x) UNLOCK((x)->l, (x)->pl); +# define MUTEX_INIT(x,y,z) (x)->l = LOCK_ALLOC((uchar_t)-1, IPF_LOCK_PL, (lkinfo_t *)-1, KM_NOSLEEP) +# define MUTEX_DESTROY(x) LOCK_DEALLOC((x)->l) +# else /* __sgi */ +# define ATOMIC_INC(x) (x)++ +# define ATOMIC_DEC(x) (x)-- +# define MUTEX_ENTER(x) ; +# define READ_ENTER(x) ; +# define WRITE_ENTER(x) ; +# define RW_UPGRADE(x) ; +# define MUTEX_DOWNGRADE(x) ; +# define RWLOCK_EXIT(x) ; +# define MUTEX_EXIT(x) ; +# define MUTEX_INIT(x,y,z) ; +# define MUTEX_DESTROY(x) ; +# endif /* __sgi */ +# ifndef linux +# define FREE_MB_T(m) m_freem(m) +# define MTOD(m,t) mtod(m,t) +# define IRCOPY(a,b,c) (bcopy((a), (b), (c)), 0) +# define IWCOPY(a,b,c) (bcopy((a), (b), (c)), 0) +# define IRCOPYPTR ircopyptr +# define IWCOPYPTR iwcopyptr +# endif /* !linux */ +# endif /* SOLARIS */ + +# ifdef sun +# if !SOLARIS +# include <sys/kmem_alloc.h> +# define GETUNIT(n, v) ifunit(n, IFNAMSIZ) +# define IFNAME(x) ((struct ifnet *)x)->if_name +# endif +# else +# ifndef linux +# define GETUNIT(n, v) ifunit(n) +# if (defined(NetBSD) && (NetBSD <= 1991011) && (NetBSD >= 199606)) || \ + (defined(OpenBSD) && (OpenBSD >= 199603)) +# define IFNAME(x) ((struct ifnet *)x)->if_xname +# else +# define IFNAME(x) ((struct ifnet *)x)->if_name +# endif +# endif +# endif /* sun */ + +# if defined(sun) && !defined(linux) || defined(__sgi) +# define UIOMOVE(a,b,c,d) uiomove((caddr_t)a,b,c,d) +# define SLEEP(id, n) sleep((id), PZERO+1) +# define WAKEUP(id) wakeup(id) +# define KFREE(x) kmem_free((char *)(x), sizeof(*(x))) +# define KFREES(x,s) kmem_free((char *)(x), (s)) +# if !SOLARIS +extern void m_copydata __P((struct mbuf *, int, int, caddr_t)); +extern void m_copyback __P((struct mbuf *, int, int, caddr_t)); +# endif +# ifdef __sgi +# include <sys/kmem.h> +# include <sys/ddi.h> +# define KMALLOC(a,b) (a) = (b)kmem_alloc(sizeof(*(a)), KM_NOSLEEP) +# define KMALLOCS(a,b,c) (a) = (b)kmem_alloc((c), KM_NOSLEEP) +# define GET_MINOR(x) getminor(x) +# else +# if !SOLARIS +# define KMALLOC(a,b) (a) = (b)new_kmem_alloc(sizeof(*(a)), \ + KMEM_NOSLEEP) +# define KMALLOCS(a,b,c) (a) = (b)new_kmem_alloc((c), KMEM_NOSLEEP) +# endif /* SOLARIS */ +# endif /* __sgi */ +# endif /* sun && !linux */ +# ifndef GET_MINOR +# define GET_MINOR(x) minor(x) +# endif +# if (BSD >= 199306) || defined(__FreeBSD__) +# include <vm/vm.h> +# if !defined(__FreeBSD__) || (defined (__FreeBSD__) && __FreeBSD__>=3) +# include <vm/vm_extern.h> +# include <sys/proc.h> +extern vm_map_t kmem_map; +# else /* !__FreeBSD__ || (__FreeBSD__ && __FreeBSD__>=3) */ +# include <vm/vm_kern.h> +# endif /* !__FreeBSD__ || (__FreeBSD__ && __FreeBSD__>=3) */ +# ifdef M_PFIL +# define KMALLOC(a, b) MALLOC((a), b, sizeof(*(a)), M_PFIL, M_NOWAIT) +# define KMALLOCS(a, b, c) MALLOC((a), b, (c), M_PFIL, M_NOWAIT) +# define KFREE(x) FREE((x), M_PFIL) +# define KFREES(x,s) FREE((x), M_PFIL) +# else +# define KMALLOC(a, b) MALLOC((a), b, sizeof(*(a)), M_TEMP, M_NOWAIT) +# define KMALLOCS(a, b, c) MALLOC((a), b, (c), M_TEMP, M_NOWAIT) +# define KFREE(x) FREE((x), M_TEMP) +# define KFREES(x,s) FREE((x), M_TEMP) +# endif /* M_PFIL */ +# define UIOMOVE(a,b,c,d) uiomove(a,b,d) +# define SLEEP(id, n) tsleep((id), PPAUSE|PCATCH, n, 0) +# define WAKEUP(id) wakeup(id) +# endif /* BSD */ +# if defined(NetBSD) && NetBSD <= 1991011 && NetBSD >= 199407 +# define SPL_NET(x) x = splsoftnet() +# define SPL_X(x) (void) splx(x) +# else +# if !SOLARIS && !defined(linux) +# define SPL_IMP(x) x = splimp() +# define SPL_NET(x) x = splnet() +# define SPL_X(x) (void) splx(x) +# endif +# endif /* NetBSD && NetBSD <= 1991011 && NetBSD >= 199407 */ +# define PANIC(x,y) if (x) panic y +#else /* KERNEL */ +# define SLEEP(x,y) ; +# define WAKEUP(x) ; +# define PANIC(x,y) ; +# define ATOMIC_INC(x) (x)++ +# define ATOMIC_DEC(x) (x)-- +# define MUTEX_ENTER(x) ; +# define READ_ENTER(x) ; +# define MUTEX_INIT(x,y,z) ; +# define MUTEX_DESTROY(x) ; +# define WRITE_ENTER(x) ; +# define RW_UPGRADE(x) ; +# define MUTEX_DOWNGRADE(x) ; +# define RWLOCK_EXIT(x) ; +# define MUTEX_EXIT(x) ; +# define SPL_NET(x) ; +# define SPL_IMP(x) ; +# undef SPL_X +# define SPL_X(x) ; +# define KMALLOC(a,b) (a) = (b)malloc(sizeof(*a)) +# define KMALLOCS(a,b,c) (a) = (b)malloc(c) +# define KFREE(x) free(x) +# define KFREES(x,s) free(x) +# define GETUNIT(x, v) get_unit(x,v) +# define IRCOPY(a,b,c) (bcopy((a), (b), (c)), 0) +# define IWCOPY(a,b,c) (bcopy((a), (b), (c)), 0) +# define IRCOPYPTR ircopyptr +# define IWCOPYPTR iwcopyptr +#endif /* KERNEL */ + +#if SOLARIS +typedef mblk_t mb_t; +# if SOLARIS2 >= 7 +# ifdef lint +# define ALIGN32(ptr) (ptr ? 0L : 0L) +# define ALIGN16(ptr) (ptr ? 0L : 0L) +# else +# define ALIGN32(ptr) (ptr) +# define ALIGN16(ptr) (ptr) +# endif +# endif +#else +# ifdef linux +# ifndef kernel +typedef struct mb { + struct mb *next; + u_int len; + u_char *data; +} mb_t; +# else +typedef struct sk_buff mb_t; +# endif +# else +typedef struct mbuf mb_t; +# endif +#endif /* SOLARIS */ + +#if defined(linux) || defined(__sgi) +/* + * These #ifdef's are here mainly for linux, but who knows, they may + * not be in other places or maybe one day linux will grow up and some + * of these will turn up there too. + */ +#ifndef ICMP_MINLEN +# define ICMP_MINLEN 8 +#endif +#ifndef ICMP_UNREACH +# define ICMP_UNREACH ICMP_DEST_UNREACH +#endif +#ifndef ICMP_SOURCEQUENCH +# define ICMP_SOURCEQUENCH ICMP_SOURCE_QUENCH +#endif +#ifndef ICMP_TIMXCEED +# define ICMP_TIMXCEED ICMP_TIME_EXCEEDED +#endif +#ifndef ICMP_PARAMPROB +# define ICMP_PARAMPROB ICMP_PARAMETERPROB +#endif +#ifndef ICMP_TSTAMP +# define ICMP_TSTAMP ICMP_TIMESTAMP +#endif +#ifndef ICMP_TSTAMPREPLY +# define ICMP_TSTAMPREPLY ICMP_TIMESTAMPREPLY +#endif +#ifndef ICMP_IREQ +# define ICMP_IREQ ICMP_INFO_REQUEST +#endif +#ifndef ICMP_IREQREPLY +# define ICMP_IREQREPLY ICMP_INFO_REPLY +#endif +#ifndef ICMP_MASKREQ +# define ICMP_MASKREQ ICMP_ADDRESS +#endif +#ifndef ICMP_MASKREPLY +# define ICMP_MASKREPLY ICMP_ADDRESSREPLY +#endif +#ifndef IPVERSION +# define IPVERSION 4 +#endif +#ifndef IPOPT_MINOFF +# define IPOPT_MINOFF 4 +#endif +#ifndef IPOPT_COPIED +# define IPOPT_COPIED(x) ((x)&0x80) +#endif +#ifndef IPOPT_EOL +# define IPOPT_EOL 0 +#endif +#ifndef IPOPT_NOP +# define IPOPT_NOP 1 +#endif +#ifndef IP_MF +# define IP_MF ((u_short)0x2000) +#endif +#ifndef ETHERTYPE_IP +# define ETHERTYPE_IP ((u_short)0x0800) +#endif +#ifndef TH_FIN +# define TH_FIN 0x01 +#endif +#ifndef TH_SYN +# define TH_SYN 0x02 +#endif +#ifndef TH_RST +# define TH_RST 0x04 +#endif +#ifndef TH_PUSH +# define TH_PUSH 0x08 +#endif +#ifndef TH_ACK +# define TH_ACK 0x10 +#endif +#ifndef TH_URG +# define TH_URG 0x20 +#endif +#ifndef IPOPT_EOL +# define IPOPT_EOL 0 +#endif +#ifndef IPOPT_NOP +# define IPOPT_NOP 1 +#endif +#ifndef IPOPT_RR +# define IPOPT_RR 7 +#endif +#ifndef IPOPT_TS +# define IPOPT_TS 68 +#endif +#ifndef IPOPT_SECURITY +# define IPOPT_SECURITY 130 +#endif +#ifndef IPOPT_LSRR +# define IPOPT_LSRR 131 +#endif +#ifndef IPOPT_SATID +# define IPOPT_SATID 136 +#endif +#ifndef IPOPT_SSRR +# define IPOPT_SSRR 137 +#endif +#ifndef IPOPT_SECUR_UNCLASS +# define IPOPT_SECUR_UNCLASS ((u_short)0x0000) +#endif +#ifndef IPOPT_SECUR_CONFID +# define IPOPT_SECUR_CONFID ((u_short)0xf135) +#endif +#ifndef IPOPT_SECUR_EFTO +# define IPOPT_SECUR_EFTO ((u_short)0x789a) +#endif +#ifndef IPOPT_SECUR_MMMM +# define IPOPT_SECUR_MMMM ((u_short)0xbc4d) +#endif +#ifndef IPOPT_SECUR_RESTR +# define IPOPT_SECUR_RESTR ((u_short)0xaf13) +#endif +#ifndef IPOPT_SECUR_SECRET +# define IPOPT_SECUR_SECRET ((u_short)0xd788) +#endif +#ifndef IPOPT_SECUR_TOPSECRET +# define IPOPT_SECUR_TOPSECRET ((u_short)0x6bc5) +#endif +#ifndef IPOPT_OLEN +# define IPOPT_OLEN 1 +#endif +#endif /* linux || __sgi */ + +#ifdef linux +#include <linux/in_systm.h> +/* + * TCP States + */ +#define TCPS_CLOSED 0 /* closed */ +#define TCPS_LISTEN 1 /* listening for connection */ +#define TCPS_SYN_SENT 2 /* active, have sent syn */ +#define TCPS_SYN_RECEIVED 3 /* have send and received syn */ +/* states < TCPS_ESTABLISHED are those where connections not established */ +#define TCPS_ESTABLISHED 4 /* established */ +#define TCPS_CLOSE_WAIT 5 /* rcvd fin, waiting for close */ +/* states > TCPS_CLOSE_WAIT are those where user has closed */ +#define TCPS_FIN_WAIT_1 6 /* have closed, sent fin */ +#define TCPS_CLOSING 7 /* closed xchd FIN; await FIN ACK */ +#define TCPS_LAST_ACK 8 /* had fin and close; await FIN ACK */ +/* states > TCPS_CLOSE_WAIT && < TCPS_FIN_WAIT_2 await ACK of FIN */ +#define TCPS_FIN_WAIT_2 9 /* have closed, fin is acked */ +#define TCPS_TIME_WAIT 10 /* in 2*msl quiet wait after close */ + +/* + * file flags. + */ +#ifdef WRITE +#define FWRITE WRITE +#define FREAD READ +#else +#define FWRITE _IOC_WRITE +#define FREAD _IOC_READ +#endif +/* + * mbuf related problems. + */ +#define mtod(m,t) (t)((m)->data) +#define m_len len +#define m_next next + +#ifdef IP_DF +#undef IP_DF +#endif +#define IP_DF 0x4000 + +typedef struct { + __u16 th_sport; + __u16 th_dport; + __u32 th_seq; + __u32 th_ack; +# if defined(__i386__) || defined(__MIPSEL__) || defined(__alpha__) ||\ + defined(vax) + __u8 th_res:4; + __u8 th_off:4; +#else + __u8 th_off:4; + __u8 th_res:4; +#endif + __u8 th_flags; + __u16 th_win; + __u16 th_sum; + __u16 th_urp; +} tcphdr_t; + +typedef struct { + __u16 uh_sport; + __u16 uh_dport; + __u16 uh_ulen; + __u16 uh_sum; +} udphdr_t; + +typedef struct { +# if defined(__i386__) || defined(__MIPSEL__) || defined(__alpha__) ||\ + defined(vax) + __u8 ip_hl:4; + __u8 ip_v:4; +# else + __u8 ip_v:4; + __u8 ip_hl:4; +# endif + __u8 ip_tos; + __u16 ip_len; + __u16 ip_id; + __u16 ip_off; + __u8 ip_ttl; + __u8 ip_p; + __u16 ip_sum; + struct in_addr ip_src; + struct in_addr ip_dst; +} ip_t; + +/* + * Structure of an icmp header. + */ +typedef struct icmp { + __u8 icmp_type; /* type of message, see below */ + __u8 icmp_code; /* type sub code */ + __u16 icmp_cksum; /* ones complement cksum of struct */ + union { + __u8 ih_pptr; /* ICMP_PARAMPROB */ + struct in_addr ih_gwaddr; /* ICMP_REDIRECT */ + struct ih_idseq { + __u16 icd_id; + __u16 icd_seq; + } ih_idseq; + int ih_void; + } icmp_hun; +# define icmp_pptr icmp_hun.ih_pptr +# define icmp_gwaddr icmp_hun.ih_gwaddr +# define icmp_id icmp_hun.ih_idseq.icd_id +# define icmp_seq icmp_hun.ih_idseq.icd_seq +# define icmp_void icmp_hun.ih_void + union { + struct id_ts { + n_time its_otime; + n_time its_rtime; + n_time its_ttime; + } id_ts; + struct id_ip { + ip_t idi_ip; + /* options and then 64 bits of data */ + } id_ip; + u_long id_mask; + char id_data[1]; + } icmp_dun; +# define icmp_otime icmp_dun.id_ts.its_otime +# define icmp_rtime icmp_dun.id_ts.its_rtime +# define icmp_ttime icmp_dun.id_ts.its_ttime +# define icmp_ip icmp_dun.id_ip.idi_ip +# define icmp_mask icmp_dun.id_mask +# define icmp_data icmp_dun.id_data +} icmphdr_t; + +# ifndef LINUX_IPOVLY +# define LINUX_IPOVLY +struct ipovly { + caddr_t ih_next, ih_prev; /* for protocol sequence q's */ + u_char ih_x1; /* (unused) */ + u_char ih_pr; /* protocol */ + short ih_len; /* protocol length */ + struct in_addr ih_src; /* source internet address */ + struct in_addr ih_dst; /* destination internet address */ +}; +# endif + +typedef struct { + __u8 ether_dhost[6]; + __u8 ether_shost[6]; + __u16 ether_type; +} ether_header_t; + +typedef struct uio { + int uio_resid; + int uio_rw; + caddr_t uio_buf; +} uio_t; + +# define UIO_READ 0 +# define UIO_WRITE 1 +# define UIOMOVE(a, b, c, d) uiomove(a,b,c,d) + +/* + * For masking struct ifnet onto struct device + */ +# define if_name name + +# ifdef KERNEL +# define GETUNIT(x, v) dev_get(x) +# define FREE_MB_T(m) kfree_skb(m, FREE_WRITE) +# define uniqtime do_gettimeofday +# undef INT_MAX +# undef UINT_MAX +# undef LONG_MAX +# undef ULONG_MAX +# include <linux/netdevice.h> +# define SPL_X(x) +# define SPL_NET(x) +# define SPL_IMP(x) + +# define bcmp(a,b,c) memcmp(a,b,c) +# define bcopy(a,b,c) memcpy(b,a,c) +# define bzero(a,c) memset(a,0,c) + +# define UNITNAME(n) dev_get((n)) + +# define KMALLOC(a,b) (a) = (b)kmalloc(sizeof(*(a)), GFP_ATOMIC) +# define KMALLOCS(a,b,c) (a) = (b)kmalloc((c), GFP_ATOMIC) +# define KFREE(x) kfree_s((x), sizeof(*(x))) +# define KFREES(x,s) kfree_s((x), (s)) +#define IRCOPY(const void *a, void *b, size_t c) { \ + int error; \ + + error = verify_area(VERIFY_READ, a ,c); \ + if (!error) \ + memcpy_fromfs(b, a, c); \ + return error; \ +} +static inline int IWCOPY(const void *a, void *b, size_t c) +{ + int error; + + error = verify_area(VERIFY_WRITE, b, c); + if (!error) + memcpy_tofs(b, a, c); + return error; +} +static inline int IRCOPYPTR(const void *a, void *b, size_t c) { + caddr_t ca; + int error; + + error = verify_area(VERIFY_READ, a ,sizeof(ca)); + if (!error) { + memcpy_fromfs(ca, a, sizeof(ca)); + error = verify_area(VERIFY_READ, ca , c); + if (!error) + memcpy_fromfs(b, ca, c); + } + return error; +} +static inline int IWCOPYPTR(const void *a, void *b, size_t c) { + caddr_t ca; + int error; + + + error = verify_area(VERIFY_READ, b ,sizeof(ca)); + if (!error) { + memcpy_fromfs(ca, b, sizeof(ca)); + error = verify_area(VERIFY_WRITE, ca, c); + if (!error) + memcpy_tofs(ca, a, c); + } + return error; +} +# else +# define __KERNEL__ +# undef INT_MAX +# undef UINT_MAX +# undef LONG_MAX +# undef ULONG_MAX +# define s8 __s8 +# define u8 __u8 +# define s16 __s16 +# define u16 __u16 +# define s32 __s32 +# define u32 __u32 +# include <linux/netdevice.h> +# undef __KERNEL__ +# endif +# define ifnet device +#else +typedef struct tcphdr tcphdr_t; +typedef struct udphdr udphdr_t; +typedef struct icmp icmphdr_t; +typedef struct ip ip_t; +typedef struct ether_header ether_header_t; +#endif /* linux */ +typedef struct tcpiphdr tcpiphdr_t; + +#if defined(hpux) || defined(linux) +struct ether_addr { + char ether_addr_octet[6]; +}; +#endif + +/* + * XXX - This is one of those *awful* hacks which nobody likes + */ +#ifdef ultrix +#define A_A +#else +#define A_A & +#endif + +#ifndef ICMP_ROUTERADVERT +# define ICMP_ROUTERADVERT 9 +#endif +#ifndef ICMP_ROUTERSOLICIT +# define ICMP_ROUTERSOLICIT 10 +#endif +#undef ICMP_MAX_UNREACH +#define ICMP_MAX_UNREACH 14 +#undef ICMP_MAXTYPE +#define ICMP_MAXTYPE 18 +/* + * ICMP error replies have an IP header (20 bytes), 8 bytes of ICMP data, + * another IP header and then 64 bits of data, totalling 56. Of course, + * the last 64 bits is dependant on that being available. + */ +#define ICMPERR_ICMPHLEN 8 +#define ICMPERR_IPICMPHLEN (20 + 8) +#define ICMPERR_MINPKTLEN (20 + 8 + 20) +#define ICMPERR_MAXPKTLEN (20 + 8 + 20 + 8) +#define ICMP6ERR_MINPKTLEN (40 + 8) +#define ICMP6ERR_IPICMPHLEN (40 + 8 + 40) + +/* + * ECN is a new addition to TCP - RFC 2481 + */ +#ifndef TH_ECN +# define TH_ECN 0x40 +#endif +#ifndef TH_CWR +# define TH_CWR 0x80 +#endif +#define TH_ECNALL (TH_ECN|TH_CWR) + +#define TCPF_ALL (TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG|TH_ECN|TH_CWR) + +#endif /* __IP_COMPAT_H__ */ diff --git a/sys/netinet/ip_divert.c b/sys/netinet/ip_divert.c new file mode 100644 index 0000000..df5ec22 --- /dev/null +++ b/sys/netinet/ip_divert.c @@ -0,0 +1,543 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "opt_inet.h" +#include "opt_ipfw.h" +#include "opt_ipdivert.h" +#include "opt_ipsec.h" + +#ifndef INET +#error "IPDIVERT requires INET." +#endif + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/socket.h> +#include <sys/protosw.h> +#include <sys/socketvar.h> +#include <sys/sysctl.h> +#include <sys/systm.h> + +#include <vm/vm_zone.h> + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/in_pcb.h> +#include <netinet/in_var.h> +#include <netinet/ip_var.h> + +/* + * Divert sockets + */ + +/* + * Allocate enough space to hold a full IP packet + */ +#define DIVSNDQ (65536 + 100) +#define DIVRCVQ (65536 + 100) + +/* + * A 16 bit cookie is passed to and from the user process. + * The user process can send it back to help the caller know + * something about where the packet originally came from. + * + * In the case of ipfw, then the cookie is the rule that sent + * us here. On reinjection is is the rule after which processing + * should continue. Leaving it the same will make processing start + * at the rule number after that which sent it here. Setting it to + * 0 will restart processing at the beginning. + * + * For divert_packet(), ip_divert_cookie is an input value only. + * For div_output(), ip_divert_cookie is an output value only. + */ +u_int16_t ip_divert_cookie; + +/* Internal variables */ +static struct inpcbhead divcb; +static struct inpcbinfo divcbinfo; + +static u_long div_sendspace = DIVSNDQ; /* XXX sysctl ? */ +static u_long div_recvspace = DIVRCVQ; /* XXX sysctl ? */ + +/* Optimization: have this preinitialized */ +static struct sockaddr_in divsrc = { sizeof(divsrc), AF_INET }; + +/* Internal functions */ +static int div_output(struct socket *so, + struct mbuf *m, struct sockaddr *addr, struct mbuf *control); + +/* + * Initialize divert connection block queue. + */ +void +div_init(void) +{ + LIST_INIT(&divcb); + divcbinfo.listhead = &divcb; + /* + * XXX We don't use the hash list for divert IP, but it's easier + * to allocate a one entry hash list than it is to check all + * over the place for hashbase == NULL. + */ + divcbinfo.hashbase = hashinit(1, M_PCB, &divcbinfo.hashmask); + divcbinfo.porthashbase = hashinit(1, M_PCB, &divcbinfo.porthashmask); + divcbinfo.ipi_zone = zinit("divcb", sizeof(struct inpcb), + maxsockets, ZONE_INTERRUPT, 0); +} + +/* + * IPPROTO_DIVERT is not a real IP protocol; don't allow any packets + * with that protocol number to enter the system from the outside. + */ +void +div_input(struct mbuf *m, int off, int proto) +{ + ipstat.ips_noproto++; + m_freem(m); +} + +/* + * Divert a packet by passing it up to the divert socket at port 'port'. + * + * Setup generic address and protocol structures for div_input routine, + * then pass them along with mbuf chain. + */ +void +divert_packet(struct mbuf *m, int incoming, int port) +{ + struct ip *ip; + struct inpcb *inp; + struct socket *sa; + u_int16_t nport; + + /* Sanity check */ + KASSERT(port != 0, ("%s: port=0", __FUNCTION__)); + + /* Record and reset divert cookie */ + divsrc.sin_port = ip_divert_cookie; + ip_divert_cookie = 0; + + /* Assure header */ + if (m->m_len < sizeof(struct ip) && + (m = m_pullup(m, sizeof(struct ip))) == 0) { + return; + } + ip = mtod(m, struct ip *); + + /* + * Record receive interface address, if any. + * But only for incoming packets. + */ + divsrc.sin_addr.s_addr = 0; + if (incoming) { + struct ifaddr *ifa; + + /* Sanity check */ + KASSERT((m->m_flags & M_PKTHDR), ("%s: !PKTHDR", __FUNCTION__)); + + /* Find IP address for receive interface */ + TAILQ_FOREACH(ifa, &m->m_pkthdr.rcvif->if_addrhead, ifa_link) { + if (ifa->ifa_addr == NULL) + continue; + if (ifa->ifa_addr->sa_family != AF_INET) + continue; + divsrc.sin_addr = + ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr; + break; + } + } + /* + * Record the incoming interface name whenever we have one. + */ + bzero(&divsrc.sin_zero, sizeof(divsrc.sin_zero)); + if (m->m_pkthdr.rcvif) { + /* + * Hide the actual interface name in there in the + * sin_zero array. XXX This needs to be moved to a + * different sockaddr type for divert, e.g. + * sockaddr_div with multiple fields like + * sockaddr_dl. Presently we have only 7 bytes + * but that will do for now as most interfaces + * are 4 or less + 2 or less bytes for unit. + * There is probably a faster way of doing this, + * possibly taking it from the sockaddr_dl on the iface. + * This solves the problem of a P2P link and a LAN interface + * having the same address, which can result in the wrong + * interface being assigned to the packet when fed back + * into the divert socket. Theoretically if the daemon saves + * and re-uses the sockaddr_in as suggested in the man pages, + * this iface name will come along for the ride. + * (see div_output for the other half of this.) + */ + snprintf(divsrc.sin_zero, sizeof(divsrc.sin_zero), + "%s%d", m->m_pkthdr.rcvif->if_name, + m->m_pkthdr.rcvif->if_unit); + } + + /* Put packet on socket queue, if any */ + sa = NULL; + nport = htons((u_int16_t)port); + LIST_FOREACH(inp, &divcb, inp_list) { + if (inp->inp_lport == nport) + sa = inp->inp_socket; + } + if (sa) { + if (sbappendaddr(&sa->so_rcv, (struct sockaddr *)&divsrc, + m, (struct mbuf *)0) == 0) + m_freem(m); + else + sorwakeup(sa); + } else { + m_freem(m); + ipstat.ips_noproto++; + ipstat.ips_delivered--; + } +} + +/* + * Deliver packet back into the IP processing machinery. + * + * If no address specified, or address is 0.0.0.0, send to ip_output(); + * otherwise, send to ip_input() and mark as having been received on + * the interface with that address. + */ +static int +div_output(so, m, addr, control) + struct socket *so; + register struct mbuf *m; + struct sockaddr *addr; + struct mbuf *control; +{ + register struct inpcb *const inp = sotoinpcb(so); + register struct ip *const ip = mtod(m, struct ip *); + struct sockaddr_in *sin = (struct sockaddr_in *)addr; + int error = 0; + + if (control) + m_freem(control); /* XXX */ + + /* Loopback avoidance and state recovery */ + if (sin) { + int len = 0; + char *c = sin->sin_zero; + + ip_divert_cookie = sin->sin_port; + + /* + * Find receive interface with the given name or IP address. + * The name is user supplied data so don't trust it's size or + * that it is zero terminated. The name has priority. + * We are presently assuming that the sockaddr_in + * has not been replaced by a sockaddr_div, so we limit it + * to 16 bytes in total. the name is stuffed (if it exists) + * in the sin_zero[] field. + */ + while (*c++ && (len++ < sizeof(sin->sin_zero))); + if ((len > 0) && (len < sizeof(sin->sin_zero))) + m->m_pkthdr.rcvif = ifunit(sin->sin_zero); + } else { + ip_divert_cookie = 0; + } + + /* Reinject packet into the system as incoming or outgoing */ + if (!sin || sin->sin_addr.s_addr == 0) { + /* + * Don't allow both user specified and setsockopt options, + * and don't allow packet length sizes that will crash + */ + if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options) || + ((u_short)ntohs(ip->ip_len) > m->m_pkthdr.len)) { + error = EINVAL; + goto cantsend; + } + + /* Convert fields to host order for ip_output() */ + NTOHS(ip->ip_len); + NTOHS(ip->ip_off); + + /* Send packet to output processing */ + ipstat.ips_rawout++; /* XXX */ + error = ip_output(m, inp->inp_options, &inp->inp_route, + (so->so_options & SO_DONTROUTE) | + IP_ALLOWBROADCAST | IP_RAWOUTPUT, + inp->inp_moptions); + } else { + struct ifaddr *ifa; + + /* If no luck with the name above. check by IP address. */ + if (m->m_pkthdr.rcvif == NULL) { + /* + * Make sure there are no distractions + * for ifa_ifwithaddr. Clear the port and the ifname. + * Maybe zap all 8 bytes at once using a 64bit write? + */ + bzero(sin->sin_zero, sizeof(sin->sin_zero)); + /* *((u_int64_t *)sin->sin_zero) = 0; */ /* XXX ?? */ + sin->sin_port = 0; + if (!(ifa = ifa_ifwithaddr((struct sockaddr *) sin))) { + error = EADDRNOTAVAIL; + goto cantsend; + } + m->m_pkthdr.rcvif = ifa->ifa_ifp; + } + + /* Send packet to input processing */ + ip_input(m); + } + + /* paranoid: Reset for next time (and other packets) */ + /* almost definitly already done in the ipfw filter but.. */ + ip_divert_cookie = 0; + return error; + +cantsend: + m_freem(m); + ip_divert_cookie = 0; + return error; +} + +static int +div_attach(struct socket *so, int proto, struct proc *p) +{ + struct inpcb *inp; + int error, s; + + inp = sotoinpcb(so); + if (inp) + panic("div_attach"); + if (p && (error = suser(p)) != 0) + return error; + + error = soreserve(so, div_sendspace, div_recvspace); + if (error) + return error; + s = splnet(); + error = in_pcballoc(so, &divcbinfo, p); + splx(s); + if (error) + return error; + inp = (struct inpcb *)so->so_pcb; + inp->inp_ip_p = proto; + inp->inp_vflag |= INP_IPV4; + inp->inp_flags |= INP_HDRINCL; + /* The socket is always "connected" because + we always know "where" to send the packet */ + so->so_state |= SS_ISCONNECTED; +#ifdef IPSEC + error = ipsec_init_policy(so, &inp->inp_sp); + if (error != 0) { + in_pcbdetach(inp); + return error; + } +#endif /*IPSEC*/ + return 0; +} + +static int +div_detach(struct socket *so) +{ + struct inpcb *inp; + + inp = sotoinpcb(so); + if (inp == 0) + panic("div_detach"); + in_pcbdetach(inp); + return 0; +} + +static int +div_abort(struct socket *so) +{ + soisdisconnected(so); + return div_detach(so); +} + +static int +div_disconnect(struct socket *so) +{ + if ((so->so_state & SS_ISCONNECTED) == 0) + return ENOTCONN; + return div_abort(so); +} + +static int +div_bind(struct socket *so, struct sockaddr *nam, struct proc *p) +{ + struct inpcb *inp; + int s; + int error; + + s = splnet(); + inp = sotoinpcb(so); + /* in_pcbbind assumes that the socket is a sockaddr_in + * and in_pcbbind requires a valid address. Since divert + * sockets don't we need to make sure the address is + * filled in properly. + * XXX -- divert should not be abusing in_pcbind + * and should probably have its own family. + */ + if (nam->sa_family != AF_INET) { + error = EAFNOSUPPORT; + } else { + ((struct sockaddr_in *)nam)->sin_addr.s_addr = INADDR_ANY; + error = in_pcbbind(inp, nam, p); + } + splx(s); + return error; +} + +static int +div_shutdown(struct socket *so) +{ + socantsendmore(so); + return 0; +} + +static int +div_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, + struct mbuf *control, struct proc *p) +{ + /* Packet must have a header (but that's about it) */ + if (m->m_len < sizeof (struct ip) && + (m = m_pullup(m, sizeof (struct ip))) == 0) { + ipstat.ips_toosmall++; + m_freem(m); + return EINVAL; + } + + /* Send packet */ + return div_output(so, m, nam, control); +} + +static int +div_pcblist(SYSCTL_HANDLER_ARGS) +{ + int error, i, n, s; + struct inpcb *inp, **inp_list; + inp_gen_t gencnt; + struct xinpgen xig; + + /* + * The process of preparing the TCB list is too time-consuming and + * resource-intensive to repeat twice on every request. + */ + if (req->oldptr == 0) { + n = divcbinfo.ipi_count; + req->oldidx = 2 * (sizeof xig) + + (n + n/8) * sizeof(struct xinpcb); + return 0; + } + + if (req->newptr != 0) + return EPERM; + + /* + * OK, now we're committed to doing something. + */ + s = splnet(); + gencnt = divcbinfo.ipi_gencnt; + n = divcbinfo.ipi_count; + splx(s); + + xig.xig_len = sizeof xig; + xig.xig_count = n; + xig.xig_gen = gencnt; + xig.xig_sogen = so_gencnt; + error = SYSCTL_OUT(req, &xig, sizeof xig); + if (error) + return error; + + inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); + if (inp_list == 0) + return ENOMEM; + + s = splnet(); + for (inp = LIST_FIRST(divcbinfo.listhead), i = 0; inp && i < n; + inp = LIST_NEXT(inp, inp_list)) { + if (inp->inp_gencnt <= gencnt && !prison_xinpcb(req->p, inp)) + inp_list[i++] = inp; + } + splx(s); + n = i; + + error = 0; + for (i = 0; i < n; i++) { + inp = inp_list[i]; + if (inp->inp_gencnt <= gencnt) { + struct xinpcb xi; + xi.xi_len = sizeof xi; + /* XXX should avoid extra copy */ + bcopy(inp, &xi.xi_inp, sizeof *inp); + if (inp->inp_socket) + sotoxsocket(inp->inp_socket, &xi.xi_socket); + error = SYSCTL_OUT(req, &xi, sizeof xi); + } + } + if (!error) { + /* + * Give the user an updated idea of our state. + * If the generation differs from what we told + * her before, she knows that something happened + * while we were processing this request, and it + * might be necessary to retry. + */ + s = splnet(); + xig.xig_gen = divcbinfo.ipi_gencnt; + xig.xig_sogen = so_gencnt; + xig.xig_count = divcbinfo.ipi_count; + splx(s); + error = SYSCTL_OUT(req, &xig, sizeof xig); + } + free(inp_list, M_TEMP); + return error; +} + +SYSCTL_DECL(_net_inet_divert); +SYSCTL_PROC(_net_inet_divert, OID_AUTO, pcblist, CTLFLAG_RD, 0, 0, + div_pcblist, "S,xinpcb", "List of active divert sockets"); + +struct pr_usrreqs div_usrreqs = { + div_abort, pru_accept_notsupp, div_attach, div_bind, + pru_connect_notsupp, pru_connect2_notsupp, in_control, div_detach, + div_disconnect, pru_listen_notsupp, in_setpeeraddr, pru_rcvd_notsupp, + pru_rcvoob_notsupp, div_send, pru_sense_null, div_shutdown, + in_setsockaddr, sosend, soreceive, sopoll +}; diff --git a/sys/netinet/ip_dummynet.c b/sys/netinet/ip_dummynet.c new file mode 100644 index 0000000..8f69866 --- /dev/null +++ b/sys/netinet/ip_dummynet.c @@ -0,0 +1,1904 @@ +/* + * Copyright (c) 1998-2001 Luigi Rizzo, Universita` di Pisa + * Portions Copyright (c) 2000 Akamba Corp. + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#define DEB(x) +#define DDB(x) x + +/* + * This module implements IP dummynet, a bandwidth limiter/delay emulator + * used in conjunction with the ipfw package. + * Description of the data structures used is in ip_dummynet.h + * Here you mainly find the following blocks of code: + * + variable declarations; + * + heap management functions; + * + scheduler and dummynet functions; + * + configuration and initialization. + * + * NOTA BENE: critical sections are protected by splimp()/splx() + * pairs. One would think that splnet() is enough as for most of + * the netinet code, but it is not so because when used with + * bridging, dummynet is invoked at splimp(). + * + * Most important Changes: + * + * 010124: Fixed WF2Q behaviour + * 010122: Fixed spl protection. + * 000601: WF2Q support + * 000106: large rewrite, use heaps to handle very many pipes. + * 980513: initial release + * + * include files marked with XXX are probably not needed + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/queue.h> /* XXX */ +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/time.h> +#include <sys/sysctl.h> +#include <net/if.h> +#include <net/route.h> +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/in_var.h> +#include <netinet/ip.h> +#include <netinet/ip_fw.h> +#include <netinet/ip_dummynet.h> +#include <netinet/ip_var.h> + +#include "opt_bdg.h" +#ifdef BRIDGE +#include <netinet/if_ether.h> /* for struct arpcom */ +#include <net/bridge.h> +#endif + +/* + * We keep a private variable for the simulation time, but we could + * probably use an existing one ("softticks" in sys/kern/kern_timer.c) + */ +static dn_key curr_time = 0 ; /* current simulation time */ + +static int dn_hash_size = 64 ; /* default hash size */ + +/* statistics on number of queue searches and search steps */ +static int searches, search_steps ; +static int pipe_expire = 1 ; /* expire queue if empty */ +static int dn_max_ratio = 16 ; /* max queues/buckets ratio */ + +static int red_lookup_depth = 256; /* RED - default lookup table depth */ +static int red_avg_pkt_size = 512; /* RED - default medium packet size */ +static int red_max_pkt_size = 1500; /* RED - default max packet size */ + +/* + * Three heaps contain queues and pipes that the scheduler handles: + * + * ready_heap contains all dn_flow_queue related to fixed-rate pipes. + * + * wfq_ready_heap contains the pipes associated with WF2Q flows + * + * extract_heap contains pipes associated with delay lines. + * + */ +static struct dn_heap ready_heap, extract_heap, wfq_ready_heap ; + +static int heap_init(struct dn_heap *h, int size) ; +static int heap_insert (struct dn_heap *h, dn_key key1, void *p); +static void heap_extract(struct dn_heap *h, void *obj); + +static void transmit_event(struct dn_pipe *pipe); +static void ready_event(struct dn_flow_queue *q); + +static struct dn_pipe *all_pipes = NULL ; /* list of all pipes */ +static struct dn_flow_set *all_flow_sets = NULL ;/* list of all flow_sets */ + +#ifdef SYSCTL_NODE +SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, + CTLFLAG_RW, 0, "Dummynet"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size, + CTLFLAG_RW, &dn_hash_size, 0, "Default hash table size"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, curr_time, + CTLFLAG_RD, &curr_time, 0, "Current tick"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, ready_heap, + CTLFLAG_RD, &ready_heap.size, 0, "Size of ready heap"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, extract_heap, + CTLFLAG_RD, &extract_heap.size, 0, "Size of extract heap"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, searches, + CTLFLAG_RD, &searches, 0, "Number of queue searches"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, search_steps, + CTLFLAG_RD, &search_steps, 0, "Number of queue search steps"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire, + CTLFLAG_RW, &pipe_expire, 0, "Expire queue if empty"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, max_chain_len, + CTLFLAG_RW, &dn_max_ratio, 0, + "Max ratio between dynamic queues and buckets"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth, + CTLFLAG_RD, &red_lookup_depth, 0, "Depth of RED lookup table"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size, + CTLFLAG_RD, &red_avg_pkt_size, 0, "RED Medium packet size"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size, + CTLFLAG_RD, &red_max_pkt_size, 0, "RED Max packet size"); +#endif + +static int config_pipe(struct dn_pipe *p); +static int ip_dn_ctl(struct sockopt *sopt); + +static void rt_unref(struct rtentry *); +static void dummynet(void *); +static void dummynet_flush(void); +void dummynet_drain(void); + +int if_tx_rdy(struct ifnet *ifp); + +/* + * ip_fw_chain is used when deleting a pipe, because ipfw rules can + * hold references to the pipe. + */ +extern LIST_HEAD (ip_fw_head, ip_fw_chain) ip_fw_chain_head; + +static void +rt_unref(struct rtentry *rt) +{ + if (rt == NULL) + return ; + if (rt->rt_refcnt <= 0) + printf("-- warning, refcnt now %ld, decreasing\n", rt->rt_refcnt); + RTFREE(rt); +} + +/* + * Heap management functions. + * + * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2. + * Some macros help finding parent/children so we can optimize them. + * + * heap_init() is called to expand the heap when needed. + * Increment size in blocks of 16 entries. + * XXX failure to allocate a new element is a pretty bad failure + * as we basically stall a whole queue forever!! + * Returns 1 on error, 0 on success + */ +#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 ) +#define HEAP_LEFT(x) ( 2*(x) + 1 ) +#define HEAP_IS_LEFT(x) ( (x) & 1 ) +#define HEAP_RIGHT(x) ( 2*(x) + 2 ) +#define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; } +#define HEAP_INCREMENT 15 + +static int +heap_init(struct dn_heap *h, int new_size) +{ + struct dn_heap_entry *p; + + if (h->size >= new_size ) { + printf("heap_init, Bogus call, have %d want %d\n", + h->size, new_size); + return 0 ; + } + new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT ; + p = malloc(new_size * sizeof(*p), M_IPFW, M_DONTWAIT ); + if (p == NULL) { + printf(" heap_init, resize %d failed\n", new_size ); + return 1 ; /* error */ + } + if (h->size > 0) { + bcopy(h->p, p, h->size * sizeof(*p) ); + free(h->p, M_IPFW); + } + h->p = p ; + h->size = new_size ; + return 0 ; +} + +/* + * Insert element in heap. Normally, p != NULL, we insert p in + * a new position and bubble up. If p == NULL, then the element is + * already in place, and key is the position where to start the + * bubble-up. + * Returns 1 on failure (cannot allocate new heap entry) + * + * If offset > 0 the position (index, int) of the element in the heap is + * also stored in the element itself at the given offset in bytes. + */ +#define SET_OFFSET(heap, node) \ + if (heap->offset > 0) \ + *((int *)((char *)(heap->p[node].object) + heap->offset)) = node ; +/* + * RESET_OFFSET is used for sanity checks. It sets offset to an invalid value. + */ +#define RESET_OFFSET(heap, node) \ + if (heap->offset > 0) \ + *((int *)((char *)(heap->p[node].object) + heap->offset)) = -1 ; +static int +heap_insert(struct dn_heap *h, dn_key key1, void *p) +{ + int son = h->elements ; + + if (p == NULL) /* data already there, set starting point */ + son = key1 ; + else { /* insert new element at the end, possibly resize */ + son = h->elements ; + if (son == h->size) /* need resize... */ + if (heap_init(h, h->elements+1) ) + return 1 ; /* failure... */ + h->p[son].object = p ; + h->p[son].key = key1 ; + h->elements++ ; + } + while (son > 0) { /* bubble up */ + int father = HEAP_FATHER(son) ; + struct dn_heap_entry tmp ; + + if (DN_KEY_LT( h->p[father].key, h->p[son].key ) ) + break ; /* found right position */ + /* son smaller than father, swap and repeat */ + HEAP_SWAP(h->p[son], h->p[father], tmp) ; + SET_OFFSET(h, son); + son = father ; + } + SET_OFFSET(h, son); + return 0 ; +} + +/* + * remove top element from heap, or obj if obj != NULL + */ +static void +heap_extract(struct dn_heap *h, void *obj) +{ + int child, father, max = h->elements - 1 ; + + if (max < 0) { + printf("warning, extract from empty heap 0x%p\n", h); + return ; + } + father = 0 ; /* default: move up smallest child */ + if (obj != NULL) { /* extract specific element, index is at offset */ + if (h->offset <= 0) + panic("*** heap_extract from middle not supported on this heap!!!\n"); + father = *((int *)((char *)obj + h->offset)) ; + if (father < 0 || father >= h->elements) { + printf("dummynet: heap_extract, father %d out of bound 0..%d\n", + father, h->elements); + panic("heap_extract"); + } + } + RESET_OFFSET(h, father); + child = HEAP_LEFT(father) ; /* left child */ + while (child <= max) { /* valid entry */ + if (child != max && DN_KEY_LT(h->p[child+1].key, h->p[child].key) ) + child = child+1 ; /* take right child, otherwise left */ + h->p[father] = h->p[child] ; + SET_OFFSET(h, father); + father = child ; + child = HEAP_LEFT(child) ; /* left child for next loop */ + } + h->elements-- ; + if (father != max) { + /* + * Fill hole with last entry and bubble up, reusing the insert code + */ + h->p[father] = h->p[max] ; + heap_insert(h, father, NULL); /* this one cannot fail */ + } +} + +#if 0 +/* + * change object position and update references + * XXX this one is never used! + */ +static void +heap_move(struct dn_heap *h, dn_key new_key, void *object) +{ + int temp; + int i ; + int max = h->elements-1 ; + struct dn_heap_entry buf ; + + if (h->offset <= 0) + panic("cannot move items on this heap"); + + i = *((int *)((char *)object + h->offset)); + if (DN_KEY_LT(new_key, h->p[i].key) ) { /* must move up */ + h->p[i].key = new_key ; + for (; i>0 && DN_KEY_LT(new_key, h->p[(temp = HEAP_FATHER(i))].key) ; + i = temp ) { /* bubble up */ + HEAP_SWAP(h->p[i], h->p[temp], buf) ; + SET_OFFSET(h, i); + } + } else { /* must move down */ + h->p[i].key = new_key ; + while ( (temp = HEAP_LEFT(i)) <= max ) { /* found left child */ + if ((temp != max) && DN_KEY_GT(h->p[temp].key, h->p[temp+1].key)) + temp++ ; /* select child with min key */ + if (DN_KEY_GT(new_key, h->p[temp].key)) { /* go down */ + HEAP_SWAP(h->p[i], h->p[temp], buf) ; + SET_OFFSET(h, i); + } else + break ; + i = temp ; + } + } + SET_OFFSET(h, i); +} +#endif /* heap_move, unused */ + +/* + * heapify() will reorganize data inside an array to maintain the + * heap property. It is needed when we delete a bunch of entries. + */ +static void +heapify(struct dn_heap *h) +{ + int i ; + + for (i = 0 ; i < h->elements ; i++ ) + heap_insert(h, i , NULL) ; +} + +/* + * cleanup the heap and free data structure + */ +static void +heap_free(struct dn_heap *h) +{ + if (h->size >0 ) + free(h->p, M_IPFW); + bzero(h, sizeof(*h) ); +} + +/* + * --- end of heap management functions --- + */ + +/* + * Scheduler functions: + * + * transmit_event() is called when the delay-line needs to enter + * the scheduler, either because of existing pkts getting ready, + * or new packets entering the queue. The event handled is the delivery + * time of the packet. + * + * ready_event() does something similar with fixed-rate queues, and the + * event handled is the finish time of the head pkt. + * + * wfq_ready_event() does something similar with WF2Q queues, and the + * event handled is the start time of the head pkt. + * + * In all cases, we make sure that the data structures are consistent + * before passing pkts out, because this might trigger recursive + * invocations of the procedures. + */ +static void +transmit_event(struct dn_pipe *pipe) +{ + struct dn_pkt *pkt ; + + while ( (pkt = pipe->head) && DN_KEY_LEQ(pkt->output_time, curr_time) ) { + /* + * first unlink, then call procedures, since ip_input() can invoke + * ip_output() and viceversa, thus causing nested calls + */ + pipe->head = DN_NEXT(pkt) ; + + /* + * The actual mbuf is preceded by a struct dn_pkt, resembling an mbuf + * (NOT A REAL one, just a small block of malloc'ed memory) with + * m_type = MT_DUMMYNET + * m_next = actual mbuf to be processed by ip_input/output + * m_data = the matching rule + * and some other fields. + * The block IS FREED HERE because it contains parameters passed + * to the called routine. + */ + switch (pkt->dn_dir) { + case DN_TO_IP_OUT: + (void)ip_output((struct mbuf *)pkt, NULL, NULL, 0, NULL); + rt_unref (pkt->ro.ro_rt) ; + break ; + + case DN_TO_IP_IN : + ip_input((struct mbuf *)pkt) ; + break ; + +#ifdef BRIDGE + case DN_TO_BDG_FWD : { + struct mbuf *m = (struct mbuf *)pkt; + struct ether_header *eh; + + if (pkt->dn_m->m_len < ETHER_HDR_LEN + && (pkt->dn_m = m_pullup(pkt->dn_m, ETHER_HDR_LEN)) == NULL) { + printf("dummynet/bridge: pullup fail, dropping pkt\n"); + break; + } + /* + * same as ether_input, make eh be a pointer into the mbuf + */ + eh = mtod(pkt->dn_m, struct ether_header *); + m_adj(pkt->dn_m, ETHER_HDR_LEN); + /* + * bdg_forward() wants a pointer to the pseudo-mbuf-header, but + * on return it will supply the pointer to the actual packet + * (originally pkt->dn_m, but could be something else now) if + * it has not consumed it. + */ + m = bdg_forward(m, eh, pkt->ifp); + if (m) + m_freem(m); + } + break ; +#endif + + default: + printf("dummynet: bad switch %d!\n", pkt->dn_dir); + m_freem(pkt->dn_m); + break ; + } + FREE(pkt, M_IPFW); + } + /* if there are leftover packets, put into the heap for next event */ + if ( (pkt = pipe->head) ) + heap_insert(&extract_heap, pkt->output_time, pipe ) ; + /* XXX should check errors on heap_insert, by draining the + * whole pipe p and hoping in the future we are more successful + */ +} + +/* + * the following macro computes how many ticks we have to wait + * before being able to transmit a packet. The credit is taken from + * either a pipe (WF2Q) or a flow_queue (per-flow queueing) + */ +#define SET_TICKS(pkt, q, p) \ + (pkt->dn_m->m_pkthdr.len*8*hz - (q)->numbytes + p->bandwidth - 1 ) / \ + p->bandwidth ; + +/* + * extract pkt from queue, compute output time (could be now) + * and put into delay line (p_queue) + */ +static void +move_pkt(struct dn_pkt *pkt, struct dn_flow_queue *q, + struct dn_pipe *p, int len) +{ + q->head = DN_NEXT(pkt) ; + q->len-- ; + q->len_bytes -= len ; + + pkt->output_time = curr_time + p->delay ; + + if (p->head == NULL) + p->head = pkt; + else + DN_NEXT(p->tail) = pkt; + p->tail = pkt; + DN_NEXT(p->tail) = NULL; +} + +/* + * ready_event() is invoked every time the queue must enter the + * scheduler, either because the first packet arrives, or because + * a previously scheduled event fired. + * On invokation, drain as many pkts as possible (could be 0) and then + * if there are leftover packets reinsert the pkt in the scheduler. + */ +static void +ready_event(struct dn_flow_queue *q) +{ + struct dn_pkt *pkt; + struct dn_pipe *p = q->fs->pipe ; + int p_was_empty ; + + if (p == NULL) { + printf("ready_event- pipe is gone\n"); + return ; + } + p_was_empty = (p->head == NULL) ; + + /* + * schedule fixed-rate queues linked to this pipe: + * Account for the bw accumulated since last scheduling, then + * drain as many pkts as allowed by q->numbytes and move to + * the delay line (in p) computing output time. + * bandwidth==0 (no limit) means we can drain the whole queue, + * setting len_scaled = 0 does the job. + */ + q->numbytes += ( curr_time - q->sched_time ) * p->bandwidth; + while ( (pkt = q->head) != NULL ) { + int len = pkt->dn_m->m_pkthdr.len; + int len_scaled = p->bandwidth ? len*8*hz : 0 ; + if (len_scaled > q->numbytes ) + break ; + q->numbytes -= len_scaled ; + move_pkt(pkt, q, p, len); + } + /* + * If we have more packets queued, schedule next ready event + * (can only occur when bandwidth != 0, otherwise we would have + * flushed the whole queue in the previous loop). + * To this purpose we record the current time and compute how many + * ticks to go for the finish time of the packet. + */ + if ( (pkt = q->head) != NULL ) { /* this implies bandwidth != 0 */ + dn_key t = SET_TICKS(pkt, q, p); /* ticks i have to wait */ + q->sched_time = curr_time ; + heap_insert(&ready_heap, curr_time + t, (void *)q ); + /* XXX should check errors on heap_insert, and drain the whole + * queue on error hoping next time we are luckier. + */ + } else /* RED needs to know when the queue becomes empty */ + q->q_time = curr_time; + /* + * If the delay line was empty call transmit_event(p) now. + * Otherwise, the scheduler will take care of it. + */ + if (p_was_empty) + transmit_event(p); +} + +/* + * Called when we can transmit packets on WF2Q queues. Take pkts out of + * the queues at their start time, and enqueue into the delay line. + * Packets are drained until p->numbytes < 0. As long as + * len_scaled >= p->numbytes, the packet goes into the delay line + * with a deadline p->delay. For the last packet, if p->numbytes<0, + * there is an additional delay. + */ +static void +ready_event_wfq(struct dn_pipe *p) +{ + int p_was_empty = (p->head == NULL) ; + struct dn_heap *sch = &(p->scheduler_heap); + struct dn_heap *neh = &(p->not_eligible_heap) ; + + if (p->if_name[0] == 0) /* tx clock is simulated */ + p->numbytes += ( curr_time - p->sched_time ) * p->bandwidth; + else { /* tx clock is for real, the ifq must be empty or this is a NOP */ + if (p->ifp && p->ifp->if_snd.ifq_head != NULL) + return ; + else { + DEB(printf("pipe %d ready from %s --\n", + p->pipe_nr, p->if_name);) + } + } + + /* + * While we have backlogged traffic AND credit, we need to do + * something on the queue. + */ + while ( p->numbytes >=0 && (sch->elements>0 || neh->elements >0) ) { + if (sch->elements > 0) { /* have some eligible pkts to send out */ + struct dn_flow_queue *q = sch->p[0].object ; + struct dn_pkt *pkt = q->head; + struct dn_flow_set *fs = q->fs; + u_int64_t len = pkt->dn_m->m_pkthdr.len; + int len_scaled = p->bandwidth ? len*8*hz : 0 ; + + heap_extract(sch, NULL); /* remove queue from heap */ + p->numbytes -= len_scaled ; + move_pkt(pkt, q, p, len); + + p->V += (len<<MY_M) / p->sum ; /* update V */ + q->S = q->F ; /* update start time */ + if (q->len == 0) { /* Flow not backlogged any more */ + fs->backlogged-- ; + heap_insert(&(p->idle_heap), q->F, q); + } else { /* still backlogged */ + /* + * update F and position in backlogged queue, then + * put flow in not_eligible_heap (we will fix this later). + */ + len = (q->head)->dn_m->m_pkthdr.len; + q->F += (len<<MY_M)/(u_int64_t) fs->weight ; + if (DN_KEY_LEQ(q->S, p->V)) + heap_insert(neh, q->S, q); + else + heap_insert(sch, q->F, q); + } + } + /* + * now compute V = max(V, min(S_i)). Remember that all elements in sch + * have by definition S_i <= V so if sch is not empty, V is surely + * the max and we must not update it. Conversely, if sch is empty + * we only need to look at neh. + */ + if (sch->elements == 0 && neh->elements > 0) + p->V = MAX64 ( p->V, neh->p[0].key ); + /* move from neh to sch any packets that have become eligible */ + while (neh->elements > 0 && DN_KEY_LEQ(neh->p[0].key, p->V) ) { + struct dn_flow_queue *q = neh->p[0].object ; + heap_extract(neh, NULL); + heap_insert(sch, q->F, q); + } + + if (p->if_name[0] != '\0') {/* tx clock is from a real thing */ + p->numbytes = -1 ; /* mark not ready for I/O */ + break ; + } + } + if (sch->elements == 0 && neh->elements == 0 && p->numbytes >= 0 + && p->idle_heap.elements > 0) { + /* + * no traffic and no events scheduled. We can get rid of idle-heap. + */ + int i ; + + for (i = 0 ; i < p->idle_heap.elements ; i++) { + struct dn_flow_queue *q = p->idle_heap.p[i].object ; + + q->F = 0 ; + q->S = q->F + 1 ; + } + p->sum = 0 ; + p->V = 0 ; + p->idle_heap.elements = 0 ; + } + /* + * If we are getting clocks from dummynet (not a real interface) and + * If we are under credit, schedule the next ready event. + * Also fix the delivery time of the last packet. + */ + if (p->if_name[0]==0 && p->numbytes < 0) { /* this implies bandwidth >0 */ + dn_key t=0 ; /* number of ticks i have to wait */ + + if (p->bandwidth > 0) + t = ( p->bandwidth -1 - p->numbytes) / p->bandwidth ; + p->tail->output_time += t ; + p->sched_time = curr_time ; + heap_insert(&wfq_ready_heap, curr_time + t, (void *)p); + /* XXX should check errors on heap_insert, and drain the whole + * queue on error hoping next time we are luckier. + */ + } + /* + * If the delay line was empty call transmit_event(p) now. + * Otherwise, the scheduler will take care of it. + */ + if (p_was_empty) + transmit_event(p); +} + +/* + * This is called once per tick, or HZ times per second. It is used to + * increment the current tick counter and schedule expired events. + */ +static void +dummynet(void * __unused unused) +{ + void *p ; /* generic parameter to handler */ + struct dn_heap *h ; + int s ; + struct dn_heap *heaps[3]; + int i; + struct dn_pipe *pe ; + + heaps[0] = &ready_heap ; /* fixed-rate queues */ + heaps[1] = &wfq_ready_heap ; /* wfq queues */ + heaps[2] = &extract_heap ; /* delay line */ + s = splimp(); /* see note on top, splnet() is not enough */ + curr_time++ ; + for (i=0; i < 3 ; i++) { + h = heaps[i]; + while (h->elements > 0 && DN_KEY_LEQ(h->p[0].key, curr_time) ) { + DDB(if (h->p[0].key > curr_time) + printf("-- dummynet: warning, heap %d is %d ticks late\n", + i, (int)(curr_time - h->p[0].key));) + p = h->p[0].object ; /* store a copy before heap_extract */ + heap_extract(h, NULL); /* need to extract before processing */ + if (i == 0) + ready_event(p) ; + else if (i == 1) { + struct dn_pipe *pipe = p; + if (pipe->if_name[0] != '\0') + printf("*** bad ready_event_wfq for pipe %s\n", + pipe->if_name); + else + ready_event_wfq(p) ; + } else + transmit_event(p); + } + } + /* sweep pipes trying to expire idle flow_queues */ + for (pe = all_pipes; pe ; pe = pe->next ) + if (pe->idle_heap.elements > 0 && + DN_KEY_LT(pe->idle_heap.p[0].key, pe->V) ) { + struct dn_flow_queue *q = pe->idle_heap.p[0].object ; + + heap_extract(&(pe->idle_heap), NULL); + q->S = q->F + 1 ; /* mark timestamp as invalid */ + pe->sum -= q->fs->weight ; + } + splx(s); + timeout(dummynet, NULL, 1); +} + +/* + * called by an interface when tx_rdy occurs. + */ +int +if_tx_rdy(struct ifnet *ifp) +{ + struct dn_pipe *p; + + for (p = all_pipes; p ; p = p->next ) + if (p->ifp == ifp) + break ; + if (p == NULL) { + char buf[32]; + sprintf(buf, "%s%d",ifp->if_name, ifp->if_unit); + for (p = all_pipes; p ; p = p->next ) + if (!strcmp(p->if_name, buf) ) { + p->ifp = ifp ; + DEB(printf("++ tx rdy from %s (now found)\n", buf);) + break ; + } + } + if (p != NULL) { + DEB(printf("++ tx rdy from %s%d - qlen %d\n", ifp->if_name, + ifp->if_unit, ifp->if_snd.ifq_len);) + p->numbytes = 0 ; /* mark ready for I/O */ + ready_event_wfq(p); + } + return 0; +} + +/* + * Unconditionally expire empty queues in case of shortage. + * Returns the number of queues freed. + */ +static int +expire_queues(struct dn_flow_set *fs) +{ + struct dn_flow_queue *q, *prev ; + int i, initial_elements = fs->rq_elements ; + + if (fs->last_expired == time_second) + return 0 ; + fs->last_expired = time_second ; + for (i = 0 ; i <= fs->rq_size ; i++) /* last one is overflow */ + for (prev=NULL, q = fs->rq[i] ; q != NULL ; ) + if (q->head != NULL || q->S != q->F+1) { + prev = q ; + q = q->next ; + } else { /* entry is idle, expire it */ + struct dn_flow_queue *old_q = q ; + + if (prev != NULL) + prev->next = q = q->next ; + else + fs->rq[i] = q = q->next ; + fs->rq_elements-- ; + free(old_q, M_IPFW); + } + return initial_elements - fs->rq_elements ; +} + +/* + * If room, create a new queue and put at head of slot i; + * otherwise, create or use the default queue. + */ +static struct dn_flow_queue * +create_queue(struct dn_flow_set *fs, int i) +{ + struct dn_flow_queue *q ; + + if (fs->rq_elements > fs->rq_size * dn_max_ratio && + expire_queues(fs) == 0) { + /* + * No way to get room, use or create overflow queue. + */ + i = fs->rq_size ; + if ( fs->rq[i] != NULL ) + return fs->rq[i] ; + } + q = malloc(sizeof(*q), M_IPFW, M_DONTWAIT | M_ZERO); /* M_ZERO needed */ + if (q == NULL) { + printf("sorry, cannot allocate queue for new flow\n"); + return NULL ; + } + q->fs = fs ; + q->hash_slot = i ; + q->next = fs->rq[i] ; + q->S = q->F + 1; /* hack - mark timestamp as invalid */ + fs->rq[i] = q ; + fs->rq_elements++ ; + return q ; +} + +/* + * Given a flow_set and a pkt in last_pkt, find a matching queue + * after appropriate masking. The queue is moved to front + * so that further searches take less time. + */ +static struct dn_flow_queue * +find_queue(struct dn_flow_set *fs) +{ + int i = 0 ; /* we need i and q for new allocations */ + struct dn_flow_queue *q, *prev; + + if ( !(fs->flags_fs & DN_HAVE_FLOW_MASK) ) + q = fs->rq[0] ; + else { + /* first, do the masking */ + last_pkt.dst_ip &= fs->flow_mask.dst_ip ; + last_pkt.src_ip &= fs->flow_mask.src_ip ; + last_pkt.dst_port &= fs->flow_mask.dst_port ; + last_pkt.src_port &= fs->flow_mask.src_port ; + last_pkt.proto &= fs->flow_mask.proto ; + last_pkt.flags = 0 ; /* we don't care about this one */ + /* then, hash function */ + i = ( (last_pkt.dst_ip) & 0xffff ) ^ + ( (last_pkt.dst_ip >> 15) & 0xffff ) ^ + ( (last_pkt.src_ip << 1) & 0xffff ) ^ + ( (last_pkt.src_ip >> 16 ) & 0xffff ) ^ + (last_pkt.dst_port << 1) ^ (last_pkt.src_port) ^ + (last_pkt.proto ); + i = i % fs->rq_size ; + /* finally, scan the current list for a match */ + searches++ ; + for (prev=NULL, q = fs->rq[i] ; q ; ) { + search_steps++; + if (bcmp(&last_pkt, &(q->id), sizeof(q->id) ) == 0) + break ; /* found */ + else if (pipe_expire && q->head == NULL && q->S == q->F+1 ) { + /* entry is idle and not in any heap, expire it */ + struct dn_flow_queue *old_q = q ; + + if (prev != NULL) + prev->next = q = q->next ; + else + fs->rq[i] = q = q->next ; + fs->rq_elements-- ; + free(old_q, M_IPFW); + continue ; + } + prev = q ; + q = q->next ; + } + if (q && prev != NULL) { /* found and not in front */ + prev->next = q->next ; + q->next = fs->rq[i] ; + fs->rq[i] = q ; + } + } + if (q == NULL) { /* no match, need to allocate a new entry */ + q = create_queue(fs, i); + if (q != NULL) + q->id = last_pkt ; + } + return q ; +} + +static int +red_drops(struct dn_flow_set *fs, struct dn_flow_queue *q, int len) +{ + /* + * RED algorithm + * + * RED calculates the average queue size (avg) using a low-pass filter + * with an exponential weighted (w_q) moving average: + * avg <- (1-w_q) * avg + w_q * q_size + * where q_size is the queue length (measured in bytes or * packets). + * + * If q_size == 0, we compute the idle time for the link, and set + * avg = (1 - w_q)^(idle/s) + * where s is the time needed for transmitting a medium-sized packet. + * + * Now, if avg < min_th the packet is enqueued. + * If avg > max_th the packet is dropped. Otherwise, the packet is + * dropped with probability P function of avg. + * + */ + + int64_t p_b = 0; + /* queue in bytes or packets ? */ + u_int q_size = (fs->flags_fs & DN_QSIZE_IS_BYTES) ? q->len_bytes : q->len; + + DEB(printf("\n%d q: %2u ", (int) curr_time, q_size);) + + /* average queue size estimation */ + if (q_size != 0) { + /* + * queue is not empty, avg <- avg + (q_size - avg) * w_q + */ + int diff = SCALE(q_size) - q->avg; + int64_t v = SCALE_MUL((int64_t) diff, (int64_t) fs->w_q); + + q->avg += (int) v; + } else { + /* + * queue is empty, find for how long the queue has been + * empty and use a lookup table for computing + * (1 - * w_q)^(idle_time/s) where s is the time to send a + * (small) packet. + * XXX check wraps... + */ + if (q->avg) { + u_int t = (curr_time - q->q_time) / fs->lookup_step; + + q->avg = (t < fs->lookup_depth) ? + SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0; + } + } + DEB(printf("avg: %u ", SCALE_VAL(q->avg));) + + /* should i drop ? */ + + if (q->avg < fs->min_th) { + q->count = -1; + return 0; /* accept packet ; */ + } + if (q->avg >= fs->max_th) { /* average queue >= max threshold */ + if (fs->flags_fs & DN_IS_GENTLE_RED) { + /* + * According to Gentle-RED, if avg is greater than max_th the + * packet is dropped with a probability + * p_b = c_3 * avg - c_4 + * where c_3 = (1 - max_p) / max_th, and c_4 = 1 - 2 * max_p + */ + p_b = SCALE_MUL((int64_t) fs->c_3, (int64_t) q->avg) - fs->c_4; + } else { + q->count = -1; + printf("- drop"); + return 1 ; + } + } else if (q->avg > fs->min_th) { + /* + * we compute p_b using the linear dropping function p_b = c_1 * + * avg - c_2, where c_1 = max_p / (max_th - min_th), and c_2 = + * max_p * min_th / (max_th - min_th) + */ + p_b = SCALE_MUL((int64_t) fs->c_1, (int64_t) q->avg) - fs->c_2; + } + if (fs->flags_fs & DN_QSIZE_IS_BYTES) + p_b = (p_b * len) / fs->max_pkt_size; + if (++q->count == 0) + q->random = random() & 0xffff; + else { + /* + * q->count counts packets arrived since last drop, so a greater + * value of q->count means a greater packet drop probability. + */ + if (SCALE_MUL(p_b, SCALE((int64_t) q->count)) > q->random) { + q->count = 0; + DEB(printf("- red drop");) + /* after a drop we calculate a new random value */ + q->random = random() & 0xffff; + return 1; /* drop */ + } + } + /* end of RED algorithm */ + return 0 ; /* accept */ +} + +static __inline +struct dn_flow_set * +locate_flowset(int pipe_nr, struct ip_fw_chain *rule) +{ + struct dn_flow_set *fs = NULL ; + + if ( (rule->rule->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_QUEUE ) + for (fs=all_flow_sets; fs && fs->fs_nr != pipe_nr; fs=fs->next) + ; + else { + struct dn_pipe *p1; + for (p1 = all_pipes; p1 && p1->pipe_nr != pipe_nr; p1 = p1->next) + ; + if (p1 != NULL) + fs = &(p1->fs) ; + } + if (fs != NULL) + rule->rule->pipe_ptr = fs ; /* record for the future */ + return fs ; +} + +/* + * dummynet hook for packets. Below 'pipe' is a pipe or a queue + * depending on whether WF2Q or fixed bw is used. + */ +int +dummynet_io(int pipe_nr, int dir, /* pipe_nr can also be a fs_nr */ + struct mbuf *m, struct ifnet *ifp, struct route *ro, + struct sockaddr_in *dst, + struct ip_fw_chain *rule, int flags) +{ + struct dn_pkt *pkt; + struct dn_flow_set *fs; + struct dn_pipe *pipe ; + u_int64_t len = m->m_pkthdr.len ; + struct dn_flow_queue *q = NULL ; + int s ; + + s = splimp(); + + pipe_nr &= 0xffff ; + + if ( (fs = rule->rule->pipe_ptr) == NULL ) { + fs = locate_flowset(pipe_nr, rule); + if (fs == NULL) + goto dropit ; /* this queue/pipe does not exist! */ + } + pipe = fs->pipe ; + if (pipe == NULL) { /* must be a queue, try find a matching pipe */ + for (pipe = all_pipes; pipe && pipe->pipe_nr != fs->parent_nr; + pipe = pipe->next) + ; + if (pipe != NULL) + fs->pipe = pipe ; + else { + printf("No pipe %d for queue %d, drop pkt\n", + fs->parent_nr, fs->fs_nr); + goto dropit ; + } + } + q = find_queue(fs); + if ( q == NULL ) + goto dropit ; /* cannot allocate queue */ + /* + * update statistics, then check reasons to drop pkt + */ + q->tot_bytes += len ; + q->tot_pkts++ ; + if ( fs->plr && random() < fs->plr ) + goto dropit ; /* random pkt drop */ + if ( fs->flags_fs & DN_QSIZE_IS_BYTES) { + if (q->len_bytes > fs->qsize) + goto dropit ; /* queue size overflow */ + } else { + if (q->len >= fs->qsize) + goto dropit ; /* queue count overflow */ + } + if ( fs->flags_fs & DN_IS_RED && red_drops(fs, q, len) ) + goto dropit ; + + /* XXX expensive to zero, see if we can remove it*/ + pkt = (struct dn_pkt *)malloc(sizeof (*pkt), M_IPFW, M_NOWAIT | M_ZERO); + if ( pkt == NULL ) + goto dropit ; /* cannot allocate packet header */ + /* ok, i can handle the pkt now... */ + /* build and enqueue packet + parameters */ + pkt->hdr.mh_type = MT_DUMMYNET ; + (struct ip_fw_chain *)pkt->hdr.mh_data = rule ; + DN_NEXT(pkt) = NULL; + pkt->dn_m = m; + pkt->dn_dir = dir ; + + pkt->ifp = ifp; + if (dir == DN_TO_IP_OUT) { + /* + * We need to copy *ro because for ICMP pkts (and maybe others) + * the caller passed a pointer into the stack; dst might also be + * a pointer into *ro so it needs to be updated. + */ + pkt->ro = *ro; + if (ro->ro_rt) + ro->ro_rt->rt_refcnt++ ; + if (dst == (struct sockaddr_in *)&ro->ro_dst) /* dst points into ro */ + dst = (struct sockaddr_in *)&(pkt->ro.ro_dst) ; + + pkt->dn_dst = dst; + pkt->flags = flags ; + } + if (q->head == NULL) + q->head = pkt; + else + DN_NEXT(q->tail) = pkt; + q->tail = pkt; + q->len++; + q->len_bytes += len ; + + if ( q->head != pkt ) /* flow was not idle, we are done */ + goto done; + /* + * If we reach this point the flow was previously idle, so we need + * to schedule it. This involves different actions for fixed-rate or + * WF2Q queues. + */ + if ( (rule->rule->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_PIPE ) { + /* + * Fixed-rate queue: just insert into the ready_heap. + */ + dn_key t = 0 ; + if (pipe->bandwidth) + t = SET_TICKS(pkt, q, pipe); + q->sched_time = curr_time ; + if (t == 0) /* must process it now */ + ready_event( q ); + else + heap_insert(&ready_heap, curr_time + t , q ); + } else { + /* + * WF2Q. First, compute start time S: if the flow was idle (S=F+1) + * set S to the virtual time V for the controlling pipe, and update + * the sum of weights for the pipe; otherwise, remove flow from + * idle_heap and set S to max(F,V). + * Second, compute finish time F = S + len/weight. + * Third, if pipe was idle, update V=max(S, V). + * Fourth, count one more backlogged flow. + */ + if (DN_KEY_GT(q->S, q->F)) { /* means timestamps are invalid */ + q->S = pipe->V ; + pipe->sum += fs->weight ; /* add weight of new queue */ + } else { + heap_extract(&(pipe->idle_heap), q); + q->S = MAX64(q->F, pipe->V ) ; + } + q->F = q->S + ( len<<MY_M )/(u_int64_t) fs->weight; + + if (pipe->not_eligible_heap.elements == 0 && + pipe->scheduler_heap.elements == 0) + pipe->V = MAX64 ( q->S, pipe->V ); + fs->backlogged++ ; + /* + * Look at eligibility. A flow is not eligibile if S>V (when + * this happens, it means that there is some other flow already + * scheduled for the same pipe, so the scheduler_heap cannot be + * empty). If the flow is not eligible we just store it in the + * not_eligible_heap. Otherwise, we store in the scheduler_heap + * and possibly invoke ready_event_wfq() right now if there is + * leftover credit. + * Note that for all flows in scheduler_heap (SCH), S_i <= V, + * and for all flows in not_eligible_heap (NEH), S_i > V . + * So when we need to compute max( V, min(S_i) ) forall i in SCH+NEH, + * we only need to look into NEH. + */ + if (DN_KEY_GT(q->S, pipe->V) ) { /* not eligible */ + if (pipe->scheduler_heap.elements == 0) + printf("++ ouch! not eligible but empty scheduler!\n"); + heap_insert(&(pipe->not_eligible_heap), q->S, q); + } else { + heap_insert(&(pipe->scheduler_heap), q->F, q); + if (pipe->numbytes >= 0) { /* pipe is idle */ + if (pipe->scheduler_heap.elements != 1) + printf("*** OUCH! pipe should have been idle!\n"); + DEB(printf("Waking up pipe %d at %d\n", + pipe->pipe_nr, (int)(q->F >> MY_M)); ) + pipe->sched_time = curr_time ; + ready_event_wfq(pipe); + } + } + } +done: + splx(s); + return 0; + +dropit: + splx(s); + if (q) + q->drops++ ; + m_freem(m); + return ENOBUFS ; +} + +/* + * Below, the rt_unref is only needed when (pkt->dn_dir == DN_TO_IP_OUT) + * Doing this would probably save us the initial bzero of dn_pkt + */ +#define DN_FREE_PKT(pkt) { \ + struct dn_pkt *n = pkt ; \ + rt_unref ( n->ro.ro_rt ) ; \ + m_freem(n->dn_m); \ + pkt = DN_NEXT(n) ; \ + free(n, M_IPFW) ; } + +/* + * Dispose all packets and flow_queues on a flow_set. + * If all=1, also remove red lookup table and other storage, + * including the descriptor itself. + * For the one in dn_pipe MUST also cleanup ready_heap... + */ +static void +purge_flow_set(struct dn_flow_set *fs, int all) +{ + struct dn_pkt *pkt ; + struct dn_flow_queue *q, *qn ; + int i ; + + for (i = 0 ; i <= fs->rq_size ; i++ ) { + for (q = fs->rq[i] ; q ; q = qn ) { + for (pkt = q->head ; pkt ; ) + DN_FREE_PKT(pkt) ; + qn = q->next ; + free(q, M_IPFW); + } + fs->rq[i] = NULL ; + } + fs->rq_elements = 0 ; + if (all) { + /* RED - free lookup table */ + if (fs->w_q_lookup) + free(fs->w_q_lookup, M_IPFW); + if (fs->rq) + free(fs->rq, M_IPFW); + /* if this fs is not part of a pipe, free it */ + if (fs->pipe && fs != &(fs->pipe->fs) ) + free(fs, M_IPFW); + } +} + +/* + * Dispose all packets queued on a pipe (not a flow_set). + * Also free all resources associated to a pipe, which is about + * to be deleted. + */ +static void +purge_pipe(struct dn_pipe *pipe) +{ + struct dn_pkt *pkt ; + + purge_flow_set( &(pipe->fs), 1 ); + + for (pkt = pipe->head ; pkt ; ) + DN_FREE_PKT(pkt) ; + + heap_free( &(pipe->scheduler_heap) ); + heap_free( &(pipe->not_eligible_heap) ); + heap_free( &(pipe->idle_heap) ); +} + +/* + * Delete all pipes and heaps returning memory. Must also + * remove references from all ipfw rules to all pipes. + */ +static void +dummynet_flush() +{ + struct dn_pipe *curr_p, *p ; + struct ip_fw_chain *chain ; + struct dn_flow_set *fs, *curr_fs; + int s ; + + s = splimp() ; + + /* remove all references to pipes ...*/ + LIST_FOREACH(chain, &ip_fw_chain_head, next) + chain->rule->pipe_ptr = NULL ; + /* prevent future matches... */ + p = all_pipes ; + all_pipes = NULL ; + fs = all_flow_sets ; + all_flow_sets = NULL ; + /* and free heaps so we don't have unwanted events */ + heap_free(&ready_heap); + heap_free(&wfq_ready_heap); + heap_free(&extract_heap); + splx(s) ; + /* + * Now purge all queued pkts and delete all pipes + */ + /* scan and purge all flow_sets. */ + for ( ; fs ; ) { + curr_fs = fs ; + fs = fs->next ; + purge_flow_set(curr_fs, 1); + } + for ( ; p ; ) { + purge_pipe(p); + curr_p = p ; + p = p->next ; + free(curr_p, M_IPFW); + } +} + + +extern struct ip_fw_chain *ip_fw_default_rule ; +static void +dn_rule_delete_fs(struct dn_flow_set *fs, void *r) +{ + int i ; + struct dn_flow_queue *q ; + struct dn_pkt *pkt ; + + for (i = 0 ; i <= fs->rq_size ; i++) /* last one is ovflow */ + for (q = fs->rq[i] ; q ; q = q->next ) + for (pkt = q->head ; pkt ; pkt = DN_NEXT(pkt) ) + if (pkt->hdr.mh_data == r) + pkt->hdr.mh_data = (void *)ip_fw_default_rule ; +} +/* + * when a firewall rule is deleted, scan all queues and remove the flow-id + * from packets matching this rule. + */ +void +dn_rule_delete(void *r) +{ + struct dn_pipe *p ; + struct dn_pkt *pkt ; + struct dn_flow_set *fs ; + + /* + * If the rule references a queue (dn_flow_set), then scan + * the flow set, otherwise scan pipes. Should do either, but doing + * both does not harm. + */ + for ( fs = all_flow_sets ; fs ; fs = fs->next ) + dn_rule_delete_fs(fs, r); + for ( p = all_pipes ; p ; p = p->next ) { + fs = &(p->fs) ; + dn_rule_delete_fs(fs, r); + for (pkt = p->head ; pkt ; pkt = DN_NEXT(pkt) ) + if (pkt->hdr.mh_data == r) + pkt->hdr.mh_data = (void *)ip_fw_default_rule ; + } +} + +/* + * setup RED parameters + */ +static int +config_red(struct dn_flow_set *p, struct dn_flow_set * x) +{ + int i; + + x->w_q = p->w_q; + x->min_th = SCALE(p->min_th); + x->max_th = SCALE(p->max_th); + x->max_p = p->max_p; + + x->c_1 = p->max_p / (p->max_th - p->min_th); + x->c_2 = SCALE_MUL(x->c_1, SCALE(p->min_th)); + if (x->flags_fs & DN_IS_GENTLE_RED) { + x->c_3 = (SCALE(1) - p->max_p) / p->max_th; + x->c_4 = (SCALE(1) - 2 * p->max_p); + } + + /* if the lookup table already exist, free and create it again */ + if (x->w_q_lookup) + free(x->w_q_lookup, M_IPFW); + if (red_lookup_depth == 0) { + printf("\nnet.inet.ip.dummynet.red_lookup_depth must be > 0"); + free(x, M_IPFW); + return EINVAL; + } + x->lookup_depth = red_lookup_depth; + x->w_q_lookup = (u_int *) malloc(x->lookup_depth * sizeof(int), + M_IPFW, M_DONTWAIT); + if (x->w_q_lookup == NULL) { + printf("sorry, cannot allocate red lookup table\n"); + free(x, M_IPFW); + return ENOSPC; + } + + /* fill the lookup table with (1 - w_q)^x */ + x->lookup_step = p->lookup_step ; + x->lookup_weight = p->lookup_weight ; + x->w_q_lookup[0] = SCALE(1) - x->w_q; + for (i = 1; i < x->lookup_depth; i++) + x->w_q_lookup[i] = SCALE_MUL(x->w_q_lookup[i - 1], x->lookup_weight); + if (red_avg_pkt_size < 1) + red_avg_pkt_size = 512 ; + x->avg_pkt_size = red_avg_pkt_size ; + if (red_max_pkt_size < 1) + red_max_pkt_size = 1500 ; + x->max_pkt_size = red_max_pkt_size ; + return 0 ; +} + +static int +alloc_hash(struct dn_flow_set *x, struct dn_flow_set *pfs) +{ + if (x->flags_fs & DN_HAVE_FLOW_MASK) { /* allocate some slots */ + int l = pfs->rq_size; + + if (l == 0) + l = dn_hash_size; + if (l < 4) + l = 4; + else if (l > 1024) + l = 1024; + x->rq_size = l; + } else /* one is enough for null mask */ + x->rq_size = 1; + x->rq = malloc((1 + x->rq_size) * sizeof(struct dn_flow_queue *), + M_IPFW, M_DONTWAIT | M_ZERO); + if (x->rq == NULL) { + printf("sorry, cannot allocate queue\n"); + return ENOSPC; + } + x->rq_elements = 0; + return 0 ; +} + +static void +set_fs_parms(struct dn_flow_set *x, struct dn_flow_set *src) +{ + x->flags_fs = src->flags_fs; + x->qsize = src->qsize; + x->plr = src->plr; + x->flow_mask = src->flow_mask; + if (x->flags_fs & DN_QSIZE_IS_BYTES) { + if (x->qsize > 1024*1024) + x->qsize = 1024*1024 ; + } else { + if (x->qsize == 0) + x->qsize = 50 ; + if (x->qsize > 100) + x->qsize = 50 ; + } + /* configuring RED */ + if ( x->flags_fs & DN_IS_RED ) + config_red(src, x) ; /* XXX should check errors */ +} + +/* + * setup pipe or queue parameters. + */ + +static int +config_pipe(struct dn_pipe *p) +{ + int s ; + struct dn_flow_set *pfs = &(p->fs); + + /* + * The config program passes parameters as follows: + * bw = bits/second (0 means no limits), + * delay = ms, must be translated into ticks. + * qsize = slots/bytes + */ + p->delay = ( p->delay * hz ) / 1000 ; + /* We need either a pipe number or a flow_set number */ + if (p->pipe_nr == 0 && pfs->fs_nr == 0) + return EINVAL ; + if (p->pipe_nr != 0 && pfs->fs_nr != 0) + return EINVAL ; + if (p->pipe_nr != 0) { /* this is a pipe */ + struct dn_pipe *x, *a, *b; + /* locate pipe */ + for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ; + a = b , b = b->next) ; + + if (b == NULL || b->pipe_nr != p->pipe_nr) { /* new pipe */ + x = malloc(sizeof(struct dn_pipe), M_IPFW, M_DONTWAIT | M_ZERO); + if (x == NULL) { + printf("ip_dummynet.c: no memory for new pipe\n"); + return ENOSPC; + } + x->pipe_nr = p->pipe_nr; + x->fs.pipe = x ; + /* idle_heap is the only one from which we extract from the middle. + */ + x->idle_heap.size = x->idle_heap.elements = 0 ; + x->idle_heap.offset=OFFSET_OF(struct dn_flow_queue, heap_pos); + } else + x = b; + + x->bandwidth = p->bandwidth ; + x->numbytes = 0; /* just in case... */ + bcopy(p->if_name, x->if_name, sizeof(p->if_name) ); + x->ifp = NULL ; /* reset interface ptr */ + x->delay = p->delay ; + set_fs_parms(&(x->fs), pfs); + + + if ( x->fs.rq == NULL ) { /* a new pipe */ + s = alloc_hash(&(x->fs), pfs) ; + if (s) { + free(x, M_IPFW); + return s ; + } + s = splimp() ; + x->next = b ; + if (a == NULL) + all_pipes = x ; + else + a->next = x ; + splx(s); + } + } else { /* config queue */ + struct dn_flow_set *x, *a, *b ; + + /* locate flow_set */ + for (a=NULL, b=all_flow_sets ; b && b->fs_nr < pfs->fs_nr ; + a = b , b = b->next) ; + + if (b == NULL || b->fs_nr != pfs->fs_nr) { /* new */ + if (pfs->parent_nr == 0) /* need link to a pipe */ + return EINVAL ; + x = malloc(sizeof(struct dn_flow_set), M_IPFW, M_DONTWAIT | M_ZERO); + if (x == NULL) { + printf("ip_dummynet.c: no memory for new flow_set\n"); + return ENOSPC; + } + x->fs_nr = pfs->fs_nr; + x->parent_nr = pfs->parent_nr; + x->weight = pfs->weight ; + if (x->weight == 0) + x->weight = 1 ; + else if (x->weight > 100) + x->weight = 100 ; + } else { + /* Change parent pipe not allowed; must delete and recreate */ + if (pfs->parent_nr != 0 && b->parent_nr != pfs->parent_nr) + return EINVAL ; + x = b; + } + set_fs_parms(x, pfs); + + if ( x->rq == NULL ) { /* a new flow_set */ + s = alloc_hash(x, pfs) ; + if (s) { + free(x, M_IPFW); + return s ; + } + s = splimp() ; + x->next = b; + if (a == NULL) + all_flow_sets = x; + else + a->next = x; + splx(s); + } + } + return 0 ; +} + +/* + * Helper function to remove from a heap queues which are linked to + * a flow_set about to be deleted. + */ +static void +fs_remove_from_heap(struct dn_heap *h, struct dn_flow_set *fs) +{ + int i = 0, found = 0 ; + for (; i < h->elements ;) + if ( ((struct dn_flow_queue *)h->p[i].object)->fs == fs) { + h->elements-- ; + h->p[i] = h->p[h->elements] ; + found++ ; + } else + i++ ; + if (found) + heapify(h); +} + +/* + * helper function to remove a pipe from a heap (can be there at most once) + */ +static void +pipe_remove_from_heap(struct dn_heap *h, struct dn_pipe *p) +{ + if (h->elements > 0) { + int i = 0 ; + for (i=0; i < h->elements ; i++ ) { + if (h->p[i].object == p) { /* found it */ + h->elements-- ; + h->p[i] = h->p[h->elements] ; + heapify(h); + break ; + } + } + } +} + +/* + * drain all queues. Called in case of severe mbuf shortage. + */ +void +dummynet_drain() +{ + struct dn_flow_set *fs; + struct dn_pipe *p; + struct dn_pkt *pkt; + + heap_free(&ready_heap); + heap_free(&wfq_ready_heap); + heap_free(&extract_heap); + /* remove all references to this pipe from flow_sets */ + for (fs = all_flow_sets; fs; fs= fs->next ) + purge_flow_set(fs, 0); + + for (p = all_pipes; p; p= p->next ) { + purge_flow_set(&(p->fs), 0); + for (pkt = p->head ; pkt ; ) + DN_FREE_PKT(pkt) ; + p->head = p->tail = NULL ; + } +} + +/* + * Fully delete a pipe or a queue, cleaning up associated info. + */ +static int +delete_pipe(struct dn_pipe *p) +{ + int s ; + struct ip_fw_chain *chain ; + + if (p->pipe_nr == 0 && p->fs.fs_nr == 0) + return EINVAL ; + if (p->pipe_nr != 0 && p->fs.fs_nr != 0) + return EINVAL ; + if (p->pipe_nr != 0) { /* this is an old-style pipe */ + struct dn_pipe *a, *b; + struct dn_flow_set *fs; + + /* locate pipe */ + for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ; + a = b , b = b->next) ; + if (b == NULL || (b->pipe_nr != p->pipe_nr) ) + return EINVAL ; /* not found */ + + s = splimp() ; + + /* unlink from list of pipes */ + if (a == NULL) + all_pipes = b->next ; + else + a->next = b->next ; + /* remove references to this pipe from the ip_fw rules. */ + LIST_FOREACH(chain, &ip_fw_chain_head, next) + if (chain->rule->pipe_ptr == &(b->fs)) + chain->rule->pipe_ptr = NULL ; + + /* remove all references to this pipe from flow_sets */ + for (fs = all_flow_sets; fs; fs= fs->next ) + if (fs->pipe == b) { + printf("++ ref to pipe %d from fs %d\n", + p->pipe_nr, fs->fs_nr); + fs->pipe = NULL ; + purge_flow_set(fs, 0); + } + fs_remove_from_heap(&ready_heap, &(b->fs)); + purge_pipe(b); /* remove all data associated to this pipe */ + /* remove reference to here from extract_heap and wfq_ready_heap */ + pipe_remove_from_heap(&extract_heap, b); + pipe_remove_from_heap(&wfq_ready_heap, b); + splx(s); + free(b, M_IPFW); + } else { /* this is a WF2Q queue (dn_flow_set) */ + struct dn_flow_set *a, *b; + + /* locate set */ + for (a = NULL, b = all_flow_sets ; b && b->fs_nr < p->fs.fs_nr ; + a = b , b = b->next) ; + if (b == NULL || (b->fs_nr != p->fs.fs_nr) ) + return EINVAL ; /* not found */ + + s = splimp() ; + if (a == NULL) + all_flow_sets = b->next ; + else + a->next = b->next ; + /* remove references to this flow_set from the ip_fw rules. */ + LIST_FOREACH(chain, &ip_fw_chain_head, next) + if (chain->rule->pipe_ptr == b) + chain->rule->pipe_ptr = NULL ; + + if (b->pipe != NULL) { + /* Update total weight on parent pipe and cleanup parent heaps */ + b->pipe->sum -= b->weight * b->backlogged ; + fs_remove_from_heap(&(b->pipe->not_eligible_heap), b); + fs_remove_from_heap(&(b->pipe->scheduler_heap), b); +#if 1 /* XXX should i remove from idle_heap as well ? */ + fs_remove_from_heap(&(b->pipe->idle_heap), b); +#endif + } + purge_flow_set(b, 1); + splx(s); + } + return 0 ; +} + +/* + * helper function used to copy data from kernel in DUMMYNET_GET + */ +static char * +dn_copy_set(struct dn_flow_set *set, char *bp) +{ + int i, copied = 0 ; + struct dn_flow_queue *q, *qp = (struct dn_flow_queue *)bp; + + for (i = 0 ; i <= set->rq_size ; i++) + for (q = set->rq[i] ; q ; q = q->next, qp++ ) { + if (q->hash_slot != i) + printf("++ at %d: wrong slot (have %d, " + "should be %d)\n", copied, q->hash_slot, i); + if (q->fs != set) + printf("++ at %d: wrong fs ptr (have %p, should be %p)\n", + i, q->fs, set); + copied++ ; + bcopy(q, qp, sizeof( *q ) ); + /* cleanup pointers */ + qp->next = NULL ; + qp->head = qp->tail = NULL ; + qp->fs = NULL ; + } + if (copied != set->rq_elements) + printf("++ wrong count, have %d should be %d\n", + copied, set->rq_elements); + return (char *)qp ; +} + +static int +dummynet_get(struct sockopt *sopt) +{ + char *buf, *bp ; /* bp is the "copy-pointer" */ + size_t size ; + struct dn_flow_set *set ; + struct dn_pipe *p ; + int s, error=0 ; + + s = splimp(); + /* + * compute size of data structures: list of pipes and flow_sets. + */ + for (p = all_pipes, size = 0 ; p ; p = p->next ) + size += sizeof( *p ) + + p->fs.rq_elements * sizeof(struct dn_flow_queue); + for (set = all_flow_sets ; set ; set = set->next ) + size += sizeof ( *set ) + + set->rq_elements * sizeof(struct dn_flow_queue); + buf = malloc(size, M_TEMP, M_DONTWAIT); + if (buf == 0) { + splx(s); + return ENOBUFS ; + } + for (p = all_pipes, bp = buf ; p ; p = p->next ) { + struct dn_pipe *pipe_bp = (struct dn_pipe *)bp ; + + /* + * copy pipe descriptor into *bp, convert delay back to ms, + * then copy the flow_set descriptor(s) one at a time. + * After each flow_set, copy the queue descriptor it owns. + */ + bcopy(p, bp, sizeof( *p ) ); + pipe_bp->delay = (pipe_bp->delay * 1000) / hz ; + /* + * XXX the following is a hack based on ->next being the + * first field in dn_pipe and dn_flow_set. The correct + * solution would be to move the dn_flow_set to the beginning + * of struct dn_pipe. + */ + pipe_bp->next = (struct dn_pipe *)DN_IS_PIPE ; + /* clean pointers */ + pipe_bp->head = pipe_bp->tail = NULL ; + pipe_bp->fs.next = NULL ; + pipe_bp->fs.pipe = NULL ; + pipe_bp->fs.rq = NULL ; + + bp += sizeof( *p ) ; + bp = dn_copy_set( &(p->fs), bp ); + } + for (set = all_flow_sets ; set ; set = set->next ) { + struct dn_flow_set *fs_bp = (struct dn_flow_set *)bp ; + bcopy(set, bp, sizeof( *set ) ); + /* XXX same hack as above */ + fs_bp->next = (struct dn_flow_set *)DN_IS_QUEUE ; + fs_bp->pipe = NULL ; + fs_bp->rq = NULL ; + bp += sizeof( *set ) ; + bp = dn_copy_set( set, bp ); + } + splx(s); + error = sooptcopyout(sopt, buf, size); + FREE(buf, M_TEMP); + return error ; +} + +/* + * Handler for the various dummynet socket options (get, flush, config, del) + */ +static int +ip_dn_ctl(struct sockopt *sopt) +{ + int error = 0 ; + struct dn_pipe *p, tmp_pipe; + + /* Disallow sets in really-really secure mode. */ + if (sopt->sopt_dir == SOPT_SET && securelevel >= 3) + return (EPERM); + + switch (sopt->sopt_name) { + default : + printf("ip_dn_ctl -- unknown option %d", sopt->sopt_name); + return EINVAL ; + + case IP_DUMMYNET_GET : + error = dummynet_get(sopt); + break ; + + case IP_DUMMYNET_FLUSH : + dummynet_flush() ; + break ; + + case IP_DUMMYNET_CONFIGURE : + p = &tmp_pipe ; + error = sooptcopyin(sopt, p, sizeof *p, sizeof *p); + if (error) + break ; + error = config_pipe(p); + break ; + + case IP_DUMMYNET_DEL : /* remove a pipe or queue */ + p = &tmp_pipe ; + error = sooptcopyin(sopt, p, sizeof *p, sizeof *p); + if (error) + break ; + + error = delete_pipe(p); + break ; + } + return error ; +} + +static void +ip_dn_init(void) +{ + printf("DUMMYNET initialized (010124)\n"); + all_pipes = NULL ; + all_flow_sets = NULL ; + ready_heap.size = ready_heap.elements = 0 ; + ready_heap.offset = 0 ; + + wfq_ready_heap.size = wfq_ready_heap.elements = 0 ; + wfq_ready_heap.offset = 0 ; + + extract_heap.size = extract_heap.elements = 0 ; + extract_heap.offset = 0 ; + ip_dn_ctl_ptr = ip_dn_ctl; + timeout(dummynet, NULL, 1); +} + +static ip_dn_ctl_t *old_dn_ctl_ptr ; + +static int +dummynet_modevent(module_t mod, int type, void *data) +{ + int s ; + switch (type) { + case MOD_LOAD: + s = splimp(); + old_dn_ctl_ptr = ip_dn_ctl_ptr; + ip_dn_init(); + splx(s); + break; + case MOD_UNLOAD: + s = splimp(); + ip_dn_ctl_ptr = old_dn_ctl_ptr; + splx(s); + dummynet_flush(); + break ; + default: + break ; + } + return 0 ; +} + +static moduledata_t dummynet_mod = { + "dummynet", + dummynet_modevent, + NULL +} ; +DECLARE_MODULE(dummynet, dummynet_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); diff --git a/sys/netinet/ip_dummynet.h b/sys/netinet/ip_dummynet.h new file mode 100644 index 0000000..fe80718 --- /dev/null +++ b/sys/netinet/ip_dummynet.h @@ -0,0 +1,356 @@ +/* + * Copyright (c) 1998-2000 Luigi Rizzo, Universita` di Pisa + * Portions Copyright (c) 2000 Akamba Corp. + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _IP_DUMMYNET_H +#define _IP_DUMMYNET_H + +/* + * Definition of dummynet data structures. In the structures, I decided + * not to use the macros in <sys/queue.h> in the hope of making the code + * easier to port to other architectures. The type of lists and queue we + * use here is pretty simple anyways. + */ + +/* + * We start with a heap, which is used in the scheduler to decide when + * to transmit packets etc. + * + * The key for the heap is used for two different values: + * + * 1. timer ticks- max 10K/second, so 32 bits are enough; + * + * 2. virtual times. These increase in steps of len/x, where len is the + * packet length, and x is either the weight of the flow, or the + * sum of all weights. + * If we limit to max 1000 flows and a max weight of 100, then + * x needs 17 bits. The packet size is 16 bits, so we can easily + * overflow if we do not allow errors. + * So we use a key "dn_key" which is 64 bits. Some macros are used to + * compare key values and handle wraparounds. + * MAX64 returns the largest of two key values. + * MY_M is used as a shift count when doing fixed point arithmetic + * (a better name would be useful...). + */ +typedef u_int64_t dn_key ; /* sorting key */ +#define DN_KEY_LT(a,b) ((int64_t)((a)-(b)) < 0) +#define DN_KEY_LEQ(a,b) ((int64_t)((a)-(b)) <= 0) +#define DN_KEY_GT(a,b) ((int64_t)((a)-(b)) > 0) +#define DN_KEY_GEQ(a,b) ((int64_t)((a)-(b)) >= 0) +#define MAX64(x,y) (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x) +#define MY_M 16 /* number of left shift to obtain a larger precision */ + +/* + * XXX With this scaling, max 1000 flows, max weight 100, 1Gbit/s, the + * virtual time wraps every 15 days. + */ + +/* + * The OFFSET_OF macro is used to return the offset of a field within + * a structure. It is used by the heap management routines. + */ +#define OFFSET_OF(type, field) ((int)&( ((type *)0)->field) ) + +/* + * A heap entry is made of a key and a pointer to the actual + * object stored in the heap. + * The heap is an array of dn_heap_entry entries, dynamically allocated. + * Current size is "size", with "elements" actually in use. + * The heap normally supports only ordered insert and extract from the top. + * If we want to extract an object from the middle of the heap, we + * have to know where the object itself is located in the heap (or we + * need to scan the whole array). To this purpose, an object has a + * field (int) which contains the index of the object itself into the + * heap. When the object is moved, the field must also be updated. + * The offset of the index in the object is stored in the 'offset' + * field in the heap descriptor. The assumption is that this offset + * is non-zero if we want to support extract from the middle. + */ +struct dn_heap_entry { + dn_key key ; /* sorting key. Topmost element is smallest one */ + void *object ; /* object pointer */ +} ; + +struct dn_heap { + int size ; + int elements ; + int offset ; /* XXX if > 0 this is the offset of direct ptr to obj */ + struct dn_heap_entry *p ; /* really an array of "size" entries */ +} ; + +/* + * MT_DUMMYNET is a new (fake) mbuf type that is prepended to the + * packet when it comes out of a pipe. The definition + * ought to go in /sys/sys/mbuf.h but here it is less intrusive. + */ + +#define MT_DUMMYNET MT_CONTROL + +/* + * struct dn_pkt identifies a packet in the dummynet queue. The + * first part is really an m_hdr for implementation purposes, and some + * fields are saved there. When passing the packet back to the ip_input/ + * ip_output()/bdg_forward, the struct is prepended to the mbuf chain with type + * MT_DUMMYNET, and contains the pointer to the matching rule. + * + * Note: there is no real need to make this structure contain an m_hdr, + * in the future this should be changed to a normal data structure. + */ +struct dn_pkt { + struct m_hdr hdr ; +#define dn_next hdr.mh_nextpkt /* next element in queue */ +#define DN_NEXT(x) (struct dn_pkt *)(x)->dn_next +#define dn_m hdr.mh_next /* packet to be forwarded */ +#define dn_dir hdr.mh_flags /* action when pkt extracted from a queue */ +#define DN_TO_IP_OUT 1 +#define DN_TO_IP_IN 2 +#define DN_TO_BDG_FWD 3 + + dn_key output_time; /* when the pkt is due for delivery */ + struct ifnet *ifp; /* interface, for ip_output */ + struct sockaddr_in *dn_dst ; + struct route ro; /* route, for ip_output. MUST COPY */ + int flags ; /* flags, for ip_output (IPv6 ?) */ +}; + +/* + * Overall structure of dummynet (with WF2Q+): + +In dummynet, packets are selected with the firewall rules, and passed +to two different objects: PIPE or QUEUE. + +A QUEUE is just a queue with configurable size and queue management +policy. It is also associated with a mask (to discriminate among +different flows), a weight (used to give different shares of the +bandwidth to different flows) and a "pipe", which essentially +supplies the transmit clock for all queues associated with that +pipe. + +A PIPE emulates a fixed-bandwidth link, whose bandwidth is +configurable. The "clock" for a pipe can come from either an +internal timer, or from the transmit interrupt of an interface. +A pipe is also associated with one (or more, if masks are used) +queue, where all packets for that pipe are stored. + +The bandwidth available on the pipe is shared by the queues +associated with that pipe (only one in case the packet is sent +to a PIPE) according to the WF2Q+ scheduling algorithm and the +configured weights. + +In general, incoming packets are stored in the appropriate queue, +which is then placed into one of a few heaps managed by a scheduler +to decide when the packet should be extracted. +The scheduler (a function called dummynet()) is run at every timer +tick, and grabs queues from the head of the heaps when they are +ready for processing. + +There are three data structures definining a pipe and associated queues: + + + dn_pipe, which contains the main configuration parameters related + to delay and bandwidth; + + dn_flow_set, which contains WF2Q+ configuration, flow + masks, plr and RED configuration; + + dn_flow_queue, which is the per-flow queue (containing the packets) + +Multiple dn_flow_set can be linked to the same pipe, and multiple +dn_flow_queue can be linked to the same dn_flow_set. +All data structures are linked in a linear list which is used for +housekeeping purposes. + +During configuration, we create and initialize the dn_flow_set +and dn_pipe structures (a dn_pipe also contains a dn_flow_set). + +At runtime: packets are sent to the appropriate dn_flow_set (either +WFQ ones, or the one embedded in the dn_pipe for fixed-rate flows), +which in turn dispatches them to the appropriate dn_flow_queue +(created dynamically according to the masks). + +The transmit clock for fixed rate flows (ready_event()) selects the +dn_flow_queue to be used to transmit the next packet. For WF2Q, +wfq_ready_event() extract a pipe which in turn selects the right +flow using a number of heaps defined into the pipe itself. + + * + */ + +/* + * per flow queue. This contains the flow identifier, the queue + * of packets, counters, and parameters used to support both RED and + * WF2Q+. + */ +struct dn_flow_queue { + struct dn_flow_queue *next ; + struct ipfw_flow_id id ; + struct dn_pkt *head, *tail ; /* queue of packets */ + u_int len ; + u_int len_bytes ; + long numbytes ; /* credit for transmission (dynamic queues) */ + + u_int64_t tot_pkts ; /* statistics counters */ + u_int64_t tot_bytes ; + u_int32_t drops ; + int hash_slot ; /* debugging/diagnostic */ + + /* RED parameters */ + int avg ; /* average queue length est. (scaled) */ + int count ; /* arrivals since last RED drop */ + int random ; /* random value (scaled) */ + u_int32_t q_time ; /* start of queue idle time */ + + /* WF2Q+ support */ + struct dn_flow_set *fs ; /* parent flow set */ + int heap_pos ; /* position (index) of struct in heap */ + dn_key sched_time ; /* current time when queue enters ready_heap */ + + dn_key S,F ; /* start-time, finishing time */ + /* setting F < S means the timestamp is invalid. We only need + * to test this when the queue is empty. + */ +} ; + +/* + * flow_set descriptor. Contains the "template" parameters for the + * queue configuration, and pointers to the hash table of dn_flow_queue's. + * + * The hash table is an array of lists -- we identify the slot by + * hashing the flow-id, then scan the list looking for a match. + * The size of the hash table (buckets) is configurable on a per-queue + * basis. + */ +struct dn_flow_set { + struct dn_flow_set *next; /* next flow set in all_flow_sets list */ + + u_short fs_nr ; /* flow_set number */ + u_short flags_fs; +#define DN_HAVE_FLOW_MASK 0x0001 +#define DN_IS_PIPE 0x4000 +#define DN_IS_QUEUE 0x8000 +#define DN_IS_RED 0x0002 +#define DN_IS_GENTLE_RED 0x0004 +#define DN_QSIZE_IS_BYTES 0x0008 /* queue measured in bytes */ + + struct dn_pipe *pipe ; /* pointer to parent pipe */ + u_short parent_nr ; /* parent pipe#, 0 if local to a pipe */ + + int weight ; /* WFQ queue weight */ + int qsize ; /* queue size in slots or bytes */ + int plr ; /* pkt loss rate (2^31-1 means 100%) */ + + struct ipfw_flow_id flow_mask ; + /* hash table of queues onto this flow_set */ + int rq_size ; /* number of slots */ + int rq_elements ; /* active elements */ + struct dn_flow_queue **rq; /* array of rq_size entries */ + u_int32_t last_expired ; /* do not expire too frequently */ + /* XXX some RED parameters as well ? */ + int backlogged ; /* #active queues for this flowset */ + + /* RED parameters */ +#define SCALE_RED 16 +#define SCALE(x) ( (x) << SCALE_RED ) +#define SCALE_VAL(x) ( (x) >> SCALE_RED ) +#define SCALE_MUL(x,y) ( ( (x) * (y) ) >> SCALE_RED ) + int w_q ; /* queue weight (scaled) */ + int max_th ; /* maximum threshold for queue (scaled) */ + int min_th ; /* minimum threshold for queue (scaled) */ + int max_p ; /* maximum value for p_b (scaled) */ + u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */ + u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */ + u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */ + u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */ + u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */ + u_int lookup_depth ; /* depth of lookup table */ + int lookup_step ; /* granularity inside the lookup table */ + int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */ + int avg_pkt_size ; /* medium packet size */ + int max_pkt_size ; /* max packet size */ +} ; + +/* + * Pipe descriptor. Contains global parameters, delay-line queue, + * and the flow_set used for fixed-rate queues. + * + * For WF2Q support it also has 4 heaps holding dn_flow_queue: + * not_eligible_heap, for queues whose start time is higher + * than the virtual time. Sorted by start time. + * scheduler_heap, for queues eligible for scheduling. Sorted by + * finish time. + * backlogged_heap, all flows in the two heaps above, sorted by + * start time. This is used to compute the virtual time. + * idle_heap, all flows that are idle and can be removed. We + * do that on each tick so we do not slow down too much + * operations during forwarding. + * + */ +struct dn_pipe { /* a pipe */ + struct dn_pipe *next ; + + int pipe_nr ; /* number */ + int bandwidth; /* really, bytes/tick. */ + int delay ; /* really, ticks */ + + struct dn_pkt *head, *tail ; /* packets in delay line */ + + /* WF2Q+ */ + struct dn_heap scheduler_heap ; /* top extract - key Finish time*/ + struct dn_heap not_eligible_heap; /* top extract- key Start time */ + struct dn_heap idle_heap ; /* random extract - key Start=Finish time */ + + dn_key V ; /* virtual time */ + int sum; /* sum of weights of all active sessions */ + int numbytes; /* bit i can transmit (more or less). */ + + dn_key sched_time ; /* first time pipe is scheduled in ready_heap */ + + /* the tx clock can come from an interface. In this case, the + * name is below, and the pointer is filled when the rule is + * configured. We identify this by setting the if_name to a + * non-empty string. + */ + char if_name[16]; + struct ifnet *ifp ; + int ready ; /* set if ifp != NULL and we got a signal from it */ + + struct dn_flow_set fs ; /* used with fixed-rate flows */ +}; + +#ifdef _KERNEL + +MALLOC_DECLARE(M_IPFW); + +typedef int ip_dn_ctl_t __P((struct sockopt *)) ; +extern ip_dn_ctl_t *ip_dn_ctl_ptr; + +void dn_rule_delete(void *r); /* used in ip_fw.c */ +int dummynet_io(int pipe, int dir, + struct mbuf *m, struct ifnet *ifp, struct route *ro, + struct sockaddr_in * dst, + struct ip_fw_chain *rule, int flags); +#endif + +#endif /* _IP_DUMMYNET_H */ diff --git a/sys/netinet/ip_ecn.c b/sys/netinet/ip_ecn.c new file mode 100644 index 0000000..047f82e --- /dev/null +++ b/sys/netinet/ip_ecn.c @@ -0,0 +1,149 @@ +/* $FreeBSD$ */ +/* $KAME: ip_ecn.c,v 1.7 2000/05/05 11:00:56 sumikawa Exp $ */ + +/* + * Copyright (C) 1999 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +/* + * ECN consideration on tunnel ingress/egress operation. + * http://www.aciri.org/floyd/papers/draft-ipsec-ecn-00.txt + */ + +#include "opt_inet.h" +#include "opt_inet6.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/mbuf.h> +#include <sys/errno.h> + +#ifdef INET +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#endif + +#ifdef INET6 +#ifndef INET +#include <netinet/in.h> +#endif +#include <netinet/ip6.h> +#endif + +#include <netinet/ip_ecn.h> +#ifdef INET6 +#include <netinet6/ip6_ecn.h> +#endif + +/* + * modify outer ECN (TOS) field on ingress operation (tunnel encapsulation). + * call it after you've done the default initialization/copy for the outer. + */ +void +ip_ecn_ingress(mode, outer, inner) + int mode; + u_int8_t *outer; + u_int8_t *inner; +{ + if (!outer || !inner) + panic("NULL pointer passed to ip_ecn_ingress"); + + switch (mode) { + case ECN_ALLOWED: /* ECN allowed */ + *outer &= ~IPTOS_CE; + break; + case ECN_FORBIDDEN: /* ECN forbidden */ + *outer &= ~(IPTOS_ECT | IPTOS_CE); + break; + case ECN_NOCARE: /* no consideration to ECN */ + break; + } +} + +/* + * modify inner ECN (TOS) field on egress operation (tunnel decapsulation). + * call it after you've done the default initialization/copy for the inner. + */ +void +ip_ecn_egress(mode, outer, inner) + int mode; + u_int8_t *outer; + u_int8_t *inner; +{ + if (!outer || !inner) + panic("NULL pointer passed to ip_ecn_egress"); + + switch (mode) { + case ECN_ALLOWED: + if (*outer & IPTOS_CE) + *inner |= IPTOS_CE; + break; + case ECN_FORBIDDEN: /* ECN forbidden */ + case ECN_NOCARE: /* no consideration to ECN */ + break; + } +} + +#ifdef INET6 +void +ip6_ecn_ingress(mode, outer, inner) + int mode; + u_int32_t *outer; + u_int32_t *inner; +{ + u_int8_t outer8, inner8; + + if (!outer || !inner) + panic("NULL pointer passed to ip6_ecn_ingress"); + + outer8 = (ntohl(*outer) >> 20) & 0xff; + inner8 = (ntohl(*inner) >> 20) & 0xff; + ip_ecn_ingress(mode, &outer8, &inner8); + *outer &= ~htonl(0xff << 20); + *outer |= htonl((u_int32_t)outer8 << 20); +} + +void +ip6_ecn_egress(mode, outer, inner) + int mode; + u_int32_t *outer; + u_int32_t *inner; +{ + u_int8_t outer8, inner8; + + if (!outer || !inner) + panic("NULL pointer passed to ip6_ecn_egress"); + + outer8 = (ntohl(*outer) >> 20) & 0xff; + inner8 = (ntohl(*inner) >> 20) & 0xff; + ip_ecn_egress(mode, &outer8, &inner8); + *inner &= ~htonl(0xff << 20); + *inner |= htonl((u_int32_t)inner8 << 20); +} +#endif diff --git a/sys/netinet/ip_ecn.h b/sys/netinet/ip_ecn.h new file mode 100644 index 0000000..6445d0f --- /dev/null +++ b/sys/netinet/ip_ecn.h @@ -0,0 +1,45 @@ +/* $FreeBSD$ */ +/* $KAME: ip_ecn.h,v 1.5 2000/03/27 04:58:38 sumikawa Exp $ */ + +/* + * Copyright (C) 1999 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +/* + * ECN consideration on tunnel ingress/egress operation. + * http://www.aciri.org/floyd/papers/draft-ipsec-ecn-00.txt + */ + +#define ECN_ALLOWED 1 /* ECN allowed */ +#define ECN_FORBIDDEN 0 /* ECN forbidden */ +#define ECN_NOCARE (-1) /* no consideration to ECN */ + +#ifdef _KERNEL +extern void ip_ecn_ingress __P((int, u_int8_t *, u_int8_t *)); +extern void ip_ecn_egress __P((int, u_int8_t *, u_int8_t *)); +#endif diff --git a/sys/netinet/ip_encap.c b/sys/netinet/ip_encap.c new file mode 100644 index 0000000..7d623ea --- /dev/null +++ b/sys/netinet/ip_encap.c @@ -0,0 +1,534 @@ +/* $FreeBSD$ */ +/* $KAME: ip_encap.c,v 1.36 2000/06/17 20:34:24 itojun Exp $ */ + +/* + * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * My grandfather said that there's a devil inside tunnelling technology... + * + * We have surprisingly many protocols that want packets with IP protocol + * #4 or #41. Here's a list of protocols that want protocol #41: + * RFC1933 configured tunnel + * RFC1933 automatic tunnel + * RFC2401 IPsec tunnel + * RFC2473 IPv6 generic packet tunnelling + * RFC2529 6over4 tunnel + * mobile-ip6 (uses RFC2473) + * 6to4 tunnel + * Here's a list of protocol that want protocol #4: + * RFC1853 IPv4-in-IPv4 tunnelling + * RFC2003 IPv4 encapsulation within IPv4 + * RFC2344 reverse tunnelling for mobile-ip4 + * RFC2401 IPsec tunnel + * Well, what can I say. They impose different en/decapsulation mechanism + * from each other, so they need separate protocol handler. The only one + * we can easily determine by protocol # is IPsec, which always has + * AH/ESP/IPComp header right after outer IP header. + * + * So, clearly good old protosw does not work for protocol #4 and #41. + * The code will let you match protocol via src/dst address pair. + */ +/* XXX is M_NETADDR correct? */ + +#include "opt_mrouting.h" +#include "opt_inet.h" +#include "opt_inet6.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/socket.h> +#include <sys/sockio.h> +#include <sys/mbuf.h> +#include <sys/errno.h> +#include <sys/protosw.h> + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#include <netinet/ip_encap.h> +#ifdef MROUTING +#include <netinet/ip_mroute.h> +#endif /* MROUTING */ +#include <netinet/ipprotosw.h> + +#ifdef INET6 +#include <netinet/ip6.h> +#include <netinet6/ip6_var.h> +#include <netinet6/ip6protosw.h> +#endif + +#include <machine/stdarg.h> + +#include <net/net_osdep.h> + +#include <sys/kernel.h> +#include <sys/malloc.h> +static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); + +static void encap_add __P((struct encaptab *)); +static int mask_match __P((const struct encaptab *, const struct sockaddr *, + const struct sockaddr *)); +static void encap_fillarg __P((struct mbuf *, const struct encaptab *)); + +/* rely upon BSS initialization */ +LIST_HEAD(, encaptab) encaptab; + +void +encap_init() +{ +#if 0 + /* + * we cannot use LIST_INIT() here, since drivers may want to call + * encap_attach(), on driver attach. encap_init() will be called + * on AF_INET{,6} initialization, which happens after driver + * initialization - using LIST_INIT() here can nuke encap_attach() + * from drivers. + */ + LIST_INIT(&encaptab); +#endif +} + +void +#if __STDC__ +encap4_input(struct mbuf *m, ...) +#else +encap4_input(m, va_alist) + struct mbuf *m; + va_dcl +#endif +{ + int off, proto; + struct ip *ip; + struct sockaddr_in s, d; + const struct ipprotosw *psw; + struct encaptab *ep, *match; + va_list ap; + int prio, matchprio; + + va_start(ap, m); + off = va_arg(ap, int); + proto = va_arg(ap, int); + va_end(ap); + + ip = mtod(m, struct ip *); + + bzero(&s, sizeof(s)); + s.sin_family = AF_INET; + s.sin_len = sizeof(struct sockaddr_in); + s.sin_addr = ip->ip_src; + bzero(&d, sizeof(d)); + d.sin_family = AF_INET; + d.sin_len = sizeof(struct sockaddr_in); + d.sin_addr = ip->ip_dst; + + match = NULL; + matchprio = 0; + LIST_FOREACH(ep, &encaptab, chain) { + if (ep->af != AF_INET) + continue; + if (ep->proto >= 0 && ep->proto != proto) + continue; + if (ep->func) + prio = (*ep->func)(m, off, proto, ep->arg); + else { + /* + * it's inbound traffic, we need to match in reverse + * order + */ + prio = mask_match(ep, (struct sockaddr *)&d, + (struct sockaddr *)&s); + } + + /* + * We prioritize the matches by using bit length of the + * matches. mask_match() and user-supplied matching function + * should return the bit length of the matches (for example, + * if both src/dst are matched for IPv4, 64 should be returned). + * 0 or negative return value means "it did not match". + * + * The question is, since we have two "mask" portion, we + * cannot really define total order between entries. + * For example, which of these should be preferred? + * mask_match() returns 48 (32 + 16) for both of them. + * src=3ffe::/16, dst=3ffe:501::/32 + * src=3ffe:501::/32, dst=3ffe::/16 + * + * We need to loop through all the possible candidates + * to get the best match - the search takes O(n) for + * n attachments (i.e. interfaces). + */ + if (prio <= 0) + continue; + if (prio > matchprio) { + matchprio = prio; + match = ep; + } + } + + if (match) { + /* found a match, "match" has the best one */ + psw = (const struct ipprotosw *)match->psw; + if (psw && psw->pr_input) { + encap_fillarg(m, match); + (*psw->pr_input)(m, off, proto); + } else + m_freem(m); + return; + } + + /* for backward compatibility */ +# ifdef MROUTING +# define COMPATFUNC ipip_input +# endif /*MROUTING*/ + +#ifdef COMPATFUNC + if (proto == IPPROTO_IPV4) { + COMPATFUNC(m, off, proto); + return; + } +#endif + + /* last resort: inject to raw socket */ + rip_input(m, off, proto); +} + +#ifdef INET6 +int +encap6_input(mp, offp, proto) + struct mbuf **mp; + int *offp; + int proto; +{ + struct mbuf *m = *mp; + struct ip6_hdr *ip6; + struct sockaddr_in6 s, d; + const struct ip6protosw *psw; + struct encaptab *ep, *match; + int prio, matchprio; + + ip6 = mtod(m, struct ip6_hdr *); + + bzero(&s, sizeof(s)); + s.sin6_family = AF_INET6; + s.sin6_len = sizeof(struct sockaddr_in6); + s.sin6_addr = ip6->ip6_src; + bzero(&d, sizeof(d)); + d.sin6_family = AF_INET6; + d.sin6_len = sizeof(struct sockaddr_in6); + d.sin6_addr = ip6->ip6_dst; + + match = NULL; + matchprio = 0; + LIST_FOREACH(ep, &encaptab, chain) { + if (ep->af != AF_INET6) + continue; + if (ep->proto >= 0 && ep->proto != proto) + continue; + if (ep->func) + prio = (*ep->func)(m, *offp, proto, ep->arg); + else { + /* + * it's inbound traffic, we need to match in reverse + * order + */ + prio = mask_match(ep, (struct sockaddr *)&d, + (struct sockaddr *)&s); + } + + /* see encap4_input() for issues here */ + if (prio <= 0) + continue; + if (prio > matchprio) { + matchprio = prio; + match = ep; + } + } + + if (match) { + /* found a match */ + psw = (const struct ip6protosw *)match->psw; + if (psw && psw->pr_input) { + encap_fillarg(m, match); + return (*psw->pr_input)(mp, offp, proto); + } else { + m_freem(m); + return IPPROTO_DONE; + } + } + + /* last resort: inject to raw socket */ + return rip6_input(mp, offp, proto); +} +#endif + +static void +encap_add(ep) + struct encaptab *ep; +{ + + LIST_INSERT_HEAD(&encaptab, ep, chain); +} + +/* + * sp (src ptr) is always my side, and dp (dst ptr) is always remote side. + * length of mask (sm and dm) is assumed to be same as sp/dp. + * Return value will be necessary as input (cookie) for encap_detach(). + */ +const struct encaptab * +encap_attach(af, proto, sp, sm, dp, dm, psw, arg) + int af; + int proto; + const struct sockaddr *sp, *sm; + const struct sockaddr *dp, *dm; + const struct protosw *psw; + void *arg; +{ + struct encaptab *ep; + int error; + int s; + + s = splnet(); + /* sanity check on args */ + if (sp->sa_len > sizeof(ep->src) || dp->sa_len > sizeof(ep->dst)) { + error = EINVAL; + goto fail; + } + if (sp->sa_len != dp->sa_len) { + error = EINVAL; + goto fail; + } + if (af != sp->sa_family || af != dp->sa_family) { + error = EINVAL; + goto fail; + } + + /* check if anyone have already attached with exactly same config */ + LIST_FOREACH(ep, &encaptab, chain) { + if (ep->af != af) + continue; + if (ep->proto != proto) + continue; + if (ep->src.ss_len != sp->sa_len || + bcmp(&ep->src, sp, sp->sa_len) != 0 || + bcmp(&ep->srcmask, sm, sp->sa_len) != 0) + continue; + if (ep->dst.ss_len != dp->sa_len || + bcmp(&ep->dst, dp, dp->sa_len) != 0 || + bcmp(&ep->dstmask, dm, dp->sa_len) != 0) + continue; + + error = EEXIST; + goto fail; + } + + ep = malloc(sizeof(*ep), M_NETADDR, M_NOWAIT); /*XXX*/ + if (ep == NULL) { + error = ENOBUFS; + goto fail; + } + bzero(ep, sizeof(*ep)); + + ep->af = af; + ep->proto = proto; + bcopy(sp, &ep->src, sp->sa_len); + bcopy(sm, &ep->srcmask, sp->sa_len); + bcopy(dp, &ep->dst, dp->sa_len); + bcopy(dm, &ep->dstmask, dp->sa_len); + ep->psw = psw; + ep->arg = arg; + + encap_add(ep); + + error = 0; + splx(s); + return ep; + +fail: + splx(s); + return NULL; +} + +const struct encaptab * +encap_attach_func(af, proto, func, psw, arg) + int af; + int proto; + int (*func) __P((const struct mbuf *, int, int, void *)); + const struct protosw *psw; + void *arg; +{ + struct encaptab *ep; + int error; + int s; + + s = splnet(); + /* sanity check on args */ + if (!func) { + error = EINVAL; + goto fail; + } + + ep = malloc(sizeof(*ep), M_NETADDR, M_NOWAIT); /*XXX*/ + if (ep == NULL) { + error = ENOBUFS; + goto fail; + } + bzero(ep, sizeof(*ep)); + + ep->af = af; + ep->proto = proto; + ep->func = func; + ep->psw = psw; + ep->arg = arg; + + encap_add(ep); + + error = 0; + splx(s); + return ep; + +fail: + splx(s); + return NULL; +} + +int +encap_detach(cookie) + const struct encaptab *cookie; +{ + const struct encaptab *ep = cookie; + struct encaptab *p; + + LIST_FOREACH(p, &encaptab, chain) { + if (p == ep) { + LIST_REMOVE(p, chain); + free(p, M_NETADDR); /*XXX*/ + return 0; + } + } + + return EINVAL; +} + +static int +mask_match(ep, sp, dp) + const struct encaptab *ep; + const struct sockaddr *sp; + const struct sockaddr *dp; +{ + struct sockaddr_storage s; + struct sockaddr_storage d; + int i; + const u_int8_t *p, *q; + u_int8_t *r; + int matchlen; + + if (sp->sa_len > sizeof(s) || dp->sa_len > sizeof(d)) + return 0; + if (sp->sa_family != ep->af || dp->sa_family != ep->af) + return 0; + if (sp->sa_len != ep->src.ss_len || dp->sa_len != ep->dst.ss_len) + return 0; + + matchlen = 0; + + p = (const u_int8_t *)sp; + q = (const u_int8_t *)&ep->srcmask; + r = (u_int8_t *)&s; + for (i = 0 ; i < sp->sa_len; i++) { + r[i] = p[i] & q[i]; + /* XXX estimate */ + matchlen += (q[i] ? 8 : 0); + } + + p = (const u_int8_t *)dp; + q = (const u_int8_t *)&ep->dstmask; + r = (u_int8_t *)&d; + for (i = 0 ; i < dp->sa_len; i++) { + r[i] = p[i] & q[i]; + /* XXX rough estimate */ + matchlen += (q[i] ? 8 : 0); + } + + /* need to overwrite len/family portion as we don't compare them */ + s.ss_len = sp->sa_len; + s.ss_family = sp->sa_family; + d.ss_len = dp->sa_len; + d.ss_family = dp->sa_family; + + if (bcmp(&s, &ep->src, ep->src.ss_len) == 0 && + bcmp(&d, &ep->dst, ep->dst.ss_len) == 0) { + return matchlen; + } else + return 0; +} + +static void +encap_fillarg(m, ep) + struct mbuf *m; + const struct encaptab *ep; +{ +#if 0 + m->m_pkthdr.aux = ep->arg; +#else + struct mbuf *n; + + n = m_aux_add(m, AF_INET, IPPROTO_IPV4); + if (n) { + *mtod(n, void **) = ep->arg; + n->m_len = sizeof(void *); + } +#endif +} + +void * +encap_getarg(m) + struct mbuf *m; +{ + void *p; +#if 0 + p = m->m_pkthdr.aux; + m->m_pkthdr.aux = NULL; + return p; +#else + struct mbuf *n; + + p = NULL; + n = m_aux_find(m, AF_INET, IPPROTO_IPV4); + if (n) { + if (n->m_len == sizeof(void *)) + p = *mtod(n, void **); + m_aux_delete(m, n); + } + return p; +#endif +} diff --git a/sys/netinet/ip_encap.h b/sys/netinet/ip_encap.h new file mode 100644 index 0000000..38df6f9 --- /dev/null +++ b/sys/netinet/ip_encap.h @@ -0,0 +1,64 @@ +/* $FreeBSD$ */ +/* $KAME: ip_encap.h,v 1.7 2000/03/25 07:23:37 sumikawa Exp $ */ + +/* + * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _NETINET_IP_ENCAP_H_ +#define _NETINET_IP_ENCAP_H_ + +#ifdef _KERNEL + +struct encaptab { + LIST_ENTRY(encaptab) chain; + int af; + int proto; /* -1: don't care, I'll check myself */ + struct sockaddr_storage src; /* my addr */ + struct sockaddr_storage srcmask; + struct sockaddr_storage dst; /* remote addr */ + struct sockaddr_storage dstmask; + int (*func) __P((const struct mbuf *, int, int, void *)); + const struct protosw *psw; /* only pr_input will be used */ + void *arg; /* passed via m->m_pkthdr.aux */ +}; + +void encap_init __P((void)); +void encap4_input __P((struct mbuf *, ...)); +int encap6_input __P((struct mbuf **, int *, int)); +const struct encaptab *encap_attach __P((int, int, const struct sockaddr *, + const struct sockaddr *, const struct sockaddr *, + const struct sockaddr *, const struct protosw *, void *)); +const struct encaptab *encap_attach_func __P((int, int, + int (*) __P((const struct mbuf *, int, int, void *)), + const struct protosw *, void *)); +int encap_detach __P((const struct encaptab *)); +void *encap_getarg __P((struct mbuf *)); +#endif + +#endif /*_NETINET_IP_ENCAP_H_*/ diff --git a/sys/netinet/ip_fil.c b/sys/netinet/ip_fil.c new file mode 100644 index 0000000..ff80058 --- /dev/null +++ b/sys/netinet/ip_fil.c @@ -0,0 +1,1763 @@ +/* + * Copyright (C) 1993-2000 by Darren Reed. + * + * Redistribution and use in source and binary forms are permitted + * provided that this notice is preserved and due credit is given + * to the original author and the contributors. + */ +#if !defined(lint) +static const char sccsid[] = "@(#)ip_fil.c 2.41 6/5/96 (C) 1993-1995 Darren Reed"; +/*static const char rcsid[] = "@(#)$Id: ip_fil.c,v 2.42.2.14 2000/07/18 13:57:55 darrenr Exp $";*/ +static const char rcsid[] = "@(#)$FreeBSD$"; +#endif + +#ifndef SOLARIS +#define SOLARIS (defined(sun) && (defined(__svr4__) || defined(__SVR4))) +#endif + +#if defined(KERNEL) && !defined(_KERNEL) +# define _KERNEL +#endif +#if defined(_KERNEL) && defined(__FreeBSD_version) && \ + (__FreeBSD_version >= 400000) && !defined(KLD_MODULE) +#include "opt_inet6.h" +#endif +#include <sys/param.h> +#if defined(__NetBSD__) && (NetBSD >= 199905) && !defined(IPFILTER_LKM) && \ + defined(_KERNEL) +# include "opt_ipfilter_log.h" +#endif +#if defined(__FreeBSD__) && !defined(__FreeBSD_version) +# if !defined(_KERNEL) || defined(IPFILTER_LKM) +# include <osreldate.h> +# endif +#endif +#ifndef _KERNEL +# include <stdio.h> +# include <string.h> +# include <stdlib.h> +# include <ctype.h> +# include <fcntl.h> +#endif +#include <sys/errno.h> +#include <sys/types.h> +#include <sys/file.h> +#if __FreeBSD_version >= 220000 && defined(_KERNEL) +# include <sys/fcntl.h> +# include <sys/filio.h> +#else +# include <sys/ioctl.h> +#endif +#include <sys/time.h> +#ifdef _KERNEL +# include <sys/systm.h> +#endif +#include <sys/uio.h> +#if !SOLARIS +# if (NetBSD > 199609) || (OpenBSD > 199603) || (__FreeBSD_version >= 300000) +# include <sys/dirent.h> +# else +# include <sys/dir.h> +# endif +# include <sys/mbuf.h> +#else +# include <sys/filio.h> +#endif +#include <sys/protosw.h> +#include <sys/socket.h> + +#include <net/if.h> +#ifdef sun +# include <net/af.h> +#endif +#if __FreeBSD_version >= 300000 +# include <net/if_var.h> +# if defined(_KERNEL) && !defined(IPFILTER_LKM) +# include "opt_ipfilter.h" +# endif +#endif +#ifdef __sgi +#include <sys/debug.h> +# ifdef IFF_DRVRLOCK /* IRIX6 */ +#include <sys/hashing.h> +# endif +#endif +#include <net/route.h> +#include <netinet/in.h> +#if !(defined(__sgi) && !defined(IFF_DRVRLOCK)) /* IRIX < 6 */ +# include <netinet/in_var.h> +#endif +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#include <netinet/tcp.h> +#include <netinet/udp.h> +#include <netinet/tcpip.h> +#include <netinet/ip_icmp.h> +#ifndef _KERNEL +# include <unistd.h> +# include <syslog.h> +#endif +#include "netinet/ip_compat.h" +#ifdef USE_INET6 +# include <netinet/icmp6.h> +#endif +#include "netinet/ip_fil.h" +#include "netinet/ip_proxy.h" +#include "netinet/ip_nat.h" +#include "netinet/ip_frag.h" +#include "netinet/ip_state.h" +#include "netinet/ip_auth.h" +#if defined(__FreeBSD_version) && (__FreeBSD_version >= 300000) +# include <sys/malloc.h> +#endif +#ifndef MIN +# define MIN(a,b) (((a)<(b))?(a):(b)) +#endif +#if !SOLARIS && defined(_KERNEL) && !defined(__sgi) +# include <sys/kernel.h> +extern int ip_optcopy __P((struct ip *, struct ip *)); +#endif + +#include <machine/in_cksum.h> + +extern struct protosw inetsw[]; + +#ifndef _KERNEL +# include "ipt.h" +static struct ifnet **ifneta = NULL; +static int nifs = 0; +#else +# if (BSD < 199306) || defined(__sgi) +extern int tcp_ttl; +# endif +#endif + +int ipl_unreach = ICMP_UNREACH_FILTER; +u_long ipl_frouteok[2] = {0, 0}; + +static int frzerostats __P((caddr_t)); +#if defined(__NetBSD__) || defined(__OpenBSD__) || (__FreeBSD_version >= 300003) +static int frrequest __P((int, u_long, caddr_t, int)); +#else +static int frrequest __P((int, int, caddr_t, int)); +#endif +#ifdef _KERNEL +static int (*fr_savep) __P((ip_t *, int, void *, int, struct mbuf **)); +static int send_ip __P((ip_t *, fr_info_t *, struct mbuf *)); +# ifdef __sgi +extern kmutex_t ipf_rw; +extern KRWLOCK_T ipf_mutex; +# endif +#else +int ipllog __P((void)); +void init_ifp __P((void)); +# ifdef __sgi +static int no_output __P((struct ifnet *, struct mbuf *, + struct sockaddr *)); +static int write_output __P((struct ifnet *, struct mbuf *, + struct sockaddr *)); +# else +static int no_output __P((struct ifnet *, struct mbuf *, + struct sockaddr *, struct rtentry *)); +static int write_output __P((struct ifnet *, struct mbuf *, + struct sockaddr *, struct rtentry *)); +# endif +#endif +int fr_running = 0; + +#if (__FreeBSD_version >= 300000) && defined(_KERNEL) +struct callout_handle ipfr_slowtimer_ch; +#endif +#if defined(__NetBSD__) && (__NetBSD_Version__ >= 104230000) +# include <sys/callout.h> +struct callout ipfr_slowtimer_ch; +#endif +#if defined(__sgi) && defined(_KERNEL) +toid_t ipfr_slowtimer_ch; +#endif + +#if (_BSDI_VERSION >= 199510) && defined(_KERNEL) +# include <sys/device.h> +# include <sys/conf.h> + +struct cfdriver iplcd = { + NULL, "ipl", NULL, NULL, DV_DULL, 0 +}; + +struct devsw iplsw = { + &iplcd, + iplopen, iplclose, iplread, nowrite, iplioctl, noselect, nommap, + nostrat, nodump, nopsize, 0, + nostop +}; +#endif /* _BSDI_VERSION >= 199510 && _KERNEL */ + +#if defined(__NetBSD__) || defined(__OpenBSD__) || \ + (_BSDI_VERSION >= 199701) || (__FreeBSD_version >= 500011) +# include <sys/conf.h> +# if defined(NETBSD_PF) +# include <net/pfil.h> +/* + * We provide the fr_checkp name just to minimize changes later. + */ +int (*fr_checkp) __P((ip_t *ip, int hlen, void *ifp, int out, mb_t **mp)); +# endif /* NETBSD_PF */ +#endif /* __NetBSD__ */ + +#ifdef _KERNEL +# if defined(IPFILTER_LKM) && !defined(__sgi) +int iplidentify(s) +char *s; +{ + if (strcmp(s, "ipl") == 0) + return 1; + return 0; +} +# endif /* IPFILTER_LKM */ + + +/* + * Try to detect the case when compiling for NetBSD with pseudo-device + */ +# if defined(__NetBSD__) && defined(PFIL_HOOKS) +void +ipfilterattach(count) +int count; +{ + if (iplattach() != 0) + printf("IP Filter failed to attach\n"); +} +# endif + + +int iplattach() +{ + char *defpass; + int s; +# if defined(__sgi) || (defined(NETBSD_PF) && \ + ((__NetBSD_Version__ >= 104200000) || (__FreeBSD_version >= 500011))) + int error = 0; +# endif + + SPL_NET(s); + if (fr_running || (fr_checkp == fr_check)) { + printf("IP Filter: already initialized\n"); + SPL_X(s); + return EBUSY; + } + +# ifdef IPFILTER_LOG + ipflog_init(); +# endif + if (nat_init() == -1) + return -1; + if (fr_stateinit() == -1) + return -1; + if (appr_init() == -1) + return -1; + +# ifdef NETBSD_PF +# if (__NetBSD_Version__ >= 104200000) || (__FreeBSD_version >= 500011) + error = pfil_add_hook((void *)fr_check, PFIL_IN|PFIL_OUT, + &inetsw[ip_protox[IPPROTO_IP]].pr_pfh); + if (error) { +# ifdef USE_INET6 + goto pfil_error; +# else + appr_unload(); + ip_natunload(); + fr_stateunload(); + return error; +# endif + } +# else + pfil_add_hook((void *)fr_check, PFIL_IN|PFIL_OUT); +# endif +# ifdef USE_INET6 + error = pfil_add_hook((void *)fr_check, PFIL_IN|PFIL_OUT, + &inetsw[ip_protox[IPPROTO_IPV6]].pr_pfh); + if (error) { + pfil_remove_hook((void *)fr_check, PFIL_IN|PFIL_OUT, + &inetsw[ip_protox[IPPROTO_IP]].pr_pfh); +pfil_error: + appr_unload(); + ip_natunload(); + fr_stateunload(); + return error; + } +# endif +# endif + +# ifdef __sgi + error = ipfilter_sgi_attach(); + if (error) { + SPL_X(s); + appr_unload(); + ip_natunload(); + fr_stateunload(); + return error; + } +# endif + + bzero((char *)frcache, sizeof(frcache)); + fr_savep = fr_checkp; + fr_checkp = fr_check; + fr_running = 1; + + SPL_X(s); + if (fr_pass & FR_PASS) + defpass = "pass"; + else if (fr_pass & FR_BLOCK) + defpass = "block"; + else + defpass = "no-match -> block"; + + printf("%s initialized. Default = %s all, Logging = %s\n", + ipfilter_version, defpass, +# ifdef IPFILTER_LOG + "enabled"); +# else + "disabled"); +# endif +#ifdef _KERNEL +# if defined(__NetBSD__) && (__NetBSD_Version__ >= 104230000) + callout_init(&ipfr_slowtimer_ch); + callout_reset(&ipfr_slowtimer_ch, hz / 2, ipfr_slowtimer, NULL); +# else +# if (__FreeBSD_version >= 300000) || defined(__sgi) + ipfr_slowtimer_ch = timeout(ipfr_slowtimer, NULL, hz/2); +# else + timeout(ipfr_slowtimer, NULL, hz/2); +# endif +# endif +#endif + return 0; +} + + +/* + * Disable the filter by removing the hooks from the IP input/output + * stream. + */ +int ipldetach() +{ + int s, i = FR_INQUE|FR_OUTQUE; +#if defined(NETBSD_PF) && \ + ((__NetBSD_Version__ >= 104200000) || (__FreeBSD_version >= 500011)) + int error = 0; +#endif + +#ifdef _KERNEL +# if defined(__NetBSD__) && (__NetBSD_Version__ >= 104230000) + callout_stop(&ipfr_slowtimer_ch); +# else +# if (__FreeBSD_version >= 300000) + untimeout(ipfr_slowtimer, NULL, ipfr_slowtimer_ch); +# else +# ifdef __sgi + untimeout(ipfr_slowtimer_ch); +# else + untimeout(ipfr_slowtimer, NULL); +# endif +# endif /* FreeBSD */ +# endif /* NetBSD */ +#endif + SPL_NET(s); + if (!fr_running) + { + printf("IP Filter: not initialized\n"); + SPL_X(s); + return 0; + } + + printf("%s unloaded\n", ipfilter_version); + + fr_checkp = fr_savep; + i = frflush(IPL_LOGIPF, i); + fr_running = 0; + +# ifdef NETBSD_PF +# if ((__NetBSD_Version__ >= 104200000) || (__FreeBSD_version >= 500011)) + error = pfil_remove_hook((void *)fr_check, PFIL_IN|PFIL_OUT, + &inetsw[ip_protox[IPPROTO_IP]].pr_pfh); + if (error) + return error; +# else + pfil_remove_hook((void *)fr_check, PFIL_IN|PFIL_OUT); +# endif +# ifdef USE_INET6 + error = pfil_remove_hook((void *)fr_check, PFIL_IN|PFIL_OUT, + &inetsw[ip_protox[IPPROTO_IPV6]].pr_pfh); + if (error) + return error; +# endif +# endif + +# ifdef __sgi + ipfilter_sgi_detach(); +# endif + + appr_unload(); + ipfr_unload(); + ip_natunload(); + fr_stateunload(); + fr_authunload(); + + SPL_X(s); + return 0; +} +#endif /* _KERNEL */ + + +static int frzerostats(data) +caddr_t data; +{ + friostat_t fio; + int error; + + fr_getstat(&fio); + error = IWCOPYPTR((caddr_t)&fio, data, sizeof(fio)); + if (error) + return EFAULT; + + bzero((char *)frstats, sizeof(*frstats) * 2); + + return 0; +} + + +/* + * Filter ioctl interface. + */ +#ifdef __sgi +int IPL_EXTERN(ioctl)(dev_t dev, int cmd, caddr_t data, int mode +# ifdef _KERNEL + , cred_t *cp, int *rp +# endif +) +#else +int IPL_EXTERN(ioctl)(dev, cmd, data, mode +# if (defined(_KERNEL) && ((_BSDI_VERSION >= 199510) || (BSD >= 199506) || \ + (NetBSD >= 199511) || (__FreeBSD_version >= 220000) || \ + defined(__OpenBSD__))) +, p) +struct proc *p; +# else +) +# endif +dev_t dev; +# if defined(__NetBSD__) || defined(__OpenBSD__) || \ + (_BSDI_VERSION >= 199701) || (__FreeBSD_version >= 300000) +u_long cmd; +# else +int cmd; +# endif +caddr_t data; +int mode; +#endif /* __sgi */ +{ +#if defined(_KERNEL) && !SOLARIS + int s; +#endif + int error = 0, unit = 0, tmp; + +#if (BSD >= 199306) && defined(_KERNEL) + if ((securelevel >= 2) && (mode & FWRITE)) + return EPERM; +#endif +#ifdef _KERNEL + unit = GET_MINOR(dev); + if ((IPL_LOGMAX < unit) || (unit < 0)) + return ENXIO; +#else + unit = dev; +#endif + + SPL_NET(s); + + if (unit == IPL_LOGNAT) { + if (fr_running) + error = nat_ioctl(data, cmd, mode); + else + error = EIO; + SPL_X(s); + return error; + } + if (unit == IPL_LOGSTATE) { + if (fr_running) + error = fr_state_ioctl(data, cmd, mode); + else + error = EIO; + SPL_X(s); + return error; + } + if (unit == IPL_LOGAUTH) { + if (!fr_running) + return EIO; + error = fr_auth_ioctl(data, cmd, NULL, NULL); + SPL_X(s); + return error; + } + + switch (cmd) { + case FIONREAD : +#ifdef IPFILTER_LOG + error = IWCOPY((caddr_t)&iplused[IPL_LOGIPF], (caddr_t)data, + sizeof(iplused[IPL_LOGIPF])); +#endif + break; +#if !defined(IPFILTER_LKM) && defined(_KERNEL) + case SIOCFRENB : + { + u_int enable; + + if (!(mode & FWRITE)) + error = EPERM; + else { + error = IRCOPY(data, (caddr_t)&enable, sizeof(enable)); + if (error) + break; + if (enable) + error = iplattach(); + else + error = ipldetach(); + } + break; + } +#endif + case SIOCSETFF : + if (!(mode & FWRITE)) + error = EPERM; + else + error = IRCOPY(data, (caddr_t)&fr_flags, + sizeof(fr_flags)); + break; + case SIOCGETFF : + error = IWCOPY((caddr_t)&fr_flags, data, sizeof(fr_flags)); + break; + case SIOCINAFR : + case SIOCRMAFR : + case SIOCADAFR : + case SIOCZRLST : + if (!(mode & FWRITE)) + error = EPERM; + else + error = frrequest(unit, cmd, data, fr_active); + break; + case SIOCINIFR : + case SIOCRMIFR : + case SIOCADIFR : + if (!(mode & FWRITE)) + error = EPERM; + else + error = frrequest(unit, cmd, data, 1 - fr_active); + break; + case SIOCSWAPA : + if (!(mode & FWRITE)) + error = EPERM; + else { + bzero((char *)frcache, sizeof(frcache[0]) * 2); + *(u_int *)data = fr_active; + fr_active = 1 - fr_active; + } + break; + case SIOCGETFS : + { + friostat_t fio; + + fr_getstat(&fio); + error = IWCOPYPTR((caddr_t)&fio, data, sizeof(fio)); + if (error) + return EFAULT; + break; + } + case SIOCFRZST : + if (!(mode & FWRITE)) + error = EPERM; + else + error = frzerostats(data); + break; + case SIOCIPFFL : + if (!(mode & FWRITE)) + error = EPERM; + else { + error = IRCOPY(data, (caddr_t)&tmp, sizeof(tmp)); + if (!error) { + tmp = frflush(unit, tmp); + error = IWCOPY((caddr_t)&tmp, data, + sizeof(tmp)); + } + } + break; + case SIOCSTLCK : + error = IRCOPY(data, (caddr_t)&tmp, sizeof(tmp)); + if (!error) { + fr_state_lock = tmp; + fr_nat_lock = tmp; + fr_frag_lock = tmp; + fr_auth_lock = tmp; + } else + error = EFAULT; + break; +#ifdef IPFILTER_LOG + case SIOCIPFFB : + if (!(mode & FWRITE)) + error = EPERM; + else + *(int *)data = ipflog_clear(unit); + break; +#endif /* IPFILTER_LOG */ + case SIOCGFRST : + error = IWCOPYPTR((caddr_t)ipfr_fragstats(), data, + sizeof(ipfrstat_t)); + if (error) + return EFAULT; + break; + case SIOCAUTHW : + case SIOCAUTHR : + if (!(mode & FWRITE)) { + error = EPERM; + break; + } + case SIOCFRSYN : + if (!(mode & FWRITE)) + error = EPERM; + else { +#if defined(_KERNEL) && defined(__sgi) + ipfsync(); +#endif + frsync(); + } + break; + default : + error = EINVAL; + break; + } + SPL_X(s); + return error; +} + + +void fr_forgetifp(ifp) +void *ifp; +{ + register frentry_t *f; + + WRITE_ENTER(&ipf_mutex); + for (f = ipacct[0][fr_active]; (f != NULL); f = f->fr_next) + if (f->fr_ifa == ifp) + f->fr_ifa = (void *)-1; + for (f = ipacct[1][fr_active]; (f != NULL); f = f->fr_next) + if (f->fr_ifa == ifp) + f->fr_ifa = (void *)-1; + for (f = ipfilter[0][fr_active]; (f != NULL); f = f->fr_next) + if (f->fr_ifa == ifp) + f->fr_ifa = (void *)-1; + for (f = ipfilter[1][fr_active]; (f != NULL); f = f->fr_next) + if (f->fr_ifa == ifp) + f->fr_ifa = (void *)-1; +#ifdef USE_INET6 + for (f = ipacct6[0][fr_active]; (f != NULL); f = f->fr_next) + if (f->fr_ifa == ifp) + f->fr_ifa = (void *)-1; + for (f = ipacct6[1][fr_active]; (f != NULL); f = f->fr_next) + if (f->fr_ifa == ifp) + f->fr_ifa = (void *)-1; + for (f = ipfilter6[0][fr_active]; (f != NULL); f = f->fr_next) + if (f->fr_ifa == ifp) + f->fr_ifa = (void *)-1; + for (f = ipfilter6[1][fr_active]; (f != NULL); f = f->fr_next) + if (f->fr_ifa == ifp) + f->fr_ifa = (void *)-1; +#endif + RWLOCK_EXIT(&ipf_mutex); + ip_natsync(ifp); +} + + +static int frrequest(unit, req, data, set) +int unit; +#if defined(__NetBSD__) || defined(__OpenBSD__) || (__FreeBSD_version >= 300003) +u_long req; +#else +int req; +#endif +int set; +caddr_t data; +{ + register frentry_t *fp, *f, **fprev; + register frentry_t **ftail; + frentry_t frd; + frdest_t *fdp; + frgroup_t *fg = NULL; + u_int *p, *pp; + int error = 0, in; + u_int group; + + fp = &frd; + error = IRCOPYPTR(data, (caddr_t)fp, sizeof(*fp)); + if (error) + return EFAULT; + fp->fr_ref = 0; +#if (BSD >= 199306) && defined(_KERNEL) + if ((securelevel > 0) && (fp->fr_func != NULL)) + return EPERM; +#endif + + /* + * Check that the group number does exist and that if a head group + * has been specified, doesn't exist. + */ + if ((req != SIOCZRLST) && fp->fr_grhead && + fr_findgroup((u_int)fp->fr_grhead, fp->fr_flags, unit, set, NULL)) + return EEXIST; + if ((req != SIOCZRLST) && fp->fr_group && + !fr_findgroup((u_int)fp->fr_group, fp->fr_flags, unit, set, NULL)) + return ESRCH; + + in = (fp->fr_flags & FR_INQUE) ? 0 : 1; + + if (unit == IPL_LOGAUTH) + ftail = fprev = &ipauth; + else if ((fp->fr_flags & FR_ACCOUNT) && (fp->fr_v == 4)) + ftail = fprev = &ipacct[in][set]; + else if ((fp->fr_flags & (FR_OUTQUE|FR_INQUE)) && (fp->fr_v == 4)) + ftail = fprev = &ipfilter[in][set]; +#ifdef USE_INET6 + else if ((fp->fr_flags & FR_ACCOUNT) && (fp->fr_v == 6)) + ftail = fprev = &ipacct6[in][set]; + else if ((fp->fr_flags & (FR_OUTQUE|FR_INQUE)) && (fp->fr_v == 6)) + ftail = fprev = &ipfilter6[in][set]; +#endif + else + return ESRCH; + + if ((group = fp->fr_group)) { + if (!(fg = fr_findgroup(group, fp->fr_flags, unit, set, NULL))) + return ESRCH; + ftail = fprev = fg->fg_start; + } + + bzero((char *)frcache, sizeof(frcache[0]) * 2); + + if (*fp->fr_ifname) { + fp->fr_ifa = GETUNIT(fp->fr_ifname, fp->fr_v); + if (!fp->fr_ifa) + fp->fr_ifa = (void *)-1; + } +#if BSD >= 199306 + if (*fp->fr_oifname) { + fp->fr_oifa = GETUNIT(fp->fr_oifname, fp->fr_v); + if (!fp->fr_oifa) + fp->fr_oifa = (void *)-1; + } +#endif + + fdp = &fp->fr_dif; + fp->fr_flags &= ~FR_DUP; + if (*fdp->fd_ifname) { + fdp->fd_ifp = GETUNIT(fdp->fd_ifname, fp->fr_v); + if (!fdp->fd_ifp) + fdp->fd_ifp = (struct ifnet *)-1; + else + fp->fr_flags |= FR_DUP; + } + + fdp = &fp->fr_tif; + if (*fdp->fd_ifname) { + fdp->fd_ifp = GETUNIT(fdp->fd_ifname, fp->fr_v); + if (!fdp->fd_ifp) + fdp->fd_ifp = (struct ifnet *)-1; + } + + /* + * Look for a matching filter rule, but don't include the next or + * interface pointer in the comparison (fr_next, fr_ifa). + */ + for (fp->fr_cksum = 0, p = (u_int *)&fp->fr_ip, pp = &fp->fr_cksum; + p < pp; p++) + fp->fr_cksum += *p; + + for (; (f = *ftail); ftail = &f->fr_next) + if ((fp->fr_cksum == f->fr_cksum) && + !bcmp((char *)&f->fr_ip, (char *)&fp->fr_ip, FR_CMPSIZ)) + break; + + /* + * If zero'ing statistics, copy current to caller and zero. + */ + if (req == SIOCZRLST) { + if (!f) + return ESRCH; + error = IWCOPYPTR((caddr_t)f, data, sizeof(*f)); + if (error) + return EFAULT; + f->fr_hits = 0; + f->fr_bytes = 0; + return 0; + } + + if (!f) { + if (req != SIOCINAFR && req != SIOCINIFR) + while ((f = *ftail)) + ftail = &f->fr_next; + else { + if (fp->fr_hits) { + ftail = fprev; + while (--fp->fr_hits && (f = *ftail)) + ftail = &f->fr_next; + } + f = NULL; + } + } + + if (req == SIOCRMAFR || req == SIOCRMIFR) { + if (!f) + error = ESRCH; + else { + /* + * Only return EBUSY if there is a group list, else + * it's probably just state information referencing + * the rule. + */ + if ((f->fr_ref > 1) && f->fr_grp) + return EBUSY; + if (fg && fg->fg_head) + fg->fg_head->fr_ref--; + if (unit == IPL_LOGAUTH) + return fr_auth_ioctl(data, req, f, ftail); + if (f->fr_grhead) + fr_delgroup((u_int)f->fr_grhead, fp->fr_flags, + unit, set); + fixskip(fprev, f, -1); + *ftail = f->fr_next; + f->fr_next = NULL; + if (f->fr_ref == 0) + KFREE(f); + } + } else { + if (f) + error = EEXIST; + else { + if (unit == IPL_LOGAUTH) + return fr_auth_ioctl(data, req, fp, ftail); + KMALLOC(f, frentry_t *); + if (f != NULL) { + if (fg && fg->fg_head) + fg->fg_head->fr_ref++; + bcopy((char *)fp, (char *)f, sizeof(*f)); + f->fr_ref = 1; + f->fr_hits = 0; + f->fr_next = *ftail; + *ftail = f; + if (req == SIOCINIFR || req == SIOCINAFR) + fixskip(fprev, f, 1); + f->fr_grp = NULL; + if ((group = f->fr_grhead)) + fg = fr_addgroup(group, f, unit, set); + } else + error = ENOMEM; + } + } + return (error); +} + + +#ifdef _KERNEL +/* + * routines below for saving IP headers to buffer + */ +# ifdef __sgi +# ifdef _KERNEL +int IPL_EXTERN(open)(dev_t *pdev, int flags, int devtype, cred_t *cp) +# else +int IPL_EXTERN(open)(dev_t dev, int flags) +# endif +# else +int IPL_EXTERN(open)(dev, flags +# if ((_BSDI_VERSION >= 199510) || (BSD >= 199506) || (NetBSD >= 199511) || \ + (__FreeBSD_version >= 220000) || defined(__OpenBSD__)) && defined(_KERNEL) +, devtype, p) +int devtype; +struct proc *p; +# else +) +# endif +dev_t dev; +int flags; +# endif /* __sgi */ +{ +# if defined(__sgi) && defined(_KERNEL) + u_int min = geteminor(*pdev); +# else + u_int min = GET_MINOR(dev); +# endif + + if (IPL_LOGMAX < min) + min = ENXIO; + else + min = 0; + return min; +} + + +# ifdef __sgi +int IPL_EXTERN(close)(dev_t dev, int flags, int devtype, cred_t *cp) +#else +int IPL_EXTERN(close)(dev, flags +# if ((_BSDI_VERSION >= 199510) || (BSD >= 199506) || (NetBSD >= 199511) || \ + (__FreeBSD_version >= 220000) || defined(__OpenBSD__)) && defined(_KERNEL) +, devtype, p) +int devtype; +struct proc *p; +# else +) +# endif +dev_t dev; +int flags; +# endif /* __sgi */ +{ + u_int min = GET_MINOR(dev); + + if (IPL_LOGMAX < min) + min = ENXIO; + else + min = 0; + return min; +} + +/* + * iplread/ipllog + * both of these must operate with at least splnet() lest they be + * called during packet processing and cause an inconsistancy to appear in + * the filter lists. + */ +# ifdef __sgi +int IPL_EXTERN(read)(dev_t dev, uio_t *uio, cred_t *crp) +# else +# if BSD >= 199306 +int IPL_EXTERN(read)(dev, uio, ioflag) +int ioflag; +# else +int IPL_EXTERN(read)(dev, uio) +# endif +dev_t dev; +register struct uio *uio; +# endif /* __sgi */ +{ +# ifdef IPFILTER_LOG + return ipflog_read(GET_MINOR(dev), uio); +# else + return ENXIO; +# endif +} + + +/* + * send_reset - this could conceivably be a call to tcp_respond(), but that + * requires a large amount of setting up and isn't any more efficient. + */ +int send_reset(oip, fin) +struct ip *oip; +fr_info_t *fin; +{ + struct tcphdr *tcp, *tcp2; + int tlen = 0, hlen; + struct mbuf *m; +#ifdef USE_INET6 + ip6_t *ip6, *oip6 = (ip6_t *)oip; +#endif + ip_t *ip; + + tcp = (struct tcphdr *)fin->fin_dp; + if (tcp->th_flags & TH_RST) + return -1; /* feedback loop */ +# if (BSD < 199306) || defined(__sgi) + m = m_get(M_DONTWAIT, MT_HEADER); +# else + m = m_gethdr(M_DONTWAIT, MT_HEADER); +# endif + if (m == NULL) + return ENOBUFS; + if (m == NULL) + return -1; + + tlen = oip->ip_len - fin->fin_hlen - (tcp->th_off << 2) + + ((tcp->th_flags & TH_SYN) ? 1 : 0) + + ((tcp->th_flags & TH_FIN) ? 1 : 0); + +#ifdef USE_INET6 + hlen = (fin->fin_v == 6) ? sizeof(ip6_t) : sizeof(ip_t); +#else + hlen = sizeof(ip_t); +#endif + m->m_len = sizeof(*tcp2) + hlen; +# if BSD >= 199306 + m->m_data += max_linkhdr; + m->m_pkthdr.len = m->m_len; + m->m_pkthdr.rcvif = (struct ifnet *)0; +# endif + ip = mtod(m, struct ip *); +# ifdef USE_INET6 + ip6 = (ip6_t *)ip; +# endif + bzero((char *)ip, sizeof(*tcp2) + hlen); + tcp2 = (struct tcphdr *)((char *)ip + hlen); + + tcp2->th_sport = tcp->th_dport; + tcp2->th_dport = tcp->th_sport; + if (tcp->th_flags & TH_ACK) { + tcp2->th_seq = tcp->th_ack; + tcp2->th_flags = TH_RST; + } else { + tcp2->th_ack = ntohl(tcp->th_seq); + tcp2->th_ack += tlen; + tcp2->th_ack = htonl(tcp2->th_ack); + tcp2->th_flags = TH_RST|TH_ACK; + } + tcp2->th_off = sizeof(*tcp2) >> 2; +# ifdef USE_INET6 + if (fin->fin_v == 6) { + ip6->ip6_plen = htons(sizeof(struct tcphdr)); + ip6->ip6_nxt = IPPROTO_TCP; + ip6->ip6_src = oip6->ip6_dst; + ip6->ip6_dst = oip6->ip6_src; + tcp2->th_sum = in6_cksum(m, IPPROTO_TCP, + sizeof(*ip6), sizeof(*tcp2)); + return send_ip(oip, fin, m); + } +# endif + ip->ip_p = IPPROTO_TCP; + ip->ip_len = htons(sizeof(struct tcphdr)); + ip->ip_src.s_addr = oip->ip_dst.s_addr; + ip->ip_dst.s_addr = oip->ip_src.s_addr; + tcp2->th_sum = in_cksum(m, hlen + sizeof(*tcp2)); + ip->ip_len = hlen + sizeof(*tcp2); + return send_ip(oip, fin, m); +} + + +static int send_ip(oip, fin, m) +ip_t *oip; +fr_info_t *fin; +struct mbuf *m; +{ + ip_t *ip; + + ip = mtod(m, ip_t *); + + ip->ip_v = fin->fin_v; + if (ip->ip_v == 4) { + ip->ip_hl = (sizeof(*oip) >> 2); + ip->ip_v = IPVERSION; + ip->ip_tos = oip->ip_tos; + ip->ip_id = oip->ip_id; + ip->ip_off = 0; +# if (BSD < 199306) || defined(__sgi) + ip->ip_ttl = tcp_ttl; +# else + ip->ip_ttl = ip_defttl; +# endif + ip->ip_sum = 0; + } +# ifdef USE_INET6 + else if (ip->ip_v == 6) { + ip6_t *ip6 = (ip6_t *)ip; + + ip6->ip6_hlim = 127; + + return ip6_output(m, NULL, NULL, 0, NULL, NULL); + } +# endif +# ifdef IPSEC + m->m_pkthdr.rcvif = NULL; +# endif + return ipfr_fastroute(m, fin, NULL); +} + + +int send_icmp_err(oip, type, fin, dst) +ip_t *oip; +int type; +fr_info_t *fin; +int dst; +{ + int err, hlen = 0, xtra = 0, iclen, ohlen = 0, avail, code; + struct in_addr dst4; + struct icmp *icmp; + struct mbuf *m; + void *ifp; +#ifdef USE_INET6 + ip6_t *ip6, *oip6 = (ip6_t *)oip; + struct in6_addr dst6; +#endif + ip_t *ip; + + if ((type < 0) || (type > ICMP_MAXTYPE)) + return -1; + + code = fin->fin_icode; +#ifdef USE_INET6 + if ((code < 0) || (code > sizeof(icmptoicmp6unreach)/sizeof(int))) + return -1; +#endif + + avail = 0; + m = NULL; + ifp = fin->fin_ifp; + if (fin->fin_v == 4) { + if ((oip->ip_p == IPPROTO_ICMP) && + !(fin->fin_fi.fi_fl & FI_SHORT)) + switch (ntohs(fin->fin_data[0]) >> 8) + { + case ICMP_ECHO : + case ICMP_TSTAMP : + case ICMP_IREQ : + case ICMP_MASKREQ : + break; + default : + return 0; + } + +# if (BSD < 199306) || defined(__sgi) + avail = MLEN; + m = m_get(M_DONTWAIT, MT_HEADER); +# else + avail = MHLEN; + m = m_gethdr(M_DONTWAIT, MT_HEADER); +# endif + if (m == NULL) + return ENOBUFS; + + if (dst == 0) { + if (fr_ifpaddr(4, ifp, &dst4) == -1) + return -1; + } else + dst4.s_addr = oip->ip_dst.s_addr; + + hlen = sizeof(ip_t); + ohlen = oip->ip_hl << 2; + xtra = 8; + } + +#ifdef USE_INET6 + else if (fin->fin_v == 6) { + hlen = sizeof(ip6_t); + ohlen = sizeof(ip6_t); + type = icmptoicmp6types[type]; + if (type == ICMP6_DST_UNREACH) + code = icmptoicmp6unreach[code]; + + MGETHDR(m, M_DONTWAIT, MT_HEADER); + if (!m) + return ENOBUFS; + + MCLGET(m, M_DONTWAIT); + if ((m->m_flags & M_EXT) == 0) { + m_freem(m); + return ENOBUFS; + } +# ifdef M_TRAILINGSPACE + m->m_len = 0; + avail = M_TRAILINGSPACE(m); +# else + avail = MCLBYTES; +# endif + xtra = MIN(ntohs(oip6->ip6_plen) + sizeof(ip6_t), + avail - hlen - sizeof(*icmp) - max_linkhdr); + if (dst == 0) { + if (fr_ifpaddr(6, ifp, (struct in_addr *)&dst6) == -1) + return -1; + } else + dst6 = oip6->ip6_dst; + } +#endif + + iclen = hlen + sizeof(*icmp); +# if BSD >= 199306 + avail -= (max_linkhdr + iclen); + m->m_data += max_linkhdr; + m->m_pkthdr.rcvif = (struct ifnet *)0; + if (xtra > avail) + xtra = avail; + iclen += xtra; + m->m_pkthdr.len = iclen; +#else + avail -= (m->m_off + iclen); + if (xtra > avail) + xtra = avail; + iclen += xtra; +#endif + m->m_len = iclen; + ip = mtod(m, ip_t *); + icmp = (struct icmp *)((char *)ip + hlen); + bzero((char *)ip, iclen); + + icmp->icmp_type = type; + icmp->icmp_code = fin->fin_icode; + icmp->icmp_cksum = 0; +#ifdef icmp_nextmtu + if (type == ICMP_UNREACH && + fin->fin_icode == ICMP_UNREACH_NEEDFRAG && ifp) + icmp->icmp_nextmtu = htons(((struct ifnet *) ifp)->if_mtu); +#endif + + if (avail) { + bcopy((char *)oip, (char *)&icmp->icmp_ip, MIN(ohlen, avail)); + avail -= MIN(ohlen, avail); + } + +#ifdef USE_INET6 + ip6 = (ip6_t *)ip; + if (fin->fin_v == 6) { + ip6->ip6_flow = 0; + ip6->ip6_plen = htons(iclen - hlen); + ip6->ip6_nxt = IPPROTO_ICMPV6; + ip6->ip6_hlim = 0; + ip6->ip6_src = dst6; + ip6->ip6_dst = oip6->ip6_src; + if (avail) + bcopy((char *)oip + ohlen, + (char *)&icmp->icmp_ip + ohlen, avail); + icmp->icmp_cksum = in6_cksum(m, IPPROTO_ICMPV6, + sizeof(*ip6), iclen - hlen); + } else +#endif + { + ip->ip_src.s_addr = dst4.s_addr; + ip->ip_dst.s_addr = oip->ip_src.s_addr; + + if (avail > 8) + avail = 8; + if (avail) + bcopy((char *)oip + ohlen, + (char *)&icmp->icmp_ip + ohlen, avail); + icmp->icmp_cksum = ipf_cksum((u_short *)icmp, + sizeof(*icmp) + 8); + ip->ip_len = iclen; + ip->ip_p = IPPROTO_ICMP; + } + err = send_ip(oip, fin, m); + return err; +} + + +# if !defined(IPFILTER_LKM) && (__FreeBSD_version < 300000) && !defined(__sgi) +# if (BSD < 199306) +int iplinit __P((void)); + +int +# else +void iplinit __P((void)); + +void +# endif +iplinit() +{ + if (iplattach() != 0) + printf("IP Filter failed to attach\n"); + ip_init(); +} +# endif /* ! __NetBSD__ */ + + +size_t mbufchainlen(m0) +register struct mbuf *m0; +{ + register size_t len = 0; + + for (; m0; m0 = m0->m_next) + len += m0->m_len; + return len; +} + + +int ipfr_fastroute(m0, fin, fdp) +struct mbuf *m0; +fr_info_t *fin; +frdest_t *fdp; +{ + register struct ip *ip, *mhip; + register struct mbuf *m = m0; + register struct route *ro; + int len, off, error = 0, hlen, code; + struct ifnet *ifp, *sifp; + struct sockaddr_in *dst; + struct route iproute; + frentry_t *fr; + + hlen = fin->fin_hlen; + ip = mtod(m0, struct ip *); + +#ifdef USE_INET6 + if (ip->ip_v == 6) { + /* + * currently "to <if>" and "to <if>:ip#" are not supported + * for IPv6 + */ + return ip6_output(m0, NULL, NULL, 0, NULL, NULL); + } +#endif + /* + * Route packet. + */ + ro = &iproute; + bzero((caddr_t)ro, sizeof (*ro)); + dst = (struct sockaddr_in *)&ro->ro_dst; + dst->sin_family = AF_INET; + + fr = fin->fin_fr; + if (fdp) + ifp = fdp->fd_ifp; + else { + ifp = fin->fin_ifp; + dst->sin_addr = ip->ip_dst; + } + + /* + * In case we're here due to "to <if>" being used with "keep state", + * check that we're going in the correct direction. + */ + if ((fr != NULL) && (fin->fin_rev != 0)) { + if ((ifp != NULL) && (fdp == &fr->fr_tif)) + return -1; + dst->sin_addr = ip->ip_dst; + } else if (fdp) + dst->sin_addr = fdp->fd_ip.s_addr ? fdp->fd_ip : ip->ip_dst; + +# if BSD >= 199306 + dst->sin_len = sizeof(*dst); +# endif +# if (BSD >= 199306) && !defined(__NetBSD__) && !defined(__bsdi__) && \ + !defined(__OpenBSD__) +# ifdef RTF_CLONING + rtalloc_ign(ro, RTF_CLONING); +# else + rtalloc_ign(ro, RTF_PRCLONING); +# endif +# else + rtalloc(ro); +# endif + if (!ifp) { + if (!fr || !(fr->fr_flags & FR_FASTROUTE)) { + error = -2; + goto bad; + } + if (ro->ro_rt == 0 || (ifp = ro->ro_rt->rt_ifp) == 0) { + if (in_localaddr(ip->ip_dst)) + error = EHOSTUNREACH; + else + error = ENETUNREACH; + goto bad; + } + if (ro->ro_rt->rt_flags & RTF_GATEWAY) + dst = (struct sockaddr_in *)&ro->ro_rt->rt_gateway; + } + if (ro->ro_rt) + ro->ro_rt->rt_use++; + + /* + * For input packets which are being "fastrouted", they won't + * go back through output filtering and miss their chance to get + * NAT'd and counted. + */ + fin->fin_ifp = ifp; + if (fin->fin_out == 0) { + fin->fin_out = 1; + if ((fin->fin_fr = ipacct[1][fr_active]) && + (fr_scanlist(FR_NOMATCH, ip, fin, m) & FR_ACCOUNT)) { + ATOMIC_INCL(frstats[1].fr_acct); + } + fin->fin_fr = NULL; + if (!fr || !(fr->fr_flags & FR_RETMASK)) + (void) fr_checkstate(ip, fin); + (void) ip_natout(ip, fin); + } else + ip->ip_sum = 0; + /* + * If small enough for interface, can just send directly. + */ + if (ip->ip_len <= ifp->if_mtu) { +# if BSD >= 199306 + int i = 0; + + if ((m->m_flags & M_EXT) && MEXT_IS_REF(m)) + i = 1; +# endif +# ifndef sparc +# ifndef __FreeBSD__ + ip->ip_id = htons(ip->ip_id); +# endif + ip->ip_len = htons(ip->ip_len); + ip->ip_off = htons(ip->ip_off); +# endif + if (!ip->ip_sum) + ip->ip_sum = in_cksum(m, hlen); +# if BSD >= 199306 + error = (*ifp->if_output)(ifp, m, (struct sockaddr *)dst, + ro->ro_rt); + if (i) { +# ifndef __FreeBSD__ + ip->ip_id = ntohs(ip->ip_id); +# endif + ip->ip_len = ntohs(ip->ip_len); + ip->ip_off = ntohs(ip->ip_off); + } +# else + error = (*ifp->if_output)(ifp, m, (struct sockaddr *)dst); +# endif + goto done; + } + /* + * Too large for interface; fragment if possible. + * Must be able to put at least 8 bytes per fragment. + */ + if (ip->ip_off & IP_DF) { + error = EMSGSIZE; + goto bad; + } + len = (ifp->if_mtu - hlen) &~ 7; + if (len < 8) { + error = EMSGSIZE; + goto bad; + } + + { + int mhlen, firstlen = len; + struct mbuf **mnext = &m->m_act; + + /* + * Loop through length of segment after first fragment, + * make new header and copy data of each part and link onto chain. + */ + m0 = m; + mhlen = sizeof (struct ip); + for (off = hlen + len; off < ip->ip_len; off += len) { +# ifdef MGETHDR + MGETHDR(m, M_DONTWAIT, MT_HEADER); +# else + MGET(m, M_DONTWAIT, MT_HEADER); +# endif + if (m == 0) { + error = ENOBUFS; + goto bad; + } +# if BSD >= 199306 + m->m_data += max_linkhdr; +# else + m->m_off = MMAXOFF - hlen; +# endif + mhip = mtod(m, struct ip *); + bcopy((char *)ip, (char *)mhip, sizeof(*ip)); + if (hlen > sizeof (struct ip)) { + mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); + mhip->ip_hl = mhlen >> 2; + } + m->m_len = mhlen; + mhip->ip_off = ((off - hlen) >> 3) + (ip->ip_off & ~IP_MF); + if (ip->ip_off & IP_MF) + mhip->ip_off |= IP_MF; + if (off + len >= ip->ip_len) + len = ip->ip_len - off; + else + mhip->ip_off |= IP_MF; + mhip->ip_len = htons((u_short)(len + mhlen)); + m->m_next = m_copy(m0, off, len); + if (m->m_next == 0) { + error = ENOBUFS; /* ??? */ + goto sendorfree; + } +# if BSD >= 199306 + m->m_pkthdr.len = mhlen + len; + m->m_pkthdr.rcvif = NULL; +# endif +# ifndef sparc + mhip->ip_off = htons((u_short)mhip->ip_off); +# endif + mhip->ip_sum = 0; + mhip->ip_sum = in_cksum(m, mhlen); + *mnext = m; + mnext = &m->m_act; + } + /* + * Update first fragment by trimming what's been copied out + * and updating header, then send each fragment (in order). + */ + m_adj(m0, hlen + firstlen - ip->ip_len); + ip->ip_len = htons((u_short)(hlen + firstlen)); + ip->ip_off = htons((u_short)(ip->ip_off | IP_MF)); + ip->ip_sum = 0; + ip->ip_sum = in_cksum(m0, hlen); +sendorfree: + for (m = m0; m; m = m0) { + m0 = m->m_act; + m->m_act = 0; + if (error == 0) +# if BSD >= 199306 + error = (*ifp->if_output)(ifp, m, + (struct sockaddr *)dst, ro->ro_rt); +# else + error = (*ifp->if_output)(ifp, m, + (struct sockaddr *)dst); +# endif + else + m_freem(m); + } + } +done: + if (!error) + ipl_frouteok[0]++; + else + ipl_frouteok[1]++; + + if (ro->ro_rt) + RTFREE(ro->ro_rt); + return 0; +bad: + if (error == EMSGSIZE) { + sifp = fin->fin_ifp; + code = fin->fin_icode; + fin->fin_icode = ICMP_UNREACH_NEEDFRAG; + fin->fin_ifp = ifp; + (void) send_icmp_err(ip, ICMP_UNREACH, fin, 1); + fin->fin_ifp = sifp; + fin->fin_icode = code; + } + m_freem(m); + goto done; +} + + +int fr_verifysrc(ipa, ifp) +struct in_addr ipa; +void *ifp; +{ + struct sockaddr_in *dst; + struct route iproute; + + bzero((char *)&iproute, sizeof(iproute)); + dst = (struct sockaddr_in *)&iproute.ro_dst; + dst->sin_family = AF_INET; + dst->sin_addr = ipa; +# if (BSD >= 199306) && !defined(__NetBSD__) && !defined(__bsdi__) && \ + !defined(__OpenBSD__) +# ifdef RTF_CLONING + rtalloc_ign(&iproute, RTF_CLONING); +# else + rtalloc_ign(&iproute, RTF_PRCLONING); +# endif +# else + rtalloc(&iproute); +# endif + if (iproute.ro_rt == NULL) + return 0; + return (ifp == iproute.ro_rt->rt_ifp); +} + +#else /* #ifdef _KERNEL */ + + +# ifdef __sgi +static int no_output __P((struct ifnet *ifp, struct mbuf *m, + struct sockaddr *s)) +# else +static int no_output __P((struct ifnet *ifp, struct mbuf *m, + struct sockaddr *s, struct rtentry *rt)) +# endif +{ + return 0; +} + + +# ifdef __STDC__ +# ifdef __sgi +static int write_output __P((struct ifnet *ifp, struct mbuf *m, + struct sockaddr *s)) +# else +static int write_output __P((struct ifnet *ifp, struct mbuf *m, + struct sockaddr *s, struct rtentry *rt)) +# endif +{ + ip_t *ip = (ip_t *)m; +# else +static int write_output(ifp, ip) +struct ifnet *ifp; +ip_t *ip; +{ +# endif + char fname[32]; + int fd; + +# if (defined(NetBSD) && (NetBSD <= 1991011) && (NetBSD >= 199606)) || \ + (defined(OpenBSD) && (OpenBSD >= 199603)) + sprintf(fname, "/tmp/%s", ifp->if_xname); +# else + sprintf(fname, "/tmp/%s%d", ifp->if_name, ifp->if_unit); +# endif + fd = open(fname, O_WRONLY|O_APPEND); + if (fd == -1) { + perror("open"); + return -1; + } + write(fd, (char *)ip, ntohs(ip->ip_len)); + close(fd); + return 0; +} + + +struct ifnet *get_unit(name, v) +char *name; +int v; +{ + struct ifnet *ifp, **ifa; +# if (defined(NetBSD) && (NetBSD <= 1991011) && (NetBSD >= 199606)) || \ + (defined(OpenBSD) && (OpenBSD >= 199603)) + for (ifa = ifneta; ifa && (ifp = *ifa); ifa++) { + if (!strcmp(name, ifp->if_xname)) + return ifp; + } +# else + char ifname[32], *s; + + for (ifa = ifneta; ifa && (ifp = *ifa); ifa++) { + (void) sprintf(ifname, "%s%d", ifp->if_name, ifp->if_unit); + if (!strcmp(name, ifname)) + return ifp; + } +# endif + + if (!ifneta) { + ifneta = (struct ifnet **)malloc(sizeof(ifp) * 2); + if (!ifneta) + return NULL; + ifneta[1] = NULL; + ifneta[0] = (struct ifnet *)calloc(1, sizeof(*ifp)); + if (!ifneta[0]) { + free(ifneta); + return NULL; + } + nifs = 1; + } else { + nifs++; + ifneta = (struct ifnet **)realloc(ifneta, + (nifs + 1) * sizeof(*ifa)); + if (!ifneta) { + nifs = 0; + return NULL; + } + ifneta[nifs] = NULL; + ifneta[nifs - 1] = (struct ifnet *)malloc(sizeof(*ifp)); + if (!ifneta[nifs - 1]) { + nifs--; + return NULL; + } + } + ifp = ifneta[nifs - 1]; + +# if (defined(NetBSD) && (NetBSD <= 1991011) && (NetBSD >= 199606)) || \ + (defined(OpenBSD) && (OpenBSD >= 199603)) + strncpy(ifp->if_xname, name, sizeof(ifp->if_xname)); +# else + for (s = name; *s && !isdigit(*s); s++) + ; + if (*s && isdigit(*s)) { + ifp->if_unit = atoi(s); + ifp->if_name = (char *)malloc(s - name + 1); + strncpy(ifp->if_name, name, s - name); + ifp->if_name[s - name] = '\0'; + } else { + ifp->if_name = strdup(name); + ifp->if_unit = -1; + } +# endif + ifp->if_output = no_output; + return ifp; +} + + + +void init_ifp() +{ + struct ifnet *ifp, **ifa; + char fname[32]; + int fd; + +# if (defined(NetBSD) && (NetBSD <= 1991011) && (NetBSD >= 199606)) || \ + (defined(OpenBSD) && (OpenBSD >= 199603)) + for (ifa = ifneta; ifa && (ifp = *ifa); ifa++) { + ifp->if_output = write_output; + sprintf(fname, "/tmp/%s", ifp->if_xname); + fd = open(fname, O_WRONLY|O_CREAT|O_EXCL|O_TRUNC, 0600); + if (fd == -1) + perror("open"); + else + close(fd); + } +# else + + for (ifa = ifneta; ifa && (ifp = *ifa); ifa++) { + ifp->if_output = write_output; + sprintf(fname, "/tmp/%s%d", ifp->if_name, ifp->if_unit); + fd = open(fname, O_WRONLY|O_CREAT|O_EXCL|O_TRUNC, 0600); + if (fd == -1) + perror("open"); + else + close(fd); + } +# endif +} + + +int ipfr_fastroute(ip, fin, fdp) +ip_t *ip; +fr_info_t *fin; +frdest_t *fdp; +{ + struct ifnet *ifp = fdp->fd_ifp; + + if (!ifp) + return 0; /* no routing table out here */ + + ip->ip_len = htons((u_short)ip->ip_len); + ip->ip_off = htons((u_short)(ip->ip_off | IP_MF)); + ip->ip_sum = 0; +#ifdef __sgi + (*ifp->if_output)(ifp, (void *)ip, NULL); +#else + (*ifp->if_output)(ifp, (void *)ip, NULL, 0); +#endif + return 0; +} + + +int ipllog __P((void)) +{ + verbose("l"); + return 0; +} + + +int send_reset(ip, ifp) +ip_t *ip; +struct ifnet *ifp; +{ + verbose("- TCP RST sent\n"); + return 0; +} + + +int icmp_error(ip, ifp) +ip_t *ip; +struct ifnet *ifp; +{ + verbose("- TCP RST sent\n"); + return 0; +} + + +void frsync() +{ + return; +} +#endif /* _KERNEL */ diff --git a/sys/netinet/ip_fil.h b/sys/netinet/ip_fil.h new file mode 100644 index 0000000..a960349 --- /dev/null +++ b/sys/netinet/ip_fil.h @@ -0,0 +1,634 @@ +/* + * Copyright (C) 1993-2000 by Darren Reed. + * + * Redistribution and use in source and binary forms are permitted + * provided that this notice is preserved and due credit is given + * to the original author and the contributors. + * + * @(#)ip_fil.h 1.35 6/5/96 + * $Id: ip_fil.h,v 2.29.2.4 2000/11/12 11:54:53 darrenr Exp $ + * $FreeBSD$ + */ + +#ifndef __IP_FIL_H__ +#define __IP_FIL_H__ + +/* + * Pathnames for various IP Filter control devices. Used by LKM + * and userland, so defined here. + */ +#define IPNAT_NAME "/dev/ipnat" +#define IPSTATE_NAME "/dev/ipstate" +#define IPAUTH_NAME "/dev/ipauth" + +#ifndef SOLARIS +# define SOLARIS (defined(sun) && (defined(__svr4__) || defined(__SVR4))) +#endif + +#if defined(KERNEL) && !defined(_KERNEL) +# define _KERNEL +#endif + +#ifndef __P +# ifdef __STDC__ +# define __P(x) x +# else +# define __P(x) () +# endif +#endif + +#if defined(__STDC__) || defined(__GNUC__) +# define SIOCADAFR _IOW('r', 60, struct frentry *) +# define SIOCRMAFR _IOW('r', 61, struct frentry *) +# define SIOCSETFF _IOW('r', 62, u_int) +# define SIOCGETFF _IOR('r', 63, u_int) +# define SIOCGETFS _IOWR('r', 64, struct friostat *) +# define SIOCIPFFL _IOWR('r', 65, int) +# define SIOCIPFFB _IOR('r', 66, int) +# define SIOCADIFR _IOW('r', 67, struct frentry *) +# define SIOCRMIFR _IOW('r', 68, struct frentry *) +# define SIOCSWAPA _IOR('r', 69, u_int) +# define SIOCINAFR _IOW('r', 70, struct frentry *) +# define SIOCINIFR _IOW('r', 71, struct frentry *) +# define SIOCFRENB _IOW('r', 72, u_int) +# define SIOCFRSYN _IOW('r', 73, u_int) +# define SIOCFRZST _IOWR('r', 74, struct friostat *) +# define SIOCZRLST _IOWR('r', 75, struct frentry *) +# define SIOCAUTHW _IOWR('r', 76, struct fr_info *) +# define SIOCAUTHR _IOWR('r', 77, struct fr_info *) +# define SIOCATHST _IOWR('r', 78, struct fr_authstat *) +# define SIOCSTLCK _IOWR('r', 79, u_int) +# define SIOCSTPUT _IOWR('r', 80, struct ipstate_save *) +# define SIOCSTGET _IOWR('r', 81, struct ipstate_save *) +# define SIOCSTGSZ _IOWR('r', 82, struct natget *) +# define SIOCGFRST _IOWR('r', 83, struct ipfrstat *) +#else +# define SIOCADAFR _IOW(r, 60, struct frentry *) +# define SIOCRMAFR _IOW(r, 61, struct frentry *) +# define SIOCSETFF _IOW(r, 62, u_int) +# define SIOCGETFF _IOR(r, 63, u_int) +# define SIOCGETFS _IOWR(r, 64, struct friostat *) +# define SIOCIPFFL _IOWR(r, 65, int) +# define SIOCIPFFB _IOR(r, 66, int) +# define SIOCADIFR _IOW(r, 67, struct frentry *) +# define SIOCRMIFR _IOW(r, 68, struct frentry *) +# define SIOCSWAPA _IOR(r, 69, u_int) +# define SIOCINAFR _IOW(r, 70, struct frentry *) +# define SIOCINIFR _IOW(r, 71, struct frentry *) +# define SIOCFRENB _IOW(r, 72, u_int) +# define SIOCFRSYN _IOW(r, 73, u_int) +# define SIOCFRZST _IOWR(r, 74, struct friostat *) +# define SIOCZRLST _IOWR(r, 75, struct frentry *) +# define SIOCAUTHW _IOWR(r, 76, struct fr_info *) +# define SIOCAUTHR _IOWR(r, 77, struct fr_info *) +# define SIOCATHST _IOWR(r, 78, struct fr_authstat *) +# define SIOCSTLCK _IOWR(r, 79, u_int) +# define SIOCSTPUT _IOWR(r, 80, struct ipstate_save *) +# define SIOCSTGET _IOWR(r, 81, struct ipstate_save *) +# define SIOCSTGSZ _IOWR(r, 82, struct natget *) +# define SIOCGFRST _IOWR(r, 83, struct ipfrstat *) +#endif +#define SIOCADDFR SIOCADAFR +#define SIOCDELFR SIOCRMAFR +#define SIOCINSFR SIOCINAFR + + +typedef struct fr_ip { + u_32_t fi_v:4; /* IP version */ + u_32_t fi_fl:4; /* packet flags */ + u_32_t fi_tos:8; /* IP packet TOS */ + u_32_t fi_ttl:8; /* IP packet TTL */ + u_32_t fi_p:8; /* IP packet protocol */ + union i6addr fi_src; /* source address from packet */ + union i6addr fi_dst; /* destination address from packet */ + u_32_t fi_optmsk; /* bitmask composed from IP options */ + u_short fi_secmsk; /* bitmask composed from IP security options */ + u_short fi_auth; /* authentication code from IP sec. options */ +} fr_ip_t; + +#define FI_OPTIONS (FF_OPTIONS >> 24) +#define FI_TCPUDP (FF_TCPUDP >> 24) /* TCP/UCP implied comparison*/ +#define FI_FRAG (FF_FRAG >> 24) +#define FI_SHORT (FF_SHORT >> 24) +#define FI_CMP (FI_OPTIONS|FI_TCPUDP|FI_SHORT) + +#define fi_saddr fi_src.in4.s_addr +#define fi_daddr fi_dst.in4.s_addr + + +/* + * These are both used by the state and NAT code to indicate that one port or + * the other should be treated as a wildcard. + */ +#define FI_W_SPORT 0x00000100 +#define FI_W_DPORT 0x00000200 +#define FI_WILDP (FI_W_SPORT|FI_W_DPORT) +#define FI_W_SADDR 0x00000400 +#define FI_W_DADDR 0x00000800 +#define FI_WILDA (FI_W_SADDR|FI_W_DADDR) +#define FI_NEWFR 0x00001000 + +typedef struct fr_info { + void *fin_ifp; /* interface packet is `on' */ + struct fr_ip fin_fi; /* IP Packet summary */ + u_short fin_data[2]; /* TCP/UDP ports, ICMP code/type */ + u_char fin_out; /* in or out ? 1 == out, 0 == in */ + u_char fin_rev; /* state only: 1 = reverse */ + u_short fin_hlen; /* length of IP header in bytes */ + u_char fin_tcpf; /* TCP header flags (SYN, ACK, etc) */ + /* From here on is packet specific */ + u_char fin_icode; /* ICMP error to return */ + u_short fin_rule; /* rule # last matched */ + u_32_t fin_group; /* group number, -1 for none */ + struct frentry *fin_fr; /* last matching rule */ + char *fin_dp; /* start of data past IP header */ + u_short fin_dlen; /* length of data portion of packet */ + u_short fin_id; /* IP packet id field */ + void *fin_mp; /* pointer to pointer to mbuf */ +#if SOLARIS + void *fin_qfm; /* pointer to mblk where pkt starts */ + void *fin_qif; +#endif + u_short fin_plen; + u_short fin_off; +} fr_info_t; + +#define fin_v fin_fi.fi_v + +/* + * Size for compares on fr_info structures + */ +#define FI_CSIZE offsetof(fr_info_t, fin_icode) + +/* + * Size for copying cache fr_info structure + */ +#define FI_COPYSIZE offsetof(fr_info_t, fin_dp) + +typedef struct frdest { + void *fd_ifp; + struct in_addr fd_ip; + char fd_ifname[IFNAMSIZ]; +} frdest_t; + +typedef struct frpcmp { + int frp_cmp; /* data for port comparisons */ + u_short frp_port; /* top port for <> and >< */ + u_short frp_top; /* top port for <> and >< */ +} frpcmp_t; + +typedef struct frtuc { + u_char ftu_tcpfm; /* tcp flags mask */ + u_char ftu_tcpf; /* tcp flags */ + frpcmp_t ftu_src; + frpcmp_t ftu_dst; +} frtuc_t; + +#define ftu_scmp ftu_src.frp_cmp +#define ftu_dcmp ftu_dst.frp_cmp +#define ftu_sport ftu_src.frp_port +#define ftu_dport ftu_dst.frp_port +#define ftu_stop ftu_src.frp_top +#define ftu_dtop ftu_dst.frp_top + +typedef struct frentry { + struct frentry *fr_next; + u_32_t fr_group; /* group to which this rule belongs */ + u_32_t fr_grhead; /* group # which this rule starts */ + struct frentry *fr_grp; + int fr_ref; /* reference count - for grouping */ + void *fr_ifa; +#if BSD >= 199306 + void *fr_oifa; +#endif + /* + * These are only incremented when a packet matches this rule and + * it is the last match + */ + U_QUAD_T fr_hits; + U_QUAD_T fr_bytes; + /* + * Fields after this may not change whilst in the kernel. + */ + struct fr_ip fr_ip; + struct fr_ip fr_mip; /* mask structure */ + + + u_short fr_icmpm; /* data for ICMP packets (mask) */ + u_short fr_icmp; + + frtuc_t fr_tuc; + u_32_t fr_flags; /* per-rule flags && options (see below) */ + u_int fr_skip; /* # of rules to skip */ + u_int fr_loglevel; /* syslog log facility + priority */ + int (*fr_func) __P((int, ip_t *, fr_info_t *)); /* call this function */ + int fr_sap; /* For solaris only */ + u_char fr_icode; /* return ICMP code */ + char fr_ifname[IFNAMSIZ]; +#if BSD >= 199306 + char fr_oifname[IFNAMSIZ]; +#endif + struct frdest fr_tif; /* "to" interface */ + struct frdest fr_dif; /* duplicate packet interfaces */ + u_int fr_cksum; /* checksum on filter rules for performance */ +} frentry_t; + +#define fr_v fr_ip.fi_v +#define fr_proto fr_ip.fi_p +#define fr_ttl fr_ip.fi_ttl +#define fr_tos fr_ip.fi_tos +#define fr_tcpfm fr_tuc.ftu_tcpfm +#define fr_tcpf fr_tuc.ftu_tcpf +#define fr_scmp fr_tuc.ftu_scmp +#define fr_dcmp fr_tuc.ftu_dcmp +#define fr_dport fr_tuc.ftu_dport +#define fr_sport fr_tuc.ftu_sport +#define fr_stop fr_tuc.ftu_stop +#define fr_dtop fr_tuc.ftu_dtop +#define fr_dst fr_ip.fi_dst.in4 +#define fr_src fr_ip.fi_src.in4 +#define fr_dmsk fr_mip.fi_dst.in4 +#define fr_smsk fr_mip.fi_src.in4 + +#ifndef offsetof +#define offsetof(t,m) (int)((&((t *)0L)->m)) +#endif +#define FR_CMPSIZ (sizeof(struct frentry) - offsetof(frentry_t, fr_ip)) + +/* + * fr_flags + */ +#define FR_BLOCK 0x00001 /* do not allow packet to pass */ +#define FR_PASS 0x00002 /* allow packet to pass */ +#define FR_OUTQUE 0x00004 /* outgoing packets */ +#define FR_INQUE 0x00008 /* ingoing packets */ +#define FR_LOG 0x00010 /* Log */ +#define FR_LOGB 0x00011 /* Log-fail */ +#define FR_LOGP 0x00012 /* Log-pass */ +#define FR_LOGBODY 0x00020 /* Log the body */ +#define FR_LOGFIRST 0x00040 /* Log the first byte if state held */ +#define FR_RETRST 0x00080 /* Return TCP RST packet - reset connection */ +#define FR_RETICMP 0x00100 /* Return ICMP unreachable packet */ +#define FR_FAKEICMP 0x00180 /* Return ICMP unreachable with fake source */ +#define FR_NOMATCH 0x00200 /* no match occured */ +#define FR_ACCOUNT 0x00400 /* count packet bytes */ +#define FR_KEEPFRAG 0x00800 /* keep fragment information */ +#define FR_KEEPSTATE 0x01000 /* keep `connection' state information */ +#define FR_INACTIVE 0x02000 +#define FR_QUICK 0x04000 /* match & stop processing list */ +#define FR_FASTROUTE 0x08000 /* bypass normal routing */ +#define FR_CALLNOW 0x10000 /* call another function (fr_func) if matches */ +#define FR_DUP 0x20000 /* duplicate packet */ +#define FR_LOGORBLOCK 0x40000 /* block the packet if it can't be logged */ +#define FR_NOTSRCIP 0x80000 /* not the src IP# */ +#define FR_NOTDSTIP 0x100000 /* not the dst IP# */ +#define FR_AUTH 0x200000 /* use authentication */ +#define FR_PREAUTH 0x400000 /* require preauthentication */ +#define FR_DONTCACHE 0x800000 /* don't cache the result */ + +#define FR_LOGMASK (FR_LOG|FR_LOGP|FR_LOGB) +#define FR_RETMASK (FR_RETICMP|FR_RETRST|FR_FAKEICMP) + +/* + * These correspond to #define's for FI_* and are stored in fr_flags + */ +#define FF_OPTIONS 0x01000000 +#define FF_TCPUDP 0x02000000 +#define FF_FRAG 0x04000000 +#define FF_SHORT 0x08000000 +/* + * recognized flags for SIOCGETFF and SIOCSETFF, and get put in fr_flags + */ +#define FF_LOGPASS 0x10000000 +#define FF_LOGBLOCK 0x20000000 +#define FF_LOGNOMATCH 0x40000000 +#define FF_LOGGING (FF_LOGPASS|FF_LOGBLOCK|FF_LOGNOMATCH) +#define FF_BLOCKNONIP 0x80000000 /* Solaris2 Only */ + +#define FR_NONE 0 +#define FR_EQUAL 1 +#define FR_NEQUAL 2 +#define FR_LESST 3 +#define FR_GREATERT 4 +#define FR_LESSTE 5 +#define FR_GREATERTE 6 +#define FR_OUTRANGE 7 +#define FR_INRANGE 8 + +typedef struct filterstats { + u_long fr_pass; /* packets allowed */ + u_long fr_block; /* packets denied */ + u_long fr_nom; /* packets which don't match any rule */ + u_long fr_short; /* packets which are short */ + u_long fr_ppkl; /* packets allowed and logged */ + u_long fr_bpkl; /* packets denied and logged */ + u_long fr_npkl; /* packets unmatched and logged */ + u_long fr_pkl; /* packets logged */ + u_long fr_skip; /* packets to be logged but buffer full */ + u_long fr_ret; /* packets for which a return is sent */ + u_long fr_acct; /* packets for which counting was performed */ + u_long fr_bnfr; /* bad attempts to allocate fragment state */ + u_long fr_nfr; /* new fragment state kept */ + u_long fr_cfr; /* add new fragment state but complete pkt */ + u_long fr_bads; /* bad attempts to allocate packet state */ + u_long fr_ads; /* new packet state kept */ + u_long fr_chit; /* cached hit */ + u_long fr_tcpbad; /* TCP checksum check failures */ + u_long fr_pull[2]; /* good and bad pullup attempts */ + u_long fr_badsrc; /* source received doesn't match route */ + u_long fr_badttl; /* TTL in packet doesn't reach minimum */ +#if SOLARIS + u_long fr_notdata; /* PROTO/PCPROTO that have no data */ + u_long fr_nodata; /* mblks that have no data */ + u_long fr_bad; /* bad IP packets to the filter */ + u_long fr_notip; /* packets passed through no on ip queue */ + u_long fr_drop; /* packets dropped - no info for them! */ + u_long fr_copy; /* messages copied due to db_ref > 1 */ +#endif + u_long fr_ipv6[2]; /* IPv6 packets in/out */ +} filterstats_t; + +/* + * For SIOCGETFS + */ +typedef struct friostat { + struct filterstats f_st[2]; + struct frentry *f_fin[2]; + struct frentry *f_fout[2]; + struct frentry *f_acctin[2]; + struct frentry *f_acctout[2]; + struct frentry *f_fin6[2]; + struct frentry *f_fout6[2]; + struct frentry *f_acctin6[2]; + struct frentry *f_acctout6[2]; + struct frentry *f_auth; + struct frgroup *f_groups[3][2]; + u_long f_froute[2]; + int f_defpass; /* default pass - from fr_pass */ + char f_active; /* 1 or 0 - active rule set */ + char f_running; /* 1 if running, else 0 */ + char f_logging; /* 1 if enabled, else 0 */ + char f_version[32]; /* version string */ + int f_locks[4]; +} friostat_t; + +typedef struct optlist { + u_short ol_val; + int ol_bit; +} optlist_t; + + +/* + * Group list structure. + */ +typedef struct frgroup { + u_32_t fg_num; + struct frgroup *fg_next; + struct frentry *fg_head; + struct frentry **fg_start; +} frgroup_t; + + +/* + * Log structure. Each packet header logged is prepended by one of these. + * Following this in the log records read from the device will be an ipflog + * structure which is then followed by any packet data. + */ +typedef struct iplog { + u_32_t ipl_magic; + u_int ipl_count; + u_long ipl_sec; + u_long ipl_usec; + size_t ipl_dsize; + struct iplog *ipl_next; +} iplog_t; + +#define IPL_MAGIC 0x49504c4d /* 'IPLM' */ + +typedef struct ipflog { +#if (defined(NetBSD) && (NetBSD <= 1991011) && (NetBSD >= 199603)) || \ + (defined(OpenBSD) && (OpenBSD >= 199603)) + u_char fl_ifname[IFNAMSIZ]; +#else + u_int fl_unit; + u_char fl_ifname[4]; +#endif + u_char fl_plen; /* extra data after hlen */ + u_char fl_hlen; /* length of IP headers saved */ + u_short fl_loglevel; /* syslog log level */ + u_32_t fl_rule; + u_32_t fl_group; + u_32_t fl_flags; + u_32_t fl_lflags; +} ipflog_t; + + +#ifndef ICMP_UNREACH_FILTER +# define ICMP_UNREACH_FILTER 13 +#endif + +#ifndef IPF_LOGGING +# define IPF_LOGGING 0 +#endif +#ifndef IPF_DEFAULT_PASS +# define IPF_DEFAULT_PASS FR_PASS +#endif + +#define IPMINLEN(i, h) ((i)->ip_len >= ((i)->ip_hl * 4 + sizeof(struct h))) +#define IPLLOGSIZE 8192 + +/* + * Device filenames for reading log information. Use ipf on Solaris2 because + * ipl is already a name used by something else. + */ +#ifndef IPL_NAME +# if SOLARIS +# define IPL_NAME "/dev/ipf" +# else +# define IPL_NAME "/dev/ipl" +# endif +#endif +#define IPL_NAT IPNAT_NAME +#define IPL_STATE IPSTATE_NAME +#define IPL_AUTH IPAUTH_NAME + +#define IPL_LOGIPF 0 /* Minor device #'s for accessing logs */ +#define IPL_LOGNAT 1 +#define IPL_LOGSTATE 2 +#define IPL_LOGAUTH 3 +#define IPL_LOGMAX 3 + +#if !defined(CDEV_MAJOR) && defined (__FreeBSD_version) && \ + (__FreeBSD_version >= 220000) +# define CDEV_MAJOR 79 +#endif + +/* + * Post NetBSD 1.2 has the PFIL interface for packet filters. This turns + * on those hooks. We don't need any special mods in non-IP Filter code + * with this! + */ +#if (defined(NetBSD) && (NetBSD > 199609) && (NetBSD <= 1991011)) || \ + (defined(NetBSD1_2) && NetBSD1_2 > 1) || (defined(__FreeBSD_version) && \ + (__FreeBSD_version >= 500011)) +# if (NetBSD >= 199905) +# define PFIL_HOOKS +# endif +# ifdef PFIL_HOOKS +# define NETBSD_PF +# endif +#endif + + +#ifndef _KERNEL +struct ifnet; +extern int fr_check __P((ip_t *, int, void *, int, mb_t **)); +extern int (*fr_checkp) __P((ip_t *, int, void *, int, mb_t **)); +extern int send_reset __P((ip_t *, struct ifnet *)); +extern int icmp_error __P((ip_t *, struct ifnet *)); +extern int ipf_log __P((void)); +extern int ipfr_fastroute __P((ip_t *, fr_info_t *, frdest_t *)); +extern struct ifnet *get_unit __P((char *, int)); +# if defined(__NetBSD__) || defined(__OpenBSD__) || \ + (_BSDI_VERSION >= 199701) || (__FreeBSD_version >= 300000) +extern int iplioctl __P((dev_t, u_long, caddr_t, int)); +# else +extern int iplioctl __P((dev_t, int, caddr_t, int)); +# endif +extern int iplopen __P((dev_t, int)); +extern int iplclose __P((dev_t, int)); +#else /* #ifndef _KERNEL */ +# if defined(__NetBSD__) && defined(PFIL_HOOKS) +extern void ipfilterattach __P((int)); +# endif +extern int iplattach __P((void)); +extern int ipl_enable __P((void)); +extern int ipl_disable __P((void)); +extern void ipflog_init __P((void)); +extern int ipflog_clear __P((minor_t)); +extern int ipflog_read __P((minor_t, struct uio *)); +extern int ipflog __P((u_int, ip_t *, fr_info_t *, mb_t *)); +extern int ipllog __P((int, fr_info_t *, void **, size_t *, int *, int)); +extern int send_icmp_err __P((ip_t *, int, fr_info_t *, int)); +extern int send_reset __P((ip_t *, fr_info_t *)); +# if SOLARIS +extern int fr_check __P((ip_t *, int, void *, int, qif_t *, mb_t **)); +extern int (*fr_checkp) __P((ip_t *, int, void *, + int, qif_t *, mb_t **)); +# if SOLARIS2 >= 7 +extern int iplioctl __P((dev_t, int, intptr_t, int, cred_t *, int *)); +# else +extern int iplioctl __P((dev_t, int, int *, int, cred_t *, int *)); +# endif +extern int iplopen __P((dev_t *, int, int, cred_t *)); +extern int iplclose __P((dev_t, int, int, cred_t *)); +extern int ipfsync __P((void)); +extern int ipfr_fastroute __P((ip_t *, mblk_t *, mblk_t **, + fr_info_t *, frdest_t *)); +extern void copyin_mblk __P((mblk_t *, size_t, size_t, char *)); +extern void copyout_mblk __P((mblk_t *, size_t, size_t, char *)); +extern int fr_qin __P((queue_t *, mblk_t *)); +extern int fr_qout __P((queue_t *, mblk_t *)); +extern int iplread __P((dev_t, struct uio *, cred_t *)); +# else /* SOLARIS */ +extern int fr_check __P((ip_t *, int, void *, int, mb_t **)); +extern int (*fr_checkp) __P((ip_t *, int, void *, int, mb_t **)); +extern int ipfr_fastroute __P((mb_t *, fr_info_t *, frdest_t *)); +extern size_t mbufchainlen __P((mb_t *)); +# ifdef __sgi +# include <sys/cred.h> +extern int iplioctl __P((dev_t, int, caddr_t, int, cred_t *, int *)); +extern int iplopen __P((dev_t *, int, int, cred_t *)); +extern int iplclose __P((dev_t, int, int, cred_t *)); +extern int iplread __P((dev_t, struct uio *, cred_t *)); +extern int ipfsync __P((void)); +extern int ipfilter_sgi_attach __P((void)); +extern void ipfilter_sgi_detach __P((void)); +extern void ipfilter_sgi_intfsync __P((void)); +# else +# ifdef IPFILTER_LKM +extern int iplidentify __P((char *)); +# endif +# if (_BSDI_VERSION >= 199510) || (__FreeBSD_version >= 220000) || \ + (NetBSD >= 199511) || defined(__OpenBSD__) +# if defined(__NetBSD__) || (_BSDI_VERSION >= 199701) || \ + defined(__OpenBSD__) || (__FreeBSD_version >= 300000) +extern int iplioctl __P((dev_t, u_long, caddr_t, int, struct proc *)); +# else +extern int iplioctl __P((dev_t, int, caddr_t, int, struct proc *)); +# endif +extern int iplopen __P((dev_t, int, int, struct proc *)); +extern int iplclose __P((dev_t, int, int, struct proc *)); +# else +# ifndef linux +extern int iplopen __P((dev_t, int)); +extern int iplclose __P((dev_t, int)); +extern int iplioctl __P((dev_t, int, caddr_t, int)); +# else +extern int iplioctl(struct inode *, struct file *, u_int, u_long); +extern int iplopen __P((struct inode *, struct file *)); +extern void iplclose __P((struct inode *, struct file *)); +# endif /* !linux */ +# endif /* (_BSDI_VERSION >= 199510) */ +# if BSD >= 199306 +extern int iplread __P((dev_t, struct uio *, int)); +# else +# ifndef linux +extern int iplread __P((dev_t, struct uio *)); +# else +extern int iplread(struct inode *, struct file *, char *, int); +# endif /* !linux */ +# endif /* BSD >= 199306 */ +# endif /* __ sgi */ +# endif /* SOLARIS */ +#endif /* #ifndef _KERNEL */ + +extern char *memstr __P((char *, char *, int, int)); +extern void fixskip __P((frentry_t **, frentry_t *, int)); +extern int countbits __P((u_32_t)); +extern int ipldetach __P((void)); +extern u_short ipf_cksum __P((u_short *, int)); +extern int ircopyptr __P((void *, void *, size_t)); +extern int iwcopyptr __P((void *, void *, size_t)); + +extern int frflush __P((minor_t, int)); +extern void frsync __P((void)); +extern frgroup_t *fr_addgroup __P((u_32_t, frentry_t *, minor_t, int)); +extern void fr_delgroup __P((u_32_t, u_32_t, minor_t, int)); +extern frgroup_t *fr_findgroup __P((u_32_t, u_32_t, minor_t, int, + frgroup_t ***)); + +extern int fr_copytolog __P((int, char *, int)); +extern void fr_forgetifp __P((void *)); +extern void fr_getstat __P((struct friostat *)); +extern int fr_ifpaddr __P((int, void *, struct in_addr *)); +extern int fr_lock __P((caddr_t, int *)); +extern void fr_makefrip __P((int, ip_t *, fr_info_t *)); +extern u_short fr_tcpsum __P((mb_t *, ip_t *, tcphdr_t *)); +extern int fr_scanlist __P((u_32_t, ip_t *, fr_info_t *, void *)); +extern int fr_tcpudpchk __P((frtuc_t *, fr_info_t *)); +extern int fr_verifysrc __P((struct in_addr, void *)); + +extern int ipl_unreach; +extern int fr_running; +extern u_long ipl_frouteok[2]; +extern int fr_pass; +extern int fr_flags; +extern int fr_active; +extern int fr_chksrc; +extern int fr_minttl; +extern int fr_minttllog; +extern fr_info_t frcache[2]; +extern char ipfilter_version[]; +extern iplog_t **iplh[IPL_LOGMAX+1], *iplt[IPL_LOGMAX+1]; +extern size_t iplused[IPL_LOGMAX + 1]; +extern struct frentry *ipfilter[2][2], *ipacct[2][2]; +#ifdef USE_INET6 +extern struct frentry *ipfilter6[2][2], *ipacct6[2][2]; +extern int icmptoicmp6types[ICMP_MAXTYPE+1]; +extern int icmptoicmp6unreach[ICMP_MAX_UNREACH]; +#endif +extern struct frgroup *ipfgroups[3][2]; +extern struct filterstats frstats[]; + +#endif /* __IP_FIL_H__ */ diff --git a/sys/netinet/ip_flow.c b/sys/netinet/ip_flow.c new file mode 100644 index 0000000..6c9119b --- /dev/null +++ b/sys/netinet/ip_flow.c @@ -0,0 +1,327 @@ +/*- + * Copyright (c) 1998 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by the 3am Software Foundry ("3am"). It was developed by Matt Thomas. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the NetBSD + * Foundation, Inc. and its contributors. + * 4. Neither the name of The NetBSD Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/kernel.h> + +#include <sys/sysctl.h> + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/in_var.h> +#include <netinet/ip_var.h> +#include <netinet/ip_flow.h> + +#define IPFLOW_TIMER (5 * PR_SLOWHZ) +#define IPFLOW_HASHBITS 6 /* should not be a multiple of 8 */ +#define IPFLOW_HASHSIZE (1 << IPFLOW_HASHBITS) +static LIST_HEAD(ipflowhead, ipflow) ipflows[IPFLOW_HASHSIZE]; +static int ipflow_inuse; +#define IPFLOW_MAX 256 + +static int ipflow_active = 0; +SYSCTL_INT(_net_inet_ip, IPCTL_FASTFORWARDING, fastforwarding, CTLFLAG_RW, + &ipflow_active, 0, "Enable flow-based IP forwarding"); + +static MALLOC_DEFINE(M_IPFLOW, "ip_flow", "IP flow"); + +static unsigned +ipflow_hash( + struct in_addr dst, + struct in_addr src, + unsigned tos) +{ + unsigned hash = tos; + int idx; + for (idx = 0; idx < 32; idx += IPFLOW_HASHBITS) + hash += (dst.s_addr >> (32 - idx)) + (src.s_addr >> idx); + return hash & (IPFLOW_HASHSIZE-1); +} + +static struct ipflow * +ipflow_lookup( + const struct ip *ip) +{ + unsigned hash; + struct ipflow *ipf; + + hash = ipflow_hash(ip->ip_dst, ip->ip_src, ip->ip_tos); + + ipf = LIST_FIRST(&ipflows[hash]); + while (ipf != NULL) { + if (ip->ip_dst.s_addr == ipf->ipf_dst.s_addr + && ip->ip_src.s_addr == ipf->ipf_src.s_addr + && ip->ip_tos == ipf->ipf_tos) + break; + ipf = LIST_NEXT(ipf, ipf_next); + } + return ipf; +} + +int +ipflow_fastforward( + struct mbuf *m) +{ + struct ip *ip; + struct ipflow *ipf; + struct rtentry *rt; + int error; + + /* + * Are we forwarding packets? Big enough for an IP packet? + */ + if (!ipforwarding || !ipflow_active || m->m_len < sizeof(struct ip)) + return 0; + /* + * IP header with no option and valid version and length + */ + ip = mtod(m, struct ip *); + if (ip->ip_v != IPVERSION || ip->ip_hl != (sizeof(struct ip) >> 2) + || ntohs(ip->ip_len) > m->m_pkthdr.len) + return 0; + /* + * Find a flow. + */ + if ((ipf = ipflow_lookup(ip)) == NULL) + return 0; + + /* + * Route and interface still up? + */ + rt = ipf->ipf_ro.ro_rt; + if ((rt->rt_flags & RTF_UP) == 0 || (rt->rt_ifp->if_flags & IFF_UP) == 0) + return 0; + + /* + * Packet size OK? TTL? + */ + if (m->m_pkthdr.len > rt->rt_ifp->if_mtu || ip->ip_ttl <= IPTTLDEC) + return 0; + + /* + * Everything checks out and so we can forward this packet. + * Modify the TTL and incrementally change the checksum. + */ + ip->ip_ttl -= IPTTLDEC; + if (ip->ip_sum >= htons(0xffff - (IPTTLDEC << 8))) { + ip->ip_sum += htons(IPTTLDEC << 8) + 1; + } else { + ip->ip_sum += htons(IPTTLDEC << 8); + } + + /* + * Send the packet on its way. All we can get back is ENOBUFS + */ + ipf->ipf_uses++; + ipf->ipf_timer = IPFLOW_TIMER; + if ((error = (*rt->rt_ifp->if_output)(rt->rt_ifp, m, &ipf->ipf_ro.ro_dst, rt)) != 0) { + if (error == ENOBUFS) + ipf->ipf_dropped++; + else + ipf->ipf_errors++; + } + return 1; +} + +static void +ipflow_addstats( + struct ipflow *ipf) +{ + ipf->ipf_ro.ro_rt->rt_use += ipf->ipf_uses; + ipstat.ips_cantforward += ipf->ipf_errors + ipf->ipf_dropped; + ipstat.ips_forward += ipf->ipf_uses; + ipstat.ips_fastforward += ipf->ipf_uses; +} + +static void +ipflow_free( + struct ipflow *ipf) +{ + int s; + /* + * Remove the flow from the hash table (at elevated IPL). + * Once it's off the list, we can deal with it at normal + * network IPL. + */ + s = splimp(); + LIST_REMOVE(ipf, ipf_next); + splx(s); + ipflow_addstats(ipf); + RTFREE(ipf->ipf_ro.ro_rt); + ipflow_inuse--; + FREE(ipf, M_IPFLOW); +} + +static struct ipflow * +ipflow_reap( + void) +{ + struct ipflow *ipf, *maybe_ipf = NULL; + int idx; + int s; + + for (idx = 0; idx < IPFLOW_HASHSIZE; idx++) { + ipf = LIST_FIRST(&ipflows[idx]); + while (ipf != NULL) { + /* + * If this no longer points to a valid route + * reclaim it. + */ + if ((ipf->ipf_ro.ro_rt->rt_flags & RTF_UP) == 0) + goto done; + /* + * choose the one that's been least recently used + * or has had the least uses in the last 1.5 + * intervals. + */ + if (maybe_ipf == NULL + || ipf->ipf_timer < maybe_ipf->ipf_timer + || (ipf->ipf_timer == maybe_ipf->ipf_timer + && ipf->ipf_last_uses + ipf->ipf_uses < + maybe_ipf->ipf_last_uses + + maybe_ipf->ipf_uses)) + maybe_ipf = ipf; + ipf = LIST_NEXT(ipf, ipf_next); + } + } + ipf = maybe_ipf; + done: + /* + * Remove the entry from the flow table. + */ + s = splimp(); + LIST_REMOVE(ipf, ipf_next); + splx(s); + ipflow_addstats(ipf); + RTFREE(ipf->ipf_ro.ro_rt); + return ipf; +} + +void +ipflow_slowtimo( + void) +{ + struct ipflow *ipf; + int idx; + + for (idx = 0; idx < IPFLOW_HASHSIZE; idx++) { + ipf = LIST_FIRST(&ipflows[idx]); + while (ipf != NULL) { + struct ipflow *next_ipf = LIST_NEXT(ipf, ipf_next); + if (--ipf->ipf_timer == 0) { + ipflow_free(ipf); + } else { + ipf->ipf_last_uses = ipf->ipf_uses; + ipf->ipf_ro.ro_rt->rt_use += ipf->ipf_uses; + ipstat.ips_forward += ipf->ipf_uses; + ipstat.ips_fastforward += ipf->ipf_uses; + ipf->ipf_uses = 0; + } + ipf = next_ipf; + } + } +} + +void +ipflow_create( + const struct route *ro, + struct mbuf *m) +{ + const struct ip *const ip = mtod(m, struct ip *); + struct ipflow *ipf; + unsigned hash; + int s; + + /* + * Don't create cache entries for ICMP messages. + */ + if (!ipflow_active || ip->ip_p == IPPROTO_ICMP) + return; + /* + * See if an existing flow struct exists. If so remove it from it's + * list and free the old route. If not, try to malloc a new one + * (if we aren't at our limit). + */ + ipf = ipflow_lookup(ip); + if (ipf == NULL) { + if (ipflow_inuse == IPFLOW_MAX) { + ipf = ipflow_reap(); + } else { + ipf = (struct ipflow *) malloc(sizeof(*ipf), M_IPFLOW, + M_NOWAIT); + if (ipf == NULL) + return; + ipflow_inuse++; + } + bzero((caddr_t) ipf, sizeof(*ipf)); + } else { + s = splimp(); + LIST_REMOVE(ipf, ipf_next); + splx(s); + ipflow_addstats(ipf); + RTFREE(ipf->ipf_ro.ro_rt); + ipf->ipf_uses = ipf->ipf_last_uses = 0; + ipf->ipf_errors = ipf->ipf_dropped = 0; + } + + /* + * Fill in the updated information. + */ + ipf->ipf_ro = *ro; + ro->ro_rt->rt_refcnt++; + ipf->ipf_dst = ip->ip_dst; + ipf->ipf_src = ip->ip_src; + ipf->ipf_tos = ip->ip_tos; + ipf->ipf_timer = IPFLOW_TIMER; + /* + * Insert into the approriate bucket of the flow table. + */ + hash = ipflow_hash(ip->ip_dst, ip->ip_src, ip->ip_tos); + s = splimp(); + LIST_INSERT_HEAD(&ipflows[hash], ipf, ipf_next); + splx(s); +} diff --git a/sys/netinet/ip_flow.h b/sys/netinet/ip_flow.h new file mode 100644 index 0000000..4675996 --- /dev/null +++ b/sys/netinet/ip_flow.h @@ -0,0 +1,57 @@ +/*- + * Copyright (c) 1998 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by the 3am Software Foundry ("3am"). It was developed by Matt Thomas. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the NetBSD + * Foundation, Inc. and its contributors. + * 4. Neither the name of The NetBSD Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NETINET_IP_FLOW_H +#define _NETINET_IP_FLOW_H + +struct ipflow { + LIST_ENTRY(ipflow) ipf_next; /* next ipflow in bucket */ + struct in_addr ipf_dst; /* destination address */ + struct in_addr ipf_src; /* source address */ + + u_int8_t ipf_tos; /* type-of-service */ + struct route ipf_ro; /* associated route entry */ + u_long ipf_uses; /* number of uses in this period */ + + int ipf_timer; /* remaining lifetime of this entry */ + u_long ipf_dropped; /* ENOBUFS returned by if_output */ + u_long ipf_errors; /* other errors returned by if_output */ + u_long ipf_last_uses; /* number of uses in last period */ +}; + +#endif diff --git a/sys/netinet/ip_frag.c b/sys/netinet/ip_frag.c new file mode 100644 index 0000000..f5548fc --- /dev/null +++ b/sys/netinet/ip_frag.c @@ -0,0 +1,576 @@ +/* + * Copyright (C) 1993-2000 by Darren Reed. + * + * Redistribution and use in source and binary forms are permitted + * provided that this notice is preserved and due credit is given + * to the original author and the contributors. + */ +#if !defined(lint) +static const char sccsid[] = "@(#)ip_frag.c 1.11 3/24/96 (C) 1993-1995 Darren Reed"; +static const char rcsid[] = "@(#)$FreeBSD$"; +#endif + +#if defined(KERNEL) && !defined(_KERNEL) +# define _KERNEL +#endif + +#include <sys/errno.h> +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/file.h> +#if !defined(_KERNEL) && !defined(KERNEL) +# include <stdio.h> +# include <string.h> +# include <stdlib.h> +#endif +#if (defined(KERNEL) || defined(_KERNEL)) && (__FreeBSD_version >= 220000) +# include <sys/filio.h> +# include <sys/fcntl.h> +#else +# include <sys/ioctl.h> +#endif +#include <sys/uio.h> +#ifndef linux +# include <sys/protosw.h> +#endif +#include <sys/socket.h> +#if defined(_KERNEL) && !defined(linux) +# include <sys/systm.h> +#endif +#if !defined(__SVR4) && !defined(__svr4__) +# if defined(_KERNEL) && !defined(__sgi) +# include <sys/kernel.h> +# endif +# ifndef linux +# include <sys/mbuf.h> +# endif +#else +# include <sys/byteorder.h> +# ifdef _KERNEL +# include <sys/dditypes.h> +# endif +# include <sys/stream.h> +# include <sys/kmem.h> +#endif +#include <net/if.h> +#ifdef sun +# include <net/af.h> +#endif +#include <net/route.h> +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#ifndef linux +# include <netinet/ip_var.h> +#endif +#include <netinet/tcp.h> +#include <netinet/udp.h> +#include <netinet/ip_icmp.h> +#include "netinet/ip_compat.h" +#include <netinet/tcpip.h> +#include "netinet/ip_fil.h" +#include "netinet/ip_proxy.h" +#include "netinet/ip_nat.h" +#include "netinet/ip_frag.h" +#include "netinet/ip_state.h" +#include "netinet/ip_auth.h" +#if (__FreeBSD_version >= 300000) +# include <sys/malloc.h> +# if (defined(KERNEL) || defined(_KERNEL)) +# ifndef IPFILTER_LKM +# include <sys/libkern.h> +# include <sys/systm.h> +# endif +extern struct callout_handle ipfr_slowtimer_ch; +# endif +#endif +#if defined(__NetBSD__) && (__NetBSD_Version__ >= 104230000) +# include <sys/callout.h> +extern struct callout ipfr_slowtimer_ch; +#endif + + +static ipfr_t *ipfr_heads[IPFT_SIZE]; +static ipfr_t *ipfr_nattab[IPFT_SIZE]; +static ipfrstat_t ipfr_stats; +static int ipfr_inuse = 0; + +int fr_ipfrttl = 120; /* 60 seconds */ +int fr_frag_lock = 0; + +#ifdef _KERNEL +# if SOLARIS2 >= 7 +extern timeout_id_t ipfr_timer_id; +# else +extern int ipfr_timer_id; +# endif +#endif +#if (SOLARIS || defined(__sgi)) && defined(_KERNEL) +extern KRWLOCK_T ipf_frag, ipf_natfrag, ipf_nat, ipf_mutex; +# if SOLARIS +extern KRWLOCK_T ipf_solaris; +# else +KRWLOCK_T ipf_solaris; +# endif +extern kmutex_t ipf_rw; +#endif + + +static ipfr_t *ipfr_new __P((ip_t *, fr_info_t *, u_int, ipfr_t **)); +static ipfr_t *ipfr_lookup __P((ip_t *, fr_info_t *, ipfr_t **)); +static void ipfr_delete __P((ipfr_t *)); + + +ipfrstat_t *ipfr_fragstats() +{ + ipfr_stats.ifs_table = ipfr_heads; + ipfr_stats.ifs_nattab = ipfr_nattab; + ipfr_stats.ifs_inuse = ipfr_inuse; + return &ipfr_stats; +} + + +/* + * add a new entry to the fragment cache, registering it as having come + * through this box, with the result of the filter operation. + */ +static ipfr_t *ipfr_new(ip, fin, pass, table) +ip_t *ip; +fr_info_t *fin; +u_int pass; +ipfr_t *table[]; +{ + ipfr_t **fp, *fra, frag; + u_int idx, off; + + if (ipfr_inuse >= IPFT_SIZE) + return NULL; + + if (!(fin->fin_fi.fi_fl & FI_FRAG)) + return NULL; + + frag.ipfr_p = ip->ip_p; + idx = ip->ip_p; + frag.ipfr_id = ip->ip_id; + idx += ip->ip_id; + frag.ipfr_tos = ip->ip_tos; + frag.ipfr_src.s_addr = ip->ip_src.s_addr; + idx += ip->ip_src.s_addr; + frag.ipfr_dst.s_addr = ip->ip_dst.s_addr; + idx += ip->ip_dst.s_addr; + frag.ipfr_ifp = fin->fin_ifp; + idx *= 127; + idx %= IPFT_SIZE; + + /* + * first, make sure it isn't already there... + */ + for (fp = &table[idx]; (fra = *fp); fp = &fra->ipfr_next) + if (!bcmp((char *)&frag.ipfr_src, (char *)&fra->ipfr_src, + IPFR_CMPSZ)) { + ATOMIC_INCL(ipfr_stats.ifs_exists); + return NULL; + } + + /* + * allocate some memory, if possible, if not, just record that we + * failed to do so. + */ + KMALLOC(fra, ipfr_t *); + if (fra == NULL) { + ATOMIC_INCL(ipfr_stats.ifs_nomem); + return NULL; + } + + if ((fra->ipfr_rule = fin->fin_fr) != NULL) { + ATOMIC_INC32(fin->fin_fr->fr_ref); + } + + + /* + * Instert the fragment into the fragment table, copy the struct used + * in the search using bcopy rather than reassign each field. + * Set the ttl to the default and mask out logging from "pass" + */ + if ((fra->ipfr_next = table[idx])) + table[idx]->ipfr_prev = fra; + fra->ipfr_prev = NULL; + fra->ipfr_data = NULL; + table[idx] = fra; + bcopy((char *)&frag.ipfr_src, (char *)&fra->ipfr_src, IPFR_CMPSZ); + fra->ipfr_ttl = fr_ipfrttl; + /* + * Compute the offset of the expected start of the next packet. + */ + off = ip->ip_off & IP_OFFMASK; + if (!off) + fra->ipfr_seen0 = 1; + fra->ipfr_off = off + (fin->fin_dlen >> 3); + ATOMIC_INCL(ipfr_stats.ifs_new); + ATOMIC_INC32(ipfr_inuse); + return fra; +} + + +int ipfr_newfrag(ip, fin, pass) +ip_t *ip; +fr_info_t *fin; +u_int pass; +{ + ipfr_t *ipf; + + if ((ip->ip_v != 4) || (fr_frag_lock)) + return -1; + WRITE_ENTER(&ipf_frag); + ipf = ipfr_new(ip, fin, pass, ipfr_heads); + RWLOCK_EXIT(&ipf_frag); + return ipf ? 0 : -1; +} + + +int ipfr_nat_newfrag(ip, fin, pass, nat) +ip_t *ip; +fr_info_t *fin; +u_int pass; +nat_t *nat; +{ + ipfr_t *ipf; + + if ((ip->ip_v != 4) || (fr_frag_lock)) + return -1; + WRITE_ENTER(&ipf_natfrag); + ipf = ipfr_new(ip, fin, pass, ipfr_nattab); + if (ipf != NULL) { + ipf->ipfr_data = nat; + nat->nat_data = ipf; + } + RWLOCK_EXIT(&ipf_natfrag); + return ipf ? 0 : -1; +} + + +/* + * check the fragment cache to see if there is already a record of this packet + * with its filter result known. + */ +static ipfr_t *ipfr_lookup(ip, fin, table) +ip_t *ip; +fr_info_t *fin; +ipfr_t *table[]; +{ + ipfr_t *f, frag; + u_int idx; + + if (!(fin->fin_fi.fi_fl & FI_FRAG)) + return NULL; + + /* + * For fragments, we record protocol, packet id, TOS and both IP#'s + * (these should all be the same for all fragments of a packet). + * + * build up a hash value to index the table with. + */ + frag.ipfr_p = ip->ip_p; + idx = ip->ip_p; + frag.ipfr_id = ip->ip_id; + idx += ip->ip_id; + frag.ipfr_tos = ip->ip_tos; + frag.ipfr_src.s_addr = ip->ip_src.s_addr; + idx += ip->ip_src.s_addr; + frag.ipfr_dst.s_addr = ip->ip_dst.s_addr; + idx += ip->ip_dst.s_addr; + frag.ipfr_ifp = fin->fin_ifp; + idx *= 127; + idx %= IPFT_SIZE; + + /* + * check the table, careful to only compare the right amount of data + */ + for (f = table[idx]; f; f = f->ipfr_next) + if (!bcmp((char *)&frag.ipfr_src, (char *)&f->ipfr_src, + IPFR_CMPSZ)) { + u_short atoff, off; + + /* + * XXX - We really need to be guarding against the + * retransmission of (src,dst,id,offset-range) here + * because a fragmented packet is never resent with + * the same IP ID#. + */ + off = ip->ip_off & IP_OFFMASK; + if (f->ipfr_seen0) { + if (!off || (fin->fin_fi.fi_fl & FI_SHORT)) + continue; + } else if (!off) + f->ipfr_seen0 = 1; + + if (f != table[idx]) { + /* + * move fragment info. to the top of the list + * to speed up searches. + */ + if ((f->ipfr_prev->ipfr_next = f->ipfr_next)) + f->ipfr_next->ipfr_prev = f->ipfr_prev; + f->ipfr_next = table[idx]; + table[idx]->ipfr_prev = f; + f->ipfr_prev = NULL; + table[idx] = f; + } + atoff = off + (fin->fin_dlen >> 3); + /* + * If we've follwed the fragments, and this is the + * last (in order), shrink expiration time. + */ + if (off == f->ipfr_off) { + if (!(ip->ip_off & IP_MF)) + f->ipfr_ttl = 1; + else + f->ipfr_off = atoff; + } + ATOMIC_INCL(ipfr_stats.ifs_hits); + return f; + } + return NULL; +} + + +/* + * functional interface for NAT lookups of the NAT fragment cache + */ +nat_t *ipfr_nat_knownfrag(ip, fin) +ip_t *ip; +fr_info_t *fin; +{ + nat_t *nat; + ipfr_t *ipf; + + if ((ip->ip_v != 4) || (fr_frag_lock)) + return NULL; + READ_ENTER(&ipf_natfrag); + ipf = ipfr_lookup(ip, fin, ipfr_nattab); + if (ipf != NULL) { + nat = ipf->ipfr_data; + /* + * This is the last fragment for this packet. + */ + if ((ipf->ipfr_ttl == 1) && (nat != NULL)) { + nat->nat_data = NULL; + ipf->ipfr_data = NULL; + } + } else + nat = NULL; + RWLOCK_EXIT(&ipf_natfrag); + return nat; +} + + +/* + * functional interface for normal lookups of the fragment cache + */ +frentry_t *ipfr_knownfrag(ip, fin) +ip_t *ip; +fr_info_t *fin; +{ + frentry_t *fr = NULL; + ipfr_t *fra; + + if ((ip->ip_v != 4) || (fr_frag_lock)) + return NULL; + READ_ENTER(&ipf_frag); + fra = ipfr_lookup(ip, fin, ipfr_heads); + if (fra != NULL) + fr = fra->ipfr_rule; + RWLOCK_EXIT(&ipf_frag); + return fr; +} + + +/* + * forget any references to this external object. + */ +void ipfr_forget(nat) +void *nat; +{ + ipfr_t *fr; + int idx; + + WRITE_ENTER(&ipf_natfrag); + for (idx = IPFT_SIZE - 1; idx >= 0; idx--) + for (fr = ipfr_heads[idx]; fr; fr = fr->ipfr_next) + if (fr->ipfr_data == nat) + fr->ipfr_data = NULL; + + RWLOCK_EXIT(&ipf_natfrag); +} + + +static void ipfr_delete(fra) +ipfr_t *fra; +{ + frentry_t *fr; + + fr = fra->ipfr_rule; + if (fr != NULL) { + ATOMIC_DEC32(fr->fr_ref); + if (fr->fr_ref == 0) + KFREE(fr); + } + if (fra->ipfr_prev) + fra->ipfr_prev->ipfr_next = fra->ipfr_next; + if (fra->ipfr_next) + fra->ipfr_next->ipfr_prev = fra->ipfr_prev; + KFREE(fra); +} + + +/* + * Free memory in use by fragment state info. kept. + */ +void ipfr_unload() +{ + ipfr_t **fp, *fra; + nat_t *nat; + int idx; + + WRITE_ENTER(&ipf_frag); + for (idx = IPFT_SIZE - 1; idx >= 0; idx--) + for (fp = &ipfr_heads[idx]; (fra = *fp); ) { + *fp = fra->ipfr_next; + ipfr_delete(fra); + } + RWLOCK_EXIT(&ipf_frag); + + WRITE_ENTER(&ipf_nat); + WRITE_ENTER(&ipf_natfrag); + for (idx = IPFT_SIZE - 1; idx >= 0; idx--) + for (fp = &ipfr_nattab[idx]; (fra = *fp); ) { + *fp = fra->ipfr_next; + nat = fra->ipfr_data; + if (nat != NULL) { + if (nat->nat_data == fra) + nat->nat_data = NULL; + } + ipfr_delete(fra); + } + RWLOCK_EXIT(&ipf_natfrag); + RWLOCK_EXIT(&ipf_nat); +} + + +#ifdef _KERNEL +void ipfr_fragexpire() +{ + ipfr_t **fp, *fra; + nat_t *nat; + int idx; +#if defined(_KERNEL) +# if !SOLARIS + int s; +# endif +#endif + + if (fr_frag_lock) + return; + + SPL_NET(s); + WRITE_ENTER(&ipf_frag); + + /* + * Go through the entire table, looking for entries to expire, + * decreasing the ttl by one for each entry. If it reaches 0, + * remove it from the chain and free it. + */ + for (idx = IPFT_SIZE - 1; idx >= 0; idx--) + for (fp = &ipfr_heads[idx]; (fra = *fp); ) { + --fra->ipfr_ttl; + if (fra->ipfr_ttl == 0) { + *fp = fra->ipfr_next; + ipfr_delete(fra); + ATOMIC_INCL(ipfr_stats.ifs_expire); + ATOMIC_DEC32(ipfr_inuse); + } else + fp = &fra->ipfr_next; + } + RWLOCK_EXIT(&ipf_frag); + + /* + * Same again for the NAT table, except that if the structure also + * still points to a NAT structure, and the NAT structure points back + * at the one to be free'd, NULL the reference from the NAT struct. + * NOTE: We need to grab both mutex's early, and in this order so as + * to prevent a deadlock if both try to expire at the same time. + */ + WRITE_ENTER(&ipf_nat); + WRITE_ENTER(&ipf_natfrag); + for (idx = IPFT_SIZE - 1; idx >= 0; idx--) + for (fp = &ipfr_nattab[idx]; (fra = *fp); ) { + --fra->ipfr_ttl; + if (fra->ipfr_ttl == 0) { + ATOMIC_INCL(ipfr_stats.ifs_expire); + ATOMIC_DEC32(ipfr_inuse); + nat = fra->ipfr_data; + if (nat != NULL) { + if (nat->nat_data == fra) + nat->nat_data = NULL; + } + *fp = fra->ipfr_next; + ipfr_delete(fra); + } else + fp = &fra->ipfr_next; + } + RWLOCK_EXIT(&ipf_natfrag); + RWLOCK_EXIT(&ipf_nat); + SPL_X(s); +} + + +/* + * Slowly expire held state for fragments. Timeouts are set * in expectation + * of this being called twice per second. + */ +# if (BSD >= 199306) || SOLARIS || defined(__sgi) +# if defined(SOLARIS2) && (SOLARIS2 < 7) +void ipfr_slowtimer() +# else +void ipfr_slowtimer __P((void *ptr)) +# endif +# else +int ipfr_slowtimer() +# endif +{ +#if defined(_KERNEL) && SOLARIS + extern int fr_running; + + if (fr_running <= 0) + return; +#endif + + READ_ENTER(&ipf_solaris); +#ifdef __sgi + ipfilter_sgi_intfsync(); +#endif + + ipfr_fragexpire(); + fr_timeoutstate(); + ip_natexpire(); + fr_authexpire(); +# if SOLARIS + ipfr_timer_id = timeout(ipfr_slowtimer, NULL, drv_usectohz(500000)); + RWLOCK_EXIT(&ipf_solaris); +# else +# if defined(__NetBSD__) && (__NetBSD_Version__ >= 104240000) + callout_reset(&ipfr_slowtimer_ch, hz / 2, ipfr_slowtimer, NULL); +# else +# if (__FreeBSD_version >= 300000) + ipfr_slowtimer_ch = timeout(ipfr_slowtimer, NULL, hz/2); +# else + timeout(ipfr_slowtimer, NULL, hz/2); +# endif +# if (BSD < 199306) && !defined(__sgi) + return 0; +# endif /* FreeBSD */ +# endif /* NetBSD */ +# endif /* SOLARIS */ +} +#endif /* defined(_KERNEL) */ diff --git a/sys/netinet/ip_frag.h b/sys/netinet/ip_frag.h new file mode 100644 index 0000000..2d0b9be --- /dev/null +++ b/sys/netinet/ip_frag.h @@ -0,0 +1,69 @@ +/* + * Copyright (C) 1993-2000 by Darren Reed. + * + * Redistribution and use in source and binary forms are permitted + * provided that this notice is preserved and due credit is given + * to the original author and the contributors. + * + * @(#)ip_frag.h 1.5 3/24/96 + * $Id: ip_frag.h,v 2.4.2.2 2000/11/10 13:10:54 darrenr Exp $ + * $FreeBSD$ + */ + +#ifndef __IP_FRAG_H__ +#define __IP_FRAG_H__ + +#define IPFT_SIZE 257 + +typedef struct ipfr { + struct ipfr *ipfr_next, *ipfr_prev; + void *ipfr_data; + struct in_addr ipfr_src; + struct in_addr ipfr_dst; + void *ipfr_ifp; + u_short ipfr_id; + u_char ipfr_p; + u_char ipfr_tos; + u_short ipfr_off; + u_char ipfr_ttl; + u_char ipfr_seen0; + frentry_t *ipfr_rule; +} ipfr_t; + + +typedef struct ipfrstat { + u_long ifs_exists; /* add & already exists */ + u_long ifs_nomem; + u_long ifs_new; + u_long ifs_hits; + u_long ifs_expire; + u_long ifs_inuse; + struct ipfr **ifs_table; + struct ipfr **ifs_nattab; +} ipfrstat_t; + +#define IPFR_CMPSZ (offsetof(ipfr_t, ipfr_off) - \ + offsetof(ipfr_t, ipfr_src)) + +extern int fr_ipfrttl; +extern int fr_frag_lock; +extern ipfrstat_t *ipfr_fragstats __P((void)); +extern int ipfr_newfrag __P((ip_t *, fr_info_t *, u_int)); +extern int ipfr_nat_newfrag __P((ip_t *, fr_info_t *, u_int, struct nat *)); +extern nat_t *ipfr_nat_knownfrag __P((ip_t *, fr_info_t *)); +extern frentry_t *ipfr_knownfrag __P((ip_t *, fr_info_t *)); +extern void ipfr_forget __P((void *)); +extern void ipfr_unload __P((void)); +extern void ipfr_fragexpire __P((void)); + +#if (BSD >= 199306) || SOLARIS || defined(__sgi) +# if defined(SOLARIS2) && (SOLARIS2 < 7) +extern void ipfr_slowtimer __P((void)); +# else +extern void ipfr_slowtimer __P((void *)); +# endif +#else +extern int ipfr_slowtimer __P((void)); +#endif /* (BSD >= 199306) || SOLARIS */ + +#endif /* __IP_FIL_H__ */ diff --git a/sys/netinet/ip_ftp_pxy.c b/sys/netinet/ip_ftp_pxy.c new file mode 100644 index 0000000..6e4fe53 --- /dev/null +++ b/sys/netinet/ip_ftp_pxy.c @@ -0,0 +1,786 @@ +/* + * Simple FTP transparent proxy for in-kernel use. For use with the NAT + * code. + * + * $FreeBSD$ + */ +#if SOLARIS && defined(_KERNEL) +extern kmutex_t ipf_rw; +#endif + +#define isdigit(x) ((x) >= '0' && (x) <= '9') +#define isupper(x) (((unsigned)(x) >= 'A') && ((unsigned)(x) <= 'Z')) +#define islower(x) (((unsigned)(x) >= 'a') && ((unsigned)(x) <= 'z')) +#define isalpha(x) (isupper(x) || islower(x)) +#define toupper(x) (isupper(x) ? (x) : (x) - 'a' + 'A') + +#define IPF_FTP_PROXY + +#define IPF_MINPORTLEN 18 +#define IPF_MAXPORTLEN 30 +#define IPF_MIN227LEN 39 +#define IPF_MAX227LEN 51 +#define IPF_FTPBUFSZ 96 /* This *MUST* be >= 53! */ + + +int ippr_ftp_client __P((fr_info_t *, ip_t *, nat_t *, ftpinfo_t *, int)); +int ippr_ftp_complete __P((char *, size_t)); +int ippr_ftp_in __P((fr_info_t *, ip_t *, ap_session_t *, nat_t *)); +int ippr_ftp_init __P((void)); +int ippr_ftp_new __P((fr_info_t *, ip_t *, ap_session_t *, nat_t *)); +int ippr_ftp_out __P((fr_info_t *, ip_t *, ap_session_t *, nat_t *)); +int ippr_ftp_pasv __P((fr_info_t *, ip_t *, nat_t *, ftpside_t *, int)); +int ippr_ftp_port __P((fr_info_t *, ip_t *, nat_t *, ftpside_t *, int)); +int ippr_ftp_process __P((fr_info_t *, ip_t *, nat_t *, ftpinfo_t *, int)); +int ippr_ftp_server __P((fr_info_t *, ip_t *, nat_t *, ftpinfo_t *, int)); +int ippr_ftp_valid __P((char *, size_t)); +u_short ippr_ftp_atoi __P((char **)); + +static frentry_t natfr; +int ippr_ftp_pasvonly = 0; +int ippr_ftp_insecure = 0; + + +/* + * Initialize local structures. + */ +int ippr_ftp_init() +{ + bzero((char *)&natfr, sizeof(natfr)); + natfr.fr_ref = 1; + natfr.fr_flags = FR_INQUE|FR_PASS|FR_QUICK|FR_KEEPSTATE; + return 0; +} + + +int ippr_ftp_new(fin, ip, aps, nat) +fr_info_t *fin; +ip_t *ip; +ap_session_t *aps; +nat_t *nat; +{ + ftpinfo_t *ftp; + ftpside_t *f; + + KMALLOC(ftp, ftpinfo_t *); + if (ftp == NULL) + return -1; + aps->aps_data = ftp; + aps->aps_psiz = sizeof(ftpinfo_t); + + bzero((char *)ftp, sizeof(*ftp)); + f = &ftp->ftp_side[0]; + f->ftps_rptr = f->ftps_buf; + f->ftps_wptr = f->ftps_buf; + f = &ftp->ftp_side[1]; + f->ftps_rptr = f->ftps_buf; + f->ftps_wptr = f->ftps_buf; + return 0; +} + + +int ippr_ftp_port(fin, ip, nat, f, dlen) +fr_info_t *fin; +ip_t *ip; +nat_t *nat; +ftpside_t *f; +int dlen; +{ + tcphdr_t *tcp, tcph, *tcp2 = &tcph; + char newbuf[IPF_FTPBUFSZ], *s; + u_short a5, a6, sp, dp; + u_int a1, a2, a3, a4; + struct in_addr swip; + size_t nlen, olen; + fr_info_t fi; + int inc, off; + nat_t *ipn; + mb_t *m; +#if SOLARIS + mb_t *m1; +#endif + + tcp = (tcphdr_t *)fin->fin_dp; + /* + * Check for client sending out PORT message. + */ + if (dlen < IPF_MINPORTLEN) + return 0; + off = fin->fin_hlen + (tcp->th_off << 2); + /* + * Skip the PORT command + space + */ + s = f->ftps_rptr + 5; + /* + * Pick out the address components, two at a time. + */ + a1 = ippr_ftp_atoi(&s); + if (!s) + return 0; + a2 = ippr_ftp_atoi(&s); + if (!s) + return 0; + /* + * check that IP address in the PORT/PASV reply is the same as the + * sender of the command - prevents using PORT for port scanning. + */ + a1 <<= 16; + a1 |= a2; + if (a1 != ntohl(nat->nat_inip.s_addr)) + return 0; + + a5 = ippr_ftp_atoi(&s); + if (!s) + return 0; + if (*s == ')') + s++; + + /* + * check for CR-LF at the end. + */ + if (*s == '\n') + s--; + if ((*s == '\r') && (*(s + 1) == '\n')) { + s += 2; + a6 = a5 & 0xff; + } else + return 0; + a5 >>= 8; + a5 &= 0xff; + /* + * Calculate new address parts for PORT command + */ + a1 = ntohl(ip->ip_src.s_addr); + a2 = (a1 >> 16) & 0xff; + a3 = (a1 >> 8) & 0xff; + a4 = a1 & 0xff; + a1 >>= 24; + olen = s - f->ftps_rptr; + /* DO NOT change this to sprintf! */ + (void) sprintf(newbuf, "%s %u,%u,%u,%u,%u,%u\r\n", + "PORT", a1, a2, a3, a4, a5, a6); + + nlen = strlen(newbuf); + inc = nlen - olen; + if ((inc + ip->ip_len) > 65535) + return 0; + +#if SOLARIS + m = fin->fin_qfm; + for (m1 = m; m1->b_cont; m1 = m1->b_cont) + ; + if ((inc > 0) && (m1->b_datap->db_lim - m1->b_wptr < inc)) { + mblk_t *nm; + + /* alloc enough to keep same trailer space for lower driver */ + nm = allocb(nlen, BPRI_MED); + PANIC((!nm),("ippr_ftp_out: allocb failed")); + + nm->b_band = m1->b_band; + nm->b_wptr += nlen; + + m1->b_wptr -= olen; + PANIC((m1->b_wptr < m1->b_rptr), + ("ippr_ftp_out: cannot handle fragmented data block")); + + linkb(m1, nm); + } else { + if (m1->b_datap->db_struiolim == m1->b_wptr) + m1->b_datap->db_struiolim += inc; + m1->b_datap->db_struioflag &= ~STRUIO_IP; + m1->b_wptr += inc; + } + copyin_mblk(m, off, nlen, newbuf); +#else + m = *((mb_t **)fin->fin_mp); + if (inc < 0) + m_adj(m, inc); + /* the mbuf chain will be extended if necessary by m_copyback() */ + m_copyback(m, off, nlen, newbuf); +# ifdef M_PKTHDR + if (!(m->m_flags & M_PKTHDR)) + m->m_pkthdr.len += inc; +# endif +#endif + if (inc != 0) { +#if SOLARIS || defined(__sgi) + register u_32_t sum1, sum2; + + sum1 = ip->ip_len; + sum2 = ip->ip_len + inc; + + /* Because ~1 == -2, We really need ~1 == -1 */ + if (sum1 > sum2) + sum2--; + sum2 -= sum1; + sum2 = (sum2 & 0xffff) + (sum2 >> 16); + + fix_outcksum(&ip->ip_sum, sum2); +#endif + ip->ip_len += inc; + } + + /* + * Add skeleton NAT entry for connection which will come back the + * other way. + */ + sp = htons(a5 << 8 | a6); + /* + * Don't allow the PORT command to specify a port < 1024 due to + * security crap. + */ + if (ntohs(sp) < 1024) + return 0; + /* + * The server may not make the connection back from port 20, but + * it is the most likely so use it here to check for a conflicting + * mapping. + */ + dp = htons(fin->fin_data[1] - 1); + ipn = nat_outlookup(fin->fin_ifp, IPN_TCP, nat->nat_p, nat->nat_inip, + ip->ip_dst, (dp << 16) | sp, 0); + if (ipn == NULL) { + int slen; + + slen = ip->ip_len; + ip->ip_len = fin->fin_hlen + sizeof(*tcp2); + bcopy((char *)fin, (char *)&fi, sizeof(fi)); + bzero((char *)tcp2, sizeof(*tcp2)); + tcp2->th_win = htons(8192); + tcp2->th_sport = sp; + tcp2->th_off = 5; + tcp2->th_dport = 0; /* XXX - don't specify remote port */ + fi.fin_data[0] = ntohs(sp); + fi.fin_data[1] = 0; + fi.fin_dlen = sizeof(*tcp2); + fi.fin_dp = (char *)tcp2; + fi.fin_fr = &natfr; + swip = ip->ip_src; + fi.fin_fi.fi_saddr = nat->nat_inip.s_addr; + ip->ip_src = nat->nat_inip; + ipn = nat_new(nat->nat_ptr, ip, &fi, IPN_TCP|FI_W_DPORT, + NAT_OUTBOUND); + if (ipn != NULL) { + ipn->nat_age = fr_defnatage; + (void) fr_addstate(ip, &fi, FI_W_DPORT); + } + ip->ip_len = slen; + ip->ip_src = swip; + } + return APR_INC(inc); +} + + +int ippr_ftp_client(fin, ip, nat, ftp, dlen) +fr_info_t *fin; +nat_t *nat; +ftpinfo_t *ftp; +ip_t *ip; +int dlen; +{ + char *rptr, *wptr, cmd[6], c; + ftpside_t *f; + int inc, i; + + inc = 0; + f = &ftp->ftp_side[0]; + rptr = f->ftps_rptr; + wptr = f->ftps_wptr; + + for (i = 0; (i < 5) && (i < dlen); i++) { + c = rptr[i]; + if (isalpha(c)) { + cmd[i] = toupper(c); + } else { + cmd[i] = c; + } + } + cmd[i] = '\0'; + + if ((ftp->ftp_passok == 0) && !strncmp(cmd, "USER ", 5)) + ftp->ftp_passok = 1; + else if ((ftp->ftp_passok == 2) && !strncmp(cmd, "PASS ", 5)) + ftp->ftp_passok = 3; + else if ((ftp->ftp_passok == 4) && !ippr_ftp_pasvonly && + !strncmp(cmd, "PORT ", 5)) { + inc = ippr_ftp_port(fin, ip, nat, f, dlen); + } else if (ippr_ftp_insecure && !ippr_ftp_pasvonly && + !strncmp(cmd, "PORT ", 5)) { + inc = ippr_ftp_port(fin, ip, nat, f, dlen); + } + + while ((*rptr++ != '\n') && (rptr < wptr)) + ; + f->ftps_rptr = rptr; + return inc; +} + + +int ippr_ftp_pasv(fin, ip, nat, f, dlen) +fr_info_t *fin; +ip_t *ip; +nat_t *nat; +ftpside_t *f; +int dlen; +{ + tcphdr_t *tcp, tcph, *tcp2 = &tcph; + struct in_addr swip, swip2; + u_short a5, a6, sp, dp; + u_int a1, a2, a3, a4; + fr_info_t fi; + nat_t *ipn; + int inc; + char *s; + + /* + * Check for PASV reply message. + */ + if (dlen < IPF_MIN227LEN) + return 0; + else if (strncmp(f->ftps_rptr, "227 Entering Passive Mode", 25)) + return 0; + + tcp = (tcphdr_t *)fin->fin_dp; + + /* + * Skip the PORT command + space + */ + s = f->ftps_rptr + 25; + while (*s && !isdigit(*s)) + s++; + /* + * Pick out the address components, two at a time. + */ + a1 = ippr_ftp_atoi(&s); + if (!s) + return 0; + a2 = ippr_ftp_atoi(&s); + if (!s) + return 0; + + /* + * check that IP address in the PORT/PASV reply is the same as the + * sender of the command - prevents using PORT for port scanning. + */ + a1 <<= 16; + a1 |= a2; + if (a1 != ntohl(nat->nat_oip.s_addr)) + return 0; + + a5 = ippr_ftp_atoi(&s); + if (!s) + return 0; + + if (*s == ')') + s++; + if (*s == '\n') + s--; + /* + * check for CR-LF at the end. + */ + if ((*s == '\r') && (*(s + 1) == '\n')) { + s += 2; + a6 = a5 & 0xff; + } else + return 0; + a5 >>= 8; + /* + * Calculate new address parts for 227 reply + */ + a1 = ntohl(ip->ip_src.s_addr); + a2 = (a1 >> 16) & 0xff; + a3 = (a1 >> 8) & 0xff; + a4 = a1 & 0xff; + a1 >>= 24; + inc = 0; +#if 0 + olen = s - f->ftps_rptr; + (void) sprintf(newbuf, "%s %u,%u,%u,%u,%u,%u\r\n", + "227 Entering Passive Mode", a1, a2, a3, a4, a5, a6); + nlen = strlen(newbuf); + inc = nlen - olen; + if ((inc + ip->ip_len) > 65535) + return 0; + +#if SOLARIS + m = fin->fin_qfm; + for (m1 = m; m1->b_cont; m1 = m1->b_cont) + ; + if ((inc > 0) && (m1->b_datap->db_lim - m1->b_wptr < inc)) { + mblk_t *nm; + + /* alloc enough to keep same trailer space for lower driver */ + nm = allocb(nlen, BPRI_MED); + PANIC((!nm),("ippr_ftp_out: allocb failed")); + + nm->b_band = m1->b_band; + nm->b_wptr += nlen; + + m1->b_wptr -= olen; + PANIC((m1->b_wptr < m1->b_rptr), + ("ippr_ftp_out: cannot handle fragmented data block")); + + linkb(m1, nm); + } else { + m1->b_wptr += inc; + } + /*copyin_mblk(m, off, nlen, newbuf);*/ +#else /* SOLARIS */ + m = *((mb_t **)fin->fin_mp); + if (inc < 0) + m_adj(m, inc); + /* the mbuf chain will be extended if necessary by m_copyback() */ + /*m_copyback(m, off, nlen, newbuf);*/ +#endif /* SOLARIS */ + if (inc != 0) { +#if SOLARIS || defined(__sgi) + register u_32_t sum1, sum2; + + sum1 = ip->ip_len; + sum2 = ip->ip_len + inc; + + /* Because ~1 == -2, We really need ~1 == -1 */ + if (sum1 > sum2) + sum2--; + sum2 -= sum1; + sum2 = (sum2 & 0xffff) + (sum2 >> 16); + + fix_outcksum(&ip->ip_sum, sum2); +#endif /* SOLARIS || defined(__sgi) */ + ip->ip_len += inc; + } +#endif /* 0 */ + + /* + * Add skeleton NAT entry for connection which will come back the + * other way. + */ + sp = 0; + dp = htons(fin->fin_data[1] - 1); + ipn = nat_outlookup(fin->fin_ifp, IPN_TCP, nat->nat_p, nat->nat_inip, + ip->ip_dst, (dp << 16) | sp, 0); + if (ipn == NULL) { + int slen; + + slen = ip->ip_len; + ip->ip_len = fin->fin_hlen + sizeof(*tcp2); + bcopy((char *)fin, (char *)&fi, sizeof(fi)); + bzero((char *)tcp2, sizeof(*tcp2)); + tcp2->th_win = htons(8192); + tcp2->th_sport = 0; /* XXX - fake it for nat_new */ + tcp2->th_off = 5; + fi.fin_data[1] = a5 << 8 | a6; + fi.fin_dlen = sizeof(*tcp2); + tcp2->th_dport = htons(fi.fin_data[1]); + fi.fin_data[0] = 0; + fi.fin_dp = (char *)tcp2; + fi.fin_fr = &natfr; + swip = ip->ip_src; + swip2 = ip->ip_dst; + fi.fin_fi.fi_daddr = ip->ip_src.s_addr; + fi.fin_fi.fi_saddr = nat->nat_inip.s_addr; + ip->ip_dst = ip->ip_src; + ip->ip_src = nat->nat_inip; + ipn = nat_new(nat->nat_ptr, ip, &fi, IPN_TCP|FI_W_SPORT, + NAT_OUTBOUND); + if (ipn != NULL) { + ipn->nat_age = fr_defnatage; + (void) fr_addstate(ip, &fi, FI_W_SPORT); + } + ip->ip_len = slen; + ip->ip_src = swip; + ip->ip_dst = swip2; + } + return inc; +} + + +int ippr_ftp_server(fin, ip, nat, ftp, dlen) +fr_info_t *fin; +ip_t *ip; +nat_t *nat; +ftpinfo_t *ftp; +int dlen; +{ + char *rptr, *wptr; + ftpside_t *f; + int inc; + + inc = 0; + f = &ftp->ftp_side[1]; + rptr = f->ftps_rptr; + wptr = f->ftps_wptr; + + if ((ftp->ftp_passok == 1) && !strncmp(rptr, "331", 3)) + ftp->ftp_passok = 2; + else if ((ftp->ftp_passok == 3) && !strncmp(rptr, "230", 3)) + ftp->ftp_passok = 4; + else if ((ftp->ftp_passok == 3) && !strncmp(rptr, "530", 3)) + ftp->ftp_passok = 0; + else if ((ftp->ftp_passok == 4) && !strncmp(rptr, "227 ", 4)) { + inc = ippr_ftp_pasv(fin, ip, nat, f, dlen); + } else if (ippr_ftp_insecure && !strncmp(rptr, "227 ", 4)) { + inc = ippr_ftp_pasv(fin, ip, nat, f, dlen); + } + while ((*rptr++ != '\n') && (rptr < wptr)) + ; + f->ftps_rptr = rptr; + return inc; +} + + +/* + * Look to see if the buffer starts with something which we recognise as + * being the correct syntax for the FTP protocol. + */ +int ippr_ftp_valid(buf, len) +char *buf; +size_t len; +{ + register char *s, c; + register size_t i = len; + + if (i < 5) + return 2; + s = buf; + c = *s++; + i--; + + if (isdigit(c)) { + c = *s++; + i--; + if (isdigit(c)) { + c = *s++; + i--; + if (isdigit(c)) { + c = *s++; + i--; + if ((c != '-') && (c != ' ')) + return 1; + } else + return 1; + } else + return 1; + } else if (isalpha(c)) { + c = *s++; + i--; + if (isalpha(c)) { + c = *s++; + i--; + if (isalpha(c)) { + c = *s++; + i--; + if (isalpha(c)) { + c = *s++; + i--; + if ((c != ' ') && (c != '\r')) + return 1; + } else if ((c != ' ') && (c != '\r')) + return 1; + } else + return 1; + } else + return 1; + } else + return 1; + for (; i; i--) { + c = *s++; + if (c == '\n') + return 0; + } + return 2; +} + + +int ippr_ftp_process(fin, ip, nat, ftp, rv) +fr_info_t *fin; +ip_t *ip; +nat_t *nat; +ftpinfo_t *ftp; +int rv; +{ + int mlen, len, off, inc, i, sel; + char *rptr, *wptr; + ftpside_t *f, *t; + tcphdr_t *tcp; + mb_t *m; + + tcp = (tcphdr_t *)fin->fin_dp; + off = fin->fin_hlen + (tcp->th_off << 2); + +#if SOLARIS + m = fin->fin_qfm; +#else + m = *((mb_t **)fin->fin_mp); +#endif + +#if SOLARIS + mlen = msgdsize(m) - off; +#else + mlen = mbufchainlen(m) - off; +#endif + + t = &ftp->ftp_side[1 - rv]; + f = &ftp->ftp_side[rv]; + if (!mlen) { + if (!t->ftps_seq || + (int)ntohl(tcp->th_ack) - (int)t->ftps_seq > 0) + t->ftps_seq = ntohl(tcp->th_ack); + f->ftps_len = 0; + return 0; + } + + inc = 0; + rptr = f->ftps_rptr; + wptr = f->ftps_wptr; + + sel = nat->nat_aps->aps_sel[1 - rv]; + if (rv) + i = nat->nat_aps->aps_ackoff[sel]; + else + i = nat->nat_aps->aps_seqoff[sel]; + /* + * XXX - Ideally, this packet should get dropped because we now know + * that it is out of order (and there is no real danger in doing so + * apart from causing packets to go through here ordered). + */ + if (f->ftps_len + f->ftps_seq == ntohl(tcp->th_seq)) + f->ftps_seq = ntohl(tcp->th_seq); + else if (ntohl(tcp->th_seq) + i != f->ftps_seq) { + return APR_ERR(-1); + } + f->ftps_len = mlen; + + while (mlen > 0) { + len = MIN(mlen, FTP_BUFSZ / 2); + +#if SOLARIS + copyout_mblk(m, off, len, wptr); +#else + m_copydata(m, off, len, wptr); +#endif + mlen -= len; + off += len; + wptr += len; + f->ftps_wptr = wptr; + if (f->ftps_junk == 2) + f->ftps_junk = ippr_ftp_valid(rptr, wptr - rptr); + + while ((f->ftps_junk == 0) && (wptr > rptr)) { + f->ftps_junk = ippr_ftp_valid(rptr, wptr - rptr); + if (f->ftps_junk == 0) { + len = wptr - rptr; + f->ftps_rptr = rptr; + if (rv) + inc += ippr_ftp_server(fin, ip, nat, + ftp, len); + else + inc += ippr_ftp_client(fin, ip, nat, + ftp, len); + rptr = f->ftps_rptr; + } + } + + while ((f->ftps_junk == 1) && (rptr < wptr)) { + while ((rptr < wptr) && (*rptr != '\r')) + rptr++; + + if (*rptr == '\r') { + if (rptr + 1 < wptr) { + if (*(rptr + 1) == '\n') { + rptr += 2; + f->ftps_junk = 0; + } else + rptr++; + } else + break; + } + } + f->ftps_rptr = rptr; + + if (rptr == wptr) { + rptr = wptr = f->ftps_buf; + } else { + if ((wptr > f->ftps_buf + FTP_BUFSZ / 2)) { + i = wptr - rptr; + if ((rptr == f->ftps_buf) || + (wptr - rptr > FTP_BUFSZ / 2)) { + f->ftps_junk = 1; + rptr = wptr = f->ftps_buf; + } else { + bcopy(rptr, f->ftps_buf, i); + wptr = f->ftps_buf + i; + rptr = f->ftps_buf; + } + } + f->ftps_rptr = rptr; + f->ftps_wptr = wptr; + } + } + + t->ftps_seq = ntohl(tcp->th_ack); + f->ftps_rptr = rptr; + f->ftps_wptr = wptr; + return APR_INC(inc); +} + + +int ippr_ftp_out(fin, ip, aps, nat) +fr_info_t *fin; +ip_t *ip; +ap_session_t *aps; +nat_t *nat; +{ + ftpinfo_t *ftp; + + ftp = aps->aps_data; + if (ftp == NULL) + return 0; + return ippr_ftp_process(fin, ip, nat, ftp, 0); +} + + +int ippr_ftp_in(fin, ip, aps, nat) +fr_info_t *fin; +ip_t *ip; +ap_session_t *aps; +nat_t *nat; +{ + ftpinfo_t *ftp; + + ftp = aps->aps_data; + if (ftp == NULL) + return 0; + return ippr_ftp_process(fin, ip, nat, ftp, 1); +} + + +/* + * ippr_ftp_atoi - implement a version of atoi which processes numbers in + * pairs separated by commas (which are expected to be in the range 0 - 255), + * returning a 16 bit number combining either side of the , as the MSB and + * LSB. + */ +u_short ippr_ftp_atoi(ptr) +char **ptr; +{ + register char *s = *ptr, c; + register u_char i = 0, j = 0; + + while ((c = *s++) && isdigit(c)) { + i *= 10; + i += c - '0'; + } + if (c != ',') { + *ptr = NULL; + return 0; + } + while ((c = *s++) && isdigit(c)) { + j *= 10; + j += c - '0'; + } + *ptr = s; + i &= 0xff; + j &= 0xff; + return (i << 8) | j; +} diff --git a/sys/netinet/ip_fw.c b/sys/netinet/ip_fw.c new file mode 100644 index 0000000..78c79cf --- /dev/null +++ b/sys/netinet/ip_fw.c @@ -0,0 +1,2082 @@ +/* + * Copyright (c) 1993 Daniel Boulet + * Copyright (c) 1994 Ugen J.S.Antsilevich + * Copyright (c) 1996 Alex Nash + * Copyright (c) 2000 Luigi Rizzo + * + * Redistribution and use in source forms, with and without modification, + * are permitted provided that this entire comment appears intact. + * + * Redistribution in binary form may occur without any restrictions. + * Obviously, it would be nice if you gave credit where credit is due + * but requiring it would be too onerous. + * + * This software is provided ``AS IS'' without any warranties of any kind. + * + * $FreeBSD$ + */ + +#define DEB(x) +#define DDB(x) x + +/* + * Implement IP packet firewall + */ + +#if !defined(KLD_MODULE) +#include "opt_ipfw.h" +#include "opt_ipdn.h" +#include "opt_ipdivert.h" +#include "opt_inet.h" +#ifndef INET +#error IPFIREWALL requires INET. +#endif /* INET */ +#endif + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/kernel.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> +#include <sys/ucred.h> +#include <net/if.h> +#include <net/route.h> +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/in_var.h> +#include <netinet/in_pcb.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#include <netinet/ip_icmp.h> +#include <netinet/ip_fw.h> +#ifdef DUMMYNET +#include <netinet/ip_dummynet.h> +#endif +#include <netinet/tcp.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#include <netinet/tcpip.h> +#include <netinet/udp.h> +#include <netinet/udp_var.h> + +#include <netinet/if_ether.h> /* XXX ethertype_ip */ + +static int fw_debug = 1; +#ifdef IPFIREWALL_VERBOSE +static int fw_verbose = 1; +#else +static int fw_verbose = 0; +#endif +int fw_one_pass = 1 ; +#ifdef IPFIREWALL_VERBOSE_LIMIT +static int fw_verbose_limit = IPFIREWALL_VERBOSE_LIMIT; +#else +static int fw_verbose_limit = 0; +#endif +static int fw_permanent_rules = 0; + +/* + * Right now, two fields in the IP header are changed to host format + * by the IP layer before calling the firewall. Ideally, we would like + * to have them in network format so that the packet can be + * used as it comes from the device driver (and is thus readonly). + */ + +static u_int64_t counter; /* counter for ipfw_report(NULL...) */ +struct ipfw_flow_id last_pkt ; + +#define IPFW_DEFAULT_RULE ((u_int)(u_short)~0) + +LIST_HEAD (ip_fw_head, ip_fw_chain) ip_fw_chain_head; + +MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's"); + +#ifdef SYSCTL_NODE +SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, enable, CTLFLAG_RW, + &fw_enable, 0, "Enable ipfw"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO,one_pass,CTLFLAG_RW, + &fw_one_pass, 0, + "Only do a single pass through ipfw when using dummynet(4)"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, debug, CTLFLAG_RW, + &fw_debug, 0, "Enable printing of debug ip_fw statements"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose, CTLFLAG_RW, + &fw_verbose, 0, "Log matches to ipfw rules"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW, + &fw_verbose_limit, 0, "Set upper limit of matches of ipfw rules logged"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, permanent_rules, CTLFLAG_RW, + &fw_permanent_rules, 0, "Set rule number, below which rules are permanent"); + +/* + * Extension for stateful ipfw. + * + * Dynamic rules are stored in lists accessed through a hash table + * (ipfw_dyn_v) whose size is curr_dyn_buckets. This value can + * be modified through the sysctl variable dyn_buckets which is + * updated when the table becomes empty. + * + * XXX currently there is only one list, ipfw_dyn. + * + * When a packet is received, it is first hashed, then matched + * against the entries in the corresponding list. + * Matching occurs according to the rule type. The default is to + * match the four fields and the protocol, and rules are bidirectional. + * + * For a busy proxy/web server we will have lots of connections to + * the server. We could decide for a rule type where we ignore + * ports (different hashing) and avoid special SYN/RST/FIN handling. + * + * XXX when we decide to support more than one rule type, we should + * repeat the hashing multiple times uing only the useful fields. + * Or, we could run the various tests in parallel, because the + * 'move to front' technique should shorten the average search. + * + * The lifetime of dynamic rules is regulated by dyn_*_lifetime, + * measured in seconds and depending on the flags. + * + * The total number of dynamic rules is stored in dyn_count. + * The max number of dynamic rules is dyn_max. When we reach + * the maximum number of rules we do not create anymore. This is + * done to avoid consuming too much memory, but also too much + * time when searching on each packet (ideally, we should try instead + * to put a limit on the length of the list on each bucket...). + * + * Each dynamic rules holds a pointer to the parent ipfw rule so + * we know what action to perform. Dynamic rules are removed when + * the parent rule is deleted. + * There are some limitations with dynamic rules -- we do not + * obey the 'randomized match', and we do not do multiple + * passes through the firewall. + * XXX check the latter!!! + */ +static struct ipfw_dyn_rule **ipfw_dyn_v = NULL ; +static u_int32_t dyn_buckets = 256 ; /* must be power of 2 */ +static u_int32_t curr_dyn_buckets = 256 ; /* must be power of 2 */ +static u_int32_t dyn_ack_lifetime = 300 ; +static u_int32_t dyn_syn_lifetime = 20 ; +static u_int32_t dyn_fin_lifetime = 20 ; +static u_int32_t dyn_rst_lifetime = 5 ; +static u_int32_t dyn_short_lifetime = 30 ; +static u_int32_t dyn_count = 0 ; +static u_int32_t dyn_max = 1000 ; +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_buckets, CTLFLAG_RW, + &dyn_buckets, 0, "Number of dyn. buckets"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, CTLFLAG_RD, + &curr_dyn_buckets, 0, "Current Number of dyn. buckets"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_count, CTLFLAG_RD, + &dyn_count, 0, "Number of dyn. rules"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_max, CTLFLAG_RW, + &dyn_max, 0, "Max number of dyn. rules"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_RW, + &dyn_ack_lifetime, 0, "Lifetime of dyn. rules for acks"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_RW, + &dyn_syn_lifetime, 0, "Lifetime of dyn. rules for syn"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_RW, + &dyn_fin_lifetime, 0, "Lifetime of dyn. rules for fin"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_RW, + &dyn_rst_lifetime, 0, "Lifetime of dyn. rules for rst"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_RW, + &dyn_short_lifetime, 0, "Lifetime of dyn. rules for other situations"); + +#endif + +#define dprintf(a) do { \ + if (fw_debug) \ + printf a; \ + } while (0) +#define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0 + +static int add_entry __P((struct ip_fw_head *chainptr, struct ip_fw *frwl)); +static int del_entry __P((struct ip_fw_head *chainptr, u_short number)); +static int zero_entry __P((struct ip_fw *)); +static int resetlog_entry __P((struct ip_fw *)); +static int check_ipfw_struct __P((struct ip_fw *m)); +static __inline int + iface_match __P((struct ifnet *ifp, union ip_fw_if *ifu, + int byname)); +static int ipopts_match __P((struct ip *ip, struct ip_fw *f)); +static int iptos_match __P((struct ip *ip, struct ip_fw *f)); +static __inline int + port_match __P((u_short *portptr, int nports, u_short port, + int range_flag, int mask)); +static int tcpflg_match __P((struct tcphdr *tcp, struct ip_fw *f)); +static int icmptype_match __P((struct icmp * icmp, struct ip_fw * f)); +static void ipfw_report __P((struct ip_fw *f, struct ip *ip, int offset, + struct ifnet *rif, struct ifnet *oif)); + +static void flush_rule_ptrs(void); + +static int ip_fw_chk __P((struct ip **pip, int hlen, + struct ifnet *oif, u_int16_t *cookie, struct mbuf **m, + struct ip_fw_chain **flow_id, + struct sockaddr_in **next_hop)); +static int ip_fw_ctl __P((struct sockopt *sopt)); + +static char err_prefix[] = "ip_fw_ctl:"; + +/* + * Returns 1 if the port is matched by the vector, 0 otherwise + */ +static __inline int +port_match(u_short *portptr, int nports, u_short port, int range_flag, int mask) +{ + if (!nports) + return 1; + if (mask) { + if ( 0 == ((portptr[0] ^ port) & portptr[1]) ) + return 1; + nports -= 2; + portptr += 2; + } + if (range_flag) { + if (portptr[0] <= port && port <= portptr[1]) { + return 1; + } + nports -= 2; + portptr += 2; + } + while (nports-- > 0) { + if (*portptr++ == port) { + return 1; + } + } + return 0; +} + +static int +tcpflg_match(struct tcphdr *tcp, struct ip_fw *f) +{ + u_char flg_set, flg_clr; + + /* + * If an established connection is required, reject packets that + * have only SYN of RST|ACK|SYN set. Otherwise, fall through to + * other flag requirements. + */ + if ((f->fw_ipflg & IP_FW_IF_TCPEST) && + ((tcp->th_flags & (IP_FW_TCPF_RST | IP_FW_TCPF_ACK | + IP_FW_TCPF_SYN)) == IP_FW_TCPF_SYN)) + return 0; + + flg_set = tcp->th_flags & f->fw_tcpf; + flg_clr = tcp->th_flags & f->fw_tcpnf; + + if (flg_set != f->fw_tcpf) + return 0; + if (flg_clr) + return 0; + + return 1; +} + +static int +icmptype_match(struct icmp *icmp, struct ip_fw *f) +{ + int type; + + if (!(f->fw_flg & IP_FW_F_ICMPBIT)) + return(1); + + type = icmp->icmp_type; + + /* check for matching type in the bitmap */ + if (type < IP_FW_ICMPTYPES_MAX && + (f->fw_uar.fw_icmptypes[type / (sizeof(unsigned) * NBBY)] & + (1U << (type % (sizeof(unsigned) * NBBY))))) + return(1); + + return(0); /* no match */ +} + +static int +is_icmp_query(struct ip *ip) +{ + const struct icmp *icmp; + int icmp_type; + + icmp = (struct icmp *)((u_int32_t *)ip + ip->ip_hl); + icmp_type = icmp->icmp_type; + + if (icmp_type == ICMP_ECHO || icmp_type == ICMP_ROUTERSOLICIT || + icmp_type == ICMP_TSTAMP || icmp_type == ICMP_IREQ || + icmp_type == ICMP_MASKREQ) + return(1); + + return(0); +} + +static int +ipopts_match(struct ip *ip, struct ip_fw *f) +{ + register u_char *cp; + int opt, optlen, cnt; + u_char opts, nopts, nopts_sve; + + cp = (u_char *)(ip + 1); + cnt = (ip->ip_hl << 2) - sizeof (struct ip); + opts = f->fw_ipopt; + nopts = nopts_sve = f->fw_ipnopt; + + for (; cnt > 0; cnt -= optlen, cp += optlen) { + opt = cp[IPOPT_OPTVAL]; + if (opt == IPOPT_EOL) + break; + if (opt == IPOPT_NOP) + optlen = 1; + else { + optlen = cp[IPOPT_OLEN]; + if (optlen <= 0 || optlen > cnt) { + return 0; /*XXX*/ + } + } + switch (opt) { + + default: + break; + + case IPOPT_LSRR: + opts &= ~IP_FW_IPOPT_LSRR; + nopts &= ~IP_FW_IPOPT_LSRR; + break; + + case IPOPT_SSRR: + opts &= ~IP_FW_IPOPT_SSRR; + nopts &= ~IP_FW_IPOPT_SSRR; + break; + + case IPOPT_RR: + opts &= ~IP_FW_IPOPT_RR; + nopts &= ~IP_FW_IPOPT_RR; + break; + case IPOPT_TS: + opts &= ~IP_FW_IPOPT_TS; + nopts &= ~IP_FW_IPOPT_TS; + break; + } + if (opts == nopts) + break; + } + if (opts == 0 && nopts == nopts_sve) + return 1; + else + return 0; +} + +static int +iptos_match(struct ip *ip, struct ip_fw *f) +{ + + u_int flags = (ip->ip_tos & 0x1f); + u_char opts, nopts, nopts_sve; + + opts = f->fw_iptos; + nopts = nopts_sve = f->fw_ipntos; + + while (flags != 0) { + u_int flag; + + flag = 1 << (ffs(flags) -1); + opts &= ~flag; + nopts &= ~flag; + flags &= ~flag; + } + + if (opts == 0 && nopts == nopts_sve) + return 1; + else + return 0; + +} + + +static int +tcpopts_match(struct tcphdr *tcp, struct ip_fw *f) +{ + register u_char *cp; + int opt, optlen, cnt; + u_char opts, nopts, nopts_sve; + + cp = (u_char *)(tcp + 1); + cnt = (tcp->th_off << 2) - sizeof (struct tcphdr); + opts = f->fw_tcpopt; + nopts = nopts_sve = f->fw_tcpnopt; + + for (; cnt > 0; cnt -= optlen, cp += optlen) { + opt = cp[0]; + if (opt == TCPOPT_EOL) + break; + if (opt == TCPOPT_NOP) + optlen = 1; + else { + optlen = cp[1]; + if (optlen <= 0) + break; + } + + + switch (opt) { + + default: + break; + + case TCPOPT_MAXSEG: + opts &= ~IP_FW_TCPOPT_MSS; + nopts &= ~IP_FW_TCPOPT_MSS; + break; + + case TCPOPT_WINDOW: + opts &= ~IP_FW_TCPOPT_WINDOW; + nopts &= ~IP_FW_TCPOPT_WINDOW; + break; + + case TCPOPT_SACK_PERMITTED: + case TCPOPT_SACK: + opts &= ~IP_FW_TCPOPT_SACK; + nopts &= ~IP_FW_TCPOPT_SACK; + break; + + case TCPOPT_TIMESTAMP: + opts &= ~IP_FW_TCPOPT_TS; + nopts &= ~IP_FW_TCPOPT_TS; + break; + + case TCPOPT_CC: + case TCPOPT_CCNEW: + case TCPOPT_CCECHO: + opts &= ~IP_FW_TCPOPT_CC; + nopts &= ~IP_FW_TCPOPT_CC; + break; + } + if (opts == nopts) + break; + } + if (opts == 0 && nopts == nopts_sve) + return 1; + else + return 0; +} + +static __inline int +iface_match(struct ifnet *ifp, union ip_fw_if *ifu, int byname) +{ + /* Check by name or by IP address */ + if (byname) { + /* Check unit number (-1 is wildcard) */ + if (ifu->fu_via_if.unit != -1 + && ifp->if_unit != ifu->fu_via_if.unit) + return(0); + /* Check name */ + if (strncmp(ifp->if_name, ifu->fu_via_if.name, FW_IFNLEN)) + return(0); + return(1); + } else if (ifu->fu_via_ip.s_addr != 0) { /* Zero == wildcard */ + struct ifaddr *ia; + + TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) { + if (ia->ifa_addr == NULL) + continue; + if (ia->ifa_addr->sa_family != AF_INET) + continue; + if (ifu->fu_via_ip.s_addr != ((struct sockaddr_in *) + (ia->ifa_addr))->sin_addr.s_addr) + continue; + return(1); + } + return(0); + } + return(1); +} + +static void +ipfw_report(struct ip_fw *f, struct ip *ip, int offset, + struct ifnet *rif, struct ifnet *oif) +{ + struct tcphdr *const tcp = (struct tcphdr *) ((u_int32_t *) ip+ ip->ip_hl); + struct udphdr *const udp = (struct udphdr *) ((u_int32_t *) ip+ ip->ip_hl); + struct icmp *const icmp = (struct icmp *) ((u_int32_t *) ip + ip->ip_hl); + u_int64_t count; + char *action; + char action2[32], proto[47], name[18], fragment[17]; + int len; + + count = f ? f->fw_pcnt : ++counter; + if ((f == NULL && fw_verbose_limit != 0 && count > fw_verbose_limit) || + (f && f->fw_logamount != 0 && count > f->fw_loghighest)) + return; + + /* Print command name */ + snprintf(SNPARGS(name, 0), "ipfw: %d", f ? f->fw_number : -1); + + action = action2; + if (!f) + action = "Refuse"; + else { + switch (f->fw_flg & IP_FW_F_COMMAND) { + case IP_FW_F_DENY: + action = "Deny"; + break; + case IP_FW_F_REJECT: + if (f->fw_reject_code == IP_FW_REJECT_RST) + action = "Reset"; + else + action = "Unreach"; + break; + case IP_FW_F_ACCEPT: + action = "Accept"; + break; + case IP_FW_F_COUNT: + action = "Count"; + break; +#ifdef IPDIVERT + case IP_FW_F_DIVERT: + snprintf(SNPARGS(action2, 0), "Divert %d", + f->fw_divert_port); + break; + case IP_FW_F_TEE: + snprintf(SNPARGS(action2, 0), "Tee %d", + f->fw_divert_port); + break; +#endif + case IP_FW_F_SKIPTO: + snprintf(SNPARGS(action2, 0), "SkipTo %d", + f->fw_skipto_rule); + break; +#ifdef DUMMYNET + case IP_FW_F_PIPE: + snprintf(SNPARGS(action2, 0), "Pipe %d", + f->fw_skipto_rule); + break; + case IP_FW_F_QUEUE: + snprintf(SNPARGS(action2, 0), "Queue %d", + f->fw_skipto_rule); + break; +#endif +#ifdef IPFIREWALL_FORWARD + case IP_FW_F_FWD: + if (f->fw_fwd_ip.sin_port) + snprintf(SNPARGS(action2, 0), + "Forward to %s:%d", + inet_ntoa(f->fw_fwd_ip.sin_addr), + f->fw_fwd_ip.sin_port); + else + snprintf(SNPARGS(action2, 0), "Forward to %s", + inet_ntoa(f->fw_fwd_ip.sin_addr)); + break; +#endif + default: + action = "UNKNOWN"; + break; + } + } + + switch (ip->ip_p) { + case IPPROTO_TCP: + len = snprintf(SNPARGS(proto, 0), "TCP %s", + inet_ntoa(ip->ip_src)); + if (offset == 0) + len += snprintf(SNPARGS(proto, len), ":%d ", + ntohs(tcp->th_sport)); + else + len += snprintf(SNPARGS(proto, len), " "); + len += snprintf(SNPARGS(proto, len), "%s", + inet_ntoa(ip->ip_dst)); + if (offset == 0) + snprintf(SNPARGS(proto, len), ":%d", + ntohs(tcp->th_dport)); + break; + case IPPROTO_UDP: + len = snprintf(SNPARGS(proto, 0), "UDP %s", + inet_ntoa(ip->ip_src)); + if (offset == 0) + len += snprintf(SNPARGS(proto, len), ":%d ", + ntohs(udp->uh_sport)); + else + len += snprintf(SNPARGS(proto, len), " "); + len += snprintf(SNPARGS(proto, len), "%s", + inet_ntoa(ip->ip_dst)); + if (offset == 0) + snprintf(SNPARGS(proto, len), ":%d", + ntohs(udp->uh_dport)); + break; + case IPPROTO_ICMP: + if (offset == 0) + len = snprintf(SNPARGS(proto, 0), "ICMP:%u.%u ", + icmp->icmp_type, icmp->icmp_code); + else + len = snprintf(SNPARGS(proto, 0), "ICMP "); + len += snprintf(SNPARGS(proto, len), "%s", + inet_ntoa(ip->ip_src)); + snprintf(SNPARGS(proto, len), " %s", inet_ntoa(ip->ip_dst)); + break; + default: + len = snprintf(SNPARGS(proto, 0), "P:%d %s", ip->ip_p, + inet_ntoa(ip->ip_src)); + snprintf(SNPARGS(proto, len), " %s", inet_ntoa(ip->ip_dst)); + break; + } + + if (offset != 0) + snprintf(SNPARGS(fragment, 0), " Fragment = %d", + offset); + else + fragment[0] = '\0'; + if (oif) + log(LOG_SECURITY | LOG_INFO, "%s %s %s out via %s%d%s\n", + name, action, proto, oif->if_name, oif->if_unit, fragment); + else if (rif) + log(LOG_SECURITY | LOG_INFO, "%s %s %s in via %s%d%s\n", name, + action, proto, rif->if_name, rif->if_unit, fragment); + else + log(LOG_SECURITY | LOG_INFO, "%s %s %s%s\n", name, action, + proto, fragment); + if ((f ? f->fw_logamount != 0 : 1) && + count == (f ? f->fw_loghighest : fw_verbose_limit)) + log(LOG_SECURITY | LOG_NOTICE, + "ipfw: limit %d reached on entry %d\n", + f ? f->fw_logamount : fw_verbose_limit, + f ? f->fw_number : -1); +} + +static __inline int +hash_packet(struct ipfw_flow_id *id) +{ + u_int32_t i ; + + i = (id->dst_ip) ^ (id->src_ip) ^ (id->dst_port) ^ (id->src_port); + i &= (curr_dyn_buckets - 1) ; + return i ; +} + +#define TIME_LEQ(a,b) ((int)((a)-(b)) <= 0) +/* + * Remove all dynamic rules pointing to a given chain, or all + * rules if chain == NULL. Second parameter is 1 if we want to + * delete unconditionally, otherwise only expired rules are removed. + */ +static void +remove_dyn_rule(struct ip_fw_chain *chain, int force) +{ + struct ipfw_dyn_rule *prev, *q, *old_q ; + int i ; + static u_int32_t last_remove = 0 ; + + if (ipfw_dyn_v == NULL || dyn_count == 0) + return ; + /* do not expire more than once per second, it is useless */ + if (force == 0 && last_remove == time_second) + return ; + last_remove = time_second ; + + for (i = 0 ; i < curr_dyn_buckets ; i++) { + for (prev=NULL, q = ipfw_dyn_v[i] ; q ; ) { + if ( (chain == NULL || chain == q->chain) && + (force || TIME_LEQ( q->expire , time_second ) ) ) { + DEB(printf("-- remove entry 0x%08x %d -> 0x%08x %d, %d left\n", + (q->id.src_ip), (q->id.src_port), + (q->id.dst_ip), (q->id.dst_port), dyn_count-1 ); ) + old_q = q ; + if (prev != NULL) + prev->next = q = q->next ; + else + ipfw_dyn_v[i] = q = q->next ; + dyn_count-- ; + free(old_q, M_IPFW); + continue ; + } else { + prev = q ; + q = q->next ; + } + } + } +} + +static struct ipfw_dyn_rule * +lookup_dyn_rule(struct ipfw_flow_id *pkt, int *match_direction) +{ + /* + * stateful ipfw extensions. + * Lookup into dynamic session queue + */ + struct ipfw_dyn_rule *prev, *q, *old_q ; + int i, dir = 0; +#define MATCH_FORWARD 1 + + if (ipfw_dyn_v == NULL) + return NULL ; + i = hash_packet( pkt ); + for (prev=NULL, q = ipfw_dyn_v[i] ; q != NULL ; ) { + if (TIME_LEQ( q->expire , time_second ) ) { /* expire entry */ + old_q = q ; + if (prev != NULL) + prev->next = q = q->next ; + else + ipfw_dyn_v[i] = q = q->next ; + dyn_count-- ; + free(old_q, M_IPFW); + continue ; + } + if ( pkt->proto == q->id.proto) { + switch (q->type) { + default: /* bidirectional rule, no masks */ + if (pkt->src_ip == q->id.src_ip && + pkt->dst_ip == q->id.dst_ip && + pkt->src_port == q->id.src_port && + pkt->dst_port == q->id.dst_port ) { + dir = MATCH_FORWARD ; + goto found ; + } + if (pkt->src_ip == q->id.dst_ip && + pkt->dst_ip == q->id.src_ip && + pkt->src_port == q->id.dst_port && + pkt->dst_port == q->id.src_port ) { + dir = 0 ; /* reverse match */ + goto found ; + } + break ; + } + } + prev = q ; + q = q->next ; + } + return NULL ; /* clearly not found */ +found: + if ( prev != NULL) { /* found and not in front */ + prev->next = q->next ; + q->next = ipfw_dyn_v[i] ; + ipfw_dyn_v[i] = q ; + } + if (pkt->proto == IPPROTO_TCP) { + /* update state according to flags */ + u_char flags = pkt->flags & (TH_FIN|TH_SYN|TH_RST); + q->state |= (dir == MATCH_FORWARD ) ? flags : (flags << 8); + switch (q->state) { + case TH_SYN : + /* opening */ + q->expire = time_second + dyn_syn_lifetime ; + break ; + case TH_SYN | (TH_SYN << 8) : + /* move to established */ + q->expire = time_second + dyn_ack_lifetime ; + break ; + case TH_SYN | (TH_SYN << 8) | TH_FIN : + case TH_SYN | (TH_SYN << 8) | (TH_FIN << 8) : + /* one side tries to close */ + q->expire = time_second + dyn_ack_lifetime ; + break ; + case TH_SYN | (TH_SYN << 8) | TH_FIN | (TH_FIN << 8) : + /* both sides closed */ + q->expire = time_second + dyn_fin_lifetime ; + break ; + default: +#if 0 + /* + * reset or some invalid combination, but can also + * occur if we use keep-state the wrong way. + */ + if ( (q->state & ((TH_RST << 8)|TH_RST)) == 0) + printf("invalid state: 0x%x\n", q->state); +#endif + q->expire = time_second + dyn_rst_lifetime ; + break ; + } + } else { + /* should do something for UDP and others... */ + q->expire = time_second + dyn_short_lifetime ; + } + if (match_direction) + *match_direction = dir ; + return q ; +} + +/* + * Install state for a dynamic session. + */ + +static void +add_dyn_rule(struct ipfw_flow_id *id, struct ipfw_flow_id *mask, + struct ip_fw_chain *chain) +{ + struct ipfw_dyn_rule *r ; + + int i ; + if (ipfw_dyn_v == NULL || + (dyn_count == 0 && dyn_buckets != curr_dyn_buckets)) { + /* try reallocation, make sure we have a power of 2 */ + u_int32_t i = dyn_buckets ; + while ( i > 0 && (i & 1) == 0 ) + i >>= 1 ; + if (i != 1) /* not a power of 2 */ + dyn_buckets = curr_dyn_buckets ; /* reset */ + else { + if (ipfw_dyn_v != NULL) + free(ipfw_dyn_v, M_IPFW); + ipfw_dyn_v = malloc(curr_dyn_buckets * sizeof r, + M_IPFW, M_DONTWAIT | M_ZERO); + if (ipfw_dyn_v == NULL) + return ; /* failed ! */ + } + } + i = hash_packet(id); + + r = malloc(sizeof *r, M_IPFW, M_DONTWAIT | M_ZERO); + if (r == NULL) { + printf ("sorry cannot allocate state\n"); + return ; + } + + if (mask) + r->mask = *mask ; + r->id = *id ; + r->expire = time_second + dyn_syn_lifetime ; + r->chain = chain ; + r->type = ((struct ip_fw_ext *)chain->rule)->dyn_type ; + + r->bucket = i ; + r->next = ipfw_dyn_v[i] ; + ipfw_dyn_v[i] = r ; + dyn_count++ ; + DEB(printf("-- add entry 0x%08x %d -> 0x%08x %d, %d left\n", + (r->id.src_ip), (r->id.src_port), + (r->id.dst_ip), (r->id.dst_port), + dyn_count ); ) +} + +/* + * Install dynamic state. + * There are different types of dynamic rules which can be installed. + * The type is in chain->dyn_type. + * Type 0 (default) is a bidirectional rule + */ +static void +install_state(struct ip_fw_chain *chain) +{ + struct ipfw_dyn_rule *q ; + static int last_log ; + + u_long type = ((struct ip_fw_ext *)chain->rule)->dyn_type ; + + DEB(printf("-- install state type %d 0x%08lx %u -> 0x%08lx %u\n", + type, + (last_pkt.src_ip), (last_pkt.src_port), + (last_pkt.dst_ip), (last_pkt.dst_port) );) + + q = lookup_dyn_rule(&last_pkt, NULL) ; + if (q != NULL) { + if (last_log == time_second) + return ; + last_log = time_second ; + printf(" entry already present, done\n"); + return ; + } + if (dyn_count >= dyn_max) /* try remove old ones... */ + remove_dyn_rule(NULL, 0 /* expire */); + if (dyn_count >= dyn_max) { + if (last_log == time_second) + return ; + last_log = time_second ; + printf(" Too many dynamic rules, sorry\n"); + return ; + } + switch (type) { + default: /* bidir rule */ + add_dyn_rule(&last_pkt, NULL, chain); + break ; + } + q = lookup_dyn_rule(&last_pkt, NULL) ; /* XXX this just sets the lifetime ... */ +} + +/* + * given an ip_fw_chain *, lookup_next_rule will return a pointer + * of the same type to the next one. This can be either the jump + * target (for skipto instructions) or the next one in the chain (in + * all other cases including a missing jump target). + * Backward jumps are not allowed, so start looking from the next + * rule... + */ +static struct ip_fw_chain * lookup_next_rule(struct ip_fw_chain *me); + +static struct ip_fw_chain * +lookup_next_rule(struct ip_fw_chain *me) +{ + struct ip_fw_chain *chain ; + int rule = me->rule->fw_skipto_rule ; /* guess... */ + + if ( (me->rule->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_SKIPTO ) + for (chain = LIST_NEXT(me,next); chain ; chain = LIST_NEXT(chain,next)) + if (chain->rule->fw_number >= rule) + return chain ; + return LIST_NEXT(me,next) ; /* failure or not a skipto */ +} + +/* + * Parameters: + * + * pip Pointer to packet header (struct ip **) + * hlen Packet header length + * oif Outgoing interface, or NULL if packet is incoming + * *cookie Skip up to the first rule past this rule number; + * upon return, non-zero port number for divert or tee. + * Special case: cookie == NULL on input for bridging. + * *m The packet; we set to NULL when/if we nuke it. + * *flow_id pointer to the last matching rule (in/out) + * *next_hop socket we are forwarding to (in/out). + * + * Return value: + * + * IP_FW_PORT_DENY_FLAG the packet must be dropped. + * 0 The packet is to be accepted and routed normally OR + * the packet was denied/rejected and has been dropped; + * in the latter case, *m is equal to NULL upon return. + * port Divert the packet to port, with these caveats: + * + * - If IP_FW_PORT_TEE_FLAG is set, tee the packet instead + * of diverting it (ie, 'ipfw tee'). + * + * - If IP_FW_PORT_DYNT_FLAG is set, interpret the lower + * 16 bits as a dummynet pipe number instead of diverting + */ + +static int +ip_fw_chk(struct ip **pip, int hlen, + struct ifnet *oif, u_int16_t *cookie, struct mbuf **m, + struct ip_fw_chain **flow_id, + struct sockaddr_in **next_hop) +{ + struct ip_fw_chain *chain; + struct ip_fw *f = NULL, *rule = NULL; + struct ip *ip = *pip; + struct ifnet *const rif = (*m)->m_pkthdr.rcvif; + struct ifnet *tif; + u_short offset = 0 ; + u_short src_port = 0, dst_port = 0; + struct in_addr src_ip, dst_ip; /* XXX */ + u_int8_t proto= 0, flags = 0 ; /* XXX */ + u_int16_t skipto, bridgeCookie; + u_int16_t ip_len; + + int dyn_checked = 0 ; /* set after dyn.rules have been checked. */ + int direction = MATCH_FORWARD ; /* dirty trick... */ + struct ipfw_dyn_rule *q = NULL ; + + /* Special hack for bridging (as usual) */ + if (cookie == NULL) { + bridgeCookie = 0; + cookie = &bridgeCookie; +#define BRIDGED (cookie == &bridgeCookie) + hlen = ip->ip_hl << 2; + } + + /* Grab and reset cookie */ + skipto = *cookie; + *cookie = 0; + +#define PULLUP_TO(len) do { \ + if ((*m)->m_len < (len)) { \ + ip = NULL ; \ + if ((*m = m_pullup(*m, (len))) == 0) \ + goto bogusfrag; \ + ip = mtod(*m, struct ip *); \ + *pip = ip; \ + } \ + } while (0) + + /* + * Collect parameters into local variables for faster matching. + */ + proto = ip->ip_p; + src_ip = ip->ip_src; + dst_ip = ip->ip_dst; + if (0 && BRIDGED) { /* not yet... */ + offset = (ntohs(ip->ip_off) & IP_OFFMASK); + ip_len = ntohs(ip->ip_len); + } else { + offset = (ip->ip_off & IP_OFFMASK); + ip_len = ip->ip_len; + } + if (offset == 0) { + struct tcphdr *tcp; + struct udphdr *udp; + + switch (proto) { + case IPPROTO_TCP : + PULLUP_TO(hlen + sizeof(struct tcphdr)); + tcp =(struct tcphdr *)((u_int32_t *)ip + ip->ip_hl); + dst_port = tcp->th_dport ; + src_port = tcp->th_sport ; + flags = tcp->th_flags ; + break ; + + case IPPROTO_UDP : + PULLUP_TO(hlen + sizeof(struct udphdr)); + udp =(struct udphdr *)((u_int32_t *)ip + ip->ip_hl); + dst_port = udp->uh_dport ; + src_port = udp->uh_sport ; + break; + + case IPPROTO_ICMP: + PULLUP_TO(hlen + 4); /* type, code and checksum. */ + flags = ((struct icmp *) + ((u_int32_t *)ip + ip->ip_hl))->icmp_type ; + break ; + + default : + break; + } + } +#undef PULLUP_TO + last_pkt.src_ip = ntohl(src_ip.s_addr); + last_pkt.dst_ip = ntohl(dst_ip.s_addr); + last_pkt.proto = proto; + last_pkt.src_port = ntohs(src_port); + last_pkt.dst_port = ntohs(dst_port); + last_pkt.flags = flags; + + if (*flow_id) { + /* Accept if passed first test */ + if (fw_one_pass) + return 0; + /* + * Packet has already been tagged. Look for the next rule + * to restart processing. + */ + chain = LIST_NEXT(*flow_id, next); + + if ((chain = (*flow_id)->rule->next_rule_ptr) == NULL) + chain = (*flow_id)->rule->next_rule_ptr = + lookup_next_rule(*flow_id); + if (chain == NULL) + goto dropit; + } else { + /* + * Go down the chain, looking for enlightment. + * If we've been asked to start at a given rule, do so. + */ + chain = LIST_FIRST(&ip_fw_chain_head); + if (skipto != 0) { + if (skipto >= IPFW_DEFAULT_RULE) + goto dropit; + while (chain && chain->rule->fw_number <= skipto) + chain = LIST_NEXT(chain, next); + if (chain == NULL) + goto dropit; + } + } + + + for (; chain; chain = LIST_NEXT(chain, next)) { +again: + f = chain->rule; + if (f->fw_number == IPFW_DEFAULT_RULE) + goto got_match ; + + /* + * dynamic rules are checked at the first keep-state or + * check-state occurrence. + */ + if (f->fw_flg & (IP_FW_F_KEEP_S|IP_FW_F_CHECK_S) && + dyn_checked == 0 ) { + dyn_checked = 1 ; + q = lookup_dyn_rule(&last_pkt, &direction); + if (q != NULL) { + DEB(printf("-- dynamic match 0x%08x %d %s 0x%08x %d\n", + (q->id.src_ip), (q->id.src_port), + (direction == MATCH_FORWARD ? "-->" : "<--"), + (q->id.dst_ip), (q->id.dst_port) ); ) + chain = q->chain ; + f = chain->rule ; + q->pcnt++ ; + q->bcnt += ip_len; + goto got_match ; /* random not allowed here */ + } + /* if this was a check-only rule, continue with next */ + if (f->fw_flg & IP_FW_F_CHECK_S) + continue ; + } + + /* Check if rule only valid for bridged packets */ + if ((f->fw_flg & IP_FW_BRIDGED) != 0 && !(BRIDGED)) + continue; + + if (oif) { + /* Check direction outbound */ + if (!(f->fw_flg & IP_FW_F_OUT)) + continue; + } else { + /* Check direction inbound */ + if (!(f->fw_flg & IP_FW_F_IN)) + continue; + } + + /* Fragments */ + if ((f->fw_flg & IP_FW_F_FRAG) && offset == 0 ) + continue; + + if (f->fw_flg & IP_FW_F_SME) { + INADDR_TO_IFP(src_ip, tif); + if (tif == NULL) + continue; + } + if (f->fw_flg & IP_FW_F_DME) { + INADDR_TO_IFP(dst_ip, tif); + if (tif == NULL) + continue; + } + /* If src-addr doesn't match, not this rule. */ + if (((f->fw_flg & IP_FW_F_INVSRC) != 0) ^ ((src_ip.s_addr + & f->fw_smsk.s_addr) != f->fw_src.s_addr)) + continue; + + /* If dest-addr doesn't match, not this rule. */ + if (((f->fw_flg & IP_FW_F_INVDST) != 0) ^ ((dst_ip.s_addr + & f->fw_dmsk.s_addr) != f->fw_dst.s_addr)) + continue; + + /* Interface check */ + if ((f->fw_flg & IF_FW_F_VIAHACK) == IF_FW_F_VIAHACK) { + struct ifnet *const iface = oif ? oif : rif; + + /* Backwards compatibility hack for "via" */ + if (!iface || !iface_match(iface, + &f->fw_in_if, f->fw_flg & IP_FW_F_OIFNAME)) + continue; + } else { + /* Check receive interface */ + if ((f->fw_flg & IP_FW_F_IIFACE) + && (!rif || !iface_match(rif, + &f->fw_in_if, f->fw_flg & IP_FW_F_IIFNAME))) + continue; + /* Check outgoing interface */ + if ((f->fw_flg & IP_FW_F_OIFACE) + && (!oif || !iface_match(oif, + &f->fw_out_if, f->fw_flg & IP_FW_F_OIFNAME))) + continue; + } + + /* Check IP header values */ + if (f->fw_ipflg & IP_FW_IF_IPOPT && !ipopts_match(ip, f)) + continue; + if (f->fw_ipflg & IP_FW_IF_IPLEN && f->fw_iplen != ip_len) + continue; + if (f->fw_ipflg & IP_FW_IF_IPID && f->fw_ipid != ntohs(ip->ip_id)) + continue; + if (f->fw_ipflg & IP_FW_IF_IPTOS && !iptos_match(ip, f)) + continue; + if (f->fw_ipflg & IP_FW_IF_IPTTL && f->fw_ipttl != ip->ip_ttl) + continue; + if (f->fw_ipflg & IP_FW_IF_IPVER && f->fw_ipver != ip->ip_v) + continue; + + /* Check protocol; if wildcard, and no [ug]id, match */ + if (f->fw_prot == IPPROTO_IP) { + if (!(f->fw_flg & (IP_FW_F_UID|IP_FW_F_GID))) + goto rnd_then_got_match; + } else + /* If different, don't match */ + if (proto != f->fw_prot) + continue; + + /* Protocol specific checks for uid only */ + if (f->fw_flg & (IP_FW_F_UID|IP_FW_F_GID)) { + switch (proto) { + case IPPROTO_TCP: + { + struct inpcb *P; + + if (offset == 1) /* cf. RFC 1858 */ + goto bogusfrag; + if (offset != 0) + continue; + + if (oif) + P = in_pcblookup_hash(&tcbinfo, dst_ip, + dst_port, src_ip, src_port, 0, + oif); + else + P = in_pcblookup_hash(&tcbinfo, src_ip, + src_port, dst_ip, dst_port, 0, + NULL); + + if (P && P->inp_socket) { + if (f->fw_flg & IP_FW_F_UID) { + if (P->inp_socket->so_cred->cr_uid != + f->fw_uid) + continue; + } else if (!groupmember(f->fw_gid, + P->inp_socket->so_cred)) + continue; + } else + continue; + break; + } + + case IPPROTO_UDP: + { + struct inpcb *P; + + if (offset != 0) + continue; + + if (oif) + P = in_pcblookup_hash(&udbinfo, dst_ip, + dst_port, src_ip, src_port, 1, + oif); + else + P = in_pcblookup_hash(&udbinfo, src_ip, + src_port, dst_ip, dst_port, 1, + NULL); + + if (P && P->inp_socket) { + if (f->fw_flg & IP_FW_F_UID) { + if (P->inp_socket->so_cred->cr_uid != + f->fw_uid) + continue; + } else if (!groupmember(f->fw_gid, + P->inp_socket->so_cred)) + continue; + } else + continue; + break; + } + + default: + continue; + } + } + + /* Protocol specific checks */ + switch (proto) { + case IPPROTO_TCP: + { + struct tcphdr *tcp; + + if (offset == 1) /* cf. RFC 1858 */ + goto bogusfrag; + if (offset != 0) { + /* + * TCP flags and ports aren't available in this + * packet -- if this rule specified either one, + * we consider the rule a non-match. + */ + if (f->fw_nports != 0 || + f->fw_ipflg & IP_FW_IF_TCPMSK) + continue; + + break; + } + tcp = (struct tcphdr *) ((u_int32_t *)ip + ip->ip_hl); + + if (f->fw_ipflg & IP_FW_IF_TCPOPT && !tcpopts_match(tcp, f)) + continue; + if (((f->fw_ipflg & IP_FW_IF_TCPFLG) || + (f->fw_ipflg & IP_FW_IF_TCPEST)) && + !tcpflg_match(tcp, f)) + continue; + if (f->fw_ipflg & IP_FW_IF_TCPSEQ && tcp->th_seq != f->fw_tcpseq) + continue; + if (f->fw_ipflg & IP_FW_IF_TCPACK && tcp->th_ack != f->fw_tcpack) + continue; + if (f->fw_ipflg & IP_FW_IF_TCPWIN && tcp->th_win != f->fw_tcpwin) + continue; + goto check_ports; + } + + case IPPROTO_UDP: + if (offset != 0) { + /* + * Port specification is unavailable -- if this + * rule specifies a port, we consider the rule + * a non-match. + */ + if (f->fw_nports != 0) + continue; + + break; + } +check_ports: + if (!port_match(&f->fw_uar.fw_pts[0], + IP_FW_GETNSRCP(f), ntohs(src_port), + f->fw_flg & IP_FW_F_SRNG, + f->fw_flg & IP_FW_F_SMSK)) + continue; + if (!port_match(&f->fw_uar.fw_pts[IP_FW_GETNSRCP(f)], + IP_FW_GETNDSTP(f), ntohs(dst_port), + f->fw_flg & IP_FW_F_DRNG, + f->fw_flg & IP_FW_F_DMSK)) + continue; + break; + + case IPPROTO_ICMP: + { + struct icmp *icmp; + + if (offset != 0) /* Type isn't valid */ + break; + icmp = (struct icmp *) ((u_int32_t *)ip + ip->ip_hl); + if (!icmptype_match(icmp, f)) + continue; + break; + } + + default: + break; + +bogusfrag: + if (fw_verbose && ip != NULL) + ipfw_report(NULL, ip, offset, rif, oif); + goto dropit; + + } + +rnd_then_got_match: + if ( ((struct ip_fw_ext *)f)->dont_match_prob && + random() < ((struct ip_fw_ext *)f)->dont_match_prob ) + continue ; +got_match: + /* + * If not a dynamic match (q == NULL) and keep-state, install + * a new dynamic entry. + */ + if (q == NULL && f->fw_flg & IP_FW_F_KEEP_S) + install_state(chain); + /* Update statistics */ + f->fw_pcnt += 1; + f->fw_bcnt += ip_len; + f->timestamp = time_second; + + /* Log to console if desired */ + if ((f->fw_flg & IP_FW_F_PRN) && fw_verbose) + ipfw_report(f, ip, offset, rif, oif); + + /* Take appropriate action */ + switch (f->fw_flg & IP_FW_F_COMMAND) { + case IP_FW_F_ACCEPT: + return(0); + case IP_FW_F_COUNT: + continue; +#ifdef IPDIVERT + case IP_FW_F_DIVERT: + *cookie = f->fw_number; + return(f->fw_divert_port); + case IP_FW_F_TEE: + *cookie = f->fw_number; + return(f->fw_divert_port | IP_FW_PORT_TEE_FLAG); +#endif + case IP_FW_F_SKIPTO: /* XXX check */ + if ( f->next_rule_ptr ) + chain = f->next_rule_ptr ; + else + chain = lookup_next_rule(chain) ; + if (! chain) goto dropit; + goto again ; +#ifdef DUMMYNET + case IP_FW_F_PIPE: + case IP_FW_F_QUEUE: + *flow_id = chain; + return(f->fw_pipe_nr | IP_FW_PORT_DYNT_FLAG); +#endif +#ifdef IPFIREWALL_FORWARD + case IP_FW_F_FWD: + /* Change the next-hop address for this packet. + * Initially we'll only worry about directly + * reachable next-hop's, but ultimately + * we will work out for next-hops that aren't + * direct the route we would take for it. We + * [cs]ould leave this latter problem to + * ip_output.c. We hope to high [name the abode of + * your favourite deity] that ip_output doesn't modify + * the new value of next_hop (which is dst there) + */ + if (next_hop != NULL /* Make sure, first... */ + && (q == NULL || direction == MATCH_FORWARD) ) + *next_hop = &(f->fw_fwd_ip); + return(0); /* Allow the packet */ +#endif + } + + /* Deny/reject this packet using this rule */ + rule = f; + break; + + } + + /* Rule IPFW_DEFAULT_RULE should always be there and match */ + KASSERT(chain != NULL, ("ip_fw: no chain")); + + /* + * At this point, we're going to drop the packet. + * Send a reject notice if all of the following are true: + * + * - The packet matched a reject rule + * - The packet is not an ICMP packet, or is an ICMP query packet + * - The packet is not a multicast or broadcast packet + */ + if ((rule->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_REJECT + && (ip->ip_p != IPPROTO_ICMP || is_icmp_query(ip)) + && !((*m)->m_flags & (M_BCAST|M_MCAST)) + && !IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { + switch (rule->fw_reject_code) { + case IP_FW_REJECT_RST: + { + /* XXX warning, this code writes into the mbuf */ + struct tcphdr *const tcp = + (struct tcphdr *) ((u_int32_t *)ip + ip->ip_hl); + struct tcpiphdr ti, *const tip = (struct tcpiphdr *) ip; + + if (offset != 0 || (tcp->th_flags & TH_RST)) + break; + ti.ti_i = *((struct ipovly *) ip); + ti.ti_t = *tcp; + bcopy(&ti, ip, sizeof(ti)); + NTOHL(tip->ti_seq); + NTOHL(tip->ti_ack); + tip->ti_len = ip_len - hlen - (tip->ti_off << 2); + if (tcp->th_flags & TH_ACK) { + tcp_respond(NULL, (void *)ip, tcp, *m, + (tcp_seq)0, tcp->th_ack, TH_RST); + } else { + if (tcp->th_flags & TH_SYN) + tip->ti_len++; + tcp_respond(NULL, (void *)ip, tcp, *m, + tip->ti_seq + tip->ti_len, + (tcp_seq)0, TH_RST|TH_ACK); + } + *m = NULL; + break; + } + default: /* Send an ICMP unreachable using code */ + icmp_error(*m, ICMP_UNREACH, + rule->fw_reject_code, 0L, 0); + *m = NULL; + break; + } + } + +dropit: + /* + * Finally, drop the packet. + */ + return(IP_FW_PORT_DENY_FLAG); +#undef BRIDGED +} + +/* + * when a rule is added/deleted, zero the direct pointers within + * all firewall rules. These will be reconstructed on the fly + * as packets are matched. + * Must be called at splnet(). + */ +static void +flush_rule_ptrs() +{ + struct ip_fw_chain *fcp ; + + LIST_FOREACH(fcp, &ip_fw_chain_head, next) { + fcp->rule->next_rule_ptr = NULL ; + } +} + +static int +add_entry(struct ip_fw_head *chainptr, struct ip_fw *frwl) +{ + struct ip_fw *ftmp = 0; + struct ip_fw_ext *ftmp_ext = 0 ; + struct ip_fw_chain *fwc = 0, *fcp, *fcpl = 0; + u_short nbr = 0; + int s; + + fwc = malloc(sizeof *fwc, M_IPFW, M_DONTWAIT); + ftmp_ext = malloc(sizeof *ftmp_ext, M_IPFW, M_DONTWAIT | M_ZERO); + ftmp = &ftmp_ext->rule ; + if (!fwc || !ftmp) { + dprintf(("%s malloc said no\n", err_prefix)); + if (fwc) free(fwc, M_IPFW); + if (ftmp) free(ftmp, M_IPFW); + return (ENOSPC); + } + + bcopy(frwl, ftmp, sizeof(*ftmp)); + if (ftmp->fw_flg & IP_FW_F_RND_MATCH) + ftmp_ext->dont_match_prob = (intptr_t)ftmp->pipe_ptr; + if (ftmp->fw_flg & IP_FW_F_KEEP_S) + ftmp_ext->dyn_type = (u_long)(ftmp->next_rule_ptr) ; + + ftmp->fw_in_if.fu_via_if.name[FW_IFNLEN - 1] = '\0'; + ftmp->fw_pcnt = 0L; + ftmp->fw_bcnt = 0L; + ftmp->next_rule_ptr = NULL ; + ftmp->pipe_ptr = NULL ; + fwc->rule = ftmp; + + s = splnet(); + + if (LIST_FIRST(chainptr) == 0) { + LIST_INSERT_HEAD(chainptr, fwc, next); + splx(s); + return(0); + } + + /* If entry number is 0, find highest numbered rule and add 100 */ + if (ftmp->fw_number == 0) { + LIST_FOREACH(fcp, chainptr, next) { + if (fcp->rule->fw_number != (u_short)-1) + nbr = fcp->rule->fw_number; + else + break; + } + if (nbr < IPFW_DEFAULT_RULE - 100) + nbr += 100; + ftmp->fw_number = frwl->fw_number = nbr; + } + + /* Got a valid number; now insert it, keeping the list ordered */ + LIST_FOREACH(fcp, chainptr, next) { + if (fcp->rule->fw_number > ftmp->fw_number) { + if (fcpl) { + LIST_INSERT_AFTER(fcpl, fwc, next); + } else { + LIST_INSERT_HEAD(chainptr, fwc, next); + } + break; + } else { + fcpl = fcp; + } + } + flush_rule_ptrs(); + + splx(s); + return (0); +} + +static int +del_entry(struct ip_fw_head *chainptr, u_short number) +{ + struct ip_fw_chain *fcp; + + fcp = LIST_FIRST(chainptr); + if (number != (u_short)-1) { + for (; fcp; fcp = LIST_NEXT(fcp, next)) { + if (fcp->rule->fw_number == number) { + int s; + + /* prevent access to rules while removing them */ + s = splnet(); + while (fcp && fcp->rule->fw_number == number) { + struct ip_fw_chain *next; + + remove_dyn_rule(fcp, 1 /* delete */); + next = LIST_NEXT(fcp, next); + LIST_REMOVE(fcp, next); +#ifdef DUMMYNET + dn_rule_delete(fcp) ; +#endif + flush_rule_ptrs(); + free(fcp->rule, M_IPFW); + free(fcp, M_IPFW); + fcp = next; + } + splx(s); + return 0; + } + } + } + + return (EINVAL); +} + +static int +zero_entry(struct ip_fw *frwl) +{ + struct ip_fw_chain *fcp; + int s, cleared; + + if (frwl == 0) { + s = splnet(); + LIST_FOREACH(fcp, &ip_fw_chain_head, next) { + fcp->rule->fw_bcnt = fcp->rule->fw_pcnt = 0; + fcp->rule->fw_loghighest = fcp->rule->fw_logamount; + fcp->rule->timestamp = 0; + } + splx(s); + } + else { + cleared = 0; + + /* + * It's possible to insert multiple chain entries with the + * same number, so we don't stop after finding the first + * match if zeroing a specific entry. + */ + LIST_FOREACH(fcp, &ip_fw_chain_head, next) + if (frwl->fw_number == fcp->rule->fw_number) { + s = splnet(); + while (fcp && frwl->fw_number == fcp->rule->fw_number) { + fcp->rule->fw_bcnt = fcp->rule->fw_pcnt = 0; + fcp->rule->fw_loghighest = + fcp->rule->fw_logamount; + fcp->rule->timestamp = 0; + fcp = LIST_NEXT(fcp, next); + } + splx(s); + cleared = 1; + break; + } + if (!cleared) /* we didn't find any matching rules */ + return (EINVAL); + } + + if (fw_verbose) { + if (frwl) + log(LOG_SECURITY | LOG_NOTICE, + "ipfw: Entry %d cleared.\n", frwl->fw_number); + else + log(LOG_SECURITY | LOG_NOTICE, + "ipfw: Accounting cleared.\n"); + } + + return (0); +} + +static int +resetlog_entry(struct ip_fw *frwl) +{ + struct ip_fw_chain *fcp; + int s, cleared; + + if (frwl == 0) { + s = splnet(); + counter = 0; + LIST_FOREACH(fcp, &ip_fw_chain_head, next) + fcp->rule->fw_loghighest = fcp->rule->fw_pcnt + + fcp->rule->fw_logamount; + splx(s); + } + else { + cleared = 0; + + /* + * It's possible to insert multiple chain entries with the + * same number, so we don't stop after finding the first + * match if zeroing a specific entry. + */ + LIST_FOREACH(fcp, &ip_fw_chain_head, next) + if (frwl->fw_number == fcp->rule->fw_number) { + s = splnet(); + while (fcp && frwl->fw_number == fcp->rule->fw_number) { + fcp->rule->fw_loghighest = + fcp->rule->fw_pcnt + + fcp->rule->fw_logamount; + fcp = LIST_NEXT(fcp, next); + } + splx(s); + cleared = 1; + break; + } + if (!cleared) /* we didn't find any matching rules */ + return (EINVAL); + } + + if (fw_verbose) { + if (frwl) + log(LOG_SECURITY | LOG_NOTICE, + "ipfw: Entry %d logging count reset.\n", + frwl->fw_number); + else + log(LOG_SECURITY | LOG_NOTICE, " + ipfw: All logging counts cleared.\n"); + } + + return (0); +} + +static int +check_ipfw_struct(struct ip_fw *frwl) +{ + /* Check for invalid flag bits */ + if ((frwl->fw_flg & ~IP_FW_F_MASK) != 0) { + dprintf(("%s undefined flag bits set (flags=%x)\n", + err_prefix, frwl->fw_flg)); + return (EINVAL); + } + if (frwl->fw_flg == IP_FW_F_CHECK_S) { + /* check-state */ + return 0 ; + } + /* Must apply to incoming or outgoing (or both) */ + if (!(frwl->fw_flg & (IP_FW_F_IN | IP_FW_F_OUT))) { + dprintf(("%s neither in nor out\n", err_prefix)); + return (EINVAL); + } + /* Empty interface name is no good */ + if (((frwl->fw_flg & IP_FW_F_IIFNAME) + && !*frwl->fw_in_if.fu_via_if.name) + || ((frwl->fw_flg & IP_FW_F_OIFNAME) + && !*frwl->fw_out_if.fu_via_if.name)) { + dprintf(("%s empty interface name\n", err_prefix)); + return (EINVAL); + } + /* Sanity check interface matching */ + if ((frwl->fw_flg & IF_FW_F_VIAHACK) == IF_FW_F_VIAHACK) { + ; /* allow "via" backwards compatibility */ + } else if ((frwl->fw_flg & IP_FW_F_IN) + && (frwl->fw_flg & IP_FW_F_OIFACE)) { + dprintf(("%s outgoing interface check on incoming\n", + err_prefix)); + return (EINVAL); + } + /* Sanity check port ranges */ + if ((frwl->fw_flg & IP_FW_F_SRNG) && IP_FW_GETNSRCP(frwl) < 2) { + dprintf(("%s src range set but n_src_p=%d\n", + err_prefix, IP_FW_GETNSRCP(frwl))); + return (EINVAL); + } + if ((frwl->fw_flg & IP_FW_F_DRNG) && IP_FW_GETNDSTP(frwl) < 2) { + dprintf(("%s dst range set but n_dst_p=%d\n", + err_prefix, IP_FW_GETNDSTP(frwl))); + return (EINVAL); + } + if (IP_FW_GETNSRCP(frwl) + IP_FW_GETNDSTP(frwl) > IP_FW_MAX_PORTS) { + dprintf(("%s too many ports (%d+%d)\n", + err_prefix, IP_FW_GETNSRCP(frwl), IP_FW_GETNDSTP(frwl))); + return (EINVAL); + } + /* + * Protocols other than TCP/UDP don't use port range + */ + if ((frwl->fw_prot != IPPROTO_TCP) && + (frwl->fw_prot != IPPROTO_UDP) && + (IP_FW_GETNSRCP(frwl) || IP_FW_GETNDSTP(frwl))) { + dprintf(("%s port(s) specified for non TCP/UDP rule\n", + err_prefix)); + return (EINVAL); + } + + /* + * Rather than modify the entry to make such entries work, + * we reject this rule and require user level utilities + * to enforce whatever policy they deem appropriate. + */ + if ((frwl->fw_src.s_addr & (~frwl->fw_smsk.s_addr)) || + (frwl->fw_dst.s_addr & (~frwl->fw_dmsk.s_addr))) { + dprintf(("%s rule never matches\n", err_prefix)); + return (EINVAL); + } + + if ((frwl->fw_flg & IP_FW_F_FRAG) && + (frwl->fw_prot == IPPROTO_UDP || frwl->fw_prot == IPPROTO_TCP)) { + if (frwl->fw_nports) { + dprintf(("%s cannot mix 'frag' and ports\n", err_prefix)); + return (EINVAL); + } + if (frwl->fw_prot == IPPROTO_TCP && + frwl->fw_tcpf != frwl->fw_tcpnf) { + dprintf(("%s cannot mix 'frag' and TCP flags\n", err_prefix)); + return (EINVAL); + } + } + + if (frwl->fw_flg & (IP_FW_F_UID | IP_FW_F_GID)) { + if ((frwl->fw_prot != IPPROTO_TCP) && + (frwl->fw_prot != IPPROTO_UDP) && + (frwl->fw_prot != IPPROTO_IP)) { + dprintf(("%s cannot use uid/gid logic on non-TCP/UDP\n", err_prefix)); + return (EINVAL); + } + } + + /* Check command specific stuff */ + switch (frwl->fw_flg & IP_FW_F_COMMAND) + { + case IP_FW_F_REJECT: + if (frwl->fw_reject_code >= 0x100 + && !(frwl->fw_prot == IPPROTO_TCP + && frwl->fw_reject_code == IP_FW_REJECT_RST)) { + dprintf(("%s unknown reject code\n", err_prefix)); + return (EINVAL); + } + break; +#if defined(IPDIVERT) || defined(DUMMYNET) +#ifdef IPDIVERT + case IP_FW_F_DIVERT: /* Diverting to port zero is invalid */ + case IP_FW_F_TEE: +#endif +#ifdef DUMMYNET + case IP_FW_F_PIPE: /* piping through 0 is invalid */ + case IP_FW_F_QUEUE: /* piping through 0 is invalid */ +#endif + if (frwl->fw_divert_port == 0) { + dprintf(("%s can't divert to port 0\n", err_prefix)); + return (EINVAL); + } + break; +#endif /* IPDIVERT || DUMMYNET */ + case IP_FW_F_DENY: + case IP_FW_F_ACCEPT: + case IP_FW_F_COUNT: + case IP_FW_F_SKIPTO: +#ifdef IPFIREWALL_FORWARD + case IP_FW_F_FWD: +#endif + break; + default: + dprintf(("%s invalid command\n", err_prefix)); + return (EINVAL); + } + + return 0; +} + +static int +ip_fw_ctl(struct sockopt *sopt) +{ + int error, s; + size_t size; + struct ip_fw_chain *fcp; + struct ip_fw frwl, *bp , *buf; + + /* + * Disallow modifications in really-really secure mode, but still allow + * the logging counters to be reset. + */ + if (securelevel >= 3 && (sopt->sopt_name == IP_FW_ADD || + (sopt->sopt_dir == SOPT_SET && sopt->sopt_name != IP_FW_RESETLOG))) + return (EPERM); + error = 0; + + switch (sopt->sopt_name) { + case IP_FW_GET: + size = 0 ; + LIST_FOREACH(fcp, &ip_fw_chain_head, next) + size += sizeof(struct ip_fw) ; + if (ipfw_dyn_v) { + int i ; + struct ipfw_dyn_rule *p ; + + for (i = 0 ; i < curr_dyn_buckets ; i++ ) + for ( p = ipfw_dyn_v[i] ; p != NULL ; p = p->next ) + size += sizeof(*p) ; + } + buf = malloc(size, M_TEMP, M_WAITOK); + if (buf == 0) { + error = ENOBUFS; + break; + } + + bp = buf ; + LIST_FOREACH(fcp, &ip_fw_chain_head, next) { + bcopy(fcp->rule, bp, sizeof *fcp->rule); + bp->pipe_ptr = (void *)(intptr_t) + ((struct ip_fw_ext *)fcp->rule)->dont_match_prob; + bp->next_rule_ptr = (void *)(intptr_t) + ((struct ip_fw_ext *)fcp->rule)->dyn_type; + bp++; + } + if (ipfw_dyn_v) { + int i ; + struct ipfw_dyn_rule *p, *dst, *last = NULL ; + + dst = (struct ipfw_dyn_rule *)bp ; + for (i = 0 ; i < curr_dyn_buckets ; i++ ) + for ( p = ipfw_dyn_v[i] ; p != NULL ; p = p->next, dst++ ) { + bcopy(p, dst, sizeof *p); + (int)dst->chain = p->chain->rule->fw_number ; + dst->next = dst ; /* fake non-null pointer... */ + last = dst ; + if (TIME_LEQ(dst->expire, time_second) ) + dst->expire = 0 ; + else + dst->expire -= time_second ; + } + if (last != NULL) + last->next = NULL ; + } + error = sooptcopyout(sopt, buf, size); + FREE(buf, M_TEMP); + break; + + case IP_FW_FLUSH: + s = splnet(); + remove_dyn_rule(NULL, 1 /* force delete */); + splx(s); + fcp = LIST_FIRST(&ip_fw_chain_head); + while (fcp) { + struct ip_fw_chain *next; + next = LIST_NEXT(fcp, next); + if (fcp->rule->fw_number > fw_permanent_rules && + fcp->rule->fw_number != IPFW_DEFAULT_RULE ) { + s = splnet(); + LIST_REMOVE(fcp, next); +#ifdef DUMMYNET + dn_rule_delete(fcp); +#endif + FREE(fcp->rule, M_IPFW); + FREE(fcp, M_IPFW); + splx(s); + } + fcp = next; + } + break; + + case IP_FW_ZERO: + if (sopt->sopt_val != 0) { + error = sooptcopyin(sopt, &frwl, sizeof frwl, + sizeof frwl); + if (error || (error = zero_entry(&frwl))) + break; + } else { + error = zero_entry(0); + } + break; + + case IP_FW_ADD: + error = sooptcopyin(sopt, &frwl, sizeof frwl, sizeof frwl); + if (error || (error = check_ipfw_struct(&frwl))) + break; + + if (frwl.fw_number == IPFW_DEFAULT_RULE) { + dprintf(("%s can't add rule %u\n", err_prefix, + (unsigned)IPFW_DEFAULT_RULE)); + error = EINVAL; + } else { + error = add_entry(&ip_fw_chain_head, &frwl); + if (!error && sopt->sopt_dir == SOPT_GET) + error = sooptcopyout(sopt, &frwl, sizeof frwl); + } + break; + + case IP_FW_DEL: + error = sooptcopyin(sopt, &frwl, sizeof frwl, sizeof frwl); + if (error) + break; + + if (frwl.fw_number == IPFW_DEFAULT_RULE) { + dprintf(("%s can't delete rule %u\n", err_prefix, + (unsigned)IPFW_DEFAULT_RULE)); + error = EINVAL; + } else { + error = del_entry(&ip_fw_chain_head, frwl.fw_number); + } + break; + + case IP_FW_RESETLOG: + if (sopt->sopt_val != 0) { + error = sooptcopyin(sopt, &frwl, sizeof frwl, + sizeof frwl); + if (error || (error = resetlog_entry(&frwl))) + break; + } else { + error = resetlog_entry(0); + } + break; + + default: + printf("ip_fw_ctl invalid option %d\n", sopt->sopt_name); + error = EINVAL ; + } + + return (error); +} + +struct ip_fw_chain *ip_fw_default_rule ; + +void +ip_fw_init(void) +{ + struct ip_fw default_rule; + + ip_fw_chk_ptr = ip_fw_chk; + ip_fw_ctl_ptr = ip_fw_ctl; + LIST_INIT(&ip_fw_chain_head); + + bzero(&default_rule, sizeof default_rule); + default_rule.fw_prot = IPPROTO_IP; + default_rule.fw_number = IPFW_DEFAULT_RULE; +#ifdef IPFIREWALL_DEFAULT_TO_ACCEPT + default_rule.fw_flg |= IP_FW_F_ACCEPT; +#else + default_rule.fw_flg |= IP_FW_F_DENY; +#endif + default_rule.fw_flg |= IP_FW_F_IN | IP_FW_F_OUT; + if (check_ipfw_struct(&default_rule) != 0 || + add_entry(&ip_fw_chain_head, &default_rule)) + panic("ip_fw_init"); + + ip_fw_default_rule = LIST_FIRST(&ip_fw_chain_head) ; + printf("IP packet filtering initialized, " +#ifdef IPDIVERT + "divert enabled, " +#else + "divert disabled, " +#endif +#ifdef IPFIREWALL_FORWARD + "rule-based forwarding enabled, " +#else + "rule-based forwarding disabled, " +#endif +#ifdef IPFIREWALL_DEFAULT_TO_ACCEPT + "default to accept, "); +#else + "default to deny, " ); +#endif +#ifndef IPFIREWALL_VERBOSE + printf("logging disabled\n"); +#else + if (fw_verbose_limit == 0) + printf("unlimited logging\n"); + else + printf("logging limited to %d packets/entry by default\n", + fw_verbose_limit); +#endif +} + +static ip_fw_chk_t *old_chk_ptr; +static ip_fw_ctl_t *old_ctl_ptr; + +static int +ipfw_modevent(module_t mod, int type, void *unused) +{ + int s; + struct ip_fw_chain *fcp; + + switch (type) { + case MOD_LOAD: + s = splnet(); + + old_chk_ptr = ip_fw_chk_ptr; + old_ctl_ptr = ip_fw_ctl_ptr; + + ip_fw_init(); + splx(s); + return 0; + case MOD_UNLOAD: + s = splnet(); + ip_fw_chk_ptr = old_chk_ptr; + ip_fw_ctl_ptr = old_ctl_ptr; + remove_dyn_rule(NULL, 1 /* force delete */); + while ( (fcp = LIST_FIRST(&ip_fw_chain_head)) != NULL) { + LIST_REMOVE(fcp, next); +#ifdef DUMMYNET + dn_rule_delete(fcp); +#endif + free(fcp->rule, M_IPFW); + free(fcp, M_IPFW); + } + + splx(s); + printf("IP firewall unloaded\n"); + return 0; + default: + break; + } + return 0; +} + +static moduledata_t ipfwmod = { + "ipfw", + ipfw_modevent, + 0 +}; +DECLARE_MODULE(ipfw, ipfwmod, SI_SUB_PSEUDO, SI_ORDER_ANY); diff --git a/sys/netinet/ip_fw.h b/sys/netinet/ip_fw.h new file mode 100644 index 0000000..7abae15 --- /dev/null +++ b/sys/netinet/ip_fw.h @@ -0,0 +1,309 @@ +/* + * Copyright (c) 1993 Daniel Boulet + * Copyright (c) 1994 Ugen J.S.Antsilevich + * + * Redistribution and use in source forms, with and without modification, + * are permitted provided that this entire comment appears intact. + * + * Redistribution in binary form may occur without any restrictions. + * Obviously, it would be nice if you gave credit where credit is due + * but requiring it would be too onerous. + * + * This software is provided ``AS IS'' without any warranties of any kind. + * + * $FreeBSD$ + */ + +#ifndef _IP_FW_H +#define _IP_FW_H + +#include <sys/queue.h> + +/* + * This union structure identifies an interface, either explicitly + * by name or implicitly by IP address. The flags IP_FW_F_IIFNAME + * and IP_FW_F_OIFNAME say how to interpret this structure. An + * interface unit number of -1 matches any unit number, while an + * IP address of 0.0.0.0 indicates matches any interface. + * + * The receive and transmit interfaces are only compared against the + * the packet if the corresponding bit (IP_FW_F_IIFACE or IP_FW_F_OIFACE) + * is set. Note some packets lack a receive or transmit interface + * (in which case the missing "interface" never matches). + */ + +union ip_fw_if { + struct in_addr fu_via_ip; /* Specified by IP address */ + struct { /* Specified by interface name */ +#define FW_IFNLEN 10 /* need room ! was IFNAMSIZ */ + char name[FW_IFNLEN]; + short unit; /* -1 means match any unit */ + } fu_via_if; +}; + +/* + * Format of an IP firewall descriptor + * + * fw_src, fw_dst, fw_smsk, fw_dmsk are always stored in network byte order. + * fw_flg and fw_n*p are stored in host byte order (of course). + * Port numbers are stored in HOST byte order. + */ + +struct ip_fw { + u_int64_t fw_pcnt,fw_bcnt; /* Packet and byte counters */ + struct in_addr fw_src, fw_dst; /* Source and destination IP addr */ + struct in_addr fw_smsk, fw_dmsk; /* Mask for src and dest IP addr */ + u_short fw_number; /* Rule number */ + u_int fw_flg; /* Operational Flags word */ +#define IP_FW_MAX_PORTS 10 /* A reasonable maximum */ + union { + u_short fw_pts[IP_FW_MAX_PORTS]; /* Array of port numbers to match */ +#define IP_FW_ICMPTYPES_MAX 128 +#define IP_FW_ICMPTYPES_DIM (IP_FW_ICMPTYPES_MAX / (sizeof(unsigned) * 8)) + unsigned fw_icmptypes[IP_FW_ICMPTYPES_DIM]; /* ICMP types bitmap */ + } fw_uar; + u_int fw_ipflg; /* IP flags word */ + u_char fw_ipopt,fw_ipnopt; /* IP options set/unset */ + u_short fw_iplen, fw_ipid; /* IP length, identification */ + u_char fw_iptos, fw_ipntos; /* IP type of service set/unset */ + u_char fw_ipttl; /* IP time to live */ + u_int fw_ipver:4; /* IP version */ + u_char fw_tcpopt,fw_tcpnopt; /* TCP options set/unset */ + u_char fw_tcpf,fw_tcpnf; /* TCP flags set/unset */ + u_int32_t fw_tcpseq, fw_tcpack; /* TCP sequence and acknowledgement */ + u_short fw_tcpwin; /* TCP window size */ + long timestamp; /* timestamp (tv_sec) of last match */ + union ip_fw_if fw_in_if, fw_out_if; /* Incoming and outgoing interfaces */ + union { + u_short fu_divert_port; /* Divert/tee port (options IPDIVERT) */ + u_short fu_pipe_nr; /* queue number (option DUMMYNET) */ + u_short fu_skipto_rule; /* SKIPTO command rule number */ + u_short fu_reject_code; /* REJECT response code */ + struct sockaddr_in fu_fwd_ip; + } fw_un; + u_char fw_prot; /* IP protocol */ + /* + * N'of src ports and # of dst ports in ports array (dst ports + * follow src ports; max of 10 ports in all; count of 0 means + * match all ports) + */ + u_char fw_nports; + void *pipe_ptr; /* flow_set ptr for dummynet pipe */ + void *next_rule_ptr ; /* next rule in case of match */ + uid_t fw_uid; /* uid to match */ + gid_t fw_gid; /* gid to match */ + int fw_logamount; /* amount to log */ + u_int64_t fw_loghighest; /* highest number packet to log */ +}; + +/* + * extended ipfw structure... some fields in the original struct + * can be used to pass parameters up/down, namely pointers + * void *pipe_ptr + * void *next_rule_ptr + * some others can be used to pass parameters down, namely counters etc. + * u_int64_t fw_pcnt,fw_bcnt; + * long timestamp; + */ + +struct ip_fw_ext { /* extended structure */ + struct ip_fw rule; /* must be at offset 0 */ + long dont_match_prob; /* 0x7fffffff means 1.0, always fail */ + u_int dyn_type; /* type for dynamic rule */ +}; + +#define IP_FW_GETNSRCP(rule) ((rule)->fw_nports & 0x0f) +#define IP_FW_SETNSRCP(rule, n) do { \ + (rule)->fw_nports &= ~0x0f; \ + (rule)->fw_nports |= (n); \ + } while (0) +#define IP_FW_GETNDSTP(rule) ((rule)->fw_nports >> 4) +#define IP_FW_SETNDSTP(rule, n) do { \ + (rule)->fw_nports &= ~0xf0; \ + (rule)->fw_nports |= (n) << 4;\ + } while (0) + +#define fw_divert_port fw_un.fu_divert_port +#define fw_skipto_rule fw_un.fu_skipto_rule +#define fw_reject_code fw_un.fu_reject_code +#define fw_pipe_nr fw_un.fu_pipe_nr +#define fw_fwd_ip fw_un.fu_fwd_ip + +struct ip_fw_chain { + LIST_ENTRY(ip_fw_chain) next; + struct ip_fw *rule; +}; + +/* + * Flow mask/flow id for each queue. + */ +struct ipfw_flow_id { + u_int32_t dst_ip, src_ip ; + u_int16_t dst_port, src_port ; + u_int8_t proto ; + u_int8_t flags ; /* protocol-specific flags */ +} ; + +/* + * dynamic ipfw rule + */ +struct ipfw_dyn_rule { + struct ipfw_dyn_rule *next ; + + struct ipfw_flow_id id ; + struct ipfw_flow_id mask ; + struct ip_fw_chain *chain ; /* pointer to parent rule */ + u_int32_t type ; /* rule type */ + u_int32_t expire ; /* expire time */ + u_int64_t pcnt, bcnt; /* match counters */ + u_int32_t bucket ; /* which bucket in hash table */ + u_int32_t state ; /* state of this rule (typ. a */ + /* combination of TCP flags) */ +} ; + +/* + * Values for "flags" field . + */ +#define IP_FW_F_COMMAND 0x000000ff /* Mask for type of chain entry: */ +#define IP_FW_F_DENY 0x00000000 /* This is a deny rule */ +#define IP_FW_F_REJECT 0x00000001 /* Deny and send a response packet */ +#define IP_FW_F_ACCEPT 0x00000002 /* This is an accept rule */ +#define IP_FW_F_COUNT 0x00000003 /* This is a count rule */ +#define IP_FW_F_DIVERT 0x00000004 /* This is a divert rule */ +#define IP_FW_F_TEE 0x00000005 /* This is a tee rule */ +#define IP_FW_F_SKIPTO 0x00000006 /* This is a skipto rule */ +#define IP_FW_F_FWD 0x00000007 /* This is a "change forwarding address" rule */ +#define IP_FW_F_PIPE 0x00000008 /* This is a dummynet rule */ +#define IP_FW_F_QUEUE 0x00000009 /* This is a dummynet queue */ + +#define IP_FW_F_IN 0x00000100 /* Check inbound packets */ +#define IP_FW_F_OUT 0x00000200 /* Check outbound packets */ +#define IP_FW_F_IIFACE 0x00000400 /* Apply inbound interface test */ +#define IP_FW_F_OIFACE 0x00000800 /* Apply outbound interface test */ + +#define IP_FW_F_PRN 0x00001000 /* Print if this rule matches */ + +#define IP_FW_F_SRNG 0x00002000 /* The first two src ports are a min * + * and max range (stored in host byte * + * order). */ + +#define IP_FW_F_DRNG 0x00004000 /* The first two dst ports are a min * + * and max range (stored in host byte * + * order). */ + +#define IP_FW_F_FRAG 0x00008000 /* Fragment */ + +#define IP_FW_F_IIFNAME 0x00010000 /* In interface by name/unit (not IP) */ +#define IP_FW_F_OIFNAME 0x00020000 /* Out interface by name/unit (not IP) */ + +#define IP_FW_F_INVSRC 0x00040000 /* Invert sense of src check */ +#define IP_FW_F_INVDST 0x00080000 /* Invert sense of dst check */ + +#define IP_FW_F_ICMPBIT 0x00100000 /* ICMP type bitmap is valid */ + +#define IP_FW_F_UID 0x00200000 /* filter by uid */ + +#define IP_FW_F_GID 0x00400000 /* filter by gid */ + +#define IP_FW_F_RND_MATCH 0x00800000 /* probabilistic rule match */ +#define IP_FW_F_SMSK 0x01000000 /* src-port + mask */ +#define IP_FW_F_DMSK 0x02000000 /* dst-port + mask */ +#define IP_FW_BRIDGED 0x04000000 /* only match bridged packets */ +#define IP_FW_F_KEEP_S 0x08000000 /* keep state */ +#define IP_FW_F_CHECK_S 0x10000000 /* check state */ + +#define IP_FW_F_SME 0x20000000 /* source = me */ +#define IP_FW_F_DME 0x40000000 /* destination = me */ + +#define IP_FW_F_MASK 0x7FFFFFFF /* All possible flag bits mask */ + +/* + * Flags for the 'fw_ipflg' field, for comparing values of ip and its protocols. + */ +#define IP_FW_IF_TCPOPT 0x00000001 /* tcp options */ +#define IP_FW_IF_TCPFLG 0x00000002 /* tcp flags */ +#define IP_FW_IF_TCPSEQ 0x00000004 /* tcp sequence number */ +#define IP_FW_IF_TCPACK 0x00000008 /* tcp acknowledgement number */ +#define IP_FW_IF_TCPWIN 0x00000010 /* tcp window size */ +#define IP_FW_IF_TCPEST 0x00000020 /* established TCP connection */ +#define IP_FW_IF_TCPMSK 0x0000003f /* mask of all tcp values */ + +#define IP_FW_IF_IPOPT 0x00000100 /* ip options */ +#define IP_FW_IF_IPLEN 0x00000200 /* ip length */ +#define IP_FW_IF_IPID 0x00000400 /* ip identification */ +#define IP_FW_IF_IPTOS 0x00000800 /* ip type of service */ +#define IP_FW_IF_IPTTL 0x00001000 /* ip time to live */ +#define IP_FW_IF_IPVER 0x00002000 /* ip version */ +#define IP_FW_IF_IPMSK 0x00003f00 /* mask of all ip values */ + +#define IP_FW_IF_MSK 0x0000ffff /* All possible bits mask */ + +/* + * For backwards compatibility with rules specifying "via iface" but + * not restricted to only "in" or "out" packets, we define this combination + * of bits to represent this configuration. + */ + +#define IF_FW_F_VIAHACK (IP_FW_F_IN|IP_FW_F_OUT|IP_FW_F_IIFACE|IP_FW_F_OIFACE) + +/* + * Definitions for REJECT response codes. + * Values less than 256 correspond to ICMP unreachable codes. + */ +#define IP_FW_REJECT_RST 0x0100 /* TCP packets: send RST */ + +/* + * Definitions for IP option names. + */ +#define IP_FW_IPOPT_LSRR 0x01 +#define IP_FW_IPOPT_SSRR 0x02 +#define IP_FW_IPOPT_RR 0x04 +#define IP_FW_IPOPT_TS 0x08 + +/* + * Definitions for TCP option names. + */ +#define IP_FW_TCPOPT_MSS 0x01 +#define IP_FW_TCPOPT_WINDOW 0x02 +#define IP_FW_TCPOPT_SACK 0x04 +#define IP_FW_TCPOPT_TS 0x08 +#define IP_FW_TCPOPT_CC 0x10 + +/* + * Definitions for TCP flags. + */ +#define IP_FW_TCPF_FIN TH_FIN +#define IP_FW_TCPF_SYN TH_SYN +#define IP_FW_TCPF_RST TH_RST +#define IP_FW_TCPF_PSH TH_PUSH +#define IP_FW_TCPF_ACK TH_ACK +#define IP_FW_TCPF_URG TH_URG + +/* + * Main firewall chains definitions and global var's definitions. + */ +#ifdef _KERNEL + +#define IP_FW_PORT_DYNT_FLAG 0x10000 +#define IP_FW_PORT_TEE_FLAG 0x20000 +#define IP_FW_PORT_DENY_FLAG 0x40000 + +/* + * Function definitions. + */ +void ip_fw_init __P((void)); + +/* Firewall hooks */ +struct ip; +struct sockopt; +typedef int ip_fw_chk_t __P((struct ip **, int, struct ifnet *, u_int16_t *, + struct mbuf **, struct ip_fw_chain **, struct sockaddr_in **)); +typedef int ip_fw_ctl_t __P((struct sockopt *)); +extern ip_fw_chk_t *ip_fw_chk_ptr; +extern ip_fw_ctl_t *ip_fw_ctl_ptr; +extern int fw_one_pass; +extern int fw_enable; +extern struct ipfw_flow_id last_pkt ; +#endif /* _KERNEL */ + +#endif /* _IP_FW_H */ diff --git a/sys/netinet/ip_icmp.c b/sys/netinet/ip_icmp.c new file mode 100644 index 0000000..ddb95f0 --- /dev/null +++ b/sys/netinet/ip_icmp.c @@ -0,0 +1,878 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip_icmp.c 8.2 (Berkeley) 1/4/94 + * $FreeBSD$ + */ + +#include "opt_ipsec.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/mbuf.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/time.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> + +#include <net/if.h> +#include <net/route.h> + +#define _IP_VHL +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/in_var.h> +#include <netinet/ip.h> +#include <netinet/ip_icmp.h> +#include <netinet/ip_var.h> +#include <netinet/icmp_var.h> + +#ifdef IPSEC +#include <netinet6/ipsec.h> +#include <netkey/key.h> +#endif + +#include "faith.h" +#if defined(NFAITH) && NFAITH > 0 +#include <net/if_types.h> +#endif + +#include <machine/in_cksum.h> + +/* + * ICMP routines: error generation, receive packet processing, and + * routines to turnaround packets back to the originator, and + * host table maintenance routines. + */ + +static struct icmpstat icmpstat; +SYSCTL_STRUCT(_net_inet_icmp, ICMPCTL_STATS, stats, CTLFLAG_RD, + &icmpstat, icmpstat, ""); + +static int icmpmaskrepl = 0; +SYSCTL_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl, CTLFLAG_RW, + &icmpmaskrepl, 0, ""); + +static int drop_redirect = 0; +SYSCTL_INT(_net_inet_icmp, OID_AUTO, drop_redirect, CTLFLAG_RW, + &drop_redirect, 0, ""); + +static int log_redirect = 0; +SYSCTL_INT(_net_inet_icmp, OID_AUTO, log_redirect, CTLFLAG_RW, + &log_redirect, 0, ""); + +static int icmplim = 200; +SYSCTL_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_RW, + &icmplim, 0, ""); + +static int icmplim_output = 1; +SYSCTL_INT(_net_inet_icmp, OID_AUTO, icmplim_output, CTLFLAG_RW, + &icmplim_output, 0, ""); + +/* + * ICMP broadcast echo sysctl + */ + +static int icmpbmcastecho = 0; +SYSCTL_INT(_net_inet_icmp, OID_AUTO, bmcastecho, CTLFLAG_RW, + &icmpbmcastecho, 0, ""); + + +#ifdef ICMPPRINTFS +int icmpprintfs = 0; +#endif + +static void icmp_reflect __P((struct mbuf *)); +static void icmp_send __P((struct mbuf *, struct mbuf *)); +static int ip_next_mtu __P((int, int)); + +extern struct protosw inetsw[]; + +/* + * Generate an error packet of type error + * in response to bad packet ip. + */ +void +icmp_error(n, type, code, dest, destifp) + struct mbuf *n; + int type, code; + n_long dest; + struct ifnet *destifp; +{ + register struct ip *oip = mtod(n, struct ip *), *nip; + register unsigned oiplen = IP_VHL_HL(oip->ip_vhl) << 2; + register struct icmp *icp; + register struct mbuf *m; + unsigned icmplen; + +#ifdef ICMPPRINTFS + if (icmpprintfs) + printf("icmp_error(%p, %x, %d)\n", oip, type, code); +#endif + if (type != ICMP_REDIRECT) + icmpstat.icps_error++; + /* + * Don't send error if not the first fragment of message. + * Don't error if the old packet protocol was ICMP + * error message, only known informational types. + */ + if (oip->ip_off &~ (IP_MF|IP_DF)) + goto freeit; + if (oip->ip_p == IPPROTO_ICMP && type != ICMP_REDIRECT && + n->m_len >= oiplen + ICMP_MINLEN && + !ICMP_INFOTYPE(((struct icmp *)((caddr_t)oip + oiplen))->icmp_type)) { + icmpstat.icps_oldicmp++; + goto freeit; + } + /* Don't send error in response to a multicast or broadcast packet */ + if (n->m_flags & (M_BCAST|M_MCAST)) + goto freeit; + /* + * First, formulate icmp message + */ + m = m_gethdr(M_DONTWAIT, MT_HEADER); + if (m == NULL) + goto freeit; + icmplen = min(oiplen + 8, oip->ip_len); + if (icmplen < sizeof(struct ip)) + panic("icmp_error: bad length"); + m->m_len = icmplen + ICMP_MINLEN; + MH_ALIGN(m, m->m_len); + icp = mtod(m, struct icmp *); + if ((u_int)type > ICMP_MAXTYPE) + panic("icmp_error"); + icmpstat.icps_outhist[type]++; + icp->icmp_type = type; + if (type == ICMP_REDIRECT) + icp->icmp_gwaddr.s_addr = dest; + else { + icp->icmp_void = 0; + /* + * The following assignments assume an overlay with the + * zeroed icmp_void field. + */ + if (type == ICMP_PARAMPROB) { + icp->icmp_pptr = code; + code = 0; + } else if (type == ICMP_UNREACH && + code == ICMP_UNREACH_NEEDFRAG && destifp) { + icp->icmp_nextmtu = htons(destifp->if_mtu); + } + } + + icp->icmp_code = code; + m_copydata(n, 0, icmplen, (caddr_t)&icp->icmp_ip); + nip = &icp->icmp_ip; + + /* + * Convert fields to network representation. + */ + HTONS(nip->ip_len); + HTONS(nip->ip_off); + + /* + * Now, copy old ip header (without options) + * in front of icmp message. + */ + if (m->m_data - sizeof(struct ip) < m->m_pktdat) + panic("icmp len"); + m->m_data -= sizeof(struct ip); + m->m_len += sizeof(struct ip); + m->m_pkthdr.len = m->m_len; + m->m_pkthdr.rcvif = n->m_pkthdr.rcvif; + nip = mtod(m, struct ip *); + bcopy((caddr_t)oip, (caddr_t)nip, sizeof(struct ip)); + nip->ip_len = m->m_len; + nip->ip_vhl = IP_VHL_BORING; + nip->ip_p = IPPROTO_ICMP; + nip->ip_tos = 0; + icmp_reflect(m); + +freeit: + m_freem(n); +} + +static struct sockaddr_in icmpsrc = { sizeof (struct sockaddr_in), AF_INET }; +static struct sockaddr_in icmpdst = { sizeof (struct sockaddr_in), AF_INET }; +static struct sockaddr_in icmpgw = { sizeof (struct sockaddr_in), AF_INET }; + +/* + * Process a received ICMP message. + */ +void +icmp_input(m, off, proto) + register struct mbuf *m; + int off, proto; +{ + int hlen = off; + register struct icmp *icp; + register struct ip *ip = mtod(m, struct ip *); + int icmplen = ip->ip_len; + register int i; + struct in_ifaddr *ia; + void (*ctlfunc) __P((int, struct sockaddr *, void *)); + int code; + + /* + * Locate icmp structure in mbuf, and check + * that not corrupted and of at least minimum length. + */ +#ifdef ICMPPRINTFS + if (icmpprintfs) { + char buf[4 * sizeof "123"]; + strcpy(buf, inet_ntoa(ip->ip_src)); + printf("icmp_input from %s to %s, len %d\n", + buf, inet_ntoa(ip->ip_dst), icmplen); + } +#endif + if (icmplen < ICMP_MINLEN) { + icmpstat.icps_tooshort++; + goto freeit; + } + i = hlen + min(icmplen, ICMP_ADVLENMIN); + if (m->m_len < i && (m = m_pullup(m, i)) == 0) { + icmpstat.icps_tooshort++; + return; + } + ip = mtod(m, struct ip *); + m->m_len -= hlen; + m->m_data += hlen; + icp = mtod(m, struct icmp *); + if (in_cksum(m, icmplen)) { + icmpstat.icps_checksum++; + goto freeit; + } + m->m_len += hlen; + m->m_data -= hlen; + +#if defined(NFAITH) && 0 < NFAITH + if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type == IFT_FAITH) { + /* + * Deliver very specific ICMP type only. + */ + switch (icp->icmp_type) { + case ICMP_UNREACH: + case ICMP_TIMXCEED: + break; + default: + goto freeit; + } + } +#endif + +#ifdef ICMPPRINTFS + if (icmpprintfs) + printf("icmp_input, type %d code %d\n", icp->icmp_type, + icp->icmp_code); +#endif + +#ifdef IPSEC + /* drop it if it does not match the policy */ + /* XXX Is there meaning of check in here ? */ + if (ipsec4_in_reject(m, NULL)) { + ipsecstat.in_polvio++; + goto freeit; + } +#endif + + /* + * Message type specific processing. + */ + if (icp->icmp_type > ICMP_MAXTYPE) + goto raw; + icmpstat.icps_inhist[icp->icmp_type]++; + code = icp->icmp_code; + switch (icp->icmp_type) { + + case ICMP_UNREACH: + switch (code) { + case ICMP_UNREACH_NET: + case ICMP_UNREACH_HOST: + case ICMP_UNREACH_SRCFAIL: + case ICMP_UNREACH_NET_UNKNOWN: + case ICMP_UNREACH_HOST_UNKNOWN: + case ICMP_UNREACH_ISOLATED: + case ICMP_UNREACH_TOSNET: + case ICMP_UNREACH_TOSHOST: + case ICMP_UNREACH_HOST_PRECEDENCE: + case ICMP_UNREACH_PRECEDENCE_CUTOFF: + code = PRC_UNREACH_NET; + break; + + case ICMP_UNREACH_NEEDFRAG: + code = PRC_MSGSIZE; + break; + + /* + * RFC 1122, Sections 3.2.2.1 and 4.2.3.9. + * Treat subcodes 2,3 as immediate RST + */ + case ICMP_UNREACH_PROTOCOL: + case ICMP_UNREACH_PORT: + code = PRC_UNREACH_PORT; + break; + + case ICMP_UNREACH_NET_PROHIB: + case ICMP_UNREACH_HOST_PROHIB: + case ICMP_UNREACH_FILTER_PROHIB: + code = PRC_UNREACH_ADMIN_PROHIB; + break; + + default: + goto badcode; + } + goto deliver; + + case ICMP_TIMXCEED: + if (code > 1) + goto badcode; + code += PRC_TIMXCEED_INTRANS; + goto deliver; + + case ICMP_PARAMPROB: + if (code > 1) + goto badcode; + code = PRC_PARAMPROB; + goto deliver; + + case ICMP_SOURCEQUENCH: + if (code) + goto badcode; + code = PRC_QUENCH; + deliver: + /* + * Problem with datagram; advise higher level routines. + */ + if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) || + IP_VHL_HL(icp->icmp_ip.ip_vhl) < (sizeof(struct ip) >> 2)) { + icmpstat.icps_badlen++; + goto freeit; + } + NTOHS(icp->icmp_ip.ip_len); + /* Discard ICMP's in response to multicast packets */ + if (IN_MULTICAST(ntohl(icp->icmp_ip.ip_dst.s_addr))) + goto badcode; +#ifdef ICMPPRINTFS + if (icmpprintfs) + printf("deliver to protocol %d\n", icp->icmp_ip.ip_p); +#endif + icmpsrc.sin_addr = icp->icmp_ip.ip_dst; +#if 1 + /* + * MTU discovery: + * If we got a needfrag and there is a host route to the + * original destination, and the MTU is not locked, then + * set the MTU in the route to the suggested new value + * (if given) and then notify as usual. The ULPs will + * notice that the MTU has changed and adapt accordingly. + * If no new MTU was suggested, then we guess a new one + * less than the current value. If the new MTU is + * unreasonably small (arbitrarily set at 296), then + * we reset the MTU to the interface value and enable the + * lock bit, indicating that we are no longer doing MTU + * discovery. + */ + if (code == PRC_MSGSIZE) { + struct rtentry *rt; + int mtu; + + rt = rtalloc1((struct sockaddr *)&icmpsrc, 0, + RTF_CLONING | RTF_PRCLONING); + if (rt && (rt->rt_flags & RTF_HOST) + && !(rt->rt_rmx.rmx_locks & RTV_MTU)) { + mtu = ntohs(icp->icmp_nextmtu); + if (!mtu) + mtu = ip_next_mtu(rt->rt_rmx.rmx_mtu, + 1); +#ifdef DEBUG_MTUDISC + printf("MTU for %s reduced to %d\n", + inet_ntoa(icmpsrc.sin_addr), mtu); +#endif + if (mtu < 296) { + /* rt->rt_rmx.rmx_mtu = + rt->rt_ifp->if_mtu; */ + rt->rt_rmx.rmx_locks |= RTV_MTU; + } else if (rt->rt_rmx.rmx_mtu > mtu) { + rt->rt_rmx.rmx_mtu = mtu; + } + } + if (rt) + RTFREE(rt); + } + +#endif + /* + * XXX if the packet contains [IPv4 AH TCP], we can't make a + * notification to TCP layer. + */ + ctlfunc = inetsw[ip_protox[icp->icmp_ip.ip_p]].pr_ctlinput; + if (ctlfunc) + (*ctlfunc)(code, (struct sockaddr *)&icmpsrc, + (void *)&icp->icmp_ip); + break; + + badcode: + icmpstat.icps_badcode++; + break; + + case ICMP_ECHO: + if (!icmpbmcastecho + && (m->m_flags & (M_MCAST | M_BCAST)) != 0) { + icmpstat.icps_bmcastecho++; + break; + } + icp->icmp_type = ICMP_ECHOREPLY; + if (badport_bandlim(BANDLIM_ICMP_ECHO) < 0) + goto freeit; + else + goto reflect; + + case ICMP_TSTAMP: + if (!icmpbmcastecho + && (m->m_flags & (M_MCAST | M_BCAST)) != 0) { + icmpstat.icps_bmcasttstamp++; + break; + } + if (icmplen < ICMP_TSLEN) { + icmpstat.icps_badlen++; + break; + } + icp->icmp_type = ICMP_TSTAMPREPLY; + icp->icmp_rtime = iptime(); + icp->icmp_ttime = icp->icmp_rtime; /* bogus, do later! */ + if (badport_bandlim(BANDLIM_ICMP_TSTAMP) < 0) + goto freeit; + else + goto reflect; + + case ICMP_MASKREQ: +#define satosin(sa) ((struct sockaddr_in *)(sa)) + if (icmpmaskrepl == 0) + break; + /* + * We are not able to respond with all ones broadcast + * unless we receive it over a point-to-point interface. + */ + if (icmplen < ICMP_MASKLEN) + break; + switch (ip->ip_dst.s_addr) { + + case INADDR_BROADCAST: + case INADDR_ANY: + icmpdst.sin_addr = ip->ip_src; + break; + + default: + icmpdst.sin_addr = ip->ip_dst; + } + ia = (struct in_ifaddr *)ifaof_ifpforaddr( + (struct sockaddr *)&icmpdst, m->m_pkthdr.rcvif); + if (ia == 0) + break; + if (ia->ia_ifp == 0) + break; + icp->icmp_type = ICMP_MASKREPLY; + icp->icmp_mask = ia->ia_sockmask.sin_addr.s_addr; + if (ip->ip_src.s_addr == 0) { + if (ia->ia_ifp->if_flags & IFF_BROADCAST) + ip->ip_src = satosin(&ia->ia_broadaddr)->sin_addr; + else if (ia->ia_ifp->if_flags & IFF_POINTOPOINT) + ip->ip_src = satosin(&ia->ia_dstaddr)->sin_addr; + } +reflect: + ip->ip_len += hlen; /* since ip_input deducts this */ + icmpstat.icps_reflect++; + icmpstat.icps_outhist[icp->icmp_type]++; + icmp_reflect(m); + return; + + case ICMP_REDIRECT: + if (log_redirect) { + u_long src, dst, gw; + + src = ntohl(ip->ip_src.s_addr); + dst = ntohl(icp->icmp_ip.ip_dst.s_addr); + gw = ntohl(icp->icmp_gwaddr.s_addr); + printf("icmp redirect from %d.%d.%d.%d: " + "%d.%d.%d.%d => %d.%d.%d.%d\n", + (int)(src >> 24), (int)((src >> 16) & 0xff), + (int)((src >> 8) & 0xff), (int)(src & 0xff), + (int)(dst >> 24), (int)((dst >> 16) & 0xff), + (int)((dst >> 8) & 0xff), (int)(dst & 0xff), + (int)(gw >> 24), (int)((gw >> 16) & 0xff), + (int)((gw >> 8) & 0xff), (int)(gw & 0xff)); + } + if (drop_redirect) + break; + if (code > 3) + goto badcode; + if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) || + IP_VHL_HL(icp->icmp_ip.ip_vhl) < (sizeof(struct ip) >> 2)) { + icmpstat.icps_badlen++; + break; + } + /* + * Short circuit routing redirects to force + * immediate change in the kernel's routing + * tables. The message is also handed to anyone + * listening on a raw socket (e.g. the routing + * daemon for use in updating its tables). + */ + icmpgw.sin_addr = ip->ip_src; + icmpdst.sin_addr = icp->icmp_gwaddr; +#ifdef ICMPPRINTFS + if (icmpprintfs) { + char buf[4 * sizeof "123"]; + strcpy(buf, inet_ntoa(icp->icmp_ip.ip_dst)); + + printf("redirect dst %s to %s\n", + buf, inet_ntoa(icp->icmp_gwaddr)); + } +#endif + icmpsrc.sin_addr = icp->icmp_ip.ip_dst; + rtredirect((struct sockaddr *)&icmpsrc, + (struct sockaddr *)&icmpdst, + (struct sockaddr *)0, RTF_GATEWAY | RTF_HOST, + (struct sockaddr *)&icmpgw, (struct rtentry **)0); + pfctlinput(PRC_REDIRECT_HOST, (struct sockaddr *)&icmpsrc); +#ifdef IPSEC + key_sa_routechange((struct sockaddr *)&icmpsrc); +#endif + break; + + /* + * No kernel processing for the following; + * just fall through to send to raw listener. + */ + case ICMP_ECHOREPLY: + case ICMP_ROUTERADVERT: + case ICMP_ROUTERSOLICIT: + case ICMP_TSTAMPREPLY: + case ICMP_IREQREPLY: + case ICMP_MASKREPLY: + default: + break; + } + +raw: + rip_input(m, off, proto); + return; + +freeit: + m_freem(m); +} + +/* + * Reflect the ip packet back to the source + */ +static void +icmp_reflect(m) + struct mbuf *m; +{ + register struct ip *ip = mtod(m, struct ip *); + register struct in_ifaddr *ia; + struct in_addr t; + struct mbuf *opts = 0; + int optlen = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof(struct ip); + + if (!in_canforward(ip->ip_src) && + ((ntohl(ip->ip_src.s_addr) & IN_CLASSA_NET) != + (IN_LOOPBACKNET << IN_CLASSA_NSHIFT))) { + m_freem(m); /* Bad return address */ + goto done; /* Ip_output() will check for broadcast */ + } + t = ip->ip_dst; + ip->ip_dst = ip->ip_src; + /* + * If the incoming packet was addressed directly to us, + * use dst as the src for the reply. Otherwise (broadcast + * or anonymous), use the address which corresponds + * to the incoming interface. + */ + TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { + if (t.s_addr == IA_SIN(ia)->sin_addr.s_addr) + break; + if (ia->ia_ifp && (ia->ia_ifp->if_flags & IFF_BROADCAST) && + t.s_addr == satosin(&ia->ia_broadaddr)->sin_addr.s_addr) + break; + } + icmpdst.sin_addr = t; + if ((ia == (struct in_ifaddr *)0) && m->m_pkthdr.rcvif) + ia = (struct in_ifaddr *)ifaof_ifpforaddr( + (struct sockaddr *)&icmpdst, m->m_pkthdr.rcvif); + /* + * The following happens if the packet was not addressed to us, + * and was received on an interface with no IP address. + */ + if (ia == (struct in_ifaddr *)0) + ia = TAILQ_FIRST(&in_ifaddrhead); + t = IA_SIN(ia)->sin_addr; + ip->ip_src = t; + ip->ip_ttl = ip_defttl; + + if (optlen > 0) { + register u_char *cp; + int opt, cnt; + u_int len; + + /* + * Retrieve any source routing from the incoming packet; + * add on any record-route or timestamp options. + */ + cp = (u_char *) (ip + 1); + if ((opts = ip_srcroute()) == 0 && + (opts = m_gethdr(M_DONTWAIT, MT_HEADER))) { + opts->m_len = sizeof(struct in_addr); + mtod(opts, struct in_addr *)->s_addr = 0; + } + if (opts) { +#ifdef ICMPPRINTFS + if (icmpprintfs) + printf("icmp_reflect optlen %d rt %d => ", + optlen, opts->m_len); +#endif + for (cnt = optlen; cnt > 0; cnt -= len, cp += len) { + opt = cp[IPOPT_OPTVAL]; + if (opt == IPOPT_EOL) + break; + if (opt == IPOPT_NOP) + len = 1; + else { + if (cnt < IPOPT_OLEN + sizeof(*cp)) + break; + len = cp[IPOPT_OLEN]; + if (len < IPOPT_OLEN + sizeof(*cp) || + len > cnt) + break; + } + /* + * Should check for overflow, but it "can't happen" + */ + if (opt == IPOPT_RR || opt == IPOPT_TS || + opt == IPOPT_SECURITY) { + bcopy((caddr_t)cp, + mtod(opts, caddr_t) + opts->m_len, len); + opts->m_len += len; + } + } + /* Terminate & pad, if necessary */ + cnt = opts->m_len % 4; + if (cnt) { + for (; cnt < 4; cnt++) { + *(mtod(opts, caddr_t) + opts->m_len) = + IPOPT_EOL; + opts->m_len++; + } + } +#ifdef ICMPPRINTFS + if (icmpprintfs) + printf("%d\n", opts->m_len); +#endif + } + /* + * Now strip out original options by copying rest of first + * mbuf's data back, and adjust the IP length. + */ + ip->ip_len -= optlen; + ip->ip_vhl = IP_VHL_BORING; + m->m_len -= optlen; + if (m->m_flags & M_PKTHDR) + m->m_pkthdr.len -= optlen; + optlen += sizeof(struct ip); + bcopy((caddr_t)ip + optlen, (caddr_t)(ip + 1), + (unsigned)(m->m_len - sizeof(struct ip))); + } + m->m_flags &= ~(M_BCAST|M_MCAST); + icmp_send(m, opts); +done: + if (opts) + (void)m_free(opts); +} + +/* + * Send an icmp packet back to the ip level, + * after supplying a checksum. + */ +static void +icmp_send(m, opts) + register struct mbuf *m; + struct mbuf *opts; +{ + register struct ip *ip = mtod(m, struct ip *); + register int hlen; + register struct icmp *icp; + struct route ro; + + hlen = IP_VHL_HL(ip->ip_vhl) << 2; + m->m_data += hlen; + m->m_len -= hlen; + icp = mtod(m, struct icmp *); + icp->icmp_cksum = 0; + icp->icmp_cksum = in_cksum(m, ip->ip_len - hlen); + m->m_data -= hlen; + m->m_len += hlen; + m->m_pkthdr.rcvif = (struct ifnet *)0; +#ifdef ICMPPRINTFS + if (icmpprintfs) { + char buf[4 * sizeof "123"]; + strcpy(buf, inet_ntoa(ip->ip_dst)); + printf("icmp_send dst %s src %s\n", + buf, inet_ntoa(ip->ip_src)); + } +#endif + bzero(&ro, sizeof ro); + (void) ip_output(m, opts, &ro, 0, NULL); + if (ro.ro_rt) + RTFREE(ro.ro_rt); +} + +n_time +iptime() +{ + struct timeval atv; + u_long t; + + getmicrotime(&atv); + t = (atv.tv_sec % (24*60*60)) * 1000 + atv.tv_usec / 1000; + return (htonl(t)); +} + +#if 1 +/* + * Return the next larger or smaller MTU plateau (table from RFC 1191) + * given current value MTU. If DIR is less than zero, a larger plateau + * is returned; otherwise, a smaller value is returned. + */ +static int +ip_next_mtu(mtu, dir) + int mtu; + int dir; +{ + static int mtutab[] = { + 65535, 32000, 17914, 8166, 4352, 2002, 1492, 1006, 508, 296, + 68, 0 + }; + int i; + + for (i = 0; i < (sizeof mtutab) / (sizeof mtutab[0]); i++) { + if (mtu >= mtutab[i]) + break; + } + + if (dir < 0) { + if (i == 0) { + return 0; + } else { + return mtutab[i - 1]; + } + } else { + if (mtutab[i] == 0) { + return 0; + } else if(mtu > mtutab[i]) { + return mtutab[i]; + } else { + return mtutab[i + 1]; + } + } +} +#endif + + +/* + * badport_bandlim() - check for ICMP bandwidth limit + * + * Return 0 if it is ok to send an ICMP error response, -1 if we have + * hit our bandwidth limit and it is not ok. + * + * If icmplim is <= 0, the feature is disabled and 0 is returned. + * + * For now we separate the TCP and UDP subsystems w/ different 'which' + * values. We may eventually remove this separation (and simplify the + * code further). + * + * Note that the printing of the error message is delayed so we can + * properly print the icmp error rate that the system was trying to do + * (i.e. 22000/100 pps, etc...). This can cause long delays in printing + * the 'final' error, but it doesn't make sense to solve the printing + * delay with more complex code. + */ + +int +badport_bandlim(int which) +{ + static int lticks[BANDLIM_MAX + 1]; + static int lpackets[BANDLIM_MAX + 1]; + int dticks; + const char *bandlimittype[] = { + "Limiting icmp unreach response", + "Limiting icmp ping response", + "Limiting icmp tstamp response", + "Limiting closed port RST response", + "Limiting open port RST response" + }; + + /* + * Return ok status if feature disabled or argument out of + * ranage. + */ + + if (icmplim <= 0 || which > BANDLIM_MAX || which < 0) + return(0); + dticks = ticks - lticks[which]; + + /* + * reset stats when cumulative dt exceeds one second. + */ + + if ((unsigned int)dticks > hz) { + if (lpackets[which] > icmplim && icmplim_output) { + printf("%s from %d to %d packets per second\n", + bandlimittype[which], + lpackets[which], + icmplim + ); + } + lticks[which] = ticks; + lpackets[which] = 0; + } + + /* + * bump packet count + */ + + if (++lpackets[which] > icmplim) { + return(-1); + } + return(0); +} + diff --git a/sys/netinet/ip_icmp.h b/sys/netinet/ip_icmp.h new file mode 100644 index 0000000..b7d5400 --- /dev/null +++ b/sys/netinet/ip_icmp.h @@ -0,0 +1,192 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip_icmp.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_IP_ICMP_H_ +#define _NETINET_IP_ICMP_H_ + +/* + * Interface Control Message Protocol Definitions. + * Per RFC 792, September 1981. + */ + +/* + * Internal of an ICMP Router Advertisement + */ +struct icmp_ra_addr { + u_int32_t ira_addr; + u_int32_t ira_preference; +}; + +/* + * Structure of an icmp header. + */ +struct icmp { + u_char icmp_type; /* type of message, see below */ + u_char icmp_code; /* type sub code */ + u_short icmp_cksum; /* ones complement cksum of struct */ + union { + u_char ih_pptr; /* ICMP_PARAMPROB */ + struct in_addr ih_gwaddr; /* ICMP_REDIRECT */ + struct ih_idseq { + n_short icd_id; + n_short icd_seq; + } ih_idseq; + int ih_void; + + /* ICMP_UNREACH_NEEDFRAG -- Path MTU Discovery (RFC1191) */ + struct ih_pmtu { + n_short ipm_void; + n_short ipm_nextmtu; + } ih_pmtu; + + struct ih_rtradv { + u_char irt_num_addrs; + u_char irt_wpa; + u_int16_t irt_lifetime; + } ih_rtradv; + } icmp_hun; +#define icmp_pptr icmp_hun.ih_pptr +#define icmp_gwaddr icmp_hun.ih_gwaddr +#define icmp_id icmp_hun.ih_idseq.icd_id +#define icmp_seq icmp_hun.ih_idseq.icd_seq +#define icmp_void icmp_hun.ih_void +#define icmp_pmvoid icmp_hun.ih_pmtu.ipm_void +#define icmp_nextmtu icmp_hun.ih_pmtu.ipm_nextmtu +#define icmp_num_addrs icmp_hun.ih_rtradv.irt_num_addrs +#define icmp_wpa icmp_hun.ih_rtradv.irt_wpa +#define icmp_lifetime icmp_hun.ih_rtradv.irt_lifetime + union { + struct id_ts { + n_time its_otime; + n_time its_rtime; + n_time its_ttime; + } id_ts; + struct id_ip { + struct ip idi_ip; + /* options and then 64 bits of data */ + } id_ip; + struct icmp_ra_addr id_radv; + u_int32_t id_mask; + char id_data[1]; + } icmp_dun; +#define icmp_otime icmp_dun.id_ts.its_otime +#define icmp_rtime icmp_dun.id_ts.its_rtime +#define icmp_ttime icmp_dun.id_ts.its_ttime +#define icmp_ip icmp_dun.id_ip.idi_ip +#define icmp_radv icmp_dun.id_radv +#define icmp_mask icmp_dun.id_mask +#define icmp_data icmp_dun.id_data +}; + +/* + * Lower bounds on packet lengths for various types. + * For the error advice packets must first insure that the + * packet is large enough to contain the returned ip header. + * Only then can we do the check to see if 64 bits of packet + * data have been returned, since we need to check the returned + * ip header length. + */ +#define ICMP_MINLEN 8 /* abs minimum */ +#define ICMP_TSLEN (8 + 3 * sizeof (n_time)) /* timestamp */ +#define ICMP_MASKLEN 12 /* address mask */ +#define ICMP_ADVLENMIN (8 + sizeof (struct ip) + 8) /* min */ +#ifndef _IP_VHL +#define ICMP_ADVLEN(p) (8 + ((p)->icmp_ip.ip_hl << 2) + 8) + /* N.B.: must separately check that ip_hl >= 5 */ +#else +#define ICMP_ADVLEN(p) (8 + (IP_VHL_HL((p)->icmp_ip.ip_vhl) << 2) + 8) + /* N.B.: must separately check that header length >= 5 */ +#endif + +/* + * Definition of type and code field values. + */ +#define ICMP_ECHOREPLY 0 /* echo reply */ +#define ICMP_UNREACH 3 /* dest unreachable, codes: */ +#define ICMP_UNREACH_NET 0 /* bad net */ +#define ICMP_UNREACH_HOST 1 /* bad host */ +#define ICMP_UNREACH_PROTOCOL 2 /* bad protocol */ +#define ICMP_UNREACH_PORT 3 /* bad port */ +#define ICMP_UNREACH_NEEDFRAG 4 /* IP_DF caused drop */ +#define ICMP_UNREACH_SRCFAIL 5 /* src route failed */ +#define ICMP_UNREACH_NET_UNKNOWN 6 /* unknown net */ +#define ICMP_UNREACH_HOST_UNKNOWN 7 /* unknown host */ +#define ICMP_UNREACH_ISOLATED 8 /* src host isolated */ +#define ICMP_UNREACH_NET_PROHIB 9 /* prohibited access */ +#define ICMP_UNREACH_HOST_PROHIB 10 /* ditto */ +#define ICMP_UNREACH_TOSNET 11 /* bad tos for net */ +#define ICMP_UNREACH_TOSHOST 12 /* bad tos for host */ +#define ICMP_UNREACH_FILTER_PROHIB 13 /* admin prohib */ +#define ICMP_UNREACH_HOST_PRECEDENCE 14 /* host prec vio. */ +#define ICMP_UNREACH_PRECEDENCE_CUTOFF 15 /* prec cutoff */ +#define ICMP_SOURCEQUENCH 4 /* packet lost, slow down */ +#define ICMP_REDIRECT 5 /* shorter route, codes: */ +#define ICMP_REDIRECT_NET 0 /* for network */ +#define ICMP_REDIRECT_HOST 1 /* for host */ +#define ICMP_REDIRECT_TOSNET 2 /* for tos and net */ +#define ICMP_REDIRECT_TOSHOST 3 /* for tos and host */ +#define ICMP_ECHO 8 /* echo service */ +#define ICMP_ROUTERADVERT 9 /* router advertisement */ +#define ICMP_ROUTERSOLICIT 10 /* router solicitation */ +#define ICMP_TIMXCEED 11 /* time exceeded, code: */ +#define ICMP_TIMXCEED_INTRANS 0 /* ttl==0 in transit */ +#define ICMP_TIMXCEED_REASS 1 /* ttl==0 in reass */ +#define ICMP_PARAMPROB 12 /* ip header bad */ +#define ICMP_PARAMPROB_ERRATPTR 0 /* error at param ptr */ +#define ICMP_PARAMPROB_OPTABSENT 1 /* req. opt. absent */ +#define ICMP_PARAMPROB_LENGTH 2 /* bad length */ +#define ICMP_TSTAMP 13 /* timestamp request */ +#define ICMP_TSTAMPREPLY 14 /* timestamp reply */ +#define ICMP_IREQ 15 /* information request */ +#define ICMP_IREQREPLY 16 /* information reply */ +#define ICMP_MASKREQ 17 /* address mask request */ +#define ICMP_MASKREPLY 18 /* address mask reply */ + +#define ICMP_MAXTYPE 18 + +#define ICMP_INFOTYPE(type) \ + ((type) == ICMP_ECHOREPLY || (type) == ICMP_ECHO || \ + (type) == ICMP_ROUTERADVERT || (type) == ICMP_ROUTERSOLICIT || \ + (type) == ICMP_TSTAMP || (type) == ICMP_TSTAMPREPLY || \ + (type) == ICMP_IREQ || (type) == ICMP_IREQREPLY || \ + (type) == ICMP_MASKREQ || (type) == ICMP_MASKREPLY) + +#ifdef _KERNEL +void icmp_error __P((struct mbuf *, int, int, n_long, struct ifnet *)); +void icmp_input __P((struct mbuf *, int, int)); +#endif + +#endif diff --git a/sys/netinet/ip_input.c b/sys/netinet/ip_input.c new file mode 100644 index 0000000..0963a0a --- /dev/null +++ b/sys/netinet/ip_input.c @@ -0,0 +1,1833 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 + * $FreeBSD$ + */ + +#define _IP_VHL + +#include "opt_bootp.h" +#include "opt_ipfw.h" +#include "opt_ipdn.h" +#include "opt_ipdivert.h" +#include "opt_ipfilter.h" +#include "opt_ipstealth.h" +#include "opt_ipsec.h" +#include "opt_pfil_hooks.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/mbuf.h> +#include <sys/malloc.h> +#include <sys/domain.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/time.h> +#include <sys/kernel.h> +#include <sys/syslog.h> +#include <sys/sysctl.h> + +#include <net/pfil.h> +#include <net/if.h> +#include <net/if_var.h> +#include <net/if_dl.h> +#include <net/route.h> +#include <net/netisr.h> +#include <net/intrq.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/in_var.h> +#include <netinet/ip.h> +#include <netinet/in_pcb.h> +#include <netinet/ip_var.h> +#include <netinet/ip_icmp.h> +#include <machine/in_cksum.h> + +#include <netinet/ipprotosw.h> + +#include <sys/socketvar.h> + +#include <netinet/ip_fw.h> + +#ifdef IPSEC +#include <netinet6/ipsec.h> +#include <netkey/key.h> +#endif + +#include "faith.h" +#if defined(NFAITH) && NFAITH > 0 +#include <net/if_types.h> +#endif + +#ifdef DUMMYNET +#include <netinet/ip_dummynet.h> +#endif + +int rsvp_on = 0; +static int ip_rsvp_on; +struct socket *ip_rsvpd; + +int ipforwarding = 0; +SYSCTL_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_RW, + &ipforwarding, 0, "Enable IP forwarding between interfaces"); + +static int ipsendredirects = 1; /* XXX */ +SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_RW, + &ipsendredirects, 0, "Enable sending IP redirects"); + +int ip_defttl = IPDEFTTL; +SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW, + &ip_defttl, 0, "Maximum TTL on IP packets"); + +static int ip_dosourceroute = 0; +SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute, CTLFLAG_RW, + &ip_dosourceroute, 0, "Enable forwarding source routed IP packets"); + +static int ip_acceptsourceroute = 0; +SYSCTL_INT(_net_inet_ip, IPCTL_ACCEPTSOURCEROUTE, accept_sourceroute, + CTLFLAG_RW, &ip_acceptsourceroute, 0, + "Enable accepting source routed IP packets"); + +static int ip_keepfaith = 0; +SYSCTL_INT(_net_inet_ip, IPCTL_KEEPFAITH, keepfaith, CTLFLAG_RW, + &ip_keepfaith, 0, + "Enable packet capture for FAITH IPv4->IPv6 translater daemon"); + +/* + * XXX - Setting ip_checkinterface mostly implements the receive side of + * the Strong ES model described in RFC 1122, but since the routing table + * and transmit implementation do not implement the Strong ES model, + * setting this to 1 results in an odd hybrid. + * + * XXX - ip_checkinterface currently must be disabled if you use ipnat + * to translate the destination address to another local interface. + * + * XXX - ip_checkinterface must be disabled if you add IP aliases + * to the loopback interface instead of the interface where the + * packets for those addresses are received. + */ +static int ip_checkinterface = 1; +SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_RW, + &ip_checkinterface, 0, "Verify packet arrives on correct interface"); + +#ifdef DIAGNOSTIC +static int ipprintfs = 0; +#endif + +extern struct domain inetdomain; +extern struct ipprotosw inetsw[]; +u_char ip_protox[IPPROTO_MAX]; +static int ipqmaxlen = IFQ_MAXLEN; +struct in_ifaddrhead in_ifaddrhead; /* first inet address */ +SYSCTL_INT(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen, CTLFLAG_RW, + &ipintrq.ifq_maxlen, 0, "Maximum size of the IP input queue"); +SYSCTL_INT(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops, CTLFLAG_RD, + &ipintrq.ifq_drops, 0, "Number of packets dropped from the IP input queue"); + +struct ipstat ipstat; +SYSCTL_STRUCT(_net_inet_ip, IPCTL_STATS, stats, CTLFLAG_RD, + &ipstat, ipstat, "IP statistics (struct ipstat, netinet/ip_var.h)"); + +/* Packet reassembly stuff */ +#define IPREASS_NHASH_LOG2 6 +#define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2) +#define IPREASS_HMASK (IPREASS_NHASH - 1) +#define IPREASS_HASH(x,y) \ + (((((x) & 0xF) | ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK) + +static TAILQ_HEAD(ipqhead, ipq) ipq[IPREASS_NHASH]; +static int nipq = 0; /* total # of reass queues */ +static int maxnipq; +const int ipintrq_present = 1; + +#ifdef IPCTL_DEFMTU +SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW, + &ip_mtu, 0, "Default MTU"); +#endif + +#ifdef IPSTEALTH +static int ipstealth = 0; +SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW, + &ipstealth, 0, ""); +#endif + + +/* Firewall hooks */ +ip_fw_chk_t *ip_fw_chk_ptr; +ip_fw_ctl_t *ip_fw_ctl_ptr; +int fw_enable = 1 ; + +#ifdef DUMMYNET +ip_dn_ctl_t *ip_dn_ctl_ptr; +#endif + + +/* + * We need to save the IP options in case a protocol wants to respond + * to an incoming packet over the same route if the packet got here + * using IP source routing. This allows connection establishment and + * maintenance when the remote end is on a network that is not known + * to us. + */ +static int ip_nhops = 0; +static struct ip_srcrt { + struct in_addr dst; /* final destination */ + char nop; /* one NOP to align */ + char srcopt[IPOPT_OFFSET + 1]; /* OPTVAL, OLEN and OFFSET */ + struct in_addr route[MAX_IPOPTLEN/sizeof(struct in_addr)]; +} ip_srcrt; + +struct sockaddr_in *ip_fw_fwd_addr; + +static void save_rte __P((u_char *, struct in_addr)); +static int ip_dooptions __P((struct mbuf *)); +static void ip_forward __P((struct mbuf *, int)); +static void ip_freef __P((struct ipqhead *, struct ipq *)); +#ifdef IPDIVERT +static struct mbuf *ip_reass __P((struct mbuf *, struct ipqhead *, struct ipq *, u_int32_t *, u_int16_t *)); +#else +static struct mbuf *ip_reass __P((struct mbuf *, struct ipqhead *, struct ipq *)); +#endif +static struct in_ifaddr *ip_rtaddr __P((struct in_addr)); +static void ipintr __P((void)); + +/* + * IP initialization: fill in IP protocol switch table. + * All protocols not implemented in kernel go to raw IP protocol handler. + */ +void +ip_init() +{ + register struct ipprotosw *pr; + register int i; + + TAILQ_INIT(&in_ifaddrhead); + pr = (struct ipprotosw *)pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); + if (pr == 0) + panic("ip_init"); + for (i = 0; i < IPPROTO_MAX; i++) + ip_protox[i] = pr - inetsw; + for (pr = (struct ipprotosw *)inetdomain.dom_protosw; + pr < (struct ipprotosw *)inetdomain.dom_protoswNPROTOSW; pr++) + if (pr->pr_domain->dom_family == PF_INET && + pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) + ip_protox[pr->pr_protocol] = pr - inetsw; + + for (i = 0; i < IPREASS_NHASH; i++) + TAILQ_INIT(&ipq[i]); + + maxnipq = nmbclusters/4; + + ip_id = time_second & 0xffff; + ipintrq.ifq_maxlen = ipqmaxlen; + mtx_init(&ipintrq.ifq_mtx, "ip_inq", MTX_DEF); + + register_netisr(NETISR_IP, ipintr); +} + +static struct sockaddr_in ipaddr = { sizeof(ipaddr), AF_INET }; +struct route ipforward_rt; + +/* + * Ip input routine. Checksum and byte swap header. If fragmented + * try to reassemble. Process options. Pass to next level. + */ +void +ip_input(struct mbuf *m) +{ + struct ip *ip; + struct ipq *fp; + struct in_ifaddr *ia = NULL; + int i, hlen, checkif; + u_short sum; + u_int16_t divert_cookie; /* firewall cookie */ + struct in_addr pkt_dst; +#ifdef IPDIVERT + u_int32_t divert_info = 0; /* packet divert/tee info */ +#endif + struct ip_fw_chain *rule = NULL; +#ifdef PFIL_HOOKS + struct packet_filter_hook *pfh; + struct mbuf *m0; + int rv; +#endif /* PFIL_HOOKS */ + +#ifdef IPDIVERT + /* Get and reset firewall cookie */ + divert_cookie = ip_divert_cookie; + ip_divert_cookie = 0; +#else + divert_cookie = 0; +#endif + +#if defined(IPFIREWALL) && defined(DUMMYNET) + /* + * dummynet packet are prepended a vestigial mbuf with + * m_type = MT_DUMMYNET and m_data pointing to the matching + * rule. + */ + if (m->m_type == MT_DUMMYNET) { + rule = (struct ip_fw_chain *)(m->m_data) ; + m = m->m_next ; + ip = mtod(m, struct ip *); + hlen = IP_VHL_HL(ip->ip_vhl) << 2; + goto iphack ; + } else + rule = NULL ; +#endif + +#ifdef DIAGNOSTIC + if (m == NULL || (m->m_flags & M_PKTHDR) == 0) + panic("ip_input no HDR"); +#endif + ipstat.ips_total++; + + if (m->m_pkthdr.len < sizeof(struct ip)) + goto tooshort; + + if (m->m_len < sizeof (struct ip) && + (m = m_pullup(m, sizeof (struct ip))) == 0) { + ipstat.ips_toosmall++; + return; + } + ip = mtod(m, struct ip *); + + if (IP_VHL_V(ip->ip_vhl) != IPVERSION) { + ipstat.ips_badvers++; + goto bad; + } + + hlen = IP_VHL_HL(ip->ip_vhl) << 2; + if (hlen < sizeof(struct ip)) { /* minimum header length */ + ipstat.ips_badhlen++; + goto bad; + } + if (hlen > m->m_len) { + if ((m = m_pullup(m, hlen)) == 0) { + ipstat.ips_badhlen++; + return; + } + ip = mtod(m, struct ip *); + } + if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { + sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); + } else { + if (hlen == sizeof(struct ip)) { + sum = in_cksum_hdr(ip); + } else { + sum = in_cksum(m, hlen); + } + } + if (sum) { + ipstat.ips_badsum++; + goto bad; + } + + /* + * Convert fields to host representation. + */ + NTOHS(ip->ip_len); + if (ip->ip_len < hlen) { + ipstat.ips_badlen++; + goto bad; + } + NTOHS(ip->ip_off); + + /* + * Check that the amount of data in the buffers + * is as at least much as the IP header would have us expect. + * Trim mbufs if longer than we expect. + * Drop packet if shorter than we expect. + */ + if (m->m_pkthdr.len < ip->ip_len) { +tooshort: + ipstat.ips_tooshort++; + goto bad; + } + if (m->m_pkthdr.len > ip->ip_len) { + if (m->m_len == m->m_pkthdr.len) { + m->m_len = ip->ip_len; + m->m_pkthdr.len = ip->ip_len; + } else + m_adj(m, ip->ip_len - m->m_pkthdr.len); + } + + /* + * Don't accept packets with a loopback destination address + * unless they arrived via the loopback interface. + */ + if ((ntohl(ip->ip_dst.s_addr) & IN_CLASSA_NET) == + (IN_LOOPBACKNET << IN_CLASSA_NSHIFT) && + (m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) { + goto bad; + } + + /* + * IpHack's section. + * Right now when no processing on packet has done + * and it is still fresh out of network we do our black + * deals with it. + * - Firewall: deny/allow/divert + * - Xlate: translate packet's addr/port (NAT). + * - Pipe: pass pkt through dummynet. + * - Wrap: fake packet's addr/port <unimpl.> + * - Encapsulate: put it in another IP and send out. <unimp.> + */ + +#if defined(IPFIREWALL) && defined(DUMMYNET) +iphack: +#endif + +#ifdef PFIL_HOOKS + /* + * Run through list of hooks for input packets. If there are any + * filters which require that additional packets in the flow are + * not fast-forwarded, they must clear the M_CANFASTFWD flag. + * Note that filters must _never_ set this flag, as another filter + * in the list may have previously cleared it. + */ + m0 = m; + pfh = pfil_hook_get(PFIL_IN, &inetsw[ip_protox[IPPROTO_IP]].pr_pfh); + for (; pfh; pfh = TAILQ_NEXT(pfh, pfil_link)) + if (pfh->pfil_func) { + rv = pfh->pfil_func(ip, hlen, + m->m_pkthdr.rcvif, 0, &m0); + if (rv) + return; + m = m0; + if (m == NULL) + return; + ip = mtod(m, struct ip *); + } +#endif /* PFIL_HOOKS */ + + if (fw_enable && ip_fw_chk_ptr) { +#ifdef IPFIREWALL_FORWARD + /* + * If we've been forwarded from the output side, then + * skip the firewall a second time + */ + if (ip_fw_fwd_addr) + goto ours; +#endif /* IPFIREWALL_FORWARD */ + /* + * See the comment in ip_output for the return values + * produced by the firewall. + */ + i = (*ip_fw_chk_ptr)(&ip, + hlen, NULL, &divert_cookie, &m, &rule, &ip_fw_fwd_addr); + if (i & IP_FW_PORT_DENY_FLAG) { /* XXX new interface-denied */ + if (m) + m_freem(m); + return ; + } + if (m == NULL) { /* Packet discarded by firewall */ + static int __debug=10; + if (__debug >0) { + printf("firewall returns NULL, please update!\n"); + __debug-- ; + } + return; + } + if (i == 0 && ip_fw_fwd_addr == NULL) /* common case */ + goto pass; +#ifdef DUMMYNET + if ((i & IP_FW_PORT_DYNT_FLAG) != 0) { + /* Send packet to the appropriate pipe */ + dummynet_io(i&0xffff,DN_TO_IP_IN,m,NULL,NULL,0, rule, + 0); + return; + } +#endif +#ifdef IPDIVERT + if (i != 0 && (i & IP_FW_PORT_DYNT_FLAG) == 0) { + /* Divert or tee packet */ + divert_info = i; + goto ours; + } +#endif +#ifdef IPFIREWALL_FORWARD + if (i == 0 && ip_fw_fwd_addr != NULL) + goto pass; +#endif + /* + * if we get here, the packet must be dropped + */ + m_freem(m); + return; + } +pass: + + /* + * Process options and, if not destined for us, + * ship it on. ip_dooptions returns 1 when an + * error was detected (causing an icmp message + * to be sent and the original packet to be freed). + */ + ip_nhops = 0; /* for source routed packets */ + if (hlen > sizeof (struct ip) && ip_dooptions(m)) { +#ifdef IPFIREWALL_FORWARD + ip_fw_fwd_addr = NULL; +#endif + return; + } + + /* greedy RSVP, snatches any PATH packet of the RSVP protocol and no + * matter if it is destined to another node, or whether it is + * a multicast one, RSVP wants it! and prevents it from being forwarded + * anywhere else. Also checks if the rsvp daemon is running before + * grabbing the packet. + */ + if (rsvp_on && ip->ip_p==IPPROTO_RSVP) + goto ours; + + /* + * Check our list of addresses, to see if the packet is for us. + * If we don't have any addresses, assume any unicast packet + * we receive might be for us (and let the upper layers deal + * with it). + */ + if (TAILQ_EMPTY(&in_ifaddrhead) && + (m->m_flags & (M_MCAST|M_BCAST)) == 0) + goto ours; + + /* + * Cache the destination address of the packet; this may be + * changed by use of 'ipfw fwd'. + */ + pkt_dst = ip_fw_fwd_addr == NULL ? + ip->ip_dst : ip_fw_fwd_addr->sin_addr; + + /* + * Enable a consistency check between the destination address + * and the arrival interface for a unicast packet (the RFC 1122 + * strong ES model) if IP forwarding is disabled and the packet + * is not locally generated and the packet is not subject to + * 'ipfw fwd'. + * + * XXX - Checking also should be disabled if the destination + * address is ipnat'ed to a different interface. + * + * XXX - Checking is incompatible with IP aliases added + * to the loopback interface instead of the interface where + * the packets are received. + */ + checkif = ip_checkinterface && (ipforwarding == 0) && + ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) && + (ip_fw_fwd_addr == NULL); + + TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { +#define satosin(sa) ((struct sockaddr_in *)(sa)) + +#ifdef BOOTP_COMPAT + if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY) + goto ours; +#endif + /* + * If the address matches, verify that the packet + * arrived via the correct interface if checking is + * enabled. + */ + if (IA_SIN(ia)->sin_addr.s_addr == pkt_dst.s_addr && + (!checkif || ia->ia_ifp == m->m_pkthdr.rcvif)) + goto ours; + /* + * Only accept broadcast packets that arrive via the + * matching interface. Reception of forwarded directed + * broadcasts would be handled via ip_forward() and + * ether_output() with the loopback into the stack for + * SIMPLEX interfaces handled by ether_output(). + */ + if (ia->ia_ifp == m->m_pkthdr.rcvif && + ia->ia_ifp && ia->ia_ifp->if_flags & IFF_BROADCAST) { + if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == + pkt_dst.s_addr) + goto ours; + if (ia->ia_netbroadcast.s_addr == pkt_dst.s_addr) + goto ours; + } + } + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { + struct in_multi *inm; + if (ip_mrouter) { + /* + * If we are acting as a multicast router, all + * incoming multicast packets are passed to the + * kernel-level multicast forwarding function. + * The packet is returned (relatively) intact; if + * ip_mforward() returns a non-zero value, the packet + * must be discarded, else it may be accepted below. + */ + if (ip_mforward(ip, m->m_pkthdr.rcvif, m, 0) != 0) { + ipstat.ips_cantforward++; + m_freem(m); + return; + } + + /* + * The process-level routing demon needs to receive + * all multicast IGMP packets, whether or not this + * host belongs to their destination groups. + */ + if (ip->ip_p == IPPROTO_IGMP) + goto ours; + ipstat.ips_forward++; + } + /* + * See if we belong to the destination multicast group on the + * arrival interface. + */ + IN_LOOKUP_MULTI(ip->ip_dst, m->m_pkthdr.rcvif, inm); + if (inm == NULL) { + ipstat.ips_notmember++; + m_freem(m); + return; + } + goto ours; + } + if (ip->ip_dst.s_addr == (u_long)INADDR_BROADCAST) + goto ours; + if (ip->ip_dst.s_addr == INADDR_ANY) + goto ours; + +#if defined(NFAITH) && 0 < NFAITH + /* + * FAITH(Firewall Aided Internet Translator) + */ + if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type == IFT_FAITH) { + if (ip_keepfaith) { + if (ip->ip_p == IPPROTO_TCP || ip->ip_p == IPPROTO_ICMP) + goto ours; + } + m_freem(m); + return; + } +#endif + /* + * Not for us; forward if possible and desirable. + */ + if (ipforwarding == 0) { + ipstat.ips_cantforward++; + m_freem(m); + } else + ip_forward(m, 0); +#ifdef IPFIREWALL_FORWARD + ip_fw_fwd_addr = NULL; +#endif + return; + +ours: + /* Count the packet in the ip address stats */ + if (ia != NULL) { + ia->ia_ifa.if_ipackets++; + ia->ia_ifa.if_ibytes += m->m_pkthdr.len; + } + + /* + * If offset or IP_MF are set, must reassemble. + * Otherwise, nothing need be done. + * (We could look in the reassembly queue to see + * if the packet was previously fragmented, + * but it's not worth the time; just let them time out.) + */ + if (ip->ip_off & (IP_MF | IP_OFFMASK)) { + + sum = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id); + /* + * Look for queue of fragments + * of this datagram. + */ + TAILQ_FOREACH(fp, &ipq[sum], ipq_list) + if (ip->ip_id == fp->ipq_id && + ip->ip_src.s_addr == fp->ipq_src.s_addr && + ip->ip_dst.s_addr == fp->ipq_dst.s_addr && + ip->ip_p == fp->ipq_p) + goto found; + + fp = 0; + + /* check if there's a place for the new queue */ + if (nipq > maxnipq) { + /* + * drop something from the tail of the current queue + * before proceeding further + */ + struct ipq *q = TAILQ_LAST(&ipq[sum], ipqhead); + if (q == NULL) { /* gak */ + for (i = 0; i < IPREASS_NHASH; i++) { + struct ipq *r = TAILQ_LAST(&ipq[i], ipqhead); + if (r) { + ip_freef(&ipq[i], r); + break; + } + } + } else + ip_freef(&ipq[sum], q); + } +found: + /* + * Adjust ip_len to not reflect header, + * convert offset of this to bytes. + */ + ip->ip_len -= hlen; + if (ip->ip_off & IP_MF) { + /* + * Make sure that fragments have a data length + * that's a non-zero multiple of 8 bytes. + */ + if (ip->ip_len == 0 || (ip->ip_len & 0x7) != 0) { + ipstat.ips_toosmall++; /* XXX */ + goto bad; + } + m->m_flags |= M_FRAG; + } + ip->ip_off <<= 3; + + /* + * Attempt reassembly; if it succeeds, proceed. + */ + ipstat.ips_fragments++; + m->m_pkthdr.header = ip; +#ifdef IPDIVERT + m = ip_reass(m, + &ipq[sum], fp, &divert_info, &divert_cookie); +#else + m = ip_reass(m, &ipq[sum], fp); +#endif + if (m == 0) { +#ifdef IPFIREWALL_FORWARD + ip_fw_fwd_addr = NULL; +#endif + return; + } + ipstat.ips_reassembled++; + ip = mtod(m, struct ip *); + /* Get the header length of the reassembled packet */ + hlen = IP_VHL_HL(ip->ip_vhl) << 2; +#ifdef IPDIVERT + /* Restore original checksum before diverting packet */ + if (divert_info != 0) { + ip->ip_len += hlen; + HTONS(ip->ip_len); + HTONS(ip->ip_off); + ip->ip_sum = 0; + if (hlen == sizeof(struct ip)) + ip->ip_sum = in_cksum_hdr(ip); + else + ip->ip_sum = in_cksum(m, hlen); + NTOHS(ip->ip_off); + NTOHS(ip->ip_len); + ip->ip_len -= hlen; + } +#endif + } else + ip->ip_len -= hlen; + +#ifdef IPDIVERT + /* + * Divert or tee packet to the divert protocol if required. + * + * If divert_info is zero then cookie should be too, so we shouldn't + * need to clear them here. Assume divert_packet() does so also. + */ + if (divert_info != 0) { + struct mbuf *clone = NULL; + + /* Clone packet if we're doing a 'tee' */ + if ((divert_info & IP_FW_PORT_TEE_FLAG) != 0) + clone = m_dup(m, M_DONTWAIT); + + /* Restore packet header fields to original values */ + ip->ip_len += hlen; + HTONS(ip->ip_len); + HTONS(ip->ip_off); + + /* Deliver packet to divert input routine */ + ip_divert_cookie = divert_cookie; + divert_packet(m, 1, divert_info & 0xffff); + ipstat.ips_delivered++; + + /* If 'tee', continue with original packet */ + if (clone == NULL) + return; + m = clone; + ip = mtod(m, struct ip *); + } +#endif + + /* + * Switch out to protocol's input routine. + */ + ipstat.ips_delivered++; + { + int off = hlen, nh = ip->ip_p; + + (*inetsw[ip_protox[ip->ip_p]].pr_input)(m, off, nh); +#ifdef IPFIREWALL_FORWARD + ip_fw_fwd_addr = NULL; /* tcp needed it */ +#endif + return; + } +bad: +#ifdef IPFIREWALL_FORWARD + ip_fw_fwd_addr = NULL; +#endif + m_freem(m); +} + +/* + * IP software interrupt routine - to go away sometime soon + */ +static void +ipintr(void) +{ + struct mbuf *m; + + while (1) { + IF_DEQUEUE(&ipintrq, m); + if (m == 0) + return; + ip_input(m); + } +} + +/* + * Take incoming datagram fragment and try to reassemble it into + * whole datagram. If a chain for reassembly of this datagram already + * exists, then it is given as fp; otherwise have to make a chain. + * + * When IPDIVERT enabled, keep additional state with each packet that + * tells us if we need to divert or tee the packet we're building. + */ + +static struct mbuf * +#ifdef IPDIVERT +ip_reass(m, head, fp, divinfo, divcookie) +#else +ip_reass(m, head, fp) +#endif + struct mbuf *m; + struct ipqhead *head; + struct ipq *fp; +#ifdef IPDIVERT + u_int32_t *divinfo; + u_int16_t *divcookie; +#endif +{ + struct ip *ip = mtod(m, struct ip *); + register struct mbuf *p, *q, *nq; + struct mbuf *t; + int hlen = IP_VHL_HL(ip->ip_vhl) << 2; + int i, next; + + /* + * Presence of header sizes in mbufs + * would confuse code below. + */ + m->m_data += hlen; + m->m_len -= hlen; + + /* + * If first fragment to arrive, create a reassembly queue. + */ + if (fp == 0) { + if ((t = m_get(M_DONTWAIT, MT_FTABLE)) == NULL) + goto dropfrag; + fp = mtod(t, struct ipq *); + TAILQ_INSERT_HEAD(head, fp, ipq_list); + nipq++; + fp->ipq_ttl = IPFRAGTTL; + fp->ipq_p = ip->ip_p; + fp->ipq_id = ip->ip_id; + fp->ipq_src = ip->ip_src; + fp->ipq_dst = ip->ip_dst; + fp->ipq_frags = m; + m->m_nextpkt = NULL; +#ifdef IPDIVERT + fp->ipq_div_info = 0; + fp->ipq_div_cookie = 0; +#endif + goto inserted; + } + +#define GETIP(m) ((struct ip*)((m)->m_pkthdr.header)) + + /* + * Find a segment which begins after this one does. + */ + for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) + if (GETIP(q)->ip_off > ip->ip_off) + break; + + /* + * If there is a preceding segment, it may provide some of + * our data already. If so, drop the data from the incoming + * segment. If it provides all of our data, drop us, otherwise + * stick new segment in the proper place. + * + * If some of the data is dropped from the the preceding + * segment, then it's checksum is invalidated. + */ + if (p) { + i = GETIP(p)->ip_off + GETIP(p)->ip_len - ip->ip_off; + if (i > 0) { + if (i >= ip->ip_len) + goto dropfrag; + m_adj(m, i); + m->m_pkthdr.csum_flags = 0; + ip->ip_off += i; + ip->ip_len -= i; + } + m->m_nextpkt = p->m_nextpkt; + p->m_nextpkt = m; + } else { + m->m_nextpkt = fp->ipq_frags; + fp->ipq_frags = m; + } + + /* + * While we overlap succeeding segments trim them or, + * if they are completely covered, dequeue them. + */ + for (; q != NULL && ip->ip_off + ip->ip_len > GETIP(q)->ip_off; + q = nq) { + i = (ip->ip_off + ip->ip_len) - + GETIP(q)->ip_off; + if (i < GETIP(q)->ip_len) { + GETIP(q)->ip_len -= i; + GETIP(q)->ip_off += i; + m_adj(q, i); + q->m_pkthdr.csum_flags = 0; + break; + } + nq = q->m_nextpkt; + m->m_nextpkt = nq; + m_freem(q); + } + +inserted: + +#ifdef IPDIVERT + /* + * Transfer firewall instructions to the fragment structure. + * Any fragment diverting causes the whole packet to divert. + */ + fp->ipq_div_info = *divinfo; + fp->ipq_div_cookie = *divcookie; + *divinfo = 0; + *divcookie = 0; +#endif + + /* + * Check for complete reassembly. + */ + next = 0; + for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) { + if (GETIP(q)->ip_off != next) + return (0); + next += GETIP(q)->ip_len; + } + /* Make sure the last packet didn't have the IP_MF flag */ + if (p->m_flags & M_FRAG) + return (0); + + /* + * Reassembly is complete. Make sure the packet is a sane size. + */ + q = fp->ipq_frags; + ip = GETIP(q); + if (next + (IP_VHL_HL(ip->ip_vhl) << 2) > IP_MAXPACKET) { + ipstat.ips_toolong++; + ip_freef(head, fp); + return (0); + } + + /* + * Concatenate fragments. + */ + m = q; + t = m->m_next; + m->m_next = 0; + m_cat(m, t); + nq = q->m_nextpkt; + q->m_nextpkt = 0; + for (q = nq; q != NULL; q = nq) { + nq = q->m_nextpkt; + q->m_nextpkt = NULL; + m->m_pkthdr.csum_flags &= q->m_pkthdr.csum_flags; + m->m_pkthdr.csum_data += q->m_pkthdr.csum_data; + m_cat(m, q); + } + +#ifdef IPDIVERT + /* + * Extract firewall instructions from the fragment structure. + */ + *divinfo = fp->ipq_div_info; + *divcookie = fp->ipq_div_cookie; +#endif + + /* + * Create header for new ip packet by + * modifying header of first packet; + * dequeue and discard fragment reassembly header. + * Make header visible. + */ + ip->ip_len = next; + ip->ip_src = fp->ipq_src; + ip->ip_dst = fp->ipq_dst; + TAILQ_REMOVE(head, fp, ipq_list); + nipq--; + (void) m_free(dtom(fp)); + m->m_len += (IP_VHL_HL(ip->ip_vhl) << 2); + m->m_data -= (IP_VHL_HL(ip->ip_vhl) << 2); + /* some debugging cruft by sklower, below, will go away soon */ + if (m->m_flags & M_PKTHDR) { /* XXX this should be done elsewhere */ + register int plen = 0; + for (t = m; t; t = t->m_next) + plen += t->m_len; + m->m_pkthdr.len = plen; + } + return (m); + +dropfrag: +#ifdef IPDIVERT + *divinfo = 0; + *divcookie = 0; +#endif + ipstat.ips_fragdropped++; + m_freem(m); + return (0); + +#undef GETIP +} + +/* + * Free a fragment reassembly header and all + * associated datagrams. + */ +static void +ip_freef(fhp, fp) + struct ipqhead *fhp; + struct ipq *fp; +{ + register struct mbuf *q; + + while (fp->ipq_frags) { + q = fp->ipq_frags; + fp->ipq_frags = q->m_nextpkt; + m_freem(q); + } + TAILQ_REMOVE(fhp, fp, ipq_list); + (void) m_free(dtom(fp)); + nipq--; +} + +/* + * IP timer processing; + * if a timer expires on a reassembly + * queue, discard it. + */ +void +ip_slowtimo() +{ + register struct ipq *fp; + int s = splnet(); + int i; + + for (i = 0; i < IPREASS_NHASH; i++) { + for(fp = TAILQ_FIRST(&ipq[i]); fp;) { + struct ipq *fpp; + + fpp = fp; + fp = TAILQ_NEXT(fp, ipq_list); + if(--fpp->ipq_ttl == 0) { + ipstat.ips_fragtimeout++; + ip_freef(&ipq[i], fpp); + } + } + } + ipflow_slowtimo(); + splx(s); +} + +/* + * Drain off all datagram fragments. + */ +void +ip_drain() +{ + int i; + + for (i = 0; i < IPREASS_NHASH; i++) { + while(!TAILQ_EMPTY(&ipq[i])) { + ipstat.ips_fragdropped++; + ip_freef(&ipq[i], TAILQ_FIRST(&ipq[i])); + } + } + in_rtqdrain(); +} + +/* + * Do option processing on a datagram, + * possibly discarding it if bad options are encountered, + * or forwarding it if source-routed. + * Returns 1 if packet has been forwarded/freed, + * 0 if the packet should be processed further. + */ +static int +ip_dooptions(m) + struct mbuf *m; +{ + register struct ip *ip = mtod(m, struct ip *); + register u_char *cp; + register struct ip_timestamp *ipt; + register struct in_ifaddr *ia; + int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0; + struct in_addr *sin, dst; + n_time ntime; + + dst = ip->ip_dst; + cp = (u_char *)(ip + 1); + cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip); + for (; cnt > 0; cnt -= optlen, cp += optlen) { + opt = cp[IPOPT_OPTVAL]; + if (opt == IPOPT_EOL) + break; + if (opt == IPOPT_NOP) + optlen = 1; + else { + if (cnt < IPOPT_OLEN + sizeof(*cp)) { + code = &cp[IPOPT_OLEN] - (u_char *)ip; + goto bad; + } + optlen = cp[IPOPT_OLEN]; + if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) { + code = &cp[IPOPT_OLEN] - (u_char *)ip; + goto bad; + } + } + switch (opt) { + + default: + break; + + /* + * Source routing with record. + * Find interface with current destination address. + * If none on this machine then drop if strictly routed, + * or do nothing if loosely routed. + * Record interface address and bring up next address + * component. If strictly routed make sure next + * address is on directly accessible net. + */ + case IPOPT_LSRR: + case IPOPT_SSRR: + if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { + code = &cp[IPOPT_OFFSET] - (u_char *)ip; + goto bad; + } + ipaddr.sin_addr = ip->ip_dst; + ia = (struct in_ifaddr *) + ifa_ifwithaddr((struct sockaddr *)&ipaddr); + if (ia == 0) { + if (opt == IPOPT_SSRR) { + type = ICMP_UNREACH; + code = ICMP_UNREACH_SRCFAIL; + goto bad; + } + if (!ip_dosourceroute) + goto nosourcerouting; + /* + * Loose routing, and not at next destination + * yet; nothing to do except forward. + */ + break; + } + off--; /* 0 origin */ + if (off > optlen - (int)sizeof(struct in_addr)) { + /* + * End of source route. Should be for us. + */ + if (!ip_acceptsourceroute) + goto nosourcerouting; + save_rte(cp, ip->ip_src); + break; + } + + if (!ip_dosourceroute) { + if (ipforwarding) { + char buf[16]; /* aaa.bbb.ccc.ddd\0 */ + /* + * Acting as a router, so generate ICMP + */ +nosourcerouting: + strcpy(buf, inet_ntoa(ip->ip_dst)); + log(LOG_WARNING, + "attempted source route from %s to %s\n", + inet_ntoa(ip->ip_src), buf); + type = ICMP_UNREACH; + code = ICMP_UNREACH_SRCFAIL; + goto bad; + } else { + /* + * Not acting as a router, so silently drop. + */ + ipstat.ips_cantforward++; + m_freem(m); + return (1); + } + } + + /* + * locate outgoing interface + */ + (void)memcpy(&ipaddr.sin_addr, cp + off, + sizeof(ipaddr.sin_addr)); + + if (opt == IPOPT_SSRR) { +#define INA struct in_ifaddr * +#define SA struct sockaddr * + if ((ia = (INA)ifa_ifwithdstaddr((SA)&ipaddr)) == 0) + ia = (INA)ifa_ifwithnet((SA)&ipaddr); + } else + ia = ip_rtaddr(ipaddr.sin_addr); + if (ia == 0) { + type = ICMP_UNREACH; + code = ICMP_UNREACH_SRCFAIL; + goto bad; + } + ip->ip_dst = ipaddr.sin_addr; + (void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr), + sizeof(struct in_addr)); + cp[IPOPT_OFFSET] += sizeof(struct in_addr); + /* + * Let ip_intr's mcast routing check handle mcast pkts + */ + forward = !IN_MULTICAST(ntohl(ip->ip_dst.s_addr)); + break; + + case IPOPT_RR: + if (optlen < IPOPT_OFFSET + sizeof(*cp)) { + code = &cp[IPOPT_OFFSET] - (u_char *)ip; + goto bad; + } + if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { + code = &cp[IPOPT_OFFSET] - (u_char *)ip; + goto bad; + } + /* + * If no space remains, ignore. + */ + off--; /* 0 origin */ + if (off > optlen - (int)sizeof(struct in_addr)) + break; + (void)memcpy(&ipaddr.sin_addr, &ip->ip_dst, + sizeof(ipaddr.sin_addr)); + /* + * locate outgoing interface; if we're the destination, + * use the incoming interface (should be same). + */ + if ((ia = (INA)ifa_ifwithaddr((SA)&ipaddr)) == 0 && + (ia = ip_rtaddr(ipaddr.sin_addr)) == 0) { + type = ICMP_UNREACH; + code = ICMP_UNREACH_HOST; + goto bad; + } + (void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr), + sizeof(struct in_addr)); + cp[IPOPT_OFFSET] += sizeof(struct in_addr); + break; + + case IPOPT_TS: + code = cp - (u_char *)ip; + ipt = (struct ip_timestamp *)cp; + if (ipt->ipt_len < 5) + goto bad; + if (ipt->ipt_ptr > + ipt->ipt_len - (int)sizeof(int32_t)) { + if (++ipt->ipt_oflw == 0) + goto bad; + break; + } + sin = (struct in_addr *)(cp + ipt->ipt_ptr - 1); + switch (ipt->ipt_flg) { + + case IPOPT_TS_TSONLY: + break; + + case IPOPT_TS_TSANDADDR: + if (ipt->ipt_ptr - 1 + sizeof(n_time) + + sizeof(struct in_addr) > ipt->ipt_len) + goto bad; + ipaddr.sin_addr = dst; + ia = (INA)ifaof_ifpforaddr((SA)&ipaddr, + m->m_pkthdr.rcvif); + if (ia == 0) + continue; + (void)memcpy(sin, &IA_SIN(ia)->sin_addr, + sizeof(struct in_addr)); + ipt->ipt_ptr += sizeof(struct in_addr); + break; + + case IPOPT_TS_PRESPEC: + if (ipt->ipt_ptr - 1 + sizeof(n_time) + + sizeof(struct in_addr) > ipt->ipt_len) + goto bad; + (void)memcpy(&ipaddr.sin_addr, sin, + sizeof(struct in_addr)); + if (ifa_ifwithaddr((SA)&ipaddr) == 0) + continue; + ipt->ipt_ptr += sizeof(struct in_addr); + break; + + default: + goto bad; + } + ntime = iptime(); + (void)memcpy(cp + ipt->ipt_ptr - 1, &ntime, + sizeof(n_time)); + ipt->ipt_ptr += sizeof(n_time); + } + } + if (forward && ipforwarding) { + ip_forward(m, 1); + return (1); + } + return (0); +bad: + icmp_error(m, type, code, 0, 0); + ipstat.ips_badoptions++; + return (1); +} + +/* + * Given address of next destination (final or next hop), + * return internet address info of interface to be used to get there. + */ +static struct in_ifaddr * +ip_rtaddr(dst) + struct in_addr dst; +{ + register struct sockaddr_in *sin; + + sin = (struct sockaddr_in *) &ipforward_rt.ro_dst; + + if (ipforward_rt.ro_rt == 0 || + !(ipforward_rt.ro_rt->rt_flags & RTF_UP) || + dst.s_addr != sin->sin_addr.s_addr) { + if (ipforward_rt.ro_rt) { + RTFREE(ipforward_rt.ro_rt); + ipforward_rt.ro_rt = 0; + } + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + sin->sin_addr = dst; + + rtalloc_ign(&ipforward_rt, RTF_PRCLONING); + } + if (ipforward_rt.ro_rt == 0) + return ((struct in_ifaddr *)0); + return ((struct in_ifaddr *) ipforward_rt.ro_rt->rt_ifa); +} + +/* + * Save incoming source route for use in replies, + * to be picked up later by ip_srcroute if the receiver is interested. + */ +void +save_rte(option, dst) + u_char *option; + struct in_addr dst; +{ + unsigned olen; + + olen = option[IPOPT_OLEN]; +#ifdef DIAGNOSTIC + if (ipprintfs) + printf("save_rte: olen %d\n", olen); +#endif + if (olen > sizeof(ip_srcrt) - (1 + sizeof(dst))) + return; + bcopy(option, ip_srcrt.srcopt, olen); + ip_nhops = (olen - IPOPT_OFFSET - 1) / sizeof(struct in_addr); + ip_srcrt.dst = dst; +} + +/* + * Retrieve incoming source route for use in replies, + * in the same form used by setsockopt. + * The first hop is placed before the options, will be removed later. + */ +struct mbuf * +ip_srcroute() +{ + register struct in_addr *p, *q; + register struct mbuf *m; + + if (ip_nhops == 0) + return ((struct mbuf *)0); + m = m_get(M_DONTWAIT, MT_HEADER); + if (m == 0) + return ((struct mbuf *)0); + +#define OPTSIZ (sizeof(ip_srcrt.nop) + sizeof(ip_srcrt.srcopt)) + + /* length is (nhops+1)*sizeof(addr) + sizeof(nop + srcrt header) */ + m->m_len = ip_nhops * sizeof(struct in_addr) + sizeof(struct in_addr) + + OPTSIZ; +#ifdef DIAGNOSTIC + if (ipprintfs) + printf("ip_srcroute: nhops %d mlen %d", ip_nhops, m->m_len); +#endif + + /* + * First save first hop for return route + */ + p = &ip_srcrt.route[ip_nhops - 1]; + *(mtod(m, struct in_addr *)) = *p--; +#ifdef DIAGNOSTIC + if (ipprintfs) + printf(" hops %lx", (u_long)ntohl(mtod(m, struct in_addr *)->s_addr)); +#endif + + /* + * Copy option fields and padding (nop) to mbuf. + */ + ip_srcrt.nop = IPOPT_NOP; + ip_srcrt.srcopt[IPOPT_OFFSET] = IPOPT_MINOFF; + (void)memcpy(mtod(m, caddr_t) + sizeof(struct in_addr), + &ip_srcrt.nop, OPTSIZ); + q = (struct in_addr *)(mtod(m, caddr_t) + + sizeof(struct in_addr) + OPTSIZ); +#undef OPTSIZ + /* + * Record return path as an IP source route, + * reversing the path (pointers are now aligned). + */ + while (p >= ip_srcrt.route) { +#ifdef DIAGNOSTIC + if (ipprintfs) + printf(" %lx", (u_long)ntohl(q->s_addr)); +#endif + *q++ = *p--; + } + /* + * Last hop goes to final destination. + */ + *q = ip_srcrt.dst; +#ifdef DIAGNOSTIC + if (ipprintfs) + printf(" %lx\n", (u_long)ntohl(q->s_addr)); +#endif + return (m); +} + +/* + * Strip out IP options, at higher + * level protocol in the kernel. + * Second argument is buffer to which options + * will be moved, and return value is their length. + * XXX should be deleted; last arg currently ignored. + */ +void +ip_stripoptions(m, mopt) + register struct mbuf *m; + struct mbuf *mopt; +{ + register int i; + struct ip *ip = mtod(m, struct ip *); + register caddr_t opts; + int olen; + + olen = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip); + opts = (caddr_t)(ip + 1); + i = m->m_len - (sizeof (struct ip) + olen); + bcopy(opts + olen, opts, (unsigned)i); + m->m_len -= olen; + if (m->m_flags & M_PKTHDR) + m->m_pkthdr.len -= olen; + ip->ip_vhl = IP_MAKE_VHL(IPVERSION, sizeof(struct ip) >> 2); +} + +u_char inetctlerrmap[PRC_NCMDS] = { + 0, 0, 0, 0, + 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, + EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, + EMSGSIZE, EHOSTUNREACH, 0, 0, + 0, 0, 0, 0, + ENOPROTOOPT, ENETRESET +}; + +/* + * Forward a packet. If some error occurs return the sender + * an icmp packet. Note we can't always generate a meaningful + * icmp message because icmp doesn't have a large enough repertoire + * of codes and types. + * + * If not forwarding, just drop the packet. This could be confusing + * if ipforwarding was zero but some routing protocol was advancing + * us as a gateway to somewhere. However, we must let the routing + * protocol deal with that. + * + * The srcrt parameter indicates whether the packet is being forwarded + * via a source route. + */ +static void +ip_forward(m, srcrt) + struct mbuf *m; + int srcrt; +{ + register struct ip *ip = mtod(m, struct ip *); + register struct rtentry *rt; + int error, type = 0, code = 0; + struct mbuf *mcopy; + n_long dest; + struct ifnet *destifp; +#ifdef IPSEC + struct ifnet dummyifp; +#endif + + dest = 0; +#ifdef DIAGNOSTIC + if (ipprintfs) + printf("forward: src %lx dst %lx ttl %x\n", + (u_long)ip->ip_src.s_addr, (u_long)ip->ip_dst.s_addr, + ip->ip_ttl); +#endif + + + if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) { + ipstat.ips_cantforward++; + m_freem(m); + return; + } +#ifdef IPSTEALTH + if (!ipstealth) { +#endif + if (ip->ip_ttl <= IPTTLDEC) { + icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, + dest, 0); + return; + } +#ifdef IPSTEALTH + } +#endif + + if (ip_rtaddr(ip->ip_dst) == 0) { + icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, dest, 0); + return; + } else + rt = ipforward_rt.ro_rt; + + /* + * Save the IP header and at most 8 bytes of the payload, + * in case we need to generate an ICMP message to the src. + * + * We don't use m_copy() because it might return a reference + * to a shared cluster. Both this function and ip_output() + * assume exclusive access to the IP header in `m', so any + * data in a cluster may change before we reach icmp_error(). + */ + MGET(mcopy, M_DONTWAIT, m->m_type); + if (mcopy != NULL) { + M_COPY_PKTHDR(mcopy, m); + mcopy->m_len = imin((IP_VHL_HL(ip->ip_vhl) << 2) + 8, + (int)ip->ip_len); + m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t)); + } + +#ifdef IPSTEALTH + if (!ipstealth) { +#endif + ip->ip_ttl -= IPTTLDEC; +#ifdef IPSTEALTH + } +#endif + + /* + * If forwarding packet using same interface that it came in on, + * perhaps should send a redirect to sender to shortcut a hop. + * Only send redirect if source is sending directly to us, + * and if packet was not source routed (or has any options). + * Also, don't send redirect if forwarding using a default route + * or a route modified by a redirect. + */ +#define satosin(sa) ((struct sockaddr_in *)(sa)) + if (rt->rt_ifp == m->m_pkthdr.rcvif && + (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 && + satosin(rt_key(rt))->sin_addr.s_addr != 0 && + ipsendredirects && !srcrt) { +#define RTA(rt) ((struct in_ifaddr *)(rt->rt_ifa)) + u_long src = ntohl(ip->ip_src.s_addr); + + if (RTA(rt) && + (src & RTA(rt)->ia_subnetmask) == RTA(rt)->ia_subnet) { + if (rt->rt_flags & RTF_GATEWAY) + dest = satosin(rt->rt_gateway)->sin_addr.s_addr; + else + dest = ip->ip_dst.s_addr; + /* Router requirements says to only send host redirects */ + type = ICMP_REDIRECT; + code = ICMP_REDIRECT_HOST; +#ifdef DIAGNOSTIC + if (ipprintfs) + printf("redirect (%d) to %lx\n", code, (u_long)dest); +#endif + } + } + + error = ip_output(m, (struct mbuf *)0, &ipforward_rt, + IP_FORWARDING, 0); + if (error) + ipstat.ips_cantforward++; + else { + ipstat.ips_forward++; + if (type) + ipstat.ips_redirectsent++; + else { + if (mcopy) { + ipflow_create(&ipforward_rt, mcopy); + m_freem(mcopy); + } + return; + } + } + if (mcopy == NULL) + return; + destifp = NULL; + + switch (error) { + + case 0: /* forwarded, but need redirect */ + /* type, code set above */ + break; + + case ENETUNREACH: /* shouldn't happen, checked above */ + case EHOSTUNREACH: + case ENETDOWN: + case EHOSTDOWN: + default: + type = ICMP_UNREACH; + code = ICMP_UNREACH_HOST; + break; + + case EMSGSIZE: + type = ICMP_UNREACH; + code = ICMP_UNREACH_NEEDFRAG; +#ifndef IPSEC + if (ipforward_rt.ro_rt) + destifp = ipforward_rt.ro_rt->rt_ifp; +#else + /* + * If the packet is routed over IPsec tunnel, tell the + * originator the tunnel MTU. + * tunnel MTU = if MTU - sizeof(IP) - ESP/AH hdrsiz + * XXX quickhack!!! + */ + if (ipforward_rt.ro_rt) { + struct secpolicy *sp = NULL; + int ipsecerror; + int ipsechdr; + struct route *ro; + + sp = ipsec4_getpolicybyaddr(mcopy, + IPSEC_DIR_OUTBOUND, + IP_FORWARDING, + &ipsecerror); + + if (sp == NULL) + destifp = ipforward_rt.ro_rt->rt_ifp; + else { + /* count IPsec header size */ + ipsechdr = ipsec4_hdrsiz(mcopy, + IPSEC_DIR_OUTBOUND, + NULL); + + /* + * find the correct route for outer IPv4 + * header, compute tunnel MTU. + * + * XXX BUG ALERT + * The "dummyifp" code relies upon the fact + * that icmp_error() touches only ifp->if_mtu. + */ + /*XXX*/ + destifp = NULL; + if (sp->req != NULL + && sp->req->sav != NULL + && sp->req->sav->sah != NULL) { + ro = &sp->req->sav->sah->sa_route; + if (ro->ro_rt && ro->ro_rt->rt_ifp) { + dummyifp.if_mtu = + ro->ro_rt->rt_ifp->if_mtu; + dummyifp.if_mtu -= ipsechdr; + destifp = &dummyifp; + } + } + + key_freesp(sp); + } + } +#endif /*IPSEC*/ + ipstat.ips_cantfrag++; + break; + + case ENOBUFS: + type = ICMP_SOURCEQUENCH; + code = 0; + break; + + case EACCES: /* ipfw denied packet */ + m_freem(mcopy); + return; + } + icmp_error(mcopy, type, code, dest, destifp); +} + +void +ip_savecontrol(inp, mp, ip, m) + register struct inpcb *inp; + register struct mbuf **mp; + register struct ip *ip; + register struct mbuf *m; +{ + if (inp->inp_socket->so_options & SO_TIMESTAMP) { + struct timeval tv; + + microtime(&tv); + *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv), + SCM_TIMESTAMP, SOL_SOCKET); + if (*mp) + mp = &(*mp)->m_next; + } + if (inp->inp_flags & INP_RECVDSTADDR) { + *mp = sbcreatecontrol((caddr_t) &ip->ip_dst, + sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP); + if (*mp) + mp = &(*mp)->m_next; + } +#ifdef notyet + /* XXX + * Moving these out of udp_input() made them even more broken + * than they already were. + */ + /* options were tossed already */ + if (inp->inp_flags & INP_RECVOPTS) { + *mp = sbcreatecontrol((caddr_t) opts_deleted_above, + sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP); + if (*mp) + mp = &(*mp)->m_next; + } + /* ip_srcroute doesn't do what we want here, need to fix */ + if (inp->inp_flags & INP_RECVRETOPTS) { + *mp = sbcreatecontrol((caddr_t) ip_srcroute(), + sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP); + if (*mp) + mp = &(*mp)->m_next; + } +#endif + if (inp->inp_flags & INP_RECVIF) { + struct ifnet *ifp; + struct sdlbuf { + struct sockaddr_dl sdl; + u_char pad[32]; + } sdlbuf; + struct sockaddr_dl *sdp; + struct sockaddr_dl *sdl2 = &sdlbuf.sdl; + + if (((ifp = m->m_pkthdr.rcvif)) + && ( ifp->if_index && (ifp->if_index <= if_index))) { + sdp = (struct sockaddr_dl *)(ifnet_addrs + [ifp->if_index - 1]->ifa_addr); + /* + * Change our mind and don't try copy. + */ + if ((sdp->sdl_family != AF_LINK) + || (sdp->sdl_len > sizeof(sdlbuf))) { + goto makedummy; + } + bcopy(sdp, sdl2, sdp->sdl_len); + } else { +makedummy: + sdl2->sdl_len + = offsetof(struct sockaddr_dl, sdl_data[0]); + sdl2->sdl_family = AF_LINK; + sdl2->sdl_index = 0; + sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0; + } + *mp = sbcreatecontrol((caddr_t) sdl2, sdl2->sdl_len, + IP_RECVIF, IPPROTO_IP); + if (*mp) + mp = &(*mp)->m_next; + } +} + +int +ip_rsvp_init(struct socket *so) +{ + if (so->so_type != SOCK_RAW || + so->so_proto->pr_protocol != IPPROTO_RSVP) + return EOPNOTSUPP; + + if (ip_rsvpd != NULL) + return EADDRINUSE; + + ip_rsvpd = so; + /* + * This may seem silly, but we need to be sure we don't over-increment + * the RSVP counter, in case something slips up. + */ + if (!ip_rsvp_on) { + ip_rsvp_on = 1; + rsvp_on++; + } + + return 0; +} + +int +ip_rsvp_done(void) +{ + ip_rsvpd = NULL; + /* + * This may seem silly, but we need to be sure we don't over-decrement + * the RSVP counter, in case something slips up. + */ + if (ip_rsvp_on) { + ip_rsvp_on = 0; + rsvp_on--; + } + return 0; +} diff --git a/sys/netinet/ip_log.c b/sys/netinet/ip_log.c new file mode 100644 index 0000000..082a0a3 --- /dev/null +++ b/sys/netinet/ip_log.c @@ -0,0 +1,507 @@ +/* + * Copyright (C) 1997-2000 by Darren Reed. + * + * Redistribution and use in source and binary forms are permitted + * provided that this notice is preserved and due credit is given + * to the original author and the contributors. + * + * $Id: ip_log.c,v 2.5.2.1 2000/07/19 13:11:47 darrenr Exp $ + * $FreeBSD$ + */ +#include <sys/param.h> +#if defined(KERNEL) && !defined(_KERNEL) +# define _KERNEL +#endif +#if defined(__NetBSD__) && (NetBSD >= 199905) && !defined(IPFILTER_LKM) +# include "opt_ipfilter_log.h" +#endif +#ifdef __FreeBSD__ +# if defined(IPFILTER_LKM) || defined(_KERNEL) +# if !defined(__FreeBSD_version) +# include <sys/osreldate.h> +# endif +# if !defined(IPFILTER_LKM) +# if defined(__FreeBSD_version) && (__FreeBSD_version >= 300000) +# include "opt_ipfilter.h" +# endif +# endif +# else +# ifdef KLD_MODULE +# include <osreldate.h> +# endif +# endif +#endif +#ifdef IPFILTER_LOG +# ifndef SOLARIS +# define SOLARIS (defined(sun) && (defined(__svr4__) || defined(__SVR4))) +# endif +# ifndef _KERNEL +# include <stdio.h> +# include <string.h> +# include <stdlib.h> +# include <ctype.h> +# endif +# include <sys/errno.h> +# include <sys/types.h> +# include <sys/file.h> +# if __FreeBSD_version >= 220000 && defined(_KERNEL) +# include <sys/fcntl.h> +# include <sys/filio.h> +# else +# include <sys/ioctl.h> +# endif +# include <sys/time.h> +# if defined(_KERNEL) && !defined(linux) +# include <sys/systm.h> +# endif +# include <sys/uio.h> +# if !SOLARIS +# if (NetBSD > 199609) || (OpenBSD > 199603) || (__FreeBSD_version >= 300000) +# include <sys/dirent.h> +# else +# include <sys/dir.h> +# endif +# ifndef linux +# include <sys/mbuf.h> +# endif +# else +# include <sys/filio.h> +# include <sys/cred.h> +# include <sys/ddi.h> +# include <sys/sunddi.h> +# include <sys/ksynch.h> +# include <sys/kmem.h> +# include <sys/mkdev.h> +# include <sys/dditypes.h> +# include <sys/cmn_err.h> +# endif +# ifndef linux +# include <sys/protosw.h> +# endif +# include <sys/socket.h> + +# include <net/if.h> +# ifdef sun +# include <net/af.h> +# endif +# if __FreeBSD_version >= 300000 +# include <net/if_var.h> +# endif +# include <net/route.h> +# include <netinet/in.h> +# ifdef __sgi +# include <sys/ddi.h> +# ifdef IFF_DRVRLOCK /* IRIX6 */ +# include <sys/hashing.h> +# endif +# endif +# if !defined(linux) && !(defined(__sgi) && !defined(IFF_DRVRLOCK)) /*IRIX<6*/ +# include <netinet/in_var.h> +# endif +# include <netinet/in_systm.h> +# include <netinet/ip.h> +# include <netinet/tcp.h> +# include <netinet/udp.h> +# include <netinet/ip_icmp.h> +# ifndef linux +# include <netinet/ip_var.h> +# endif +# ifndef _KERNEL +# include <syslog.h> +# endif +# include "netinet/ip_compat.h" +# include <netinet/tcpip.h> +# include "netinet/ip_fil.h" +# include "netinet/ip_proxy.h" +# include "netinet/ip_nat.h" +# include "netinet/ip_frag.h" +# include "netinet/ip_state.h" +# include "netinet/ip_auth.h" +# if (__FreeBSD_version >= 300000) +# include <sys/malloc.h> +# endif + +# ifndef MIN +# define MIN(a,b) (((a)<(b))?(a):(b)) +# endif + + +# if SOLARIS || defined(__sgi) +extern kmutex_t ipl_mutex; +# if SOLARIS +extern kcondvar_t iplwait; +# endif +# endif + +iplog_t **iplh[IPL_LOGMAX+1], *iplt[IPL_LOGMAX+1], *ipll[IPL_LOGMAX+1]; +size_t iplused[IPL_LOGMAX+1]; +static fr_info_t iplcrc[IPL_LOGMAX+1]; +# ifdef linux +static struct wait_queue *iplwait[IPL_LOGMAX+1]; +# endif + + +/* + * Initialise log buffers & pointers. Also iniialised the CRC to a local + * secret for use in calculating the "last log checksum". + */ +void ipflog_init() +{ + int i; + + for (i = IPL_LOGMAX; i >= 0; i--) { + iplt[i] = NULL; + ipll[i] = NULL; + iplh[i] = &iplt[i]; + iplused[i] = 0; + bzero((char *)&iplcrc[i], sizeof(iplcrc[i])); + } +} + + +/* + * ipflog + * Create a log record for a packet given that it has been triggered by a + * rule (or the default setting). Calculate the transport protocol header + * size using predetermined size of a couple of popular protocols and thus + * how much data to copy into the log, including part of the data body if + * requested. + */ +int ipflog(flags, ip, fin, m) +u_int flags; +ip_t *ip; +fr_info_t *fin; +mb_t *m; +{ + ipflog_t ipfl; + register size_t mlen, hlen; + size_t sizes[2]; + void *ptrs[2]; + int types[2]; + u_char p; +# if SOLARIS + ill_t *ifp = fin->fin_ifp; +# else + struct ifnet *ifp = fin->fin_ifp; +# endif + + /* + * calculate header size. + */ + hlen = fin->fin_hlen; + if (fin->fin_off == 0) { + p = fin->fin_fi.fi_p; + if (p == IPPROTO_TCP) + hlen += MIN(sizeof(tcphdr_t), fin->fin_dlen); + else if (p == IPPROTO_UDP) + hlen += MIN(sizeof(udphdr_t), fin->fin_dlen); + else if (p == IPPROTO_ICMP) { + struct icmp *icmp; + + icmp = (struct icmp *)fin->fin_dp; + + /* + * For ICMP, if the packet is an error packet, also + * include the information about the packet which + * caused the error. + */ + switch (icmp->icmp_type) + { + case ICMP_UNREACH : + case ICMP_SOURCEQUENCH : + case ICMP_REDIRECT : + case ICMP_TIMXCEED : + case ICMP_PARAMPROB : + hlen += MIN(sizeof(struct icmp) + 8, + fin->fin_dlen); + break; + default : + hlen += MIN(sizeof(struct icmp), + fin->fin_dlen); + break; + } + } + } + /* + * Get the interface number and name to which this packet is + * currently associated. + */ +# if SOLARIS + ipfl.fl_unit = (u_char)ifp->ill_ppa; + bcopy(ifp->ill_name, ipfl.fl_ifname, MIN(ifp->ill_name_length, 4)); + mlen = (flags & FR_LOGBODY) ? MIN(msgdsize(m) - hlen, 128) : 0; +# else +# if (defined(NetBSD) && (NetBSD <= 1991011) && (NetBSD >= 199603)) || \ + (defined(OpenBSD) && (OpenBSD >= 199603)) + strncpy(ipfl.fl_ifname, ifp->if_xname, IFNAMSIZ); +# else +# ifndef linux + ipfl.fl_unit = (u_char)ifp->if_unit; +# endif + if ((ipfl.fl_ifname[0] = ifp->if_name[0])) + if ((ipfl.fl_ifname[1] = ifp->if_name[1])) + if ((ipfl.fl_ifname[2] = ifp->if_name[2])) + ipfl.fl_ifname[3] = ifp->if_name[3]; +# endif + mlen = (flags & FR_LOGBODY) ? MIN(fin->fin_plen - hlen, 128) : 0; +# endif + ipfl.fl_plen = (u_char)mlen; + ipfl.fl_hlen = (u_char)hlen; + ipfl.fl_rule = fin->fin_rule; + ipfl.fl_group = fin->fin_group; + if (fin->fin_fr != NULL) + ipfl.fl_loglevel = fin->fin_fr->fr_loglevel; + else + ipfl.fl_loglevel = 0xffff; + ipfl.fl_flags = flags; + ptrs[0] = (void *)&ipfl; + sizes[0] = sizeof(ipfl); + types[0] = 0; +# if SOLARIS + /* + * Are we copied from the mblk or an aligned array ? + */ + if (ip == (ip_t *)m->b_rptr) { + ptrs[1] = m; + sizes[1] = hlen + mlen; + types[1] = 1; + } else { + ptrs[1] = ip; + sizes[1] = hlen + mlen; + types[1] = 0; + } +# else + ptrs[1] = m; + sizes[1] = hlen + mlen; + types[1] = 1; +# endif + return ipllog(IPL_LOGIPF, fin, ptrs, sizes, types, 2); +} + + +/* + * ipllog + */ +int ipllog(dev, fin, items, itemsz, types, cnt) +int dev; +fr_info_t *fin; +void **items; +size_t *itemsz; +int *types, cnt; +{ + caddr_t buf, s; + iplog_t *ipl; + size_t len; + int i; + + /* + * Check to see if this log record has a CRC which matches the last + * record logged. If it does, just up the count on the previous one + * rather than create a new one. + */ + MUTEX_ENTER(&ipl_mutex); + if (fin != NULL) { + if ((ipll[dev] != NULL) && + bcmp((char *)fin, (char *)&iplcrc[dev], FI_CSIZE) == 0) { + ipll[dev]->ipl_count++; + MUTEX_EXIT(&ipl_mutex); + return 1; + } + bcopy((char *)fin, (char *)&iplcrc[dev], FI_CSIZE); + } else + bzero((char *)&iplcrc[dev], FI_CSIZE); + MUTEX_EXIT(&ipl_mutex); + + /* + * Get the total amount of data to be logged. + */ + for (i = 0, len = sizeof(iplog_t); i < cnt; i++) + len += itemsz[i]; + + /* + * check that we have space to record this information and can + * allocate that much. + */ + KMALLOCS(buf, caddr_t, len); + if (!buf) + return 0; + MUTEX_ENTER(&ipl_mutex); + if ((iplused[dev] + len) > IPLLOGSIZE) { + MUTEX_EXIT(&ipl_mutex); + KFREES(buf, len); + return 0; + } + iplused[dev] += len; + MUTEX_EXIT(&ipl_mutex); + + /* + * advance the log pointer to the next empty record and deduct the + * amount of space we're going to use. + */ + ipl = (iplog_t *)buf; + ipl->ipl_magic = IPL_MAGIC; + ipl->ipl_count = 1; + ipl->ipl_next = NULL; + ipl->ipl_dsize = len; +# if SOLARIS || defined(sun) || defined(linux) + uniqtime((struct timeval *)&ipl->ipl_sec); +# else +# if BSD >= 199306 || defined(__FreeBSD__) || defined(__sgi) + microtime((struct timeval *)&ipl->ipl_sec); +# endif +# endif + + /* + * Loop through all the items to be logged, copying each one to the + * buffer. Use bcopy for normal data or the mb_t copyout routine. + */ + for (i = 0, s = buf + sizeof(*ipl); i < cnt; i++) { + if (types[i] == 0) + bcopy(items[i], s, itemsz[i]); + else if (types[i] == 1) { +# if SOLARIS + copyout_mblk(items[i], 0, itemsz[i], s); +# else + m_copydata(items[i], 0, itemsz[i], s); +# endif + } + s += itemsz[i]; + } + MUTEX_ENTER(&ipl_mutex); + ipll[dev] = ipl; + *iplh[dev] = ipl; + iplh[dev] = &ipl->ipl_next; +# if SOLARIS + cv_signal(&iplwait); + mutex_exit(&ipl_mutex); +# else + MUTEX_EXIT(&ipl_mutex); +# ifdef linux + wake_up_interruptible(&iplwait[dev]); +# else + wakeup(&iplh[dev]); +# endif +# endif + return 1; +} + + +int ipflog_read(unit, uio) +minor_t unit; +struct uio *uio; +{ + size_t dlen, copied; + int error = 0; + iplog_t *ipl; +# if defined(_KERNEL) && !SOLARIS + int s; +# endif + + /* + * Sanity checks. Make sure the minor # is valid and we're copying + * a valid chunk of data. + */ + if (IPL_LOGMAX < unit) + return ENXIO; + if (!uio->uio_resid) + return 0; + if ((uio->uio_resid < sizeof(iplog_t)) || + (uio->uio_resid > IPLLOGSIZE)) + return EINVAL; + + /* + * Lock the log so we can snapshot the variables. Wait for a signal + * if the log is empty. + */ + SPL_NET(s); + MUTEX_ENTER(&ipl_mutex); + + while (!iplused[unit] || !iplt[unit]) { +# if SOLARIS && defined(_KERNEL) + if (!cv_wait_sig(&iplwait, &ipl_mutex)) { + MUTEX_EXIT(&ipl_mutex); + return EINTR; + } +# else +# ifdef linux + interruptible_sleep_on(&iplwait[unit]); + if (current->signal & ~current->blocked) + return -EINTR; +# else + MUTEX_EXIT(&ipl_mutex); + SPL_X(s); + error = SLEEP(&iplh[unit], "ipl sleep"); + if (error) + return error; + SPL_NET(s); + MUTEX_ENTER(&ipl_mutex); +# endif /* linux */ +# endif /* SOLARIS */ + } + +# if BSD >= 199306 || defined(__FreeBSD__) + uio->uio_rw = UIO_READ; +# endif + + for (copied = 0; (ipl = iplt[unit]); copied += dlen) { + dlen = ipl->ipl_dsize; + if (dlen > uio->uio_resid) + break; + /* + * Don't hold the mutex over the uiomove call. + */ + iplt[unit] = ipl->ipl_next; + iplused[unit] -= dlen; + MUTEX_EXIT(&ipl_mutex); + SPL_X(s); + error = UIOMOVE((caddr_t)ipl, dlen, UIO_READ, uio); + if (error) { + SPL_NET(s); + MUTEX_ENTER(&ipl_mutex); + ipl->ipl_next = iplt[unit]; + iplt[unit] = ipl; + iplused[unit] += dlen; + break; + } + KFREES((caddr_t)ipl, dlen); + SPL_NET(s); + MUTEX_ENTER(&ipl_mutex); + } + if (!iplt[unit]) { + iplused[unit] = 0; + iplh[unit] = &iplt[unit]; + ipll[unit] = NULL; + } + + MUTEX_EXIT(&ipl_mutex); + SPL_X(s); +# ifdef linux + if (!error) + return (int)copied; + return -error; +# else + return error; +# endif +} + + +int ipflog_clear(unit) +minor_t unit; +{ + iplog_t *ipl; + int used; + + MUTEX_ENTER(&ipl_mutex); + while ((ipl = iplt[unit])) { + iplt[unit] = ipl->ipl_next; + KFREES((caddr_t)ipl, ipl->ipl_dsize); + } + iplh[unit] = &iplt[unit]; + ipll[unit] = NULL; + used = iplused[unit]; + iplused[unit] = 0; + bzero((char *)&iplcrc[unit], FI_CSIZE); + MUTEX_EXIT(&ipl_mutex); + return used; +} +#endif /* IPFILTER_LOG */ diff --git a/sys/netinet/ip_mroute.c b/sys/netinet/ip_mroute.c new file mode 100644 index 0000000..b6a9fca --- /dev/null +++ b/sys/netinet/ip_mroute.c @@ -0,0 +1,2263 @@ +/* + * IP multicast forwarding procedures + * + * Written by David Waitzman, BBN Labs, August 1988. + * Modified by Steve Deering, Stanford, February 1989. + * Modified by Mark J. Steiglitz, Stanford, May, 1991 + * Modified by Van Jacobson, LBL, January 1993 + * Modified by Ajit Thyagarajan, PARC, August 1993 + * Modified by Bill Fenner, PARC, April 1995 + * + * MROUTING Revision: 3.5 + * $FreeBSD$ + */ + +#include "opt_mrouting.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/protosw.h> +#include <sys/time.h> +#include <sys/kernel.h> +#include <sys/sockio.h> +#include <sys/syslog.h> +#include <net/if.h> +#include <net/route.h> +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#include <netinet/in_var.h> +#include <netinet/igmp.h> +#include <netinet/ip_mroute.h> +#include <netinet/udp.h> +#include <machine/in_cksum.h> + +#ifndef NTOHL +#if BYTE_ORDER != BIG_ENDIAN +#define NTOHL(d) ((d) = ntohl((d))) +#define NTOHS(d) ((d) = ntohs((u_short)(d))) +#define HTONL(d) ((d) = htonl((d))) +#define HTONS(d) ((d) = htons((u_short)(d))) +#else +#define NTOHL(d) +#define NTOHS(d) +#define HTONL(d) +#define HTONS(d) +#endif +#endif + +#ifndef MROUTING +extern u_long _ip_mcast_src __P((int vifi)); +extern int _ip_mforward __P((struct ip *ip, struct ifnet *ifp, + struct mbuf *m, struct ip_moptions *imo)); +extern int _ip_mrouter_done __P((void)); +extern int _ip_mrouter_get __P((struct socket *so, struct sockopt *sopt)); +extern int _ip_mrouter_set __P((struct socket *so, struct sockopt *sopt)); +extern int _mrt_ioctl __P((int req, caddr_t data, struct proc *p)); + +/* + * Dummy routines and globals used when multicast routing is not compiled in. + */ + +struct socket *ip_mrouter = NULL; +u_int rsvpdebug = 0; + +int +_ip_mrouter_set(so, sopt) + struct socket *so; + struct sockopt *sopt; +{ + return(EOPNOTSUPP); +} + +int (*ip_mrouter_set)(struct socket *, struct sockopt *) = _ip_mrouter_set; + + +int +_ip_mrouter_get(so, sopt) + struct socket *so; + struct sockopt *sopt; +{ + return(EOPNOTSUPP); +} + +int (*ip_mrouter_get)(struct socket *, struct sockopt *) = _ip_mrouter_get; + +int +_ip_mrouter_done() +{ + return(0); +} + +int (*ip_mrouter_done)(void) = _ip_mrouter_done; + +int +_ip_mforward(ip, ifp, m, imo) + struct ip *ip; + struct ifnet *ifp; + struct mbuf *m; + struct ip_moptions *imo; +{ + return(0); +} + +int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, + struct ip_moptions *) = _ip_mforward; + +int +_mrt_ioctl(int req, caddr_t data, struct proc *p) +{ + return EOPNOTSUPP; +} + +int (*mrt_ioctl)(int, caddr_t, struct proc *) = _mrt_ioctl; + +void +rsvp_input(m, off, proto) /* XXX must fixup manually */ + struct mbuf *m; + int off; + int proto; +{ + /* Can still get packets with rsvp_on = 0 if there is a local member + * of the group to which the RSVP packet is addressed. But in this + * case we want to throw the packet away. + */ + if (!rsvp_on) { + m_freem(m); + return; + } + + if (ip_rsvpd != NULL) { + if (rsvpdebug) + printf("rsvp_input: Sending packet up old-style socket\n"); + rip_input(m, off, proto); + return; + } + /* Drop the packet */ + m_freem(m); +} + +void ipip_input(struct mbuf *m, int off, int proto) { /* XXX must fixup manually */ + rip_input(m, off, proto); +} + +int (*legal_vif_num)(int) = 0; + +/* + * This should never be called, since IP_MULTICAST_VIF should fail, but + * just in case it does get called, the code a little lower in ip_output + * will assign the packet a local address. + */ +u_long +_ip_mcast_src(int vifi) { return INADDR_ANY; } +u_long (*ip_mcast_src)(int) = _ip_mcast_src; + +int +ip_rsvp_vif_init(so, sopt) + struct socket *so; + struct sockopt *sopt; +{ + return(EINVAL); +} + +int +ip_rsvp_vif_done(so, sopt) + struct socket *so; + struct sockopt *sopt; +{ + return(EINVAL); +} + +void +ip_rsvp_force_done(so) + struct socket *so; +{ + return; +} + +#else /* MROUTING */ + +#define M_HASCL(m) ((m)->m_flags & M_EXT) + +#define INSIZ sizeof(struct in_addr) +#define same(a1, a2) \ + (bcmp((caddr_t)(a1), (caddr_t)(a2), INSIZ) == 0) + +static MALLOC_DEFINE(M_MRTABLE, "mroutetbl", "multicast routing tables"); + +/* + * Globals. All but ip_mrouter and ip_mrtproto could be static, + * except for netstat or debugging purposes. + */ +#ifndef MROUTE_LKM +struct socket *ip_mrouter = NULL; +static struct mrtstat mrtstat; +#else /* MROUTE_LKM */ +extern void X_ipip_input __P((struct mbuf *m, int iphlen)); +extern struct mrtstat mrtstat; +static int ip_mrtproto; +#endif + +#define NO_RTE_FOUND 0x1 +#define RTE_FOUND 0x2 + +static struct mfc *mfctable[MFCTBLSIZ]; +static u_char nexpire[MFCTBLSIZ]; +static struct vif viftable[MAXVIFS]; +static u_int mrtdebug = 0; /* debug level */ +#define DEBUG_MFC 0x02 +#define DEBUG_FORWARD 0x04 +#define DEBUG_EXPIRE 0x08 +#define DEBUG_XMIT 0x10 +static u_int tbfdebug = 0; /* tbf debug level */ +static u_int rsvpdebug = 0; /* rsvp debug level */ + +static struct callout_handle expire_upcalls_ch; + +#define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */ +#define UPCALL_EXPIRE 6 /* number of timeouts */ + +/* + * Define the token bucket filter structures + * tbftable -> each vif has one of these for storing info + */ + +static struct tbf tbftable[MAXVIFS]; +#define TBF_REPROCESS (hz / 100) /* 100x / second */ + +/* + * 'Interfaces' associated with decapsulator (so we can tell + * packets that went through it from ones that get reflected + * by a broken gateway). These interfaces are never linked into + * the system ifnet list & no routes point to them. I.e., packets + * can't be sent this way. They only exist as a placeholder for + * multicast source verification. + */ +static struct ifnet multicast_decap_if[MAXVIFS]; + +#define ENCAP_TTL 64 +#define ENCAP_PROTO IPPROTO_IPIP /* 4 */ + +/* prototype IP hdr for encapsulated packets */ +static struct ip multicast_encap_iphdr = { +#if BYTE_ORDER == LITTLE_ENDIAN + sizeof(struct ip) >> 2, IPVERSION, +#else + IPVERSION, sizeof(struct ip) >> 2, +#endif + 0, /* tos */ + sizeof(struct ip), /* total length */ + 0, /* id */ + 0, /* frag offset */ + ENCAP_TTL, ENCAP_PROTO, + 0, /* checksum */ +}; + +/* + * Private variables. + */ +static vifi_t numvifs = 0; +static int have_encap_tunnel = 0; + +/* + * one-back cache used by ipip_input to locate a tunnel's vif + * given a datagram's src ip address. + */ +static u_long last_encap_src; +static struct vif *last_encap_vif; + +static u_long X_ip_mcast_src __P((int vifi)); +static int X_ip_mforward __P((struct ip *ip, struct ifnet *ifp, struct mbuf *m, struct ip_moptions *imo)); +static int X_ip_mrouter_done __P((void)); +static int X_ip_mrouter_get __P((struct socket *so, struct sockopt *m)); +static int X_ip_mrouter_set __P((struct socket *so, struct sockopt *m)); +static int X_legal_vif_num __P((int vif)); +static int X_mrt_ioctl __P((int cmd, caddr_t data)); + +static int get_sg_cnt(struct sioc_sg_req *); +static int get_vif_cnt(struct sioc_vif_req *); +static int ip_mrouter_init(struct socket *, int); +static int add_vif(struct vifctl *); +static int del_vif(vifi_t); +static int add_mfc(struct mfcctl *); +static int del_mfc(struct mfcctl *); +static int socket_send(struct socket *, struct mbuf *, struct sockaddr_in *); +static int set_assert(int); +static void expire_upcalls(void *); +static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *, + vifi_t); +static void phyint_send(struct ip *, struct vif *, struct mbuf *); +static void encap_send(struct ip *, struct vif *, struct mbuf *); +static void tbf_control(struct vif *, struct mbuf *, struct ip *, u_long); +static void tbf_queue(struct vif *, struct mbuf *); +static void tbf_process_q(struct vif *); +static void tbf_reprocess_q(void *); +static int tbf_dq_sel(struct vif *, struct ip *); +static void tbf_send_packet(struct vif *, struct mbuf *); +static void tbf_update_tokens(struct vif *); +static int priority(struct vif *, struct ip *); +void multiencap_decap(struct mbuf *); + +/* + * whether or not special PIM assert processing is enabled. + */ +static int pim_assert; +/* + * Rate limit for assert notification messages, in usec + */ +#define ASSERT_MSG_TIME 3000000 + +/* + * Hash function for a source, group entry + */ +#define MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \ + ((g) >> 20) ^ ((g) >> 10) ^ (g)) + +/* + * Find a route for a given origin IP address and Multicast group address + * Type of service parameter to be added in the future!!! + */ + +#define MFCFIND(o, g, rt) { \ + register struct mfc *_rt = mfctable[MFCHASH(o,g)]; \ + rt = NULL; \ + ++mrtstat.mrts_mfc_lookups; \ + while (_rt) { \ + if ((_rt->mfc_origin.s_addr == o) && \ + (_rt->mfc_mcastgrp.s_addr == g) && \ + (_rt->mfc_stall == NULL)) { \ + rt = _rt; \ + break; \ + } \ + _rt = _rt->mfc_next; \ + } \ + if (rt == NULL) { \ + ++mrtstat.mrts_mfc_misses; \ + } \ +} + + +/* + * Macros to compute elapsed time efficiently + * Borrowed from Van Jacobson's scheduling code + */ +#define TV_DELTA(a, b, delta) { \ + register int xxs; \ + \ + delta = (a).tv_usec - (b).tv_usec; \ + if ((xxs = (a).tv_sec - (b).tv_sec)) { \ + switch (xxs) { \ + case 2: \ + delta += 1000000; \ + /* fall through */ \ + case 1: \ + delta += 1000000; \ + break; \ + default: \ + delta += (1000000 * xxs); \ + } \ + } \ +} + +#define TV_LT(a, b) (((a).tv_usec < (b).tv_usec && \ + (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec) + +#ifdef UPCALL_TIMING +u_long upcall_data[51]; +static void collate(struct timeval *); +#endif /* UPCALL_TIMING */ + + +/* + * Handle MRT setsockopt commands to modify the multicast routing tables. + */ +static int +X_ip_mrouter_set(so, sopt) + struct socket *so; + struct sockopt *sopt; +{ + int error, optval; + vifi_t vifi; + struct vifctl vifc; + struct mfcctl mfc; + + if (so != ip_mrouter && sopt->sopt_name != MRT_INIT) + return (EPERM); + + error = 0; + switch (sopt->sopt_name) { + case MRT_INIT: + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + break; + error = ip_mrouter_init(so, optval); + break; + + case MRT_DONE: + error = ip_mrouter_done(); + break; + + case MRT_ADD_VIF: + error = sooptcopyin(sopt, &vifc, sizeof vifc, sizeof vifc); + if (error) + break; + error = add_vif(&vifc); + break; + + case MRT_DEL_VIF: + error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi); + if (error) + break; + error = del_vif(vifi); + break; + + case MRT_ADD_MFC: + case MRT_DEL_MFC: + error = sooptcopyin(sopt, &mfc, sizeof mfc, sizeof mfc); + if (error) + break; + if (sopt->sopt_name == MRT_ADD_MFC) + error = add_mfc(&mfc); + else + error = del_mfc(&mfc); + break; + + case MRT_ASSERT: + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + break; + set_assert(optval); + break; + + default: + error = EOPNOTSUPP; + break; + } + return (error); +} + +#ifndef MROUTE_LKM +int (*ip_mrouter_set)(struct socket *, struct sockopt *) = X_ip_mrouter_set; +#endif + +/* + * Handle MRT getsockopt commands + */ +static int +X_ip_mrouter_get(so, sopt) + struct socket *so; + struct sockopt *sopt; +{ + int error; + static int version = 0x0305; /* !!! why is this here? XXX */ + + switch (sopt->sopt_name) { + case MRT_VERSION: + error = sooptcopyout(sopt, &version, sizeof version); + break; + + case MRT_ASSERT: + error = sooptcopyout(sopt, &pim_assert, sizeof pim_assert); + break; + default: + error = EOPNOTSUPP; + break; + } + return (error); +} + +#ifndef MROUTE_LKM +int (*ip_mrouter_get)(struct socket *, struct sockopt *) = X_ip_mrouter_get; +#endif + +/* + * Handle ioctl commands to obtain information from the cache + */ +static int +X_mrt_ioctl(cmd, data) + int cmd; + caddr_t data; +{ + int error = 0; + + switch (cmd) { + case (SIOCGETVIFCNT): + return (get_vif_cnt((struct sioc_vif_req *)data)); + break; + case (SIOCGETSGCNT): + return (get_sg_cnt((struct sioc_sg_req *)data)); + break; + default: + return (EINVAL); + break; + } + return error; +} + +#ifndef MROUTE_LKM +int (*mrt_ioctl)(int, caddr_t) = X_mrt_ioctl; +#endif + +/* + * returns the packet, byte, rpf-failure count for the source group provided + */ +static int +get_sg_cnt(req) + register struct sioc_sg_req *req; +{ + register struct mfc *rt; + int s; + + s = splnet(); + MFCFIND(req->src.s_addr, req->grp.s_addr, rt); + splx(s); + if (rt != NULL) { + req->pktcnt = rt->mfc_pkt_cnt; + req->bytecnt = rt->mfc_byte_cnt; + req->wrong_if = rt->mfc_wrong_if; + } else + req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff; + + return 0; +} + +/* + * returns the input and output packet and byte counts on the vif provided + */ +static int +get_vif_cnt(req) + register struct sioc_vif_req *req; +{ + register vifi_t vifi = req->vifi; + + if (vifi >= numvifs) return EINVAL; + + req->icount = viftable[vifi].v_pkt_in; + req->ocount = viftable[vifi].v_pkt_out; + req->ibytes = viftable[vifi].v_bytes_in; + req->obytes = viftable[vifi].v_bytes_out; + + return 0; +} + +/* + * Enable multicast routing + */ +static int +ip_mrouter_init(so, version) + struct socket *so; + int version; +{ + if (mrtdebug) + log(LOG_DEBUG,"ip_mrouter_init: so_type = %d, pr_protocol = %d\n", + so->so_type, so->so_proto->pr_protocol); + + if (so->so_type != SOCK_RAW || + so->so_proto->pr_protocol != IPPROTO_IGMP) return EOPNOTSUPP; + + if (version != 1) + return ENOPROTOOPT; + + if (ip_mrouter != NULL) return EADDRINUSE; + + ip_mrouter = so; + + bzero((caddr_t)mfctable, sizeof(mfctable)); + bzero((caddr_t)nexpire, sizeof(nexpire)); + + pim_assert = 0; + + expire_upcalls_ch = timeout(expire_upcalls, (caddr_t)NULL, EXPIRE_TIMEOUT); + + if (mrtdebug) + log(LOG_DEBUG, "ip_mrouter_init\n"); + + return 0; +} + +/* + * Disable multicast routing + */ +static int +X_ip_mrouter_done() +{ + vifi_t vifi; + int i; + struct ifnet *ifp; + struct ifreq ifr; + struct mfc *rt; + struct rtdetq *rte; + int s; + + s = splnet(); + + /* + * For each phyint in use, disable promiscuous reception of all IP + * multicasts. + */ + for (vifi = 0; vifi < numvifs; vifi++) { + if (viftable[vifi].v_lcl_addr.s_addr != 0 && + !(viftable[vifi].v_flags & VIFF_TUNNEL)) { + ((struct sockaddr_in *)&(ifr.ifr_addr))->sin_family = AF_INET; + ((struct sockaddr_in *)&(ifr.ifr_addr))->sin_addr.s_addr + = INADDR_ANY; + ifp = viftable[vifi].v_ifp; + if_allmulti(ifp, 0); + } + } + bzero((caddr_t)tbftable, sizeof(tbftable)); + bzero((caddr_t)viftable, sizeof(viftable)); + numvifs = 0; + pim_assert = 0; + + untimeout(expire_upcalls, (caddr_t)NULL, expire_upcalls_ch); + + /* + * Free all multicast forwarding cache entries. + */ + for (i = 0; i < MFCTBLSIZ; i++) { + for (rt = mfctable[i]; rt != NULL; ) { + struct mfc *nr = rt->mfc_next; + + for (rte = rt->mfc_stall; rte != NULL; ) { + struct rtdetq *n = rte->next; + + m_freem(rte->m); + free(rte, M_MRTABLE); + rte = n; + } + free(rt, M_MRTABLE); + rt = nr; + } + } + + bzero((caddr_t)mfctable, sizeof(mfctable)); + + /* + * Reset de-encapsulation cache + */ + last_encap_src = 0; + last_encap_vif = NULL; + have_encap_tunnel = 0; + + ip_mrouter = NULL; + + splx(s); + + if (mrtdebug) + log(LOG_DEBUG, "ip_mrouter_done\n"); + + return 0; +} + +#ifndef MROUTE_LKM +int (*ip_mrouter_done)(void) = X_ip_mrouter_done; +#endif + +/* + * Set PIM assert processing global + */ +static int +set_assert(i) + int i; +{ + if ((i != 1) && (i != 0)) + return EINVAL; + + pim_assert = i; + + return 0; +} + +/* + * Add a vif to the vif table + */ +static int +add_vif(vifcp) + register struct vifctl *vifcp; +{ + register struct vif *vifp = viftable + vifcp->vifc_vifi; + static struct sockaddr_in sin = {sizeof sin, AF_INET}; + struct ifaddr *ifa; + struct ifnet *ifp; + int error, s; + struct tbf *v_tbf = tbftable + vifcp->vifc_vifi; + + if (vifcp->vifc_vifi >= MAXVIFS) return EINVAL; + if (vifp->v_lcl_addr.s_addr != 0) return EADDRINUSE; + + /* Find the interface with an address in AF_INET family */ + sin.sin_addr = vifcp->vifc_lcl_addr; + ifa = ifa_ifwithaddr((struct sockaddr *)&sin); + if (ifa == 0) return EADDRNOTAVAIL; + ifp = ifa->ifa_ifp; + + if (vifcp->vifc_flags & VIFF_TUNNEL) { + if ((vifcp->vifc_flags & VIFF_SRCRT) == 0) { + /* + * An encapsulating tunnel is wanted. Tell ipip_input() to + * start paying attention to encapsulated packets. + */ + if (have_encap_tunnel == 0) { + have_encap_tunnel = 1; + for (s = 0; s < MAXVIFS; ++s) { + multicast_decap_if[s].if_name = "mdecap"; + multicast_decap_if[s].if_unit = s; + } + } + /* + * Set interface to fake encapsulator interface + */ + ifp = &multicast_decap_if[vifcp->vifc_vifi]; + /* + * Prepare cached route entry + */ + bzero(&vifp->v_route, sizeof(vifp->v_route)); + } else { + log(LOG_ERR, "source routed tunnels not supported\n"); + return EOPNOTSUPP; + } + } else { + /* Make sure the interface supports multicast */ + if ((ifp->if_flags & IFF_MULTICAST) == 0) + return EOPNOTSUPP; + + /* Enable promiscuous reception of all IP multicasts from the if */ + s = splnet(); + error = if_allmulti(ifp, 1); + splx(s); + if (error) + return error; + } + + s = splnet(); + /* define parameters for the tbf structure */ + vifp->v_tbf = v_tbf; + GET_TIME(vifp->v_tbf->tbf_last_pkt_t); + vifp->v_tbf->tbf_n_tok = 0; + vifp->v_tbf->tbf_q_len = 0; + vifp->v_tbf->tbf_max_q_len = MAXQSIZE; + vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL; + + vifp->v_flags = vifcp->vifc_flags; + vifp->v_threshold = vifcp->vifc_threshold; + vifp->v_lcl_addr = vifcp->vifc_lcl_addr; + vifp->v_rmt_addr = vifcp->vifc_rmt_addr; + vifp->v_ifp = ifp; + /* scaling up here allows division by 1024 in critical code */ + vifp->v_rate_limit= vifcp->vifc_rate_limit * 1024 / 1000; + vifp->v_rsvp_on = 0; + vifp->v_rsvpd = NULL; + /* initialize per vif pkt counters */ + vifp->v_pkt_in = 0; + vifp->v_pkt_out = 0; + vifp->v_bytes_in = 0; + vifp->v_bytes_out = 0; + splx(s); + + /* Adjust numvifs up if the vifi is higher than numvifs */ + if (numvifs <= vifcp->vifc_vifi) numvifs = vifcp->vifc_vifi + 1; + + if (mrtdebug) + log(LOG_DEBUG, "add_vif #%d, lcladdr %lx, %s %lx, thresh %x, rate %d\n", + vifcp->vifc_vifi, + (u_long)ntohl(vifcp->vifc_lcl_addr.s_addr), + (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask", + (u_long)ntohl(vifcp->vifc_rmt_addr.s_addr), + vifcp->vifc_threshold, + vifcp->vifc_rate_limit); + + return 0; +} + +/* + * Delete a vif from the vif table + */ +static int +del_vif(vifi) + vifi_t vifi; +{ + register struct vif *vifp = &viftable[vifi]; + register struct mbuf *m; + struct ifnet *ifp; + struct ifreq ifr; + int s; + + if (vifi >= numvifs) return EINVAL; + if (vifp->v_lcl_addr.s_addr == 0) return EADDRNOTAVAIL; + + s = splnet(); + + if (!(vifp->v_flags & VIFF_TUNNEL)) { + ((struct sockaddr_in *)&(ifr.ifr_addr))->sin_family = AF_INET; + ((struct sockaddr_in *)&(ifr.ifr_addr))->sin_addr.s_addr = INADDR_ANY; + ifp = vifp->v_ifp; + if_allmulti(ifp, 0); + } + + if (vifp == last_encap_vif) { + last_encap_vif = 0; + last_encap_src = 0; + } + + /* + * Free packets queued at the interface + */ + while (vifp->v_tbf->tbf_q) { + m = vifp->v_tbf->tbf_q; + vifp->v_tbf->tbf_q = m->m_act; + m_freem(m); + } + + bzero((caddr_t)vifp->v_tbf, sizeof(*(vifp->v_tbf))); + bzero((caddr_t)vifp, sizeof (*vifp)); + + if (mrtdebug) + log(LOG_DEBUG, "del_vif %d, numvifs %d\n", vifi, numvifs); + + /* Adjust numvifs down */ + for (vifi = numvifs; vifi > 0; vifi--) + if (viftable[vifi-1].v_lcl_addr.s_addr != 0) break; + numvifs = vifi; + + splx(s); + + return 0; +} + +/* + * Add an mfc entry + */ +static int +add_mfc(mfccp) + struct mfcctl *mfccp; +{ + struct mfc *rt; + u_long hash; + struct rtdetq *rte; + register u_short nstl; + int s; + int i; + + MFCFIND(mfccp->mfcc_origin.s_addr, mfccp->mfcc_mcastgrp.s_addr, rt); + + /* If an entry already exists, just update the fields */ + if (rt) { + if (mrtdebug & DEBUG_MFC) + log(LOG_DEBUG,"add_mfc update o %lx g %lx p %x\n", + (u_long)ntohl(mfccp->mfcc_origin.s_addr), + (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), + mfccp->mfcc_parent); + + s = splnet(); + rt->mfc_parent = mfccp->mfcc_parent; + for (i = 0; i < numvifs; i++) + rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; + splx(s); + return 0; + } + + /* + * Find the entry for which the upcall was made and update + */ + s = splnet(); + hash = MFCHASH(mfccp->mfcc_origin.s_addr, mfccp->mfcc_mcastgrp.s_addr); + for (rt = mfctable[hash], nstl = 0; rt; rt = rt->mfc_next) { + + if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) && + (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) && + (rt->mfc_stall != NULL)) { + + if (nstl++) + log(LOG_ERR, "add_mfc %s o %lx g %lx p %x dbx %p\n", + "multiple kernel entries", + (u_long)ntohl(mfccp->mfcc_origin.s_addr), + (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), + mfccp->mfcc_parent, (void *)rt->mfc_stall); + + if (mrtdebug & DEBUG_MFC) + log(LOG_DEBUG,"add_mfc o %lx g %lx p %x dbg %p\n", + (u_long)ntohl(mfccp->mfcc_origin.s_addr), + (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), + mfccp->mfcc_parent, (void *)rt->mfc_stall); + + rt->mfc_origin = mfccp->mfcc_origin; + rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; + rt->mfc_parent = mfccp->mfcc_parent; + for (i = 0; i < numvifs; i++) + rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; + /* initialize pkt counters per src-grp */ + rt->mfc_pkt_cnt = 0; + rt->mfc_byte_cnt = 0; + rt->mfc_wrong_if = 0; + rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_usec = 0; + + rt->mfc_expire = 0; /* Don't clean this guy up */ + nexpire[hash]--; + + /* free packets Qed at the end of this entry */ + for (rte = rt->mfc_stall; rte != NULL; ) { + struct rtdetq *n = rte->next; + + ip_mdq(rte->m, rte->ifp, rt, -1); + m_freem(rte->m); +#ifdef UPCALL_TIMING + collate(&(rte->t)); +#endif /* UPCALL_TIMING */ + free(rte, M_MRTABLE); + rte = n; + } + rt->mfc_stall = NULL; + } + } + + /* + * It is possible that an entry is being inserted without an upcall + */ + if (nstl == 0) { + if (mrtdebug & DEBUG_MFC) + log(LOG_DEBUG,"add_mfc no upcall h %lu o %lx g %lx p %x\n", + hash, (u_long)ntohl(mfccp->mfcc_origin.s_addr), + (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), + mfccp->mfcc_parent); + + for (rt = mfctable[hash]; rt != NULL; rt = rt->mfc_next) { + + if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) && + (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr)) { + + rt->mfc_origin = mfccp->mfcc_origin; + rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; + rt->mfc_parent = mfccp->mfcc_parent; + for (i = 0; i < numvifs; i++) + rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; + /* initialize pkt counters per src-grp */ + rt->mfc_pkt_cnt = 0; + rt->mfc_byte_cnt = 0; + rt->mfc_wrong_if = 0; + rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_usec = 0; + if (rt->mfc_expire) + nexpire[hash]--; + rt->mfc_expire = 0; + } + } + if (rt == NULL) { + /* no upcall, so make a new entry */ + rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); + if (rt == NULL) { + splx(s); + return ENOBUFS; + } + + /* insert new entry at head of hash chain */ + rt->mfc_origin = mfccp->mfcc_origin; + rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; + rt->mfc_parent = mfccp->mfcc_parent; + for (i = 0; i < numvifs; i++) + rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; + /* initialize pkt counters per src-grp */ + rt->mfc_pkt_cnt = 0; + rt->mfc_byte_cnt = 0; + rt->mfc_wrong_if = 0; + rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_usec = 0; + rt->mfc_expire = 0; + rt->mfc_stall = NULL; + + /* link into table */ + rt->mfc_next = mfctable[hash]; + mfctable[hash] = rt; + } + } + splx(s); + return 0; +} + +#ifdef UPCALL_TIMING +/* + * collect delay statistics on the upcalls + */ +static void collate(t) +register struct timeval *t; +{ + register u_long d; + register struct timeval tp; + register u_long delta; + + GET_TIME(tp); + + if (TV_LT(*t, tp)) + { + TV_DELTA(tp, *t, delta); + + d = delta >> 10; + if (d > 50) + d = 50; + + ++upcall_data[d]; + } +} +#endif /* UPCALL_TIMING */ + +/* + * Delete an mfc entry + */ +static int +del_mfc(mfccp) + struct mfcctl *mfccp; +{ + struct in_addr origin; + struct in_addr mcastgrp; + struct mfc *rt; + struct mfc **nptr; + u_long hash; + int s; + + origin = mfccp->mfcc_origin; + mcastgrp = mfccp->mfcc_mcastgrp; + hash = MFCHASH(origin.s_addr, mcastgrp.s_addr); + + if (mrtdebug & DEBUG_MFC) + log(LOG_DEBUG,"del_mfc orig %lx mcastgrp %lx\n", + (u_long)ntohl(origin.s_addr), (u_long)ntohl(mcastgrp.s_addr)); + + s = splnet(); + + nptr = &mfctable[hash]; + while ((rt = *nptr) != NULL) { + if (origin.s_addr == rt->mfc_origin.s_addr && + mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr && + rt->mfc_stall == NULL) + break; + + nptr = &rt->mfc_next; + } + if (rt == NULL) { + splx(s); + return EADDRNOTAVAIL; + } + + *nptr = rt->mfc_next; + free(rt, M_MRTABLE); + + splx(s); + + return 0; +} + +/* + * Send a message to mrouted on the multicast routing socket + */ +static int +socket_send(s, mm, src) + struct socket *s; + struct mbuf *mm; + struct sockaddr_in *src; +{ + if (s) { + if (sbappendaddr(&s->so_rcv, + (struct sockaddr *)src, + mm, (struct mbuf *)0) != 0) { + sorwakeup(s); + return 0; + } + } + m_freem(mm); + return -1; +} + +/* + * IP multicast forwarding function. This function assumes that the packet + * pointed to by "ip" has arrived on (or is about to be sent to) the interface + * pointed to by "ifp", and the packet is to be relayed to other networks + * that have members of the packet's destination IP multicast group. + * + * The packet is returned unscathed to the caller, unless it is + * erroneous, in which case a non-zero return value tells the caller to + * discard it. + */ + +#define IP_HDR_LEN 20 /* # bytes of fixed IP header (excluding options) */ +#define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */ + +static int +X_ip_mforward(ip, ifp, m, imo) + register struct ip *ip; + struct ifnet *ifp; + struct mbuf *m; + struct ip_moptions *imo; +{ + register struct mfc *rt; + register u_char *ipoptions; + static struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; + static int srctun = 0; + register struct mbuf *mm; + int s; + vifi_t vifi; + struct vif *vifp; + + if (mrtdebug & DEBUG_FORWARD) + log(LOG_DEBUG, "ip_mforward: src %lx, dst %lx, ifp %p\n", + (u_long)ntohl(ip->ip_src.s_addr), (u_long)ntohl(ip->ip_dst.s_addr), + (void *)ifp); + + if (ip->ip_hl < (IP_HDR_LEN + TUNNEL_LEN) >> 2 || + (ipoptions = (u_char *)(ip + 1))[1] != IPOPT_LSRR ) { + /* + * Packet arrived via a physical interface or + * an encapsulated tunnel. + */ + } else { + /* + * Packet arrived through a source-route tunnel. + * Source-route tunnels are no longer supported. + */ + if ((srctun++ % 1000) == 0) + log(LOG_ERR, + "ip_mforward: received source-routed packet from %lx\n", + (u_long)ntohl(ip->ip_src.s_addr)); + + return 1; + } + + if ((imo) && ((vifi = imo->imo_multicast_vif) < numvifs)) { + if (ip->ip_ttl < 255) + ip->ip_ttl++; /* compensate for -1 in *_send routines */ + if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) { + vifp = viftable + vifi; + printf("Sending IPPROTO_RSVP from %lx to %lx on vif %d (%s%s%d)\n", + ntohl(ip->ip_src.s_addr), ntohl(ip->ip_dst.s_addr), vifi, + (vifp->v_flags & VIFF_TUNNEL) ? "tunnel on " : "", + vifp->v_ifp->if_name, vifp->v_ifp->if_unit); + } + return (ip_mdq(m, ifp, NULL, vifi)); + } + if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) { + printf("Warning: IPPROTO_RSVP from %lx to %lx without vif option\n", + ntohl(ip->ip_src.s_addr), ntohl(ip->ip_dst.s_addr)); + if(!imo) + printf("In fact, no options were specified at all\n"); + } + + /* + * Don't forward a packet with time-to-live of zero or one, + * or a packet destined to a local-only group. + */ + if (ip->ip_ttl <= 1 || + ntohl(ip->ip_dst.s_addr) <= INADDR_MAX_LOCAL_GROUP) + return 0; + + /* + * Determine forwarding vifs from the forwarding cache table + */ + s = splnet(); + MFCFIND(ip->ip_src.s_addr, ip->ip_dst.s_addr, rt); + + /* Entry exists, so forward if necessary */ + if (rt != NULL) { + splx(s); + return (ip_mdq(m, ifp, rt, -1)); + } else { + /* + * If we don't have a route for packet's origin, + * Make a copy of the packet & + * send message to routing daemon + */ + + register struct mbuf *mb0; + register struct rtdetq *rte; + register u_long hash; + int hlen = ip->ip_hl << 2; +#ifdef UPCALL_TIMING + struct timeval tp; + + GET_TIME(tp); +#endif + + mrtstat.mrts_no_route++; + if (mrtdebug & (DEBUG_FORWARD | DEBUG_MFC)) + log(LOG_DEBUG, "ip_mforward: no rte s %lx g %lx\n", + (u_long)ntohl(ip->ip_src.s_addr), + (u_long)ntohl(ip->ip_dst.s_addr)); + + /* + * Allocate mbufs early so that we don't do extra work if we are + * just going to fail anyway. Make sure to pullup the header so + * that other people can't step on it. + */ + rte = (struct rtdetq *)malloc((sizeof *rte), M_MRTABLE, M_NOWAIT); + if (rte == NULL) { + splx(s); + return ENOBUFS; + } + mb0 = m_copy(m, 0, M_COPYALL); + if (mb0 && (M_HASCL(mb0) || mb0->m_len < hlen)) + mb0 = m_pullup(mb0, hlen); + if (mb0 == NULL) { + free(rte, M_MRTABLE); + splx(s); + return ENOBUFS; + } + + /* is there an upcall waiting for this packet? */ + hash = MFCHASH(ip->ip_src.s_addr, ip->ip_dst.s_addr); + for (rt = mfctable[hash]; rt; rt = rt->mfc_next) { + if ((ip->ip_src.s_addr == rt->mfc_origin.s_addr) && + (ip->ip_dst.s_addr == rt->mfc_mcastgrp.s_addr) && + (rt->mfc_stall != NULL)) + break; + } + + if (rt == NULL) { + int i; + struct igmpmsg *im; + + /* no upcall, so make a new entry */ + rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); + if (rt == NULL) { + free(rte, M_MRTABLE); + m_freem(mb0); + splx(s); + return ENOBUFS; + } + /* Make a copy of the header to send to the user level process */ + mm = m_copy(mb0, 0, hlen); + if (mm == NULL) { + free(rte, M_MRTABLE); + m_freem(mb0); + free(rt, M_MRTABLE); + splx(s); + return ENOBUFS; + } + + /* + * Send message to routing daemon to install + * a route into the kernel table + */ + k_igmpsrc.sin_addr = ip->ip_src; + + im = mtod(mm, struct igmpmsg *); + im->im_msgtype = IGMPMSG_NOCACHE; + im->im_mbz = 0; + + mrtstat.mrts_upcalls++; + + if (socket_send(ip_mrouter, mm, &k_igmpsrc) < 0) { + log(LOG_WARNING, "ip_mforward: ip_mrouter socket queue full\n"); + ++mrtstat.mrts_upq_sockfull; + free(rte, M_MRTABLE); + m_freem(mb0); + free(rt, M_MRTABLE); + splx(s); + return ENOBUFS; + } + + /* insert new entry at head of hash chain */ + rt->mfc_origin.s_addr = ip->ip_src.s_addr; + rt->mfc_mcastgrp.s_addr = ip->ip_dst.s_addr; + rt->mfc_expire = UPCALL_EXPIRE; + nexpire[hash]++; + for (i = 0; i < numvifs; i++) + rt->mfc_ttls[i] = 0; + rt->mfc_parent = -1; + + /* link into table */ + rt->mfc_next = mfctable[hash]; + mfctable[hash] = rt; + rt->mfc_stall = rte; + + } else { + /* determine if q has overflowed */ + int npkts = 0; + struct rtdetq **p; + + for (p = &rt->mfc_stall; *p != NULL; p = &(*p)->next) + npkts++; + + if (npkts > MAX_UPQ) { + mrtstat.mrts_upq_ovflw++; + free(rte, M_MRTABLE); + m_freem(mb0); + splx(s); + return 0; + } + + /* Add this entry to the end of the queue */ + *p = rte; + } + + rte->m = mb0; + rte->ifp = ifp; +#ifdef UPCALL_TIMING + rte->t = tp; +#endif + rte->next = NULL; + + splx(s); + + return 0; + } +} + +#ifndef MROUTE_LKM +int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, + struct ip_moptions *) = X_ip_mforward; +#endif + +/* + * Clean up the cache entry if upcall is not serviced + */ +static void +expire_upcalls(void *unused) +{ + struct rtdetq *rte; + struct mfc *mfc, **nptr; + int i; + int s; + + s = splnet(); + for (i = 0; i < MFCTBLSIZ; i++) { + if (nexpire[i] == 0) + continue; + nptr = &mfctable[i]; + for (mfc = *nptr; mfc != NULL; mfc = *nptr) { + /* + * Skip real cache entries + * Make sure it wasn't marked to not expire (shouldn't happen) + * If it expires now + */ + if (mfc->mfc_stall != NULL && + mfc->mfc_expire != 0 && + --mfc->mfc_expire == 0) { + if (mrtdebug & DEBUG_EXPIRE) + log(LOG_DEBUG, "expire_upcalls: expiring (%lx %lx)\n", + (u_long)ntohl(mfc->mfc_origin.s_addr), + (u_long)ntohl(mfc->mfc_mcastgrp.s_addr)); + /* + * drop all the packets + * free the mbuf with the pkt, if, timing info + */ + for (rte = mfc->mfc_stall; rte; ) { + struct rtdetq *n = rte->next; + + m_freem(rte->m); + free(rte, M_MRTABLE); + rte = n; + } + ++mrtstat.mrts_cache_cleanups; + nexpire[i]--; + + *nptr = mfc->mfc_next; + free(mfc, M_MRTABLE); + } else { + nptr = &mfc->mfc_next; + } + } + } + splx(s); + expire_upcalls_ch = timeout(expire_upcalls, (caddr_t)NULL, EXPIRE_TIMEOUT); +} + +/* + * Packet forwarding routine once entry in the cache is made + */ +static int +ip_mdq(m, ifp, rt, xmt_vif) + register struct mbuf *m; + register struct ifnet *ifp; + register struct mfc *rt; + register vifi_t xmt_vif; +{ + register struct ip *ip = mtod(m, struct ip *); + register vifi_t vifi; + register struct vif *vifp; + register int plen = ip->ip_len; + +/* + * Macro to send packet on vif. Since RSVP packets don't get counted on + * input, they shouldn't get counted on output, so statistics keeping is + * separate. + */ +#define MC_SEND(ip,vifp,m) { \ + if ((vifp)->v_flags & VIFF_TUNNEL) \ + encap_send((ip), (vifp), (m)); \ + else \ + phyint_send((ip), (vifp), (m)); \ +} + + /* + * If xmt_vif is not -1, send on only the requested vif. + * + * (since vifi_t is u_short, -1 becomes MAXUSHORT, which > numvifs.) + */ + if (xmt_vif < numvifs) { + MC_SEND(ip, viftable + xmt_vif, m); + return 1; + } + + /* + * Don't forward if it didn't arrive from the parent vif for its origin. + */ + vifi = rt->mfc_parent; + if ((vifi >= numvifs) || (viftable[vifi].v_ifp != ifp)) { + /* came in the wrong interface */ + if (mrtdebug & DEBUG_FORWARD) + log(LOG_DEBUG, "wrong if: ifp %p vifi %d vififp %p\n", + (void *)ifp, vifi, (void *)viftable[vifi].v_ifp); + ++mrtstat.mrts_wrong_if; + ++rt->mfc_wrong_if; + /* + * If we are doing PIM assert processing, and we are forwarding + * packets on this interface, and it is a broadcast medium + * interface (and not a tunnel), send a message to the routing daemon. + */ + if (pim_assert && rt->mfc_ttls[vifi] && + (ifp->if_flags & IFF_BROADCAST) && + !(viftable[vifi].v_flags & VIFF_TUNNEL)) { + struct sockaddr_in k_igmpsrc; + struct mbuf *mm; + struct igmpmsg *im; + int hlen = ip->ip_hl << 2; + struct timeval now; + register u_long delta; + + GET_TIME(now); + + TV_DELTA(rt->mfc_last_assert, now, delta); + + if (delta > ASSERT_MSG_TIME) { + mm = m_copy(m, 0, hlen); + if (mm && (M_HASCL(mm) || mm->m_len < hlen)) + mm = m_pullup(mm, hlen); + if (mm == NULL) { + return ENOBUFS; + } + + rt->mfc_last_assert = now; + + im = mtod(mm, struct igmpmsg *); + im->im_msgtype = IGMPMSG_WRONGVIF; + im->im_mbz = 0; + im->im_vif = vifi; + + k_igmpsrc.sin_addr = im->im_src; + + socket_send(ip_mrouter, mm, &k_igmpsrc); + } + } + return 0; + } + + /* If I sourced this packet, it counts as output, else it was input. */ + if (ip->ip_src.s_addr == viftable[vifi].v_lcl_addr.s_addr) { + viftable[vifi].v_pkt_out++; + viftable[vifi].v_bytes_out += plen; + } else { + viftable[vifi].v_pkt_in++; + viftable[vifi].v_bytes_in += plen; + } + rt->mfc_pkt_cnt++; + rt->mfc_byte_cnt += plen; + + /* + * For each vif, decide if a copy of the packet should be forwarded. + * Forward if: + * - the ttl exceeds the vif's threshold + * - there are group members downstream on interface + */ + for (vifp = viftable, vifi = 0; vifi < numvifs; vifp++, vifi++) + if ((rt->mfc_ttls[vifi] > 0) && + (ip->ip_ttl > rt->mfc_ttls[vifi])) { + vifp->v_pkt_out++; + vifp->v_bytes_out += plen; + MC_SEND(ip, vifp, m); + } + + return 0; +} + +/* + * check if a vif number is legal/ok. This is used by ip_output, to export + * numvifs there, + */ +static int +X_legal_vif_num(vif) + int vif; +{ + if (vif >= 0 && vif < numvifs) + return(1); + else + return(0); +} + +#ifndef MROUTE_LKM +int (*legal_vif_num)(int) = X_legal_vif_num; +#endif + +/* + * Return the local address used by this vif + */ +static u_long +X_ip_mcast_src(vifi) + int vifi; +{ + if (vifi >= 0 && vifi < numvifs) + return viftable[vifi].v_lcl_addr.s_addr; + else + return INADDR_ANY; +} + +#ifndef MROUTE_LKM +u_long (*ip_mcast_src)(int) = X_ip_mcast_src; +#endif + +static void +phyint_send(ip, vifp, m) + struct ip *ip; + struct vif *vifp; + struct mbuf *m; +{ + register struct mbuf *mb_copy; + register int hlen = ip->ip_hl << 2; + + /* + * Make a new reference to the packet; make sure that + * the IP header is actually copied, not just referenced, + * so that ip_output() only scribbles on the copy. + */ + mb_copy = m_copy(m, 0, M_COPYALL); + if (mb_copy && (M_HASCL(mb_copy) || mb_copy->m_len < hlen)) + mb_copy = m_pullup(mb_copy, hlen); + if (mb_copy == NULL) + return; + + if (vifp->v_rate_limit == 0) + tbf_send_packet(vifp, mb_copy); + else + tbf_control(vifp, mb_copy, mtod(mb_copy, struct ip *), ip->ip_len); +} + +static void +encap_send(ip, vifp, m) + register struct ip *ip; + register struct vif *vifp; + register struct mbuf *m; +{ + register struct mbuf *mb_copy; + register struct ip *ip_copy; + register int i, len = ip->ip_len; + + /* + * copy the old packet & pullup its IP header into the + * new mbuf so we can modify it. Try to fill the new + * mbuf since if we don't the ethernet driver will. + */ + MGETHDR(mb_copy, M_DONTWAIT, MT_HEADER); + if (mb_copy == NULL) + return; + mb_copy->m_data += max_linkhdr; + mb_copy->m_len = sizeof(multicast_encap_iphdr); + + if ((mb_copy->m_next = m_copy(m, 0, M_COPYALL)) == NULL) { + m_freem(mb_copy); + return; + } + i = MHLEN - M_LEADINGSPACE(mb_copy); + if (i > len) + i = len; + mb_copy = m_pullup(mb_copy, i); + if (mb_copy == NULL) + return; + mb_copy->m_pkthdr.len = len + sizeof(multicast_encap_iphdr); + + /* + * fill in the encapsulating IP header. + */ + ip_copy = mtod(mb_copy, struct ip *); + *ip_copy = multicast_encap_iphdr; + ip_copy->ip_id = htons(ip_id++); + ip_copy->ip_len += len; + ip_copy->ip_src = vifp->v_lcl_addr; + ip_copy->ip_dst = vifp->v_rmt_addr; + + /* + * turn the encapsulated IP header back into a valid one. + */ + ip = (struct ip *)((caddr_t)ip_copy + sizeof(multicast_encap_iphdr)); + --ip->ip_ttl; + HTONS(ip->ip_len); + HTONS(ip->ip_off); + ip->ip_sum = 0; + mb_copy->m_data += sizeof(multicast_encap_iphdr); + ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2); + mb_copy->m_data -= sizeof(multicast_encap_iphdr); + + if (vifp->v_rate_limit == 0) + tbf_send_packet(vifp, mb_copy); + else + tbf_control(vifp, mb_copy, ip, ip_copy->ip_len); +} + +/* + * De-encapsulate a packet and feed it back through ip input (this + * routine is called whenever IP gets a packet with proto type + * ENCAP_PROTO and a local destination address). + */ +void +#ifdef MROUTE_LKM +X_ipip_input(m, off, proto) +#else +ipip_input(m, off, proto) +#endif + register struct mbuf *m; + int off; + int proto; +{ + struct ifnet *ifp = m->m_pkthdr.rcvif; + register struct ip *ip = mtod(m, struct ip *); + register int hlen = ip->ip_hl << 2; + register struct vif *vifp; + + if (!have_encap_tunnel) { + rip_input(m, off, proto); + return; + } + /* + * dump the packet if it's not to a multicast destination or if + * we don't have an encapsulating tunnel with the source. + * Note: This code assumes that the remote site IP address + * uniquely identifies the tunnel (i.e., that this site has + * at most one tunnel with the remote site). + */ + if (! IN_MULTICAST(ntohl(((struct ip *)((char *)ip + hlen))->ip_dst.s_addr))) { + ++mrtstat.mrts_bad_tunnel; + m_freem(m); + return; + } + if (ip->ip_src.s_addr != last_encap_src) { + register struct vif *vife; + + vifp = viftable; + vife = vifp + numvifs; + last_encap_src = ip->ip_src.s_addr; + last_encap_vif = 0; + for ( ; vifp < vife; ++vifp) + if (vifp->v_rmt_addr.s_addr == ip->ip_src.s_addr) { + if ((vifp->v_flags & (VIFF_TUNNEL|VIFF_SRCRT)) + == VIFF_TUNNEL) + last_encap_vif = vifp; + break; + } + } + if ((vifp = last_encap_vif) == 0) { + last_encap_src = 0; + mrtstat.mrts_cant_tunnel++; /*XXX*/ + m_freem(m); + if (mrtdebug) + log(LOG_DEBUG, "ip_mforward: no tunnel with %lx\n", + (u_long)ntohl(ip->ip_src.s_addr)); + return; + } + ifp = vifp->v_ifp; + + if (hlen > IP_HDR_LEN) + ip_stripoptions(m, (struct mbuf *) 0); + m->m_data += IP_HDR_LEN; + m->m_len -= IP_HDR_LEN; + m->m_pkthdr.len -= IP_HDR_LEN; + m->m_pkthdr.rcvif = ifp; + + (void) IF_HANDOFF(&ipintrq, m, NULL); + /* + * normally we would need a "schednetisr(NETISR_IP)" + * here but we were called by ip_input and it is going + * to loop back & try to dequeue the packet we just + * queued as soon as we return so we avoid the + * unnecessary software interrrupt. + */ +} + +/* + * Token bucket filter module + */ + +static void +tbf_control(vifp, m, ip, p_len) + register struct vif *vifp; + register struct mbuf *m; + register struct ip *ip; + register u_long p_len; +{ + register struct tbf *t = vifp->v_tbf; + + if (p_len > MAX_BKT_SIZE) { + /* drop if packet is too large */ + mrtstat.mrts_pkt2large++; + m_freem(m); + return; + } + + tbf_update_tokens(vifp); + + /* if there are enough tokens, + * and the queue is empty, + * send this packet out + */ + + if (t->tbf_q_len == 0) { + /* queue empty, send packet if enough tokens */ + if (p_len <= t->tbf_n_tok) { + t->tbf_n_tok -= p_len; + tbf_send_packet(vifp, m); + } else { + /* queue packet and timeout till later */ + tbf_queue(vifp, m); + timeout(tbf_reprocess_q, (caddr_t)vifp, TBF_REPROCESS); + } + } else if (t->tbf_q_len < t->tbf_max_q_len) { + /* finite queue length, so queue pkts and process queue */ + tbf_queue(vifp, m); + tbf_process_q(vifp); + } else { + /* queue length too much, try to dq and queue and process */ + if (!tbf_dq_sel(vifp, ip)) { + mrtstat.mrts_q_overflow++; + m_freem(m); + return; + } else { + tbf_queue(vifp, m); + tbf_process_q(vifp); + } + } + return; +} + +/* + * adds a packet to the queue at the interface + */ +static void +tbf_queue(vifp, m) + register struct vif *vifp; + register struct mbuf *m; +{ + register int s = splnet(); + register struct tbf *t = vifp->v_tbf; + + if (t->tbf_t == NULL) { + /* Queue was empty */ + t->tbf_q = m; + } else { + /* Insert at tail */ + t->tbf_t->m_act = m; + } + + /* Set new tail pointer */ + t->tbf_t = m; + +#ifdef DIAGNOSTIC + /* Make sure we didn't get fed a bogus mbuf */ + if (m->m_act) + panic("tbf_queue: m_act"); +#endif + m->m_act = NULL; + + t->tbf_q_len++; + + splx(s); +} + + +/* + * processes the queue at the interface + */ +static void +tbf_process_q(vifp) + register struct vif *vifp; +{ + register struct mbuf *m; + register int len; + register int s = splnet(); + register struct tbf *t = vifp->v_tbf; + + /* loop through the queue at the interface and send as many packets + * as possible + */ + while (t->tbf_q_len > 0) { + m = t->tbf_q; + + len = mtod(m, struct ip *)->ip_len; + + /* determine if the packet can be sent */ + if (len <= t->tbf_n_tok) { + /* if so, + * reduce no of tokens, dequeue the packet, + * send the packet. + */ + t->tbf_n_tok -= len; + + t->tbf_q = m->m_act; + if (--t->tbf_q_len == 0) + t->tbf_t = NULL; + + m->m_act = NULL; + tbf_send_packet(vifp, m); + + } else break; + } + splx(s); +} + +static void +tbf_reprocess_q(xvifp) + void *xvifp; +{ + register struct vif *vifp = xvifp; + if (ip_mrouter == NULL) + return; + + tbf_update_tokens(vifp); + + tbf_process_q(vifp); + + if (vifp->v_tbf->tbf_q_len) + timeout(tbf_reprocess_q, (caddr_t)vifp, TBF_REPROCESS); +} + +/* function that will selectively discard a member of the queue + * based on the precedence value and the priority + */ +static int +tbf_dq_sel(vifp, ip) + register struct vif *vifp; + register struct ip *ip; +{ + register int s = splnet(); + register u_int p; + register struct mbuf *m, *last; + register struct mbuf **np; + register struct tbf *t = vifp->v_tbf; + + p = priority(vifp, ip); + + np = &t->tbf_q; + last = NULL; + while ((m = *np) != NULL) { + if (p > priority(vifp, mtod(m, struct ip *))) { + *np = m->m_act; + /* If we're removing the last packet, fix the tail pointer */ + if (m == t->tbf_t) + t->tbf_t = last; + m_freem(m); + /* it's impossible for the queue to be empty, but + * we check anyway. */ + if (--t->tbf_q_len == 0) + t->tbf_t = NULL; + splx(s); + mrtstat.mrts_drop_sel++; + return(1); + } + np = &m->m_act; + last = m; + } + splx(s); + return(0); +} + +static void +tbf_send_packet(vifp, m) + register struct vif *vifp; + register struct mbuf *m; +{ + struct ip_moptions imo; + int error; + static struct route ro; + int s = splnet(); + + if (vifp->v_flags & VIFF_TUNNEL) { + /* If tunnel options */ + ip_output(m, (struct mbuf *)0, &vifp->v_route, + IP_FORWARDING, (struct ip_moptions *)0); + } else { + imo.imo_multicast_ifp = vifp->v_ifp; + imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1; + imo.imo_multicast_loop = 1; + imo.imo_multicast_vif = -1; + + /* + * Re-entrancy should not be a problem here, because + * the packets that we send out and are looped back at us + * should get rejected because they appear to come from + * the loopback interface, thus preventing looping. + */ + error = ip_output(m, (struct mbuf *)0, &ro, + IP_FORWARDING, &imo); + + if (mrtdebug & DEBUG_XMIT) + log(LOG_DEBUG, "phyint_send on vif %d err %d\n", + vifp - viftable, error); + } + splx(s); +} + +/* determine the current time and then + * the elapsed time (between the last time and time now) + * in milliseconds & update the no. of tokens in the bucket + */ +static void +tbf_update_tokens(vifp) + register struct vif *vifp; +{ + struct timeval tp; + register u_long tm; + register int s = splnet(); + register struct tbf *t = vifp->v_tbf; + + GET_TIME(tp); + + TV_DELTA(tp, t->tbf_last_pkt_t, tm); + + /* + * This formula is actually + * "time in seconds" * "bytes/second". + * + * (tm / 1000000) * (v_rate_limit * 1000 * (1000/1024) / 8) + * + * The (1000/1024) was introduced in add_vif to optimize + * this divide into a shift. + */ + t->tbf_n_tok += tm * vifp->v_rate_limit / 1024 / 8; + t->tbf_last_pkt_t = tp; + + if (t->tbf_n_tok > MAX_BKT_SIZE) + t->tbf_n_tok = MAX_BKT_SIZE; + + splx(s); +} + +static int +priority(vifp, ip) + register struct vif *vifp; + register struct ip *ip; +{ + register int prio; + + /* temporary hack; may add general packet classifier some day */ + + /* + * The UDP port space is divided up into four priority ranges: + * [0, 16384) : unclassified - lowest priority + * [16384, 32768) : audio - highest priority + * [32768, 49152) : whiteboard - medium priority + * [49152, 65536) : video - low priority + */ + if (ip->ip_p == IPPROTO_UDP) { + struct udphdr *udp = (struct udphdr *)(((char *)ip) + (ip->ip_hl << 2)); + switch (ntohs(udp->uh_dport) & 0xc000) { + case 0x4000: + prio = 70; + break; + case 0x8000: + prio = 60; + break; + case 0xc000: + prio = 55; + break; + default: + prio = 50; + break; + } + if (tbfdebug > 1) + log(LOG_DEBUG, "port %x prio%d\n", ntohs(udp->uh_dport), prio); + } else { + prio = 50; + } + return prio; +} + +/* + * End of token bucket filter modifications + */ + +int +ip_rsvp_vif_init(so, sopt) + struct socket *so; + struct sockopt *sopt; +{ + int error, i, s; + + if (rsvpdebug) + printf("ip_rsvp_vif_init: so_type = %d, pr_protocol = %d\n", + so->so_type, so->so_proto->pr_protocol); + + if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) + return EOPNOTSUPP; + + /* Check mbuf. */ + error = sooptcopyin(sopt, &i, sizeof i, sizeof i); + if (error) + return (error); + + if (rsvpdebug) + printf("ip_rsvp_vif_init: vif = %d rsvp_on = %d\n", i, rsvp_on); + + s = splnet(); + + /* Check vif. */ + if (!legal_vif_num(i)) { + splx(s); + return EADDRNOTAVAIL; + } + + /* Check if socket is available. */ + if (viftable[i].v_rsvpd != NULL) { + splx(s); + return EADDRINUSE; + } + + viftable[i].v_rsvpd = so; + /* This may seem silly, but we need to be sure we don't over-increment + * the RSVP counter, in case something slips up. + */ + if (!viftable[i].v_rsvp_on) { + viftable[i].v_rsvp_on = 1; + rsvp_on++; + } + + splx(s); + return 0; +} + +int +ip_rsvp_vif_done(so, sopt) + struct socket *so; + struct sockopt *sopt; +{ + int error, i, s; + + if (rsvpdebug) + printf("ip_rsvp_vif_done: so_type = %d, pr_protocol = %d\n", + so->so_type, so->so_proto->pr_protocol); + + if (so->so_type != SOCK_RAW || + so->so_proto->pr_protocol != IPPROTO_RSVP) + return EOPNOTSUPP; + + error = sooptcopyin(sopt, &i, sizeof i, sizeof i); + if (error) + return (error); + + s = splnet(); + + /* Check vif. */ + if (!legal_vif_num(i)) { + splx(s); + return EADDRNOTAVAIL; + } + + if (rsvpdebug) + printf("ip_rsvp_vif_done: v_rsvpd = %p so = %p\n", + viftable[i].v_rsvpd, so); + + viftable[i].v_rsvpd = NULL; + /* + * This may seem silly, but we need to be sure we don't over-decrement + * the RSVP counter, in case something slips up. + */ + if (viftable[i].v_rsvp_on) { + viftable[i].v_rsvp_on = 0; + rsvp_on--; + } + + splx(s); + return 0; +} + +void +ip_rsvp_force_done(so) + struct socket *so; +{ + int vifi; + register int s; + + /* Don't bother if it is not the right type of socket. */ + if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) + return; + + s = splnet(); + + /* The socket may be attached to more than one vif...this + * is perfectly legal. + */ + for (vifi = 0; vifi < numvifs; vifi++) { + if (viftable[vifi].v_rsvpd == so) { + viftable[vifi].v_rsvpd = NULL; + /* This may seem silly, but we need to be sure we don't + * over-decrement the RSVP counter, in case something slips up. + */ + if (viftable[vifi].v_rsvp_on) { + viftable[vifi].v_rsvp_on = 0; + rsvp_on--; + } + } + } + + splx(s); + return; +} + +void +rsvp_input(m, off, proto) + struct mbuf *m; + int off; + int proto; +{ + int vifi; + register struct ip *ip = mtod(m, struct ip *); + static struct sockaddr_in rsvp_src = { sizeof rsvp_src, AF_INET }; + register int s; + struct ifnet *ifp; + + if (rsvpdebug) + printf("rsvp_input: rsvp_on %d\n",rsvp_on); + + /* Can still get packets with rsvp_on = 0 if there is a local member + * of the group to which the RSVP packet is addressed. But in this + * case we want to throw the packet away. + */ + if (!rsvp_on) { + m_freem(m); + return; + } + + s = splnet(); + + if (rsvpdebug) + printf("rsvp_input: check vifs\n"); + +#ifdef DIAGNOSTIC + if (!(m->m_flags & M_PKTHDR)) + panic("rsvp_input no hdr"); +#endif + + ifp = m->m_pkthdr.rcvif; + /* Find which vif the packet arrived on. */ + for (vifi = 0; vifi < numvifs; vifi++) + if (viftable[vifi].v_ifp == ifp) + break; + + if (vifi == numvifs || viftable[vifi].v_rsvpd == NULL) { + /* + * If the old-style non-vif-associated socket is set, + * then use it. Otherwise, drop packet since there + * is no specific socket for this vif. + */ + if (ip_rsvpd != NULL) { + if (rsvpdebug) + printf("rsvp_input: Sending packet up old-style socket\n"); + rip_input(m, off, proto); /* xxx */ + } else { + if (rsvpdebug && vifi == numvifs) + printf("rsvp_input: Can't find vif for packet.\n"); + else if (rsvpdebug && viftable[vifi].v_rsvpd == NULL) + printf("rsvp_input: No socket defined for vif %d\n",vifi); + m_freem(m); + } + splx(s); + return; + } + rsvp_src.sin_addr = ip->ip_src; + + if (rsvpdebug && m) + printf("rsvp_input: m->m_len = %d, sbspace() = %ld\n", + m->m_len,sbspace(&(viftable[vifi].v_rsvpd->so_rcv))); + + if (socket_send(viftable[vifi].v_rsvpd, m, &rsvp_src) < 0) { + if (rsvpdebug) + printf("rsvp_input: Failed to append to socket\n"); + } else { + if (rsvpdebug) + printf("rsvp_input: send packet up\n"); + } + + splx(s); +} + +#ifdef MROUTE_LKM +#include <sys/conf.h> +#include <sys/exec.h> +#include <sys/sysent.h> +#include <sys/lkm.h> + +MOD_MISC("ip_mroute_mod") + +static int +ip_mroute_mod_handle(struct lkm_table *lkmtp, int cmd) +{ + int i; + struct lkm_misc *args = lkmtp->private.lkm_misc; + int err = 0; + + switch(cmd) { + static int (*old_ip_mrouter_cmd)(); + static int (*old_ip_mrouter_done)(); + static int (*old_ip_mforward)(); + static int (*old_mrt_ioctl)(); + static void (*old_proto4_input)(); + static int (*old_legal_vif_num)(); + extern struct protosw inetsw[]; + + case LKM_E_LOAD: + if(lkmexists(lkmtp) || ip_mrtproto) + return(EEXIST); + old_ip_mrouter_cmd = ip_mrouter_cmd; + ip_mrouter_cmd = X_ip_mrouter_cmd; + old_ip_mrouter_done = ip_mrouter_done; + ip_mrouter_done = X_ip_mrouter_done; + old_ip_mforward = ip_mforward; + ip_mforward = X_ip_mforward; + old_mrt_ioctl = mrt_ioctl; + mrt_ioctl = X_mrt_ioctl; + old_proto4_input = inetsw[ip_protox[ENCAP_PROTO]].pr_input; + inetsw[ip_protox[ENCAP_PROTO]].pr_input = X_ipip_input; + old_legal_vif_num = legal_vif_num; + legal_vif_num = X_legal_vif_num; + ip_mrtproto = IGMP_DVMRP; + + printf("\nIP multicast routing loaded\n"); + break; + + case LKM_E_UNLOAD: + if (ip_mrouter) + return EINVAL; + + ip_mrouter_cmd = old_ip_mrouter_cmd; + ip_mrouter_done = old_ip_mrouter_done; + ip_mforward = old_ip_mforward; + mrt_ioctl = old_mrt_ioctl; + inetsw[ip_protox[ENCAP_PROTO]].pr_input = old_proto4_input; + legal_vif_num = old_legal_vif_num; + ip_mrtproto = 0; + break; + + default: + err = EINVAL; + break; + } + + return(err); +} + +int +ip_mroute_mod(struct lkm_table *lkmtp, int cmd, int ver) { + DISPATCH(lkmtp, cmd, ver, ip_mroute_mod_handle, ip_mroute_mod_handle, + nosys); +} + +#endif /* MROUTE_LKM */ +#endif /* MROUTING */ diff --git a/sys/netinet/ip_mroute.h b/sys/netinet/ip_mroute.h new file mode 100644 index 0000000..8c990c8 --- /dev/null +++ b/sys/netinet/ip_mroute.h @@ -0,0 +1,267 @@ +/* + * Copyright (c) 1989 Stephen Deering. + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Stephen Deering of Stanford University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip_mroute.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_IP_MROUTE_H_ +#define _NETINET_IP_MROUTE_H_ + +/* + * Definitions for IP multicast forwarding. + * + * Written by David Waitzman, BBN Labs, August 1988. + * Modified by Steve Deering, Stanford, February 1989. + * Modified by Ajit Thyagarajan, PARC, August 1993. + * Modified by Ajit Thyagarajan, PARC, August 1994. + * + * MROUTING Revision: 3.3.1.3 + */ + + +/* + * Multicast Routing set/getsockopt commands. + */ +#define MRT_INIT 100 /* initialize forwarder */ +#define MRT_DONE 101 /* shut down forwarder */ +#define MRT_ADD_VIF 102 /* create virtual interface */ +#define MRT_DEL_VIF 103 /* delete virtual interface */ +#define MRT_ADD_MFC 104 /* insert forwarding cache entry */ +#define MRT_DEL_MFC 105 /* delete forwarding cache entry */ +#define MRT_VERSION 106 /* get kernel version number */ +#define MRT_ASSERT 107 /* enable PIM assert processing */ + + +#define GET_TIME(t) microtime(&t) + +/* + * Types and macros for handling bitmaps with one bit per virtual interface. + */ +#define MAXVIFS 32 +typedef u_long vifbitmap_t; +typedef u_short vifi_t; /* type of a vif index */ +#define ALL_VIFS (vifi_t)-1 + +#define VIFM_SET(n, m) ((m) |= (1 << (n))) +#define VIFM_CLR(n, m) ((m) &= ~(1 << (n))) +#define VIFM_ISSET(n, m) ((m) & (1 << (n))) +#define VIFM_CLRALL(m) ((m) = 0x00000000) +#define VIFM_COPY(mfrom, mto) ((mto) = (mfrom)) +#define VIFM_SAME(m1, m2) ((m1) == (m2)) + + +/* + * Argument structure for MRT_ADD_VIF. + * (MRT_DEL_VIF takes a single vifi_t argument.) + */ +struct vifctl { + vifi_t vifc_vifi; /* the index of the vif to be added */ + u_char vifc_flags; /* VIFF_ flags defined below */ + u_char vifc_threshold; /* min ttl required to forward on vif */ + u_int vifc_rate_limit; /* max rate */ + struct in_addr vifc_lcl_addr; /* local interface address */ + struct in_addr vifc_rmt_addr; /* remote address (tunnels only) */ +}; + +#define VIFF_TUNNEL 0x1 /* vif represents a tunnel end-point */ +#define VIFF_SRCRT 0x2 /* tunnel uses IP source routing */ + +/* + * Argument structure for MRT_ADD_MFC and MRT_DEL_MFC + * (mfcc_tos to be added at a future point) + */ +struct mfcctl { + struct in_addr mfcc_origin; /* ip origin of mcasts */ + struct in_addr mfcc_mcastgrp; /* multicast group associated*/ + vifi_t mfcc_parent; /* incoming vif */ + u_char mfcc_ttls[MAXVIFS]; /* forwarding ttls on vifs */ +}; + +/* + * The kernel's multicast routing statistics. + */ +struct mrtstat { + u_long mrts_mfc_lookups; /* # forw. cache hash table hits */ + u_long mrts_mfc_misses; /* # forw. cache hash table misses */ + u_long mrts_upcalls; /* # calls to mrouted */ + u_long mrts_no_route; /* no route for packet's origin */ + u_long mrts_bad_tunnel; /* malformed tunnel options */ + u_long mrts_cant_tunnel; /* no room for tunnel options */ + u_long mrts_wrong_if; /* arrived on wrong interface */ + u_long mrts_upq_ovflw; /* upcall Q overflow */ + u_long mrts_cache_cleanups; /* # entries with no upcalls */ + u_long mrts_drop_sel; /* pkts dropped selectively */ + u_long mrts_q_overflow; /* pkts dropped - Q overflow */ + u_long mrts_pkt2large; /* pkts dropped - size > BKT SIZE */ + u_long mrts_upq_sockfull; /* upcalls dropped - socket full */ +}; + +/* + * Argument structure used by mrouted to get src-grp pkt counts + */ +struct sioc_sg_req { + struct in_addr src; + struct in_addr grp; + u_long pktcnt; + u_long bytecnt; + u_long wrong_if; +}; + +/* + * Argument structure used by mrouted to get vif pkt counts + */ +struct sioc_vif_req { + vifi_t vifi; /* vif number */ + u_long icount; /* Input packet count on vif */ + u_long ocount; /* Output packet count on vif */ + u_long ibytes; /* Input byte count on vif */ + u_long obytes; /* Output byte count on vif */ +}; + + +/* + * The kernel's virtual-interface structure. + */ +struct vif { + u_char v_flags; /* VIFF_ flags defined above */ + u_char v_threshold; /* min ttl required to forward on vif*/ + u_int v_rate_limit; /* max rate */ + struct tbf *v_tbf; /* token bucket structure at intf. */ + struct in_addr v_lcl_addr; /* local interface address */ + struct in_addr v_rmt_addr; /* remote address (tunnels only) */ + struct ifnet *v_ifp; /* pointer to interface */ + u_long v_pkt_in; /* # pkts in on interface */ + u_long v_pkt_out; /* # pkts out on interface */ + u_long v_bytes_in; /* # bytes in on interface */ + u_long v_bytes_out; /* # bytes out on interface */ + struct route v_route; /* cached route if this is a tunnel */ + u_int v_rsvp_on; /* RSVP listening on this vif */ + struct socket *v_rsvpd; /* RSVP daemon socket */ +}; + +/* + * The kernel's multicast forwarding cache entry structure + * (A field for the type of service (mfc_tos) is to be added + * at a future point) + */ +struct mfc { + struct in_addr mfc_origin; /* IP origin of mcasts */ + struct in_addr mfc_mcastgrp; /* multicast group associated*/ + vifi_t mfc_parent; /* incoming vif */ + u_char mfc_ttls[MAXVIFS]; /* forwarding ttls on vifs */ + u_long mfc_pkt_cnt; /* pkt count for src-grp */ + u_long mfc_byte_cnt; /* byte count for src-grp */ + u_long mfc_wrong_if; /* wrong if for src-grp */ + int mfc_expire; /* time to clean entry up */ + struct timeval mfc_last_assert; /* last time I sent an assert*/ + struct rtdetq *mfc_stall; /* q of packets awaiting mfc */ + struct mfc *mfc_next; /* next mfc entry */ +}; + +/* + * Struct used to communicate from kernel to multicast router + * note the convenient similarity to an IP packet + */ +struct igmpmsg { + u_long unused1; + u_long unused2; + u_char im_msgtype; /* what type of message */ +#define IGMPMSG_NOCACHE 1 +#define IGMPMSG_WRONGVIF 2 + u_char im_mbz; /* must be zero */ + u_char im_vif; /* vif rec'd on */ + u_char unused3; + struct in_addr im_src, im_dst; +}; + +/* + * Argument structure used for pkt info. while upcall is made + */ +struct rtdetq { + struct mbuf *m; /* A copy of the packet */ + struct ifnet *ifp; /* Interface pkt came in on */ + vifi_t xmt_vif; /* Saved copy of imo_multicast_vif */ +#ifdef UPCALL_TIMING + struct timeval t; /* Timestamp */ +#endif /* UPCALL_TIMING */ + struct rtdetq *next; /* Next in list of packets */ +}; + +#define MFCTBLSIZ 256 +#if (MFCTBLSIZ & (MFCTBLSIZ - 1)) == 0 /* from sys:route.h */ +#define MFCHASHMOD(h) ((h) & (MFCTBLSIZ - 1)) +#else +#define MFCHASHMOD(h) ((h) % MFCTBLSIZ) +#endif + +#define MAX_UPQ 4 /* max. no of pkts in upcall Q */ + +/* + * Token Bucket filter code + */ +#define MAX_BKT_SIZE 10000 /* 10K bytes size */ +#define MAXQSIZE 10 /* max # of pkts in queue */ + +/* + * the token bucket filter at each vif + */ +struct tbf +{ + struct timeval tbf_last_pkt_t; /* arr. time of last pkt */ + u_long tbf_n_tok; /* no of tokens in bucket */ + u_long tbf_q_len; /* length of queue at this vif */ + u_long tbf_max_q_len; /* max. queue length */ + struct mbuf *tbf_q; /* Packet queue */ + struct mbuf *tbf_t; /* tail-insertion pointer */ +}; + +#ifdef _KERNEL + +struct sockopt; + +extern int (*ip_mrouter_set) __P((struct socket *, struct sockopt *)); +extern int (*ip_mrouter_get) __P((struct socket *, struct sockopt *)); +extern int (*ip_mrouter_done) __P((void)); +#ifdef MROUTING +extern int (*mrt_ioctl) __P((int, caddr_t)); +#else +extern int (*mrt_ioctl) __P((int, caddr_t, struct proc *)); +#endif + +#endif /* _KERNEL */ + +#endif /* _NETINET_IP_MROUTE_H_ */ diff --git a/sys/netinet/ip_nat.c b/sys/netinet/ip_nat.c new file mode 100644 index 0000000..816d8e7 --- /dev/null +++ b/sys/netinet/ip_nat.c @@ -0,0 +1,2739 @@ +/* + * Copyright (C) 1995-2000 by Darren Reed. + * + * Redistribution and use in source and binary forms are permitted + * provided that this notice is preserved and due credit is given + * to the original author and the contributors. + * + * Added redirect stuff and a LOT of bug fixes. (mcn@EnGarde.com) + */ +#if !defined(lint) +/*static const char rcsid[] = "@(#)$Id: ip_nat.c,v 2.37.2.16 2000/07/18 13:57:40 darrenr Exp $";*/ +static const char rcsid[] = "@(#)$FreeBSD$"; +#endif + +#if defined(__FreeBSD__) && defined(KERNEL) && !defined(_KERNEL) +#define _KERNEL +#endif + +#include <sys/errno.h> +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/file.h> +#if defined(__NetBSD__) && (NetBSD >= 199905) && !defined(IPFILTER_LKM) && \ + defined(_KERNEL) +# include "opt_ipfilter_log.h" +#endif +#if !defined(_KERNEL) && !defined(KERNEL) +# include <stdio.h> +# include <string.h> +# include <stdlib.h> +#endif +#if (defined(KERNEL) || defined(_KERNEL)) && (__FreeBSD_version >= 220000) +# include <sys/filio.h> +# include <sys/fcntl.h> +#else +# include <sys/ioctl.h> +#endif +#include <sys/fcntl.h> +#include <sys/uio.h> +#ifndef linux +# include <sys/protosw.h> +#endif +#include <sys/socket.h> +#if defined(_KERNEL) && !defined(linux) +# include <sys/systm.h> +#endif +#if !defined(__SVR4) && !defined(__svr4__) +# ifndef linux +# include <sys/mbuf.h> +# endif +#else +# include <sys/filio.h> +# include <sys/byteorder.h> +# ifdef _KERNEL +# include <sys/dditypes.h> +# endif +# include <sys/stream.h> +# include <sys/kmem.h> +#endif +#if __FreeBSD_version >= 300000 +# include <sys/queue.h> +#endif +#include <net/if.h> +#if __FreeBSD_version >= 300000 +# include <net/if_var.h> +# if defined(_KERNEL) && !defined(IPFILTER_LKM) +# include "opt_ipfilter.h" +# endif +#endif +#ifdef sun +# include <net/af.h> +#endif +#include <net/route.h> +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> + +#ifdef __sgi +# ifdef IFF_DRVRLOCK /* IRIX6 */ +#include <sys/hashing.h> +#include <netinet/in_var.h> +# endif +#endif + +#ifdef RFC1825 +# include <vpn/md5.h> +# include <vpn/ipsec.h> +extern struct ifnet vpnif; +#endif + +#ifndef linux +# include <netinet/ip_var.h> +#endif +#include <netinet/tcp.h> +#include <netinet/udp.h> +#include <netinet/ip_icmp.h> +#include "netinet/ip_compat.h" +#include <netinet/tcpip.h> +#include "netinet/ip_fil.h" +#include "netinet/ip_proxy.h" +#include "netinet/ip_nat.h" +#include "netinet/ip_frag.h" +#include "netinet/ip_state.h" +#if (__FreeBSD_version >= 300000) +# include <sys/malloc.h> +#endif +#ifndef MIN +# define MIN(a,b) (((a)<(b))?(a):(b)) +#endif +#undef SOCKADDR_IN +#define SOCKADDR_IN struct sockaddr_in + +nat_t **nat_table[2] = { NULL, NULL }, + *nat_instances = NULL; +ipnat_t *nat_list = NULL; +u_int ipf_nattable_sz = NAT_TABLE_SZ; +u_int ipf_natrules_sz = NAT_SIZE; +u_int ipf_rdrrules_sz = RDR_SIZE; +u_int ipf_hostmap_sz = HOSTMAP_SIZE; +u_32_t nat_masks = 0; +u_32_t rdr_masks = 0; +ipnat_t **nat_rules = NULL; +ipnat_t **rdr_rules = NULL; +hostmap_t **maptable = NULL; + +u_long fr_defnatage = DEF_NAT_AGE, + fr_defnaticmpage = 6; /* 3 seconds */ +natstat_t nat_stats; +int fr_nat_lock = 0; +#if (SOLARIS || defined(__sgi)) && defined(_KERNEL) +extern kmutex_t ipf_rw; +extern KRWLOCK_T ipf_nat; +#endif + +static int nat_flushtable __P((void)); +static int nat_clearlist __P((void)); +static void nat_addnat __P((struct ipnat *)); +static void nat_addrdr __P((struct ipnat *)); +static void nat_delete __P((struct nat *)); +static void nat_delrdr __P((struct ipnat *)); +static void nat_delnat __P((struct ipnat *)); +static int fr_natgetent __P((caddr_t)); +static int fr_natgetsz __P((caddr_t)); +static int fr_natputent __P((caddr_t)); +static void nat_tabmove __P((nat_t *, u_32_t)); +static int nat_match __P((fr_info_t *, ipnat_t *, ip_t *)); +static hostmap_t *nat_hostmap __P((ipnat_t *, struct in_addr, + struct in_addr)); +static void nat_hostmapdel __P((struct hostmap *)); + + +int nat_init() +{ + KMALLOCS(nat_table[0], nat_t **, sizeof(nat_t *) * ipf_nattable_sz); + if (nat_table[0] != NULL) + bzero((char *)nat_table[0], ipf_nattable_sz * sizeof(nat_t *)); + else + return -1; + + KMALLOCS(nat_table[1], nat_t **, sizeof(nat_t *) * ipf_nattable_sz); + if (nat_table[1] != NULL) + bzero((char *)nat_table[1], ipf_nattable_sz * sizeof(nat_t *)); + else + return -1; + + KMALLOCS(nat_rules, ipnat_t **, sizeof(ipnat_t *) * ipf_natrules_sz); + if (nat_rules != NULL) + bzero((char *)nat_rules, ipf_natrules_sz * sizeof(ipnat_t *)); + else + return -1; + + KMALLOCS(rdr_rules, ipnat_t **, sizeof(ipnat_t *) * ipf_rdrrules_sz); + if (rdr_rules != NULL) + bzero((char *)rdr_rules, ipf_rdrrules_sz * sizeof(ipnat_t *)); + else + return -1; + + KMALLOCS(maptable, hostmap_t **, sizeof(hostmap_t *) * ipf_hostmap_sz); + if (maptable != NULL) + bzero((char *)maptable, sizeof(hostmap_t *) * ipf_hostmap_sz); + else + return -1; + return 0; +} + + +static void nat_addrdr(n) +ipnat_t *n; +{ + ipnat_t **np; + u_32_t j; + u_int hv; + int k; + + k = countbits(n->in_outmsk); + if ((k >= 0) && (k != 32)) + rdr_masks |= 1 << k; + j = (n->in_outip & n->in_outmsk); + hv = NAT_HASH_FN(j, 0, ipf_rdrrules_sz); + np = rdr_rules + hv; + while (*np != NULL) + np = &(*np)->in_rnext; + n->in_rnext = NULL; + n->in_prnext = np; + *np = n; +} + + +static void nat_addnat(n) +ipnat_t *n; +{ + ipnat_t **np; + u_32_t j; + u_int hv; + int k; + + k = countbits(n->in_inmsk); + if ((k >= 0) && (k != 32)) + nat_masks |= 1 << k; + j = (n->in_inip & n->in_inmsk); + hv = NAT_HASH_FN(j, 0, ipf_natrules_sz); + np = nat_rules + hv; + while (*np != NULL) + np = &(*np)->in_mnext; + n->in_mnext = NULL; + n->in_pmnext = np; + *np = n; +} + + +static void nat_delrdr(n) +ipnat_t *n; +{ + if (n->in_rnext) + n->in_rnext->in_prnext = n->in_prnext; + *n->in_prnext = n->in_rnext; +} + + +static void nat_delnat(n) +ipnat_t *n; +{ + if (n->in_mnext) + n->in_mnext->in_pmnext = n->in_pmnext; + *n->in_pmnext = n->in_mnext; +} + + +/* + * check if an ip address has already been allocated for a given mapping that + * is not doing port based translation. + * + * Must be called with ipf_nat held as a write lock. + */ +static struct hostmap *nat_hostmap(np, real, map) +ipnat_t *np; +struct in_addr real; +struct in_addr map; +{ + hostmap_t *hm; + u_int hv; + + hv = real.s_addr % HOSTMAP_SIZE; + for (hm = maptable[hv]; hm; hm = hm->hm_next) + if ((hm->hm_realip.s_addr == real.s_addr) && + (np == hm->hm_ipnat)) { + hm->hm_ref++; + return hm; + } + + KMALLOC(hm, hostmap_t *); + if (hm) { + hm->hm_next = maptable[hv]; + hm->hm_pnext = maptable + hv; + if (maptable[hv]) + maptable[hv]->hm_pnext = &hm->hm_next; + maptable[hv] = hm; + hm->hm_ipnat = np; + hm->hm_realip = real; + hm->hm_mapip = map; + hm->hm_ref = 1; + } + return hm; +} + + +/* + * Must be called with ipf_nat held as a write lock. + */ +static void nat_hostmapdel(hm) +struct hostmap *hm; +{ + ATOMIC_DEC32(hm->hm_ref); + if (hm->hm_ref == 0) { + if (hm->hm_next) + hm->hm_next->hm_pnext = hm->hm_pnext; + *hm->hm_pnext = hm->hm_next; + KFREE(hm); + } +} + + +void fix_outcksum(sp, n) +u_short *sp; +u_32_t n; +{ + register u_short sumshort; + register u_32_t sum1; + + if (!n) + return; +#if SOLARIS2 >= 6 + else if (n & NAT_HW_CKSUM) { + *sp = n & 0xffff; + return; + } +#endif + sum1 = (~ntohs(*sp)) & 0xffff; + sum1 += (n); + sum1 = (sum1 >> 16) + (sum1 & 0xffff); + /* Again */ + sum1 = (sum1 >> 16) + (sum1 & 0xffff); + sumshort = ~(u_short)sum1; + *(sp) = htons(sumshort); +} + + +void fix_incksum(sp, n) +u_short *sp; +u_32_t n; +{ + register u_short sumshort; + register u_32_t sum1; + + if (!n) + return; +#if SOLARIS2 >= 6 + else if (n & NAT_HW_CKSUM) { + *sp = n & 0xffff; + return; + } +#endif +#ifdef sparc + sum1 = (~(*sp)) & 0xffff; +#else + sum1 = (~ntohs(*sp)) & 0xffff; +#endif + sum1 += ~(n) & 0xffff; + sum1 = (sum1 >> 16) + (sum1 & 0xffff); + /* Again */ + sum1 = (sum1 >> 16) + (sum1 & 0xffff); + sumshort = ~(u_short)sum1; + *(sp) = htons(sumshort); +} + + +/* + * fix_datacksum is used *only* for the adjustments of checksums in the data + * section of an IP packet. + * + * The only situation in which you need to do this is when NAT'ing an + * ICMP error message. Such a message, contains in its body the IP header + * of the original IP packet, that causes the error. + * + * You can't use fix_incksum or fix_outcksum in that case, because for the + * kernel the data section of the ICMP error is just data, and no special + * processing like hardware cksum or ntohs processing have been done by the + * kernel on the data section. + */ +void fix_datacksum(sp, n) +u_short *sp; +u_32_t n; +{ + register u_short sumshort; + register u_32_t sum1; + + if (!n) + return; + + sum1 = (~ntohs(*sp)) & 0xffff; + sum1 += (n); + sum1 = (sum1 >> 16) + (sum1 & 0xffff); + /* Again */ + sum1 = (sum1 >> 16) + (sum1 & 0xffff); + sumshort = ~(u_short)sum1; + *(sp) = htons(sumshort); +} + +/* + * How the NAT is organised and works. + * + * Inside (interface y) NAT Outside (interface x) + * -------------------- -+- ------------------------------------- + * Packet going | out, processsed by ip_natout() for x + * ------------> | ------------> + * src=10.1.1.1 | src=192.1.1.1 + * | + * | in, processed by ip_natin() for x + * <------------ | <------------ + * dst=10.1.1.1 | dst=192.1.1.1 + * -------------------- -+- ------------------------------------- + * ip_natout() - changes ip_src and if required, sport + * - creates a new mapping, if required. + * ip_natin() - changes ip_dst and if required, dport + * + * In the NAT table, internal source is recorded as "in" and externally + * seen as "out". + */ + +/* + * Handle ioctls which manipulate the NAT. + */ +int nat_ioctl(data, cmd, mode) +#if defined(__NetBSD__) || defined(__OpenBSD__) || (__FreeBSD_version >= 300003) +u_long cmd; +#else +int cmd; +#endif +caddr_t data; +int mode; +{ + register ipnat_t *nat, *nt, *n = NULL, **np = NULL; + int error = 0, ret, arg; + ipnat_t natd; + u_32_t i, j; + +#if (BSD >= 199306) && defined(_KERNEL) + if ((securelevel >= 2) && (mode & FWRITE)) + return EPERM; +#endif + + nat = NULL; /* XXX gcc -Wuninitialized */ + KMALLOC(nt, ipnat_t *); + if ((cmd == SIOCADNAT) || (cmd == SIOCRMNAT)) + error = IRCOPYPTR(data, (char *)&natd, sizeof(natd)); + else if (cmd == SIOCIPFFL) { /* SIOCFLNAT & SIOCCNATL */ + error = IRCOPY(data, (char *)&arg, sizeof(arg)); + if (error) + error = EFAULT; + } + + if (error) + goto done; + + /* + * For add/delete, look to see if the NAT entry is already present + */ + WRITE_ENTER(&ipf_nat); + if ((cmd == SIOCADNAT) || (cmd == SIOCRMNAT)) { + nat = &natd; + nat->in_flags &= IPN_USERFLAGS; + if ((nat->in_redir & NAT_MAPBLK) == 0) { + if ((nat->in_flags & IPN_SPLIT) == 0) + nat->in_inip &= nat->in_inmsk; + if ((nat->in_flags & IPN_IPRANGE) == 0) + nat->in_outip &= nat->in_outmsk; + } + for (np = &nat_list; (n = *np); np = &n->in_next) + if (!bcmp((char *)&nat->in_flags, (char *)&n->in_flags, + IPN_CMPSIZ)) + break; + } + + switch (cmd) + { +#ifdef IPFILTER_LOG + case SIOCIPFFB : + { + int tmp; + + if (!(mode & FWRITE)) + error = EPERM; + else { + tmp = ipflog_clear(IPL_LOGNAT); + IWCOPY((char *)&tmp, (char *)data, sizeof(tmp)); + } + break; + } +#endif + case SIOCADNAT : + if (!(mode & FWRITE)) { + error = EPERM; + break; + } + if (n) { + error = EEXIST; + break; + } + if (nt == NULL) { + error = ENOMEM; + break; + } + n = nt; + nt = NULL; + bcopy((char *)nat, (char *)n, sizeof(*n)); + n->in_ifp = (void *)GETUNIT(n->in_ifname, 4); + if (!n->in_ifp) + n->in_ifp = (void *)-1; + if (n->in_plabel[0] != '\0') { + n->in_apr = appr_match(n->in_p, n->in_plabel); + if (!n->in_apr) { + error = ENOENT; + break; + } + } + n->in_next = NULL; + *np = n; + + if (n->in_redir & NAT_REDIRECT) { + n->in_flags &= ~IPN_NOTDST; + nat_addrdr(n); + } + if (n->in_redir & (NAT_MAP|NAT_MAPBLK)) { + n->in_flags &= ~IPN_NOTSRC; + nat_addnat(n); + } + + n->in_use = 0; + if (n->in_redir & NAT_MAPBLK) + n->in_space = USABLE_PORTS * ~ntohl(n->in_outmsk); + else if (n->in_flags & IPN_AUTOPORTMAP) + n->in_space = USABLE_PORTS * ~ntohl(n->in_inmsk); + else if (n->in_flags & IPN_IPRANGE) + n->in_space = ntohl(n->in_outmsk) - ntohl(n->in_outip); + else if (n->in_flags & IPN_SPLIT) + n->in_space = 2; + else + n->in_space = ~ntohl(n->in_outmsk); + /* + * Calculate the number of valid IP addresses in the output + * mapping range. In all cases, the range is inclusive of + * the start and ending IP addresses. + * If to a CIDR address, lose 2: broadcast + network address + * (so subtract 1) + * If to a range, add one. + * If to a single IP address, set to 1. + */ + if (n->in_space) { + if ((n->in_flags & IPN_IPRANGE) != 0) + n->in_space += 1; + else + n->in_space -= 1; + } else + n->in_space = 1; + if ((n->in_outmsk != 0xffffffff) && (n->in_outmsk != 0) && + ((n->in_flags & (IPN_IPRANGE|IPN_SPLIT)) == 0)) + n->in_nip = ntohl(n->in_outip) + 1; + else if ((n->in_flags & IPN_SPLIT) && + (n->in_redir & NAT_REDIRECT)) + n->in_nip = ntohl(n->in_inip); + else + n->in_nip = ntohl(n->in_outip); + if (n->in_redir & NAT_MAP) { + n->in_pnext = ntohs(n->in_pmin); + /* + * Multiply by the number of ports made available. + */ + if (ntohs(n->in_pmax) >= ntohs(n->in_pmin)) { + n->in_space *= (ntohs(n->in_pmax) - + ntohs(n->in_pmin) + 1); + /* + * Because two different sources can map to + * different destinations but use the same + * local IP#/port #. + * If the result is smaller than in_space, then + * we may have wrapped around 32bits. + */ + i = n->in_inmsk; + if ((i != 0) && (i != 0xffffffff)) { + j = n->in_space * (~ntohl(i) + 1); + if (j >= n->in_space) + n->in_space = j; + else + n->in_space = 0xffffffff; + } + } + /* + * If no protocol is specified, multiple by 256. + */ + if ((n->in_flags & IPN_TCPUDP) == 0) { + j = n->in_space * 256; + if (j >= n->in_space) + n->in_space = j; + else + n->in_space = 0xffffffff; + } + } + /* Otherwise, these fields are preset */ + n = NULL; + nat_stats.ns_rules++; + break; + case SIOCRMNAT : + if (!(mode & FWRITE)) { + error = EPERM; + n = NULL; + break; + } + if (!n) { + error = ESRCH; + break; + } + if (n->in_redir & NAT_REDIRECT) + nat_delrdr(n); + if (n->in_redir & (NAT_MAPBLK|NAT_MAP)) + nat_delnat(n); + if (nat_list == NULL) { + nat_masks = 0; + rdr_masks = 0; + } + *np = n->in_next; + if (!n->in_use) { + if (n->in_apr) + appr_free(n->in_apr); + KFREE(n); + nat_stats.ns_rules--; + } else { + n->in_flags |= IPN_DELETE; + n->in_next = NULL; + } + n = NULL; + break; + case SIOCGNATS : + MUTEX_DOWNGRADE(&ipf_nat); + nat_stats.ns_table[0] = nat_table[0]; + nat_stats.ns_table[1] = nat_table[1]; + nat_stats.ns_list = nat_list; + nat_stats.ns_nattab_sz = ipf_nattable_sz; + nat_stats.ns_rultab_sz = ipf_natrules_sz; + nat_stats.ns_rdrtab_sz = ipf_rdrrules_sz; + nat_stats.ns_instances = nat_instances; + nat_stats.ns_apslist = ap_sess_list; + error = IWCOPYPTR((char *)&nat_stats, (char *)data, + sizeof(nat_stats)); + break; + case SIOCGNATL : + { + natlookup_t nl; + + MUTEX_DOWNGRADE(&ipf_nat); + error = IRCOPYPTR((char *)data, (char *)&nl, sizeof(nl)); + if (error) + break; + + if (nat_lookupredir(&nl)) { + error = IWCOPYPTR((char *)&nl, (char *)data, + sizeof(nl)); + } else + error = ESRCH; + break; + } + case SIOCIPFFL : /* old SIOCFLNAT & SIOCCNATL */ + if (!(mode & FWRITE)) { + error = EPERM; + break; + } + error = 0; + if (arg == 0) + ret = nat_flushtable(); + else if (arg == 1) + ret = nat_clearlist(); + else + error = EINVAL; + MUTEX_DOWNGRADE(&ipf_nat); + if (!error) { + error = IWCOPY((caddr_t)&ret, data, sizeof(ret)); + if (error) + error = EFAULT; + } + break; + case SIOCSTLCK : + error = IRCOPY(data, (caddr_t)&arg, sizeof(arg)); + if (!error) { + error = IWCOPY((caddr_t)&fr_nat_lock, data, + sizeof(fr_nat_lock)); + if (!error) + fr_nat_lock = arg; + } else + error = EFAULT; + break; + case SIOCSTPUT : + if (fr_nat_lock) + error = fr_natputent(data); + else + error = EACCES; + break; + case SIOCSTGSZ : + if (fr_nat_lock) + error = fr_natgetsz(data); + else + error = EACCES; + break; + case SIOCSTGET : + if (fr_nat_lock) + error = fr_natgetent(data); + else + error = EACCES; + break; + case FIONREAD : +#ifdef IPFILTER_LOG + arg = (int)iplused[IPL_LOGNAT]; + MUTEX_DOWNGRADE(&ipf_nat); + error = IWCOPY((caddr_t)&arg, (caddr_t)data, sizeof(arg)); + if (error) + error = EFAULT; +#endif + break; + default : + error = EINVAL; + break; + } + RWLOCK_EXIT(&ipf_nat); /* READ/WRITE */ +done: + if (nt) + KFREE(nt); + return error; +} + + +static int fr_natgetsz(data) +caddr_t data; +{ + ap_session_t *aps; + nat_t *nat, *n; + int error = 0; + natget_t ng; + + error = IRCOPY(data, (caddr_t)&ng, sizeof(ng)); + if (error) + return EFAULT; + + nat = ng.ng_ptr; + if (!nat) { + nat = nat_instances; + ng.ng_sz = 0; + if (nat == NULL) { + error = IWCOPY((caddr_t)&ng, data, sizeof(ng)); + if (error) + error = EFAULT; + return error; + } + } else { + /* + * Make sure the pointer we're copying from exists in the + * current list of entries. Security precaution to prevent + * copying of random kernel data. + */ + for (n = nat_instances; n; n = n->nat_next) + if (n == nat) + break; + if (!n) + return ESRCH; + } + + ng.ng_sz = sizeof(nat_save_t); + aps = nat->nat_aps; + if ((aps != NULL) && (aps->aps_data != 0)) { + ng.ng_sz += sizeof(ap_session_t); + ng.ng_sz += aps->aps_psiz; + } + + error = IWCOPY((caddr_t)&ng, data, sizeof(ng)); + if (error) + error = EFAULT; + return error; +} + + +static int fr_natgetent(data) +caddr_t data; +{ + nat_save_t ipn, *ipnp, *ipnn = NULL; + register nat_t *n, *nat; + ap_session_t *aps; + int error; + + error = IRCOPY(data, (caddr_t)&ipnp, sizeof(ipnp)); + if (error) + return EFAULT; + error = IRCOPY((caddr_t)ipnp, (caddr_t)&ipn, sizeof(ipn)); + if (error) + return EFAULT; + + nat = ipn.ipn_next; + if (!nat) { + nat = nat_instances; + if (nat == NULL) { + if (nat_instances == NULL) + return ENOENT; + return 0; + } + } else { + /* + * Make sure the pointer we're copying from exists in the + * current list of entries. Security precaution to prevent + * copying of random kernel data. + */ + for (n = nat_instances; n; n = n->nat_next) + if (n == nat) + break; + if (!n) + return ESRCH; + } + + ipn.ipn_next = nat->nat_next; + ipn.ipn_dsize = 0; + bcopy((char *)nat, (char *)&ipn.ipn_nat, sizeof(ipn.ipn_nat)); + ipn.ipn_nat.nat_data = NULL; + + if (nat->nat_ptr) { + bcopy((char *)nat->nat_ptr, (char *)&ipn.ipn_ipnat, + sizeof(ipn.ipn_ipnat)); + } + + if (nat->nat_fr) + bcopy((char *)nat->nat_fr, (char *)&ipn.ipn_rule, + sizeof(ipn.ipn_rule)); + + if ((aps = nat->nat_aps)) { + ipn.ipn_dsize = sizeof(*aps); + if (aps->aps_data) + ipn.ipn_dsize += aps->aps_psiz; + KMALLOCS(ipnn, nat_save_t *, sizeof(*ipnn) + ipn.ipn_dsize); + if (ipnn == NULL) + return ENOMEM; + bcopy((char *)&ipn, (char *)ipnn, sizeof(ipn)); + + bcopy((char *)aps, ipnn->ipn_data, sizeof(*aps)); + if (aps->aps_data) { + bcopy(aps->aps_data, ipnn->ipn_data + sizeof(*aps), + aps->aps_psiz); + ipnn->ipn_dsize += aps->aps_psiz; + } + error = IWCOPY((caddr_t)ipnn, ipnp, + sizeof(ipn) + ipn.ipn_dsize); + if (error) + error = EFAULT; + KFREES(ipnn, sizeof(*ipnn) + ipn.ipn_dsize); + } else { + error = IWCOPY((caddr_t)&ipn, ipnp, sizeof(ipn)); + if (error) + error = EFAULT; + } + return error; +} + + +static int fr_natputent(data) +caddr_t data; +{ + nat_save_t ipn, *ipnp, *ipnn = NULL; + register nat_t *n, *nat; + ap_session_t *aps; + frentry_t *fr; + ipnat_t *in; + + int error; + + error = IRCOPY(data, (caddr_t)&ipnp, sizeof(ipnp)); + if (error) + return EFAULT; + error = IRCOPY((caddr_t)ipnp, (caddr_t)&ipn, sizeof(ipn)); + if (error) + return EFAULT; + nat = NULL; + if (ipn.ipn_dsize) { + KMALLOCS(ipnn, nat_save_t *, sizeof(ipn) + ipn.ipn_dsize); + if (ipnn == NULL) + return ENOMEM; + bcopy((char *)&ipn, (char *)ipnn, sizeof(ipn)); + error = IRCOPY((caddr_t)ipnp, (caddr_t)ipn.ipn_data, + ipn.ipn_dsize); + if (error) { + error = EFAULT; + goto junkput; + } + } else + ipnn = NULL; + + KMALLOC(nat, nat_t *); + if (nat == NULL) { + error = EFAULT; + goto junkput; + } + + bcopy((char *)&ipn.ipn_nat, (char *)nat, sizeof(*nat)); + /* + * Initialize all these so that nat_delete() doesn't cause a crash. + */ + nat->nat_phnext[0] = NULL; + nat->nat_phnext[1] = NULL; + fr = nat->nat_fr; + nat->nat_fr = NULL; + aps = nat->nat_aps; + nat->nat_aps = NULL; + in = nat->nat_ptr; + nat->nat_ptr = NULL; + nat->nat_data = NULL; + + /* + * Restore the rule associated with this nat session + */ + if (in) { + KMALLOC(in, ipnat_t *); + if (in == NULL) { + error = ENOMEM; + goto junkput; + } + nat->nat_ptr = in; + bcopy((char *)&ipn.ipn_ipnat, (char *)in, sizeof(*in)); + in->in_use = 1; + in->in_flags |= IPN_DELETE; + in->in_next = NULL; + in->in_rnext = NULL; + in->in_prnext = NULL; + in->in_mnext = NULL; + in->in_pmnext = NULL; + in->in_ifp = GETUNIT(in->in_ifname, 4); + if (in->in_plabel[0] != '\0') { + in->in_apr = appr_match(in->in_p, in->in_plabel); + } + } + + /* + * Restore ap_session_t structure. Include the private data allocated + * if it was there. + */ + if (aps) { + KMALLOC(aps, ap_session_t *); + if (aps == NULL) { + error = ENOMEM; + goto junkput; + } + nat->nat_aps = aps; + aps->aps_next = ap_sess_list; + ap_sess_list = aps; + bcopy(ipnn->ipn_data, (char *)aps, sizeof(*aps)); + if (in) + aps->aps_apr = in->in_apr; + if (aps->aps_psiz) { + KMALLOCS(aps->aps_data, void *, aps->aps_psiz); + if (aps->aps_data == NULL) { + error = ENOMEM; + goto junkput; + } + bcopy(ipnn->ipn_data + sizeof(*aps), aps->aps_data, + aps->aps_psiz); + } else { + aps->aps_psiz = 0; + aps->aps_data = NULL; + } + } + + /* + * If there was a filtering rule associated with this entry then + * build up a new one. + */ + if (fr != NULL) { + if (nat->nat_flags & FI_NEWFR) { + KMALLOC(fr, frentry_t *); + nat->nat_fr = fr; + if (fr == NULL) { + error = ENOMEM; + goto junkput; + } + bcopy((char *)&ipn.ipn_fr, (char *)fr, sizeof(*fr)); + ipn.ipn_nat.nat_fr = fr; + error = IWCOPY((caddr_t)&ipn, ipnp, sizeof(ipn)); + if (error) { + error = EFAULT; + goto junkput; + } + } else { + for (n = nat_instances; n; n = n->nat_next) + if (n->nat_fr == fr) + break; + if (!n) { + error = ESRCH; + goto junkput; + } + } + } + + if (ipnn) + KFREES(ipnn, sizeof(ipn) + ipn.ipn_dsize); + nat_insert(nat); + return 0; +junkput: + if (ipnn) + KFREES(ipnn, sizeof(ipn) + ipn.ipn_dsize); + if (nat) + nat_delete(nat); + return error; +} + + +/* + * Delete a nat entry from the various lists and table. + */ +static void nat_delete(natd) +struct nat *natd; +{ + struct ipnat *ipn; + + if (natd->nat_flags & FI_WILDP) + nat_stats.ns_wilds--; + if (natd->nat_hnext[0]) + natd->nat_hnext[0]->nat_phnext[0] = natd->nat_phnext[0]; + *natd->nat_phnext[0] = natd->nat_hnext[0]; + if (natd->nat_hnext[1]) + natd->nat_hnext[1]->nat_phnext[1] = natd->nat_phnext[1]; + *natd->nat_phnext[1] = natd->nat_hnext[1]; + + if (natd->nat_fr != NULL) { + ATOMIC_DEC32(natd->nat_fr->fr_ref); + } + + if (natd->nat_hm != NULL) + nat_hostmapdel(natd->nat_hm); + + /* + * If there is an active reference from the nat entry to its parent + * rule, decrement the rule's reference count and free it too if no + * longer being used. + */ + ipn = natd->nat_ptr; + if (ipn != NULL) { + ipn->in_space++; + ipn->in_use--; + if (!ipn->in_use && (ipn->in_flags & IPN_DELETE)) { + if (ipn->in_apr) + appr_free(ipn->in_apr); + KFREE(ipn); + nat_stats.ns_rules--; + } + } + + MUTEX_DESTROY(&natd->nat_lock); + /* + * If there's a fragment table entry too for this nat entry, then + * dereference that as well. + */ + ipfr_forget((void *)natd); + aps_free(natd->nat_aps); + nat_stats.ns_inuse--; + KFREE(natd); +} + + +/* + * nat_flushtable - clear the NAT table of all mapping entries. + */ +static int nat_flushtable() +{ + register nat_t *nat, **natp; + register int j = 0; + + /* + * ALL NAT mappings deleted, so lets just make the deletions + * quicker. + */ + if (nat_table[0] != NULL) + bzero((char *)nat_table[0], + sizeof(nat_table[0]) * ipf_nattable_sz); + if (nat_table[1] != NULL) + bzero((char *)nat_table[1], + sizeof(nat_table[1]) * ipf_nattable_sz); + + for (natp = &nat_instances; (nat = *natp); ) { + *natp = nat->nat_next; +#ifdef IPFILTER_LOG + nat_log(nat, NL_FLUSH); +#endif + nat_delete(nat); + j++; + } + nat_stats.ns_inuse = 0; + return j; +} + + +/* + * nat_clearlist - delete all rules in the active NAT mapping list. + */ +static int nat_clearlist() +{ + register ipnat_t *n, **np = &nat_list; + int i = 0; + + if (nat_rules != NULL) + bzero((char *)nat_rules, sizeof(*nat_rules) * ipf_natrules_sz); + if (rdr_rules != NULL) + bzero((char *)rdr_rules, sizeof(*rdr_rules) * ipf_rdrrules_sz); + + while ((n = *np)) { + *np = n->in_next; + if (!n->in_use) { + if (n->in_apr) + appr_free(n->in_apr); + KFREE(n); + nat_stats.ns_rules--; + } else { + n->in_flags |= IPN_DELETE; + n->in_next = NULL; + } + i++; + } + nat_masks = 0; + rdr_masks = 0; + return i; +} + + +/* + * Create a new NAT table entry. + * NOTE: assumes write lock on ipf_nat has been obtained already. + */ +nat_t *nat_new(np, ip, fin, flags, direction) +ipnat_t *np; +ip_t *ip; +fr_info_t *fin; +u_int flags; +int direction; +{ + register u_32_t sum1, sum2, sumd, l; + u_short port = 0, sport = 0, dport = 0, nport = 0; + struct in_addr in, inb; + tcphdr_t *tcp = NULL; + hostmap_t *hm = NULL; + nat_t *nat, *natl; + u_short nflags; +#if SOLARIS && defined(_KERNEL) && (SOLARIS2 >= 6) + qif_t *qf = fin->fin_qif; +#endif + + nflags = flags & np->in_flags; + if (flags & IPN_TCPUDP) { + tcp = (tcphdr_t *)fin->fin_dp; + sport = tcp->th_sport; + dport = tcp->th_dport; + } + + /* Give me a new nat */ + KMALLOC(nat, nat_t *); + if (nat == NULL) { + nat_stats.ns_memfail++; + return NULL; + } + + bzero((char *)nat, sizeof(*nat)); + nat->nat_flags = flags; + if (flags & FI_WILDP) + nat_stats.ns_wilds++; + /* + * Search the current table for a match. + */ + if (direction == NAT_OUTBOUND) { + /* + * Values at which the search for a free resouce starts. + */ + u_32_t st_ip; + u_short st_port; + + /* + * If it's an outbound packet which doesn't match any existing + * record, then create a new port + */ + l = 0; + st_ip = np->in_nip; + st_port = np->in_pnext; + + do { + port = 0; + in.s_addr = htonl(np->in_nip); + if (l == 0) { + /* + * Check to see if there is an existing NAT + * setup for this IP address pair. + */ + hm = nat_hostmap(np, ip->ip_src, in); + if (hm != NULL) + in.s_addr = hm->hm_mapip.s_addr; + } else if ((l == 1) && (hm != NULL)) { + nat_hostmapdel(hm); + hm = NULL; + } + in.s_addr = ntohl(in.s_addr); + + nat->nat_hm = hm; + + if ((np->in_outmsk == 0xffffffff) && + (np->in_pnext == 0)) { + if (l > 0) + goto badnat; + } + + if (np->in_redir & NAT_MAPBLK) { + if ((l >= np->in_ppip) || ((l > 0) && + !(flags & IPN_TCPUDP))) + goto badnat; + /* + * map-block - Calculate destination address. + */ + in.s_addr = ntohl(ip->ip_src.s_addr); + in.s_addr &= ntohl(~np->in_inmsk); + inb.s_addr = in.s_addr; + in.s_addr /= np->in_ippip; + in.s_addr &= ntohl(~np->in_outmsk); + in.s_addr += ntohl(np->in_outip); + /* + * Calculate destination port. + */ + if ((flags & IPN_TCPUDP) && + (np->in_ppip != 0)) { + port = ntohs(sport) + l; + port %= np->in_ppip; + port += np->in_ppip * + (inb.s_addr % np->in_ippip); + port += MAPBLK_MINPORT; + port = htons(port); + } + } else if (!np->in_outip && + (np->in_outmsk == 0xffffffff)) { + /* + * 0/32 - use the interface's IP address. + */ + if ((l > 0) || + fr_ifpaddr(4, fin->fin_ifp, &in) == -1) + goto badnat; + in.s_addr = ntohl(in.s_addr); + } else if (!np->in_outip && !np->in_outmsk) { + /* + * 0/0 - use the original source address/port. + */ + if (l > 0) + goto badnat; + in.s_addr = ntohl(ip->ip_src.s_addr); + } else if ((np->in_outmsk != 0xffffffff) && + (np->in_pnext == 0) && + ((l > 0) || (hm == NULL))) + np->in_nip++; + natl = NULL; + + if ((nflags & IPN_TCPUDP) && + ((np->in_redir & NAT_MAPBLK) == 0) && + (np->in_flags & IPN_AUTOPORTMAP)) { + if ((l > 0) && (l % np->in_ppip == 0)) { + if (l > np->in_space) { + goto badnat; + } else if ((l > np->in_ppip) && + np->in_outmsk != 0xffffffff) + np->in_nip++; + } + if (np->in_ppip != 0) { + port = ntohs(sport); + port += (l % np->in_ppip); + port %= np->in_ppip; + port += np->in_ppip * + (ntohl(ip->ip_src.s_addr) % + np->in_ippip); + port += MAPBLK_MINPORT; + port = htons(port); + } + } else if (((np->in_redir & NAT_MAPBLK) == 0) && + (nflags & IPN_TCPUDP) && + (np->in_pnext != 0)) { + port = htons(np->in_pnext++); + if (np->in_pnext > ntohs(np->in_pmax)) { + np->in_pnext = ntohs(np->in_pmin); + if (np->in_outmsk != 0xffffffff) + np->in_nip++; + } + } + + if (np->in_flags & IPN_IPRANGE) { + if (np->in_nip > ntohl(np->in_outmsk)) + np->in_nip = ntohl(np->in_outip); + } else { + if ((np->in_outmsk != 0xffffffff) && + ((np->in_nip + 1) & ntohl(np->in_outmsk)) > + ntohl(np->in_outip)) + np->in_nip = ntohl(np->in_outip) + 1; + } + + if (!port && (flags & IPN_TCPUDP)) + port = sport; + + /* + * Here we do a lookup of the connection as seen from + * the outside. If an IP# pair already exists, try + * again. So if you have A->B becomes C->B, you can + * also have D->E become C->E but not D->B causing + * another C->B. Also take protocol and ports into + * account when determining whether a pre-existing + * NAT setup will cause an external conflict where + * this is appropriate. + */ + inb.s_addr = htonl(in.s_addr); + natl = nat_inlookup(fin->fin_ifp, flags & ~FI_WILDP, + (u_int)ip->ip_p, ip->ip_dst, inb, + (port << 16) | dport, 1); + + /* + * Has the search wrapped around and come back to the + * start ? + */ + if ((natl != NULL) && + (np->in_pnext != 0) && (st_port == np->in_pnext) && + (np->in_nip != 0) && (st_ip == np->in_nip)) + goto badnat; + l++; + } while (natl != NULL); + + if (np->in_space > 0) + np->in_space--; + + /* Setup the NAT table */ + nat->nat_inip = ip->ip_src; + nat->nat_outip.s_addr = htonl(in.s_addr); + nat->nat_oip = ip->ip_dst; + if (nat->nat_hm == NULL) + nat->nat_hm = nat_hostmap(np, ip->ip_src, + nat->nat_outip); + + sum1 = LONG_SUM(ntohl(ip->ip_src.s_addr)) + ntohs(sport); + sum2 = LONG_SUM(in.s_addr) + ntohs(port); + + if (flags & IPN_TCPUDP) { + nat->nat_inport = sport; + nat->nat_outport = port; /* sport */ + nat->nat_oport = dport; + } + } else { + /* + * Otherwise, it's an inbound packet. Most likely, we don't + * want to rewrite source ports and source addresses. Instead, + * we want to rewrite to a fixed internal address and fixed + * internal port. + */ + if (np->in_flags & IPN_SPLIT) { + in.s_addr = np->in_nip; + if (np->in_inip == htonl(in.s_addr)) + np->in_nip = ntohl(np->in_inmsk); + else { + np->in_nip = ntohl(np->in_inip); + if (np->in_flags & IPN_ROUNDR) { + nat_delrdr(np); + nat_addrdr(np); + } + } + } else { + in.s_addr = ntohl(np->in_inip); + if (np->in_flags & IPN_ROUNDR) { + nat_delrdr(np); + nat_addrdr(np); + } + } + if (!np->in_pnext) + nport = dport; + else { + /* + * Whilst not optimized for the case where + * pmin == pmax, the gain is not significant. + */ + nport = ntohs(dport) - ntohs(np->in_pmin) + + ntohs(np->in_pnext); + nport = htons(nport); + } + + /* + * When the redirect-to address is set to 0.0.0.0, just + * assume a blank `forwarding' of the packet. We don't + * setup any translation for this either. + */ + if (in.s_addr == 0) { + if (nport == dport) + goto badnat; + in.s_addr = ntohl(ip->ip_dst.s_addr); + } + + nat->nat_inip.s_addr = htonl(in.s_addr); + nat->nat_outip = ip->ip_dst; + nat->nat_oip = ip->ip_src; + + sum1 = LONG_SUM(ntohl(ip->ip_dst.s_addr)) + ntohs(dport); + sum2 = LONG_SUM(in.s_addr) + ntohs(nport); + + if (flags & IPN_TCPUDP) { + nat->nat_inport = nport; + nat->nat_outport = dport; + nat->nat_oport = sport; + } + } + + CALC_SUMD(sum1, sum2, sumd); + nat->nat_sumd[0] = (sumd & 0xffff) + (sumd >> 16); +#if SOLARIS && defined(_KERNEL) && (SOLARIS2 >= 6) + if ((flags == IPN_TCP) && dohwcksum && + (qf->qf_ill->ill_ick.ick_magic == ICK_M_CTL_MAGIC)) { + if (direction == NAT_OUTBOUND) + sum1 = LONG_SUM(ntohl(in.s_addr)); + else + sum1 = LONG_SUM(ntohl(ip->ip_src.s_addr)); + sum1 += LONG_SUM(ntohl(ip->ip_dst.s_addr)); + sum1 += 30; + sum1 = (sum1 & 0xffff) + (sum1 >> 16); + nat->nat_sumd[1] = NAT_HW_CKSUM|(sum1 & 0xffff); + } else +#endif + nat->nat_sumd[1] = nat->nat_sumd[0]; + + if ((flags & IPN_TCPUDP) && ((sport != port) || (dport != nport))) { + if (direction == NAT_OUTBOUND) + sum1 = LONG_SUM(ntohl(ip->ip_src.s_addr)); + else + sum1 = LONG_SUM(ntohl(ip->ip_dst.s_addr)); + + sum2 = LONG_SUM(in.s_addr); + + CALC_SUMD(sum1, sum2, sumd); + nat->nat_ipsumd = (sumd & 0xffff) + (sumd >> 16); + } else + nat->nat_ipsumd = nat->nat_sumd[0]; + + in.s_addr = htonl(in.s_addr); + +#ifdef _KERNEL + strncpy(nat->nat_ifname, IFNAME(fin->fin_ifp), IFNAMSIZ); +#endif + nat_insert(nat); + + nat->nat_dir = direction; + nat->nat_ifp = fin->fin_ifp; + nat->nat_ptr = np; + nat->nat_p = ip->ip_p; + nat->nat_bytes = 0; + nat->nat_pkts = 0; + nat->nat_fr = fin->fin_fr; + if (nat->nat_fr != NULL) { + ATOMIC_INC32(nat->nat_fr->fr_ref); + } + if (direction == NAT_OUTBOUND) { + if (flags & IPN_TCPUDP) + tcp->th_sport = port; + } else { + if (flags & IPN_TCPUDP) + tcp->th_dport = nport; + } + np->in_use++; +#ifdef IPFILTER_LOG + nat_log(nat, (u_int)np->in_redir); +#endif + return nat; +badnat: + nat_stats.ns_badnat++; + if ((hm = nat->nat_hm) != NULL) + nat_hostmapdel(hm); + KFREE(nat); + return NULL; +} + + +void nat_insert(nat) +nat_t *nat; +{ + nat_t **natp; + u_int hv; + + MUTEX_INIT(&nat->nat_lock, "nat entry lock", NULL); + + nat->nat_age = fr_defnatage; + nat->nat_ifname[sizeof(nat->nat_ifname) - 1] = '\0'; + if (nat->nat_ifname[0] !='\0') { + nat->nat_ifp = GETUNIT(nat->nat_ifname, 4); + } + + nat->nat_next = nat_instances; + nat_instances = nat; + + hv = NAT_HASH_FN(nat->nat_inip.s_addr, nat->nat_inport, + ipf_nattable_sz); + natp = &nat_table[0][hv]; + if (*natp) + (*natp)->nat_phnext[0] = &nat->nat_hnext[0]; + nat->nat_phnext[0] = natp; + nat->nat_hnext[0] = *natp; + *natp = nat; + + hv = NAT_HASH_FN(nat->nat_outip.s_addr, nat->nat_outport, + ipf_nattable_sz); + natp = &nat_table[1][hv]; + if (*natp) + (*natp)->nat_phnext[1] = &nat->nat_hnext[1]; + nat->nat_phnext[1] = natp; + nat->nat_hnext[1] = *natp; + *natp = nat; + + nat_stats.ns_added++; + nat_stats.ns_inuse++; +} + + +nat_t *nat_icmplookup(ip, fin, dir) +ip_t *ip; +fr_info_t *fin; +int dir; +{ + icmphdr_t *icmp; + tcphdr_t *tcp = NULL; + ip_t *oip; + int flags = 0, type, minlen; + + icmp = (icmphdr_t *)fin->fin_dp; + /* + * Does it at least have the return (basic) IP header ? + * Only a basic IP header (no options) should be with an ICMP error + * header. + */ + if ((ip->ip_hl != 5) || (ip->ip_len < ICMPERR_MINPKTLEN)) + return NULL; + type = icmp->icmp_type; + /* + * If it's not an error type, then return. + */ + if ((type != ICMP_UNREACH) && (type != ICMP_SOURCEQUENCH) && + (type != ICMP_REDIRECT) && (type != ICMP_TIMXCEED) && + (type != ICMP_PARAMPROB)) + return NULL; + + oip = (ip_t *)((char *)fin->fin_dp + 8); + minlen = (oip->ip_hl << 2); + if (minlen < sizeof(ip_t)) + return NULL; + if (ip->ip_len < ICMPERR_IPICMPHLEN + minlen) + return NULL; + /* + * Is the buffer big enough for all of it ? It's the size of the IP + * header claimed in the encapsulated part which is of concern. It + * may be too big to be in this buffer but not so big that it's + * outside the ICMP packet, leading to TCP deref's causing problems. + * This is possible because we don't know how big oip_hl is when we + * do the pullup early in fr_check() and thus can't gaurantee it is + * all here now. + */ +#ifdef _KERNEL + { + mb_t *m; + +# if SOLARIS + m = fin->fin_qfm; + if ((char *)oip + fin->fin_dlen - ICMPERR_ICMPHLEN > (char *)m->b_wptr) + return NULL; +# else + m = *(mb_t **)fin->fin_mp; + if ((char *)oip + fin->fin_dlen - ICMPERR_ICMPHLEN > + (char *)ip + m->m_len) + return NULL; +# endif + } +#endif + + if (oip->ip_p == IPPROTO_TCP) + flags = IPN_TCP; + else if (oip->ip_p == IPPROTO_UDP) + flags = IPN_UDP; + if (flags & IPN_TCPUDP) { + minlen += 8; /* + 64bits of data to get ports */ + if (ip->ip_len < ICMPERR_IPICMPHLEN + minlen) + return NULL; + tcp = (tcphdr_t *)((char *)oip + (oip->ip_hl << 2)); + if (dir == NAT_INBOUND) + return nat_inlookup(fin->fin_ifp, flags, + (u_int)oip->ip_p, oip->ip_dst, oip->ip_src, + (tcp->th_sport << 16) | tcp->th_dport, 0); + else + return nat_outlookup(fin->fin_ifp, flags, + (u_int)oip->ip_p, oip->ip_dst, oip->ip_src, + (tcp->th_sport << 16) | tcp->th_dport, 0); + } + if (dir == NAT_INBOUND) + return nat_inlookup(fin->fin_ifp, 0, (u_int)oip->ip_p, + oip->ip_dst, oip->ip_src, 0, 0); + else + return nat_outlookup(fin->fin_ifp, 0, (u_int)oip->ip_p, + oip->ip_dst, oip->ip_src, 0, 0); +} + + +/* + * This should *ONLY* be used for incoming packets to make sure a NAT'd ICMP + * packet gets correctly recognised. + */ +nat_t *nat_icmp(ip, fin, nflags, dir) +ip_t *ip; +fr_info_t *fin; +u_int *nflags; +int dir; +{ + u_32_t sum1, sum2, sumd, sumd2 = 0; + struct in_addr in; + icmphdr_t *icmp; + udphdr_t *udp; + nat_t *nat; + ip_t *oip; + int flags = 0; + + if ((fin->fin_fi.fi_fl & FI_SHORT) || (ip->ip_off & IP_OFFMASK)) + return NULL; + /* + * nat_icmplookup() will return NULL for `defective' packets. + */ + if ((ip->ip_v != 4) || !(nat = nat_icmplookup(ip, fin, dir))) + return NULL; + *nflags = IPN_ICMPERR; + icmp = (icmphdr_t *)fin->fin_dp; + oip = (ip_t *)&icmp->icmp_ip; + if (oip->ip_p == IPPROTO_TCP) + flags = IPN_TCP; + else if (oip->ip_p == IPPROTO_UDP) + flags = IPN_UDP; + udp = (udphdr_t *)((((char *)oip) + (oip->ip_hl << 2))); + /* + * Need to adjust ICMP header to include the real IP#'s and + * port #'s. Only apply a checksum change relative to the + * IP address change as it will be modified again in ip_natout + * for both address and port. Two checksum changes are + * necessary for the two header address changes. Be careful + * to only modify the checksum once for the port # and twice + * for the IP#. + */ + + /* + * Step 1 + * Fix the IP addresses in the offending IP packet. You also need + * to adjust the IP header checksum of that offending IP packet + * and the ICMP checksum of the ICMP error message itself. + * + * Unfortunately, for UDP and TCP, the IP addresses are also contained + * in the pseudo header that is used to compute the UDP resp. TCP + * checksum. So, we must compensate that as well. Even worse, the + * change in the UDP and TCP checksums require yet another + * adjustment of the ICMP checksum of the ICMP error message. + * + * For the moment we forget about TCP, because that checksum is not + * in the first 8 bytes, so it will not be available in most cases. + */ + + if (oip->ip_dst.s_addr == nat->nat_oip.s_addr) { + sum1 = LONG_SUM(ntohl(oip->ip_src.s_addr)); + in = nat->nat_inip; + oip->ip_src = in; + } else { + sum1 = LONG_SUM(ntohl(oip->ip_dst.s_addr)); + in = nat->nat_outip; + oip->ip_dst = in; + } + + sum2 = LONG_SUM(ntohl(in.s_addr)); + + CALC_SUMD(sum1, sum2, sumd); + + if (nat->nat_dir == NAT_OUTBOUND) { + /* + * Fix IP checksum of the offending IP packet to adjust for + * the change in the IP address. + * + * Normally, you would expect that the ICMP checksum of the + * ICMP error message needs to be adjusted as well for the + * IP address change in oip. + * However, this is a NOP, because the ICMP checksum is + * calculated over the complete ICMP packet, which includes the + * changed oip IP addresses and oip->ip_sum. However, these + * two changes cancel each other out (if the delta for + * the IP address is x, then the delta for ip_sum is minus x), + * so no change in the icmp_cksum is necessary. + * + * Be careful that nat_dir refers to the direction of the + * offending IP packet (oip), not to its ICMP response (icmp) + */ + fix_datacksum(&oip->ip_sum, sumd); + + /* + * Fix UDP pseudo header checksum to compensate for the + * IP address change. + */ + if (oip->ip_p == IPPROTO_UDP && udp->uh_sum) { + /* + * The UDP checksum is optional, only adjust it + * if it has been set. + */ + sum1 = ntohs(udp->uh_sum); + fix_datacksum(&udp->uh_sum, sumd); + sum2 = ntohs(udp->uh_sum); + + /* + * Fix ICMP checksum to compensate the UDP + * checksum adjustment. + */ + CALC_SUMD(sum1, sum2, sumd); + sumd2 = sumd; + } + +#if 0 + /* + * Fix TCP pseudo header checksum to compensate for the + * IP address change. Before we can do the change, we + * must make sure that oip is sufficient large to hold + * the TCP checksum (normally it does not!). + */ + if (oip->ip_p == IPPROTO_TCP) { + + } +#endif + } else { + + /* + * Fix IP checksum of the offending IP packet to adjust for + * the change in the IP address. + * + * Normally, you would expect that the ICMP checksum of the + * ICMP error message needs to be adjusted as well for the + * IP address change in oip. + * However, this is a NOP, because the ICMP checksum is + * calculated over the complete ICMP packet, which includes the + * changed oip IP addresses and oip->ip_sum. However, these + * two changes cancel each other out (if the delta for + * the IP address is x, then the delta for ip_sum is minus x), + * so no change in the icmp_cksum is necessary. + * + * Be careful that nat_dir refers to the direction of the + * offending IP packet (oip), not to its ICMP response (icmp) + */ + fix_datacksum(&oip->ip_sum, sumd); + +/* XXX FV : without having looked at Solaris source code, it seems unlikely + * that SOLARIS would compensate this in the kernel (a body of an IP packet + * in the data section of an ICMP packet). I have the feeling that this should + * be unconditional, but I'm not in a position to check. + */ +#if !SOLARIS && !defined(__sgi) + /* + * Fix UDP pseudo header checksum to compensate for the + * IP address change. + */ + if (oip->ip_p == IPPROTO_UDP && udp->uh_sum) { + /* + * The UDP checksum is optional, only adjust it + * if it has been set + */ + sum1 = ntohs(udp->uh_sum); + fix_datacksum(&udp->uh_sum, sumd); + sum2 = ntohs(udp->uh_sum); + + /* + * Fix ICMP checksum to compensate the UDP + * checksum adjustment. + */ + CALC_SUMD(sum1, sum2, sumd); + sumd2 = sumd; + } + +#if 0 + /* + * Fix TCP pseudo header checksum to compensate for the + * IP address change. Before we can do the change, we + * must make sure that oip is sufficient large to hold + * the TCP checksum (normally it does not!). + */ + if (oip->ip_p == IPPROTO_TCP) { + + }; +#endif + +#endif + } + + if ((flags & IPN_TCPUDP) != 0) { + tcphdr_t *tcp; + + /* + * XXX - what if this is bogus hl and we go off the end ? + * In this case, nat_icmpinlookup() will have returned NULL. + */ + tcp = (tcphdr_t *)udp; + + /* + * Step 2 : + * For offending TCP/UDP IP packets, translate the ports as + * well, based on the NAT specification. Of course such + * a change must be reflected in the ICMP checksum as well. + * + * Advance notice : Now it becomes complicated :-) + * + * Since the port fields are part of the TCP/UDP checksum + * of the offending IP packet, you need to adjust that checksum + * as well... but, if you change, you must change the icmp + * checksum *again*, to reflect that change. + * + * To further complicate: the TCP checksum is not in the first + * 8 bytes of the offending ip packet, so it most likely is not + * available (we might have to fix that if the encounter a + * device that returns more than 8 data bytes on icmp error) + */ + + if (nat->nat_oport == tcp->th_dport) { + if (tcp->th_sport != nat->nat_inport) { + /* + * Fix ICMP checksum to compensate port + * adjustment. + */ + sum1 = ntohs(tcp->th_sport); + sum2 = ntohs(nat->nat_inport); + CALC_SUMD(sum1, sum2, sumd); + sumd2 += sumd; + tcp->th_sport = nat->nat_inport; + + /* + * Fix udp checksum to compensate port + * adjustment. NOTE : the offending IP packet + * flows the other direction compared to the + * ICMP message. + * + * The UDP checksum is optional, only adjust + * it if it has been set. + */ + if (oip->ip_p == IPPROTO_UDP && udp->uh_sum) { + + sum1 = ntohs(udp->uh_sum); + fix_datacksum(&udp->uh_sum, sumd); + sum2 = ntohs(udp->uh_sum); + + /* + * Fix ICMP checksum to + * compensate UDP checksum + * adjustment. + */ + CALC_SUMD(sum1, sum2, sumd); + sumd2 += sumd; + } + } + } else { + if (tcp->th_dport != nat->nat_outport) { + /* + * Fix ICMP checksum to compensate port + * adjustment. + */ + sum1 = ntohs(tcp->th_dport); + sum2 = ntohs(nat->nat_outport); + CALC_SUMD(sum1, sum2, sumd); + sumd2 += sumd; + tcp->th_dport = nat->nat_outport; + + /* + * Fix udp checksum to compensate port + * adjustment. NOTE : the offending IP + * packet flows the other direction compared + * to the ICMP message. + * + * The UDP checksum is optional, only adjust + * it if it has been set. + */ + if (oip->ip_p == IPPROTO_UDP && udp->uh_sum) { + + sum1 = ntohs(udp->uh_sum); + fix_datacksum(&udp->uh_sum, sumd); + sum2 = ntohs(udp->uh_sum); + + /* + * Fix ICMP checksum to compensate + * UDP checksum adjustment. + */ + CALC_SUMD(sum1, sum2, sumd); + sumd2 += sumd; + } + } + } + if (sumd2) { + sumd2 = (sumd2 & 0xffff) + (sumd2 >> 16); + sumd2 = (sumd2 & 0xffff) + (sumd2 >> 16); + if (nat->nat_dir == NAT_OUTBOUND) { + fix_outcksum(&icmp->icmp_cksum, sumd2); + } else { + fix_incksum(&icmp->icmp_cksum, sumd2); + } + } + } + nat->nat_age = fr_defnaticmpage; + return nat; +} + + +/* + * NB: these lookups don't lock access to the list, it assume it has already + * been done! + */ +/* + * Lookup a nat entry based on the mapped destination ip address/port and + * real source address/port. We use this lookup when receiving a packet, + * we're looking for a table entry, based on the destination address. + * NOTE: THE PACKET BEING CHECKED (IF FOUND) HAS A MAPPING ALREADY. + */ +nat_t *nat_inlookup(ifp, flags, p, src, mapdst, ports, rw) +void *ifp; +register u_int flags, p; +struct in_addr src , mapdst; +u_32_t ports; +int rw; +{ + register u_short sport, dport; + register nat_t *nat; + register int nflags; + register u_32_t dst; + u_int hv; + + dst = mapdst.s_addr; + dport = ports >> 16; + sport = ports & 0xffff; + flags &= IPN_TCPUDP; + + hv = NAT_HASH_FN(dst, dport, ipf_nattable_sz); + nat = nat_table[1][hv]; + for (; nat; nat = nat->nat_hnext[1]) { + nflags = nat->nat_flags; + if ((!ifp || ifp == nat->nat_ifp) && + nat->nat_oip.s_addr == src.s_addr && + nat->nat_outip.s_addr == dst && + (((p == 0) && (flags == (nat->nat_flags & IPN_TCPUDP))) + || (p == nat->nat_p)) && (!flags || + (((nat->nat_oport == sport) || (nflags & FI_W_DPORT)) && + ((nat->nat_outport == dport) || (nflags & FI_W_SPORT))))) + return nat; + } + if (!nat_stats.ns_wilds || !(flags & IPN_TCPUDP)) + return NULL; + if (!rw) { + RWLOCK_EXIT(&ipf_nat); + } + hv = NAT_HASH_FN(dst, 0, ipf_nattable_sz); + if (!rw) { + WRITE_ENTER(&ipf_nat); + } + nat = nat_table[1][hv]; + for (; nat; nat = nat->nat_hnext[1]) { + nflags = nat->nat_flags; + if (ifp && ifp != nat->nat_ifp) + continue; + if (!(nflags & IPN_TCPUDP)) + continue; + if (!(nflags & FI_WILDP)) + continue; + if (nat->nat_oip.s_addr != src.s_addr || + nat->nat_outip.s_addr != dst) + continue; + if (((nat->nat_oport == sport) || (nflags & FI_W_DPORT)) && + ((nat->nat_outport == dport) || (nflags & FI_W_SPORT))) { + nat_tabmove(nat, ports); + break; + } + } + if (!rw) { + MUTEX_DOWNGRADE(&ipf_nat); + } + return nat; +} + + +/* + * This function is only called for TCP/UDP NAT table entries where the + * original was placed in the table without hashing on the ports and we now + * want to include hashing on port numbers. + */ +static void nat_tabmove(nat, ports) +nat_t *nat; +u_32_t ports; +{ + register u_short sport, dport; + nat_t **natp; + u_int hv; + + dport = ports >> 16; + sport = ports & 0xffff; + + if (nat->nat_oport == dport) { + nat->nat_inport = sport; + nat->nat_outport = sport; + } + + /* + * Remove the NAT entry from the old location + */ + if (nat->nat_hnext[0]) + nat->nat_hnext[0]->nat_phnext[0] = nat->nat_phnext[0]; + *nat->nat_phnext[0] = nat->nat_hnext[0]; + + if (nat->nat_hnext[1]) + nat->nat_hnext[1]->nat_phnext[1] = nat->nat_phnext[1]; + *nat->nat_phnext[1] = nat->nat_hnext[1]; + + /* + * Add into the NAT table in the new position + */ + hv = NAT_HASH_FN(nat->nat_inip.s_addr, sport, ipf_nattable_sz); + natp = &nat_table[0][hv]; + if (*natp) + (*natp)->nat_phnext[0] = &nat->nat_hnext[0]; + nat->nat_phnext[0] = natp; + nat->nat_hnext[0] = *natp; + *natp = nat; + + hv = NAT_HASH_FN(nat->nat_outip.s_addr, sport, ipf_nattable_sz); + natp = &nat_table[1][hv]; + if (*natp) + (*natp)->nat_phnext[1] = &nat->nat_hnext[1]; + nat->nat_phnext[1] = natp; + nat->nat_hnext[1] = *natp; + *natp = nat; +} + + +/* + * Lookup a nat entry based on the source 'real' ip address/port and + * destination address/port. We use this lookup when sending a packet out, + * we're looking for a table entry, based on the source address. + * NOTE: THE PACKET BEING CHECKED (IF FOUND) HAS A MAPPING ALREADY. + */ +nat_t *nat_outlookup(ifp, flags, p, src, dst, ports, rw) +void *ifp; +register u_int flags, p; +struct in_addr src , dst; +u_32_t ports; +int rw; +{ + register u_short sport, dport; + register nat_t *nat; + register int nflags; + u_32_t srcip; + u_int hv; + + sport = ports & 0xffff; + dport = ports >> 16; + flags &= IPN_TCPUDP; + srcip = src.s_addr; + + hv = NAT_HASH_FN(srcip, sport, ipf_nattable_sz); + nat = nat_table[0][hv]; + for (; nat; nat = nat->nat_hnext[0]) { + nflags = nat->nat_flags; + + if ((!ifp || ifp == nat->nat_ifp) && + nat->nat_inip.s_addr == srcip && + nat->nat_oip.s_addr == dst.s_addr && + (((p == 0) && (flags == (nflags & IPN_TCPUDP))) + || (p == nat->nat_p)) && (!flags || + ((nat->nat_inport == sport || nflags & FI_W_SPORT) && + (nat->nat_oport == dport || nflags & FI_W_DPORT)))) + return nat; + } + if (!nat_stats.ns_wilds || !(flags & IPN_TCPUDP)) + return NULL; + if (!rw) { + RWLOCK_EXIT(&ipf_nat); + } + hv = NAT_HASH_FN(srcip, 0, ipf_nattable_sz); + if (!rw) { + WRITE_ENTER(&ipf_nat); + } + nat = nat_table[0][hv]; + for (; nat; nat = nat->nat_hnext[0]) { + nflags = nat->nat_flags; + if (ifp && ifp != nat->nat_ifp) + continue; + if (!(nflags & IPN_TCPUDP)) + continue; + if (!(nflags & FI_WILDP)) + continue; + if ((nat->nat_inip.s_addr != srcip) || + (nat->nat_oip.s_addr != dst.s_addr)) + continue; + if (((nat->nat_inport == sport) || (nflags & FI_W_SPORT)) && + ((nat->nat_oport == dport) || (nflags & FI_W_DPORT))) { + nat_tabmove(nat, ports); + break; + } + } + if (!rw) { + MUTEX_DOWNGRADE(&ipf_nat); + } + return nat; +} + + +/* + * Lookup the NAT tables to search for a matching redirect + */ +nat_t *nat_lookupredir(np) +register natlookup_t *np; +{ + u_32_t ports; + nat_t *nat; + + ports = (np->nl_outport << 16) | np->nl_inport; + /* + * If nl_inip is non null, this is a lookup based on the real + * ip address. Else, we use the fake. + */ + if ((nat = nat_outlookup(NULL, np->nl_flags, 0, np->nl_inip, + np->nl_outip, ports, 0))) { + np->nl_realip = nat->nat_outip; + np->nl_realport = nat->nat_outport; + } + return nat; +} + + +static int nat_match(fin, np, ip) +fr_info_t *fin; +ipnat_t *np; +ip_t *ip; +{ + frtuc_t *ft; + + if (ip->ip_v != 4) + return 0; + + if (np->in_p && ip->ip_p != np->in_p) + return 0; + if (fin->fin_out) { + if (!(np->in_redir & (NAT_MAP|NAT_MAPBLK))) + return 0; + if (((fin->fin_fi.fi_saddr & np->in_inmsk) != np->in_inip) + ^ ((np->in_flags & IPN_NOTSRC) != 0)) + return 0; + if (((fin->fin_fi.fi_daddr & np->in_srcmsk) != np->in_srcip) + ^ ((np->in_flags & IPN_NOTDST) != 0)) + return 0; + } else { + if (!(np->in_redir & NAT_REDIRECT)) + return 0; + if (((fin->fin_fi.fi_saddr & np->in_srcmsk) != np->in_srcip) + ^ ((np->in_flags & IPN_NOTSRC) != 0)) + return 0; + if (((fin->fin_fi.fi_daddr & np->in_outmsk) != np->in_outip) + ^ ((np->in_flags & IPN_NOTDST) != 0)) + return 0; + } + + ft = &np->in_tuc; + if (!(fin->fin_fi.fi_fl & FI_TCPUDP) || + (fin->fin_fi.fi_fl & FI_SHORT) || (ip->ip_off & IP_OFFMASK)) { + if (ft->ftu_scmp || ft->ftu_dcmp) + return 0; + return 1; + } + + return fr_tcpudpchk(ft, fin); +} + + +/* + * Packets going out on the external interface go through this. + * Here, the source address requires alteration, if anything. + */ +int ip_natout(ip, fin) +ip_t *ip; +fr_info_t *fin; +{ + register ipnat_t *np = NULL; + register u_32_t ipa; + tcphdr_t *tcp = NULL; + u_short sport = 0, dport = 0, *csump = NULL; + struct ifnet *ifp; + int natadd = 1; + frentry_t *fr; + u_int nflags = 0, hv, msk; + u_32_t iph; + nat_t *nat; + int i; + + if (nat_list == NULL || (fr_nat_lock)) + return 0; + + if ((fr = fin->fin_fr) && !(fr->fr_flags & FR_DUP) && + fr->fr_tif.fd_ifp && fr->fr_tif.fd_ifp != (void *)-1) + ifp = fr->fr_tif.fd_ifp; + else + ifp = fin->fin_ifp; + + if (!(ip->ip_off & IP_OFFMASK) && !(fin->fin_fi.fi_fl & FI_SHORT)) { + if (ip->ip_p == IPPROTO_TCP) + nflags = IPN_TCP; + else if (ip->ip_p == IPPROTO_UDP) + nflags = IPN_UDP; + if ((nflags & IPN_TCPUDP)) { + tcp = (tcphdr_t *)fin->fin_dp; + sport = tcp->th_sport; + dport = tcp->th_dport; + } + } + + ipa = ip->ip_src.s_addr; + + READ_ENTER(&ipf_nat); + + if ((ip->ip_p == IPPROTO_ICMP) && + (nat = nat_icmp(ip, fin, &nflags, NAT_OUTBOUND))) + ; + else if ((ip->ip_off & (IP_OFFMASK|IP_MF)) && + (nat = ipfr_nat_knownfrag(ip, fin))) + natadd = 0; + else if ((nat = nat_outlookup(ifp, nflags, (u_int)ip->ip_p, + ip->ip_src, ip->ip_dst, + (dport << 16) | sport, 0))) { + nflags = nat->nat_flags; + if ((nflags & (FI_W_SPORT|FI_W_DPORT)) != 0) { + if ((nflags & FI_W_SPORT) && + (nat->nat_inport != sport)) + nat->nat_inport = sport; + else if ((nflags & FI_W_DPORT) && + (nat->nat_oport != dport)) + nat->nat_oport = dport; + if (nat->nat_outport == 0) + nat->nat_outport = sport; + nat->nat_flags &= ~(FI_W_DPORT|FI_W_SPORT); + nflags = nat->nat_flags; + nat_stats.ns_wilds--; + } + } else { + RWLOCK_EXIT(&ipf_nat); + WRITE_ENTER(&ipf_nat); + /* + * If there is no current entry in the nat table for this IP#, + * create one for it (if there is a matching rule). + */ + msk = 0xffffffff; + i = 32; +maskloop: + iph = ipa & htonl(msk); + hv = NAT_HASH_FN(iph, 0, ipf_natrules_sz); + for (np = nat_rules[hv]; np; np = np->in_mnext) + { + if ((np->in_ifp && (np->in_ifp != ifp)) || + !np->in_space) + continue; + if ((np->in_flags & IPN_RF) && + !(np->in_flags & nflags)) + continue; + if (np->in_flags & IPN_FILTER) { + if (!nat_match(fin, np, ip)) + continue; + } else if ((ipa & np->in_inmsk) != np->in_inip) + continue; + if (np->in_redir & (NAT_MAP|NAT_MAPBLK)) { + if (*np->in_plabel && !appr_ok(ip, tcp, np)) + continue; + /* + * If it's a redirection, then we don't want to + * create new outgoing port stuff. + * Redirections are only for incoming + * connections. + */ + if (!(np->in_redir & (NAT_MAP|NAT_MAPBLK))) + continue; + if ((nat = nat_new(np, ip, fin, (u_int)nflags, + NAT_OUTBOUND))) { + np->in_hits++; + break; + } + } + } + if ((np == NULL) && (i > 0)) { + do { + i--; + msk <<= 1; + } while ((i >= 0) && ((nat_masks & (1 << i)) == 0)); + if (i >= 0) + goto maskloop; + } + MUTEX_DOWNGRADE(&ipf_nat); + } + + /* + * NOTE: ipf_nat must now only be held as a read lock + */ + if (nat) { + np = nat->nat_ptr; + if (natadd && (fin->fin_fi.fi_fl & FI_FRAG) && + np && (np->in_flags & IPN_FRAG)) + ipfr_nat_newfrag(ip, fin, 0, nat); + MUTEX_ENTER(&nat->nat_lock); + nat->nat_age = fr_defnatage; + nat->nat_bytes += ip->ip_len; + nat->nat_pkts++; + MUTEX_EXIT(&nat->nat_lock); + + /* + * Fix up checksums, not by recalculating them, but + * simply computing adjustments. + */ + if (nflags == IPN_ICMPERR) { + u_32_t s1, s2, sumd; + + s1 = LONG_SUM(ntohl(ip->ip_src.s_addr)); + s2 = LONG_SUM(ntohl(nat->nat_outip.s_addr)); + CALC_SUMD(s1, s2, sumd); + + if (nat->nat_dir == NAT_OUTBOUND) + fix_incksum(&ip->ip_sum, sumd); + else + fix_outcksum(&ip->ip_sum, sumd); + } +#if SOLARIS || defined(__sgi) + else { + if (nat->nat_dir == NAT_OUTBOUND) + fix_outcksum(&ip->ip_sum, nat->nat_ipsumd); + else + fix_incksum(&ip->ip_sum, nat->nat_ipsumd); + } +#endif + ip->ip_src = nat->nat_outip; + + if (!(ip->ip_off & IP_OFFMASK) && + !(fin->fin_fi.fi_fl & FI_SHORT)) { + + if ((nat->nat_outport != 0) && (nflags & IPN_TCPUDP)) { + tcp->th_sport = nat->nat_outport; + fin->fin_data[0] = ntohs(tcp->th_sport); + } + + if (ip->ip_p == IPPROTO_TCP) { + csump = &tcp->th_sum; + MUTEX_ENTER(&nat->nat_lock); + fr_tcp_age(&nat->nat_age, + nat->nat_tcpstate, fin, 1); + if (nat->nat_age < fr_defnaticmpage) + nat->nat_age = fr_defnaticmpage; +#ifdef LARGE_NAT + else if (nat->nat_age > fr_defnatage) + nat->nat_age = fr_defnatage; +#endif + /* + * Increase this because we may have + * "keep state" following this too and + * packet storms can occur if this is + * removed too quickly. + */ + if (nat->nat_age == fr_tcpclosed) + nat->nat_age = fr_tcplastack; + MUTEX_EXIT(&nat->nat_lock); + } else if (ip->ip_p == IPPROTO_UDP) { + udphdr_t *udp = (udphdr_t *)tcp; + + if (udp->uh_sum) + csump = &udp->uh_sum; + } else if (ip->ip_p == IPPROTO_ICMP) { + nat->nat_age = fr_defnaticmpage; + } + + if (csump) { + if (nat->nat_dir == NAT_OUTBOUND) + fix_outcksum(csump, nat->nat_sumd[1]); + else + fix_incksum(csump, nat->nat_sumd[1]); + } + } + + if ((np->in_apr != NULL) && (np->in_dport == 0 || + (tcp != NULL && dport == np->in_dport))) { + i = appr_check(ip, fin, nat); + if (i == 0) + i = 1; + } else + i = 1; + ATOMIC_INCL(nat_stats.ns_mapped[1]); + RWLOCK_EXIT(&ipf_nat); /* READ */ + return i; + } + RWLOCK_EXIT(&ipf_nat); /* READ/WRITE */ + return 0; +} + + +/* + * Packets coming in from the external interface go through this. + * Here, the destination address requires alteration, if anything. + */ +int ip_natin(ip, fin) +ip_t *ip; +fr_info_t *fin; +{ + register struct in_addr src; + register struct in_addr in; + register ipnat_t *np; + u_int nflags = 0, natadd = 1, hv, msk; + struct ifnet *ifp = fin->fin_ifp; + tcphdr_t *tcp = NULL; + u_short sport = 0, dport = 0, *csump = NULL; + nat_t *nat; + u_32_t iph; + int i; + + if ((nat_list == NULL) || (ip->ip_v != 4) || (fr_nat_lock)) + return 0; + + if (!(ip->ip_off & IP_OFFMASK) && !(fin->fin_fi.fi_fl & FI_SHORT)) { + if (ip->ip_p == IPPROTO_TCP) + nflags = IPN_TCP; + else if (ip->ip_p == IPPROTO_UDP) + nflags = IPN_UDP; + if ((nflags & IPN_TCPUDP)) { + tcp = (tcphdr_t *)fin->fin_dp; + dport = tcp->th_dport; + sport = tcp->th_sport; + } + } + + in = ip->ip_dst; + /* make sure the source address is to be redirected */ + src = ip->ip_src; + + READ_ENTER(&ipf_nat); + + if ((ip->ip_p == IPPROTO_ICMP) && + (nat = nat_icmp(ip, fin, &nflags, NAT_INBOUND))) + ; + else if ((ip->ip_off & (IP_OFFMASK|IP_MF)) && + (nat = ipfr_nat_knownfrag(ip, fin))) + natadd = 0; + else if ((nat = nat_inlookup(fin->fin_ifp, nflags, (u_int)ip->ip_p, + ip->ip_src, in, (dport << 16) | sport, + 0))) { + nflags = nat->nat_flags; + if ((nflags & (FI_W_SPORT|FI_W_DPORT)) != 0) { + if ((nat->nat_oport != sport) && (nflags & FI_W_DPORT)) + nat->nat_oport = sport; + else if ((nat->nat_outport != dport) && + (nflags & FI_W_SPORT)) + nat->nat_outport = dport; + nat->nat_flags &= ~(FI_W_SPORT|FI_W_DPORT); + nflags = nat->nat_flags; + nat_stats.ns_wilds--; + } + } else { + RWLOCK_EXIT(&ipf_nat); + WRITE_ENTER(&ipf_nat); + /* + * If there is no current entry in the nat table for this IP#, + * create one for it (if there is a matching rule). + */ + msk = 0xffffffff; + i = 32; +maskloop: + iph = in.s_addr & htonl(msk); + hv = NAT_HASH_FN(iph, 0, ipf_rdrrules_sz); + for (np = rdr_rules[hv]; np; np = np->in_rnext) { + if ((np->in_ifp && (np->in_ifp != ifp)) || + (np->in_p && (np->in_p != ip->ip_p)) || + (np->in_flags && !(nflags & np->in_flags))) + continue; + if (np->in_flags & IPN_FILTER) { + if (!nat_match(fin, np, ip)) + continue; + } else if ((in.s_addr & np->in_outmsk) != np->in_outip) + continue; + if ((np->in_redir & NAT_REDIRECT) && + (!np->in_pmin || (np->in_flags & IPN_FILTER) || + ((ntohs(np->in_pmax) >= ntohs(dport)) && + (ntohs(dport) >= ntohs(np->in_pmin))))) + if ((nat = nat_new(np, ip, fin, nflags, + NAT_INBOUND))) { + np->in_hits++; + break; + } + } + + if ((np == NULL) && (i > 0)) { + do { + i--; + msk <<= 1; + } while ((i >= 0) && ((rdr_masks & (1 << i)) == 0)); + if (i >= 0) + goto maskloop; + } + MUTEX_DOWNGRADE(&ipf_nat); + } + + /* + * NOTE: ipf_nat must now only be held as a read lock + */ + if (nat) { + np = nat->nat_ptr; + fin->fin_fr = nat->nat_fr; + if (natadd && (fin->fin_fi.fi_fl & FI_FRAG) && + np && (np->in_flags & IPN_FRAG)) + ipfr_nat_newfrag(ip, fin, 0, nat); + if ((np->in_apr != NULL) && (np->in_dport == 0 || + (tcp != NULL && sport == np->in_dport))) { + i = appr_check(ip, fin, nat); + if (i == -1) { + RWLOCK_EXIT(&ipf_nat); + return i; + } + } + + MUTEX_ENTER(&nat->nat_lock); + if (nflags != IPN_ICMPERR) + nat->nat_age = fr_defnatage; + + nat->nat_bytes += ip->ip_len; + nat->nat_pkts++; + MUTEX_EXIT(&nat->nat_lock); + ip->ip_dst = nat->nat_inip; + fin->fin_fi.fi_daddr = nat->nat_inip.s_addr; + + /* + * Fix up checksums, not by recalculating them, but + * simply computing adjustments. + */ +#if SOLARIS || defined(__sgi) + if (nat->nat_dir == NAT_OUTBOUND) + fix_incksum(&ip->ip_sum, nat->nat_ipsumd); + else + fix_outcksum(&ip->ip_sum, nat->nat_ipsumd); +#endif + if (!(ip->ip_off & IP_OFFMASK) && + !(fin->fin_fi.fi_fl & FI_SHORT)) { + + if ((nat->nat_inport != 0) && (nflags & IPN_TCPUDP)) { + tcp->th_dport = nat->nat_inport; + fin->fin_data[1] = ntohs(tcp->th_dport); + } + + if (ip->ip_p == IPPROTO_TCP) { + csump = &tcp->th_sum; + MUTEX_ENTER(&nat->nat_lock); + fr_tcp_age(&nat->nat_age, + nat->nat_tcpstate, fin, 0); + if (nat->nat_age < fr_defnaticmpage) + nat->nat_age = fr_defnaticmpage; +#ifdef LARGE_NAT + else if (nat->nat_age > fr_defnatage) + nat->nat_age = fr_defnatage; +#endif + /* + * Increase this because we may have + * "keep state" following this too and + * packet storms can occur if this is + * removed too quickly. + */ + if (nat->nat_age == fr_tcpclosed) + nat->nat_age = fr_tcplastack; + MUTEX_EXIT(&nat->nat_lock); + } else if (ip->ip_p == IPPROTO_UDP) { + udphdr_t *udp = (udphdr_t *)tcp; + + if (udp->uh_sum) + csump = &udp->uh_sum; + } else if (ip->ip_p == IPPROTO_ICMP) { + nat->nat_age = fr_defnaticmpage; + } + + if (csump) { + if (nat->nat_dir == NAT_OUTBOUND) + fix_incksum(csump, nat->nat_sumd[0]); + else + fix_outcksum(csump, nat->nat_sumd[0]); + } + } + ATOMIC_INCL(nat_stats.ns_mapped[0]); + RWLOCK_EXIT(&ipf_nat); /* READ */ + return 1; + } + RWLOCK_EXIT(&ipf_nat); /* READ/WRITE */ + return 0; +} + + +/* + * Free all memory used by NAT structures allocated at runtime. + */ +void ip_natunload() +{ + WRITE_ENTER(&ipf_nat); + (void) nat_clearlist(); + (void) nat_flushtable(); + RWLOCK_EXIT(&ipf_nat); + + if (nat_table[0] != NULL) { + KFREES(nat_table[0], sizeof(nat_t *) * ipf_nattable_sz); + nat_table[0] = NULL; + } + if (nat_table[1] != NULL) { + KFREES(nat_table[1], sizeof(nat_t *) * ipf_nattable_sz); + nat_table[1] = NULL; + } + if (nat_rules != NULL) { + KFREES(nat_rules, sizeof(ipnat_t *) * ipf_natrules_sz); + nat_rules = NULL; + } + if (rdr_rules != NULL) { + KFREES(rdr_rules, sizeof(ipnat_t *) * ipf_rdrrules_sz); + rdr_rules = NULL; + } + if (maptable != NULL) { + KFREES(maptable, sizeof(hostmap_t *) * ipf_hostmap_sz); + maptable = NULL; + } +} + + +/* + * Slowly expire held state for NAT entries. Timeouts are set in + * expectation of this being called twice per second. + */ +void ip_natexpire() +{ + register struct nat *nat, **natp; +#if defined(_KERNEL) && !SOLARIS + int s; +#endif + + SPL_NET(s); + WRITE_ENTER(&ipf_nat); + for (natp = &nat_instances; (nat = *natp); ) { + nat->nat_age--; + if (nat->nat_age) { + natp = &nat->nat_next; + continue; + } + *natp = nat->nat_next; +#ifdef IPFILTER_LOG + nat_log(nat, NL_EXPIRE); +#endif + nat_delete(nat); + nat_stats.ns_expire++; + } + RWLOCK_EXIT(&ipf_nat); + SPL_X(s); +} + + +/* + */ +void ip_natsync(ifp) +void *ifp; +{ + register ipnat_t *n; + register nat_t *nat; + register u_32_t sum1, sum2, sumd; + struct in_addr in; + ipnat_t *np; + void *ifp2; +#if defined(_KERNEL) && !SOLARIS + int s; +#endif + + /* + * Change IP addresses for NAT sessions for any protocol except TCP + * since it will break the TCP connection anyway. + */ + SPL_NET(s); + WRITE_ENTER(&ipf_nat); + for (nat = nat_instances; nat; nat = nat->nat_next) + if (((ifp == NULL) || (ifp == nat->nat_ifp)) && + !(nat->nat_flags & IPN_TCP) && (np = nat->nat_ptr) && + (np->in_outmsk == 0xffffffff) && !np->in_nip) { + ifp2 = nat->nat_ifp; + /* + * Change the map-to address to be the same as the + * new one. + */ + sum1 = nat->nat_outip.s_addr; + if (fr_ifpaddr(4, ifp2, &in) != -1) + nat->nat_outip = in; + sum2 = nat->nat_outip.s_addr; + + if (sum1 == sum2) + continue; + /* + * Readjust the checksum adjustment to take into + * account the new IP#. + */ + CALC_SUMD(sum1, sum2, sumd); + /* XXX - dont change for TCP when solaris does + * hardware checksumming. + */ + sumd += nat->nat_sumd[0]; + nat->nat_sumd[0] = (sumd & 0xffff) + (sumd >> 16); + nat->nat_sumd[1] = nat->nat_sumd[0]; + } + + for (n = nat_list; (n != NULL); n = n->in_next) + if (n->in_ifp == ifp) { + n->in_ifp = (void *)GETUNIT(n->in_ifname, 4); + if (!n->in_ifp) + n->in_ifp = (void *)-1; + } + RWLOCK_EXIT(&ipf_nat); + SPL_X(s); +} + + +#ifdef IPFILTER_LOG +void nat_log(nat, type) +struct nat *nat; +u_int type; +{ + struct ipnat *np; + struct natlog natl; + void *items[1]; + size_t sizes[1]; + int rulen, types[1]; + + natl.nl_inip = nat->nat_inip; + natl.nl_outip = nat->nat_outip; + natl.nl_origip = nat->nat_oip; + natl.nl_bytes = nat->nat_bytes; + natl.nl_pkts = nat->nat_pkts; + natl.nl_origport = nat->nat_oport; + natl.nl_inport = nat->nat_inport; + natl.nl_outport = nat->nat_outport; + natl.nl_p = nat->nat_p; + natl.nl_type = type; + natl.nl_rule = -1; +#ifndef LARGE_NAT + if (nat->nat_ptr != NULL) { + for (rulen = 0, np = nat_list; np; np = np->in_next, rulen++) + if (np == nat->nat_ptr) { + natl.nl_rule = rulen; + break; + } + } +#endif + items[0] = &natl; + sizes[0] = sizeof(natl); + types[0] = 0; + + (void) ipllog(IPL_LOGNAT, NULL, items, sizes, types, 1); +} +#endif diff --git a/sys/netinet/ip_nat.h b/sys/netinet/ip_nat.h new file mode 100644 index 0000000..afa7e14 --- /dev/null +++ b/sys/netinet/ip_nat.h @@ -0,0 +1,309 @@ +/* + * Copyright (C) 1995-2000 by Darren Reed. + * + * Redistribution and use in source and binary forms are permitted + * provided that this notice is preserved and due credit is given + * to the original author and the contributors. + * + * @(#)ip_nat.h 1.5 2/4/96 + * $Id: ip_nat.h,v 2.17.2.14 2000/11/18 03:58:04 darrenr Exp $ + * $FreeBSD$ + */ + +#ifndef __IP_NAT_H__ +#define __IP_NAT_H__ + +#ifndef SOLARIS +#define SOLARIS (defined(sun) && (defined(__svr4__) || defined(__SVR4))) +#endif + +#if defined(__STDC__) || defined(__GNUC__) +#define SIOCADNAT _IOW('r', 60, struct ipnat *) +#define SIOCRMNAT _IOW('r', 61, struct ipnat *) +#define SIOCGNATS _IOWR('r', 62, struct natstat *) +#define SIOCGNATL _IOWR('r', 63, struct natlookup *) +#else +#define SIOCADNAT _IOW(r, 60, struct ipnat *) +#define SIOCRMNAT _IOW(r, 61, struct ipnat *) +#define SIOCGNATS _IOWR(r, 62, struct natstat *) +#define SIOCGNATL _IOWR(r, 63, struct natlookup *) +#endif + +#undef LARGE_NAT /* define this if you're setting up a system to NAT + * LARGE numbers of networks/hosts - i.e. in the + * hundreds or thousands. In such a case, you should + * also change the RDR_SIZE and NAT_SIZE below to more + * appropriate sizes. The figures below were used for + * a setup with 1000-2000 networks to NAT. + */ +#define NAT_SIZE 127 +#define RDR_SIZE 127 +#define HOSTMAP_SIZE 127 +#define NAT_TABLE_SZ 127 +#ifdef LARGE_NAT +#undef NAT_SIZE +#undef RDR_SIZE +#undef NAT_TABLE_SZ +#undef HOSTMAP_SIZE 127 +#define NAT_SIZE 2047 +#define RDR_SIZE 2047 +#define NAT_TABLE_SZ 16383 +#define HOSTMAP_SIZE 8191 +#endif +#ifndef APR_LABELLEN +#define APR_LABELLEN 16 +#endif +#define NAT_HW_CKSUM 0x80000000 + +#define DEF_NAT_AGE 1200 /* 10 minutes (600 seconds) */ + +struct ap_session; + +typedef struct nat { + u_long nat_age; + int nat_flags; + u_32_t nat_sumd[2]; + u_32_t nat_ipsumd; + void *nat_data; + struct ap_session *nat_aps; /* proxy session */ + struct frentry *nat_fr; /* filter rule ptr if appropriate */ + struct in_addr nat_inip; + struct in_addr nat_outip; + struct in_addr nat_oip; /* other ip */ + U_QUAD_T nat_pkts; + U_QUAD_T nat_bytes; + u_short nat_oport; /* other port */ + u_short nat_inport; + u_short nat_outport; + u_short nat_use; + u_char nat_tcpstate[2]; + u_char nat_p; /* protocol for NAT */ + struct ipnat *nat_ptr; /* pointer back to the rule */ + struct hostmap *nat_hm; + struct nat *nat_next; + struct nat *nat_hnext[2]; + struct nat **nat_phnext[2]; + void *nat_ifp; + int nat_dir; + char nat_ifname[IFNAMSIZ]; +#if SOLARIS || defined(__sgi) + kmutex_t nat_lock; +#endif +} nat_t; + +typedef struct ipnat { + struct ipnat *in_next; + struct ipnat *in_rnext; + struct ipnat **in_prnext; + struct ipnat *in_mnext; + struct ipnat **in_pmnext; + void *in_ifp; + void *in_apr; + u_long in_space; + u_int in_use; + u_int in_hits; + struct in_addr in_nextip; + u_short in_pnext; + u_short in_ippip; /* IP #'s per IP# */ + u_32_t in_flags; /* From here to in_dport must be reflected */ + u_short in_spare; + u_short in_ppip; /* ports per IP */ + u_short in_port[2]; /* correctly in IPN_CMPSIZ */ + struct in_addr in_in[2]; + struct in_addr in_out[2]; + struct in_addr in_src[2]; + struct frtuc in_tuc; + int in_redir; /* 0 if it's a mapping, 1 if it's a hard redir */ + char in_ifname[IFNAMSIZ]; + char in_plabel[APR_LABELLEN]; /* proxy label */ + char in_p; /* protocol */ +} ipnat_t; + +#define in_pmin in_port[0] /* Also holds static redir port */ +#define in_pmax in_port[1] +#define in_nip in_nextip.s_addr +#define in_inip in_in[0].s_addr +#define in_inmsk in_in[1].s_addr +#define in_outip in_out[0].s_addr +#define in_outmsk in_out[1].s_addr +#define in_srcip in_src[0].s_addr +#define in_srcmsk in_src[1].s_addr +#define in_scmp in_tuc.ftu_scmp +#define in_dcmp in_tuc.ftu_dcmp +#define in_stop in_tuc.ftu_stop +#define in_dtop in_tuc.ftu_dtop +#define in_sport in_tuc.ftu_sport +#define in_dport in_tuc.ftu_dport + +#define NAT_OUTBOUND 0 +#define NAT_INBOUND 1 + +#define NAT_MAP 0x01 +#define NAT_REDIRECT 0x02 +#define NAT_BIMAP (NAT_MAP|NAT_REDIRECT) +#define NAT_MAPBLK 0x04 +/* 0x100 reserved for FI_W_SPORT */ +/* 0x200 reserved for FI_W_DPORT */ +/* 0x400 reserved for FI_W_SADDR */ +/* 0x800 reserved for FI_W_DADDR */ +/* 0x1000 reserved for FI_W_NEWFR */ + +#define MAPBLK_MINPORT 1024 /* don't use reserved ports for src port */ +#define USABLE_PORTS (65536 - MAPBLK_MINPORT) + +#define IPN_CMPSIZ (sizeof(ipnat_t) - offsetof(ipnat_t, in_flags)) + +typedef struct natlookup { + struct in_addr nl_inip; + struct in_addr nl_outip; + struct in_addr nl_realip; + int nl_flags; + u_short nl_inport; + u_short nl_outport; + u_short nl_realport; +} natlookup_t; + + +typedef struct nat_save { + void *ipn_next; + struct nat ipn_nat; + struct ipnat ipn_ipnat; + struct frentry ipn_fr; + int ipn_dsize; + char ipn_data[4]; +} nat_save_t; + +#define ipn_rule ipn_nat.nat_fr + +typedef struct natget { + void *ng_ptr; + int ng_sz; +} natget_t; + + +typedef struct hostmap { + struct hostmap *hm_next; + struct hostmap **hm_pnext; + struct ipnat *hm_ipnat; + struct in_addr hm_realip; + struct in_addr hm_mapip; + int hm_ref; +} hostmap_t; + + +typedef struct natstat { + u_long ns_mapped[2]; + u_long ns_rules; + u_long ns_added; + u_long ns_expire; + u_long ns_inuse; + u_long ns_logged; + u_long ns_logfail; + u_long ns_memfail; + u_long ns_badnat; + nat_t **ns_table[2]; + ipnat_t *ns_list; + void *ns_apslist; + u_int ns_nattab_sz; + u_int ns_rultab_sz; + u_int ns_rdrtab_sz; + nat_t *ns_instances; + u_int ns_wilds; +} natstat_t; + +#define IPN_ANY 0x000 +#define IPN_TCP 0x001 +#define IPN_UDP 0x002 +#define IPN_TCPUDP (IPN_TCP|IPN_UDP) +#define IPN_DELETE 0x004 +#define IPN_ICMPERR 0x008 +#define IPN_RF (IPN_TCPUDP|IPN_DELETE|IPN_ICMPERR) +#define IPN_AUTOPORTMAP 0x010 +#define IPN_IPRANGE 0x020 +#define IPN_USERFLAGS (IPN_TCPUDP|IPN_AUTOPORTMAP|IPN_IPRANGE|IPN_SPLIT|\ + IPN_ROUNDR|IPN_FILTER|IPN_NOTSRC|IPN_NOTDST) +#define IPN_FILTER 0x040 +#define IPN_SPLIT 0x080 +#define IPN_ROUNDR 0x100 +#define IPN_NOTSRC 0x080000 +#define IPN_NOTDST 0x100000 +#define IPN_FRAG 0x200000 + + +typedef struct natlog { + struct in_addr nl_origip; + struct in_addr nl_outip; + struct in_addr nl_inip; + u_short nl_origport; + u_short nl_outport; + u_short nl_inport; + u_short nl_type; + int nl_rule; + U_QUAD_T nl_pkts; + U_QUAD_T nl_bytes; + u_char nl_p; +} natlog_t; + + +#define NL_NEWMAP NAT_MAP +#define NL_NEWRDR NAT_REDIRECT +#define NL_NEWBIMAP NAT_BIMAP +#define NL_NEWBLOCK NAT_MAPBLK +#define NL_FLUSH 0xfffe +#define NL_EXPIRE 0xffff + +#define NAT_HASH_FN(k,l,m) (((k) + ((k) >> 12) + l) % (m)) + +#define LONG_SUM(in) (((in) & 0xffff) + ((in) >> 16)) + +#define CALC_SUMD(s1, s2, sd) { \ + (s1) = ((s1) & 0xffff) + ((s1) >> 16); \ + (s2) = ((s2) & 0xffff) + ((s2) >> 16); \ + /* Do it twice */ \ + (s1) = ((s1) & 0xffff) + ((s1) >> 16); \ + (s2) = ((s2) & 0xffff) + ((s2) >> 16); \ + /* Because ~1 == -2, We really need ~1 == -1 */ \ + if ((s1) > (s2)) (s2)--; \ + (sd) = (s2) - (s1); \ + (sd) = ((sd) & 0xffff) + ((sd) >> 16); } + + +extern u_int ipf_nattable_sz; +extern u_int ipf_natrules_sz; +extern u_int ipf_rdrrules_sz; +extern int fr_nat_lock; +extern void ip_natsync __P((void *)); +extern u_long fr_defnatage; +extern u_long fr_defnaticmpage; +extern nat_t **nat_table[2]; +extern nat_t *nat_instances; +extern ipnat_t **nat_rules; +extern ipnat_t **rdr_rules; +extern natstat_t nat_stats; +#if defined(__NetBSD__) || defined(__OpenBSD__) || (__FreeBSD_version >= 300003) +extern int nat_ioctl __P((caddr_t, u_long, int)); +#else +extern int nat_ioctl __P((caddr_t, int, int)); +#endif +extern int nat_init __P((void)); +extern nat_t *nat_new __P((ipnat_t *, ip_t *, fr_info_t *, u_int, int)); +extern nat_t *nat_outlookup __P((void *, u_int, u_int, struct in_addr, + struct in_addr, u_32_t, int)); +extern nat_t *nat_inlookup __P((void *, u_int, u_int, struct in_addr, + struct in_addr, u_32_t, int)); +extern nat_t *nat_maplookup __P((void *, u_int, struct in_addr, + struct in_addr)); +extern nat_t *nat_lookupredir __P((natlookup_t *)); +extern nat_t *nat_icmplookup __P((ip_t *, fr_info_t *, int)); +extern nat_t *nat_icmp __P((ip_t *, fr_info_t *, u_int *, int)); +extern void nat_insert __P((nat_t *)); + +extern int ip_natout __P((ip_t *, fr_info_t *)); +extern int ip_natin __P((ip_t *, fr_info_t *)); +extern void ip_natunload __P((void)), ip_natexpire __P((void)); +extern void nat_log __P((struct nat *, u_int)); +extern void fix_incksum __P((u_short *, u_32_t)); +extern void fix_outcksum __P((u_short *, u_32_t)); +extern void fix_datacksum __P((u_short *, u_32_t)); + +#endif /* __IP_NAT_H__ */ diff --git a/sys/netinet/ip_output.c b/sys/netinet/ip_output.c new file mode 100644 index 0000000..ed20168 --- /dev/null +++ b/sys/netinet/ip_output.c @@ -0,0 +1,1915 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 + * $FreeBSD$ + */ + +#define _IP_VHL + +#include "opt_ipfw.h" +#include "opt_ipdn.h" +#include "opt_ipdivert.h" +#include "opt_ipfilter.h" +#include "opt_ipsec.h" +#include "opt_pfil_hooks.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/socketvar.h> + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/in_pcb.h> +#include <netinet/in_var.h> +#include <netinet/ip_var.h> + +#include "faith.h" + +#include <machine/in_cksum.h> + +static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "internet multicast options"); + +#ifdef IPSEC +#include <netinet6/ipsec.h> +#include <netkey/key.h> +#ifdef IPSEC_DEBUG +#include <netkey/key_debug.h> +#else +#define KEYDEBUG(lev,arg) +#endif +#endif /*IPSEC*/ + +#include <netinet/ip_fw.h> + +#ifdef DUMMYNET +#include <netinet/ip_dummynet.h> +#endif + +#ifdef IPFIREWALL_FORWARD_DEBUG +#define print_ip(a) printf("%ld.%ld.%ld.%ld",(ntohl(a.s_addr)>>24)&0xFF,\ + (ntohl(a.s_addr)>>16)&0xFF,\ + (ntohl(a.s_addr)>>8)&0xFF,\ + (ntohl(a.s_addr))&0xFF); +#endif + +u_short ip_id; + +static struct mbuf *ip_insertoptions __P((struct mbuf *, struct mbuf *, int *)); +static void ip_mloopback + __P((struct ifnet *, struct mbuf *, struct sockaddr_in *, int)); +static int ip_getmoptions + __P((struct sockopt *, struct ip_moptions *)); +static int ip_pcbopts __P((int, struct mbuf **, struct mbuf *)); +static int ip_setmoptions + __P((struct sockopt *, struct ip_moptions **)); + +int ip_optcopy __P((struct ip *, struct ip *)); + + +extern struct protosw inetsw[]; + +/* + * IP output. The packet in mbuf chain m contains a skeletal IP + * header (with len, off, ttl, proto, tos, src, dst). + * The mbuf chain containing the packet will be freed. + * The mbuf opt, if present, will not be freed. + */ +int +ip_output(m0, opt, ro, flags, imo) + struct mbuf *m0; + struct mbuf *opt; + struct route *ro; + int flags; + struct ip_moptions *imo; +{ + struct ip *ip, *mhip; + struct ifnet *ifp; + struct mbuf *m = m0; + int hlen = sizeof (struct ip); + int len, off, error = 0; + struct sockaddr_in *dst; + struct in_ifaddr *ia; + int isbroadcast, sw_csum; +#ifdef IPSEC + struct route iproute; + struct socket *so = NULL; + struct secpolicy *sp = NULL; +#endif + u_int16_t divert_cookie; /* firewall cookie */ +#ifdef PFIL_HOOKS + struct packet_filter_hook *pfh; + struct mbuf *m1; + int rv; +#endif /* PFIL_HOOKS */ +#ifdef IPFIREWALL_FORWARD + int fwd_rewrite_src = 0; +#endif + struct ip_fw_chain *rule = NULL; + +#ifdef IPDIVERT + /* Get and reset firewall cookie */ + divert_cookie = ip_divert_cookie; + ip_divert_cookie = 0; +#else + divert_cookie = 0; +#endif + +#if defined(IPFIREWALL) && defined(DUMMYNET) + /* + * dummynet packet are prepended a vestigial mbuf with + * m_type = MT_DUMMYNET and m_data pointing to the matching + * rule. + */ + if (m->m_type == MT_DUMMYNET) { + /* + * the packet was already tagged, so part of the + * processing was already done, and we need to go down. + * Get parameters from the header. + */ + rule = (struct ip_fw_chain *)(m->m_data) ; + opt = NULL ; + ro = & ( ((struct dn_pkt *)m)->ro ) ; + imo = NULL ; + dst = ((struct dn_pkt *)m)->dn_dst ; + ifp = ((struct dn_pkt *)m)->ifp ; + flags = ((struct dn_pkt *)m)->flags ; + + m0 = m = m->m_next ; +#ifdef IPSEC + so = ipsec_getsocket(m); + ipsec_setsocket(m, NULL); +#endif + ip = mtod(m, struct ip *); + hlen = IP_VHL_HL(ip->ip_vhl) << 2 ; + ia = (struct in_ifaddr *)ro->ro_rt->rt_ifa; + goto sendit; + } else + rule = NULL ; +#endif +#ifdef IPSEC + so = ipsec_getsocket(m); + ipsec_setsocket(m, NULL); +#endif + +#ifdef DIAGNOSTIC + if ((m->m_flags & M_PKTHDR) == 0) + panic("ip_output no HDR"); + if (!ro) + panic("ip_output no route, proto = %d", + mtod(m, struct ip *)->ip_p); +#endif + if (opt) { + m = ip_insertoptions(m, opt, &len); + hlen = len; + } + ip = mtod(m, struct ip *); + /* + * Fill in IP header. + */ + if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { + ip->ip_vhl = IP_MAKE_VHL(IPVERSION, hlen >> 2); + ip->ip_off &= IP_DF; + ip->ip_id = htons(ip_id++); + ipstat.ips_localout++; + } else { + hlen = IP_VHL_HL(ip->ip_vhl) << 2; + } + + dst = (struct sockaddr_in *)&ro->ro_dst; + /* + * If there is a cached route, + * check that it is to the same destination + * and is still up. If not, free it and try again. + */ + if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 || + dst->sin_addr.s_addr != ip->ip_dst.s_addr)) { + RTFREE(ro->ro_rt); + ro->ro_rt = (struct rtentry *)0; + } + if (ro->ro_rt == 0) { + dst->sin_family = AF_INET; + dst->sin_len = sizeof(*dst); + dst->sin_addr = ip->ip_dst; + } + /* + * If routing to interface only, + * short circuit routing lookup. + */ +#define ifatoia(ifa) ((struct in_ifaddr *)(ifa)) +#define sintosa(sin) ((struct sockaddr *)(sin)) + if (flags & IP_ROUTETOIF) { + if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == 0 && + (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == 0) { + ipstat.ips_noroute++; + error = ENETUNREACH; + goto bad; + } + ifp = ia->ia_ifp; + ip->ip_ttl = 1; + isbroadcast = in_broadcast(dst->sin_addr, ifp); + } else { + /* + * If this is the case, we probably don't want to allocate + * a protocol-cloned route since we didn't get one from the + * ULP. This lets TCP do its thing, while not burdening + * forwarding or ICMP with the overhead of cloning a route. + * Of course, we still want to do any cloning requested by + * the link layer, as this is probably required in all cases + * for correct operation (as it is for ARP). + */ + if (ro->ro_rt == 0) + rtalloc_ign(ro, RTF_PRCLONING); + if (ro->ro_rt == 0) { + ipstat.ips_noroute++; + error = EHOSTUNREACH; + goto bad; + } + ia = ifatoia(ro->ro_rt->rt_ifa); + ifp = ro->ro_rt->rt_ifp; + ro->ro_rt->rt_use++; + if (ro->ro_rt->rt_flags & RTF_GATEWAY) + dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway; + if (ro->ro_rt->rt_flags & RTF_HOST) + isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST); + else + isbroadcast = in_broadcast(dst->sin_addr, ifp); + } + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { + struct in_multi *inm; + + m->m_flags |= M_MCAST; + /* + * IP destination address is multicast. Make sure "dst" + * still points to the address in "ro". (It may have been + * changed to point to a gateway address, above.) + */ + dst = (struct sockaddr_in *)&ro->ro_dst; + /* + * See if the caller provided any multicast options + */ + if (imo != NULL) { + ip->ip_ttl = imo->imo_multicast_ttl; + if (imo->imo_multicast_ifp != NULL) + ifp = imo->imo_multicast_ifp; + if (imo->imo_multicast_vif != -1) + ip->ip_src.s_addr = + ip_mcast_src(imo->imo_multicast_vif); + } else + ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; + /* + * Confirm that the outgoing interface supports multicast. + */ + if ((imo == NULL) || (imo->imo_multicast_vif == -1)) { + if ((ifp->if_flags & IFF_MULTICAST) == 0) { + ipstat.ips_noroute++; + error = ENETUNREACH; + goto bad; + } + } + /* + * If source address not specified yet, use address + * of outgoing interface. + */ + if (ip->ip_src.s_addr == INADDR_ANY) { + register struct in_ifaddr *ia1; + + TAILQ_FOREACH(ia1, &in_ifaddrhead, ia_link) + if (ia1->ia_ifp == ifp) { + ip->ip_src = IA_SIN(ia1)->sin_addr; + break; + } + } + + IN_LOOKUP_MULTI(ip->ip_dst, ifp, inm); + if (inm != NULL && + (imo == NULL || imo->imo_multicast_loop)) { + /* + * If we belong to the destination multicast group + * on the outgoing interface, and the caller did not + * forbid loopback, loop back a copy. + */ + ip_mloopback(ifp, m, dst, hlen); + } + else { + /* + * If we are acting as a multicast router, perform + * multicast forwarding as if the packet had just + * arrived on the interface to which we are about + * to send. The multicast forwarding function + * recursively calls this function, using the + * IP_FORWARDING flag to prevent infinite recursion. + * + * Multicasts that are looped back by ip_mloopback(), + * above, will be forwarded by the ip_input() routine, + * if necessary. + */ + if (ip_mrouter && (flags & IP_FORWARDING) == 0) { + /* + * Check if rsvp daemon is running. If not, don't + * set ip_moptions. This ensures that the packet + * is multicast and not just sent down one link + * as prescribed by rsvpd. + */ + if (!rsvp_on) + imo = NULL; + if (ip_mforward(ip, ifp, m, imo) != 0) { + m_freem(m); + goto done; + } + } + } + + /* + * Multicasts with a time-to-live of zero may be looped- + * back, above, but must not be transmitted on a network. + * Also, multicasts addressed to the loopback interface + * are not sent -- the above call to ip_mloopback() will + * loop back a copy if this host actually belongs to the + * destination group on the loopback interface. + */ + if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) { + m_freem(m); + goto done; + } + + goto sendit; + } +#ifndef notdef + /* + * If source address not specified yet, use address + * of outgoing interface. + */ + if (ip->ip_src.s_addr == INADDR_ANY) { + ip->ip_src = IA_SIN(ia)->sin_addr; +#ifdef IPFIREWALL_FORWARD + /* Keep note that we did this - if the firewall changes + * the next-hop, our interface may change, changing the + * default source IP. It's a shame so much effort happens + * twice. Oh well. + */ + fwd_rewrite_src++; +#endif /* IPFIREWALL_FORWARD */ + } +#endif /* notdef */ + /* + * Verify that we have any chance at all of being able to queue + * the packet or packet fragments + */ + if ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >= + ifp->if_snd.ifq_maxlen) { + error = ENOBUFS; + goto bad; + } + + /* + * Look for broadcast address and + * and verify user is allowed to send + * such a packet. + */ + if (isbroadcast) { + if ((ifp->if_flags & IFF_BROADCAST) == 0) { + error = EADDRNOTAVAIL; + goto bad; + } + if ((flags & IP_ALLOWBROADCAST) == 0) { + error = EACCES; + goto bad; + } + /* don't allow broadcast messages to be fragmented */ + if ((u_short)ip->ip_len > ifp->if_mtu) { + error = EMSGSIZE; + goto bad; + } + m->m_flags |= M_BCAST; + } else { + m->m_flags &= ~M_BCAST; + } + +sendit: + /* + * IpHack's section. + * - Xlate: translate packet's addr/port (NAT). + * - Firewall: deny/allow/etc. + * - Wrap: fake packet's addr/port <unimpl.> + * - Encapsulate: put it in another IP and send out. <unimp.> + */ +#ifdef PFIL_HOOKS + /* + * Run through list of hooks for output packets. + */ + m1 = m; + pfh = pfil_hook_get(PFIL_OUT, &inetsw[ip_protox[IPPROTO_IP]].pr_pfh); + for (; pfh; pfh = TAILQ_NEXT(pfh, pfil_link)) + if (pfh->pfil_func) { + rv = pfh->pfil_func(ip, hlen, ifp, 1, &m1); + if (rv) { + error = EHOSTUNREACH; + goto done; + } + m = m1; + if (m == NULL) + goto done; + ip = mtod(m, struct ip *); + } +#endif /* PFIL_HOOKS */ + + /* + * Check with the firewall... + */ + if (fw_enable && ip_fw_chk_ptr) { + struct sockaddr_in *old = dst; + + off = (*ip_fw_chk_ptr)(&ip, + hlen, ifp, &divert_cookie, &m, &rule, &dst); + /* + * On return we must do the following: + * m == NULL -> drop the pkt (old interface, deprecated) + * (off & 0x40000) -> drop the pkt (new interface) + * 1<=off<= 0xffff -> DIVERT + * (off & 0x10000) -> send to a DUMMYNET pipe + * (off & 0x20000) -> TEE the packet + * dst != old -> IPFIREWALL_FORWARD + * off==0, dst==old -> accept + * If some of the above modules is not compiled in, then + * we should't have to check the corresponding condition + * (because the ipfw control socket should not accept + * unsupported rules), but better play safe and drop + * packets in case of doubt. + */ + if (off & IP_FW_PORT_DENY_FLAG) { /* XXX new interface-denied */ + if (m) + m_freem(m); + error = EACCES ; + goto done; + } + if (!m) { /* firewall said to reject */ + static int __debug=10; + if (__debug >0) { + printf("firewall returns NULL, please update!\n"); + __debug-- ; + } + error = EACCES; + goto done; + } + if (off == 0 && dst == old) /* common case */ + goto pass ; +#ifdef DUMMYNET + if ((off & IP_FW_PORT_DYNT_FLAG) != 0) { + /* + * pass the pkt to dummynet. Need to include + * pipe number, m, ifp, ro, dst because these are + * not recomputed in the next pass. + * All other parameters have been already used and + * so they are not needed anymore. + * XXX note: if the ifp or ro entry are deleted + * while a pkt is in dummynet, we are in trouble! + */ + error = dummynet_io(off & 0xffff, DN_TO_IP_OUT, m, + ifp,ro,dst,rule, flags); + goto done; + } +#endif +#ifdef IPDIVERT + if (off != 0 && (off & IP_FW_PORT_DYNT_FLAG) == 0) { + struct mbuf *clone = NULL; + + /* Clone packet if we're doing a 'tee' */ + if ((off & IP_FW_PORT_TEE_FLAG) != 0) + clone = m_dup(m, M_DONTWAIT); + + /* + * XXX + * delayed checksums are not currently compatible + * with divert sockets. + */ + if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { + in_delayed_cksum(m); + m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; + } + + /* Restore packet header fields to original values */ + HTONS(ip->ip_len); + HTONS(ip->ip_off); + + /* Deliver packet to divert input routine */ + ip_divert_cookie = divert_cookie; + divert_packet(m, 0, off & 0xffff); + + /* If 'tee', continue with original packet */ + if (clone != NULL) { + m = clone; + ip = mtod(m, struct ip *); + goto pass; + } + goto done; + } +#endif + +#ifdef IPFIREWALL_FORWARD + /* Here we check dst to make sure it's directly reachable on the + * interface we previously thought it was. + * If it isn't (which may be likely in some situations) we have + * to re-route it (ie, find a route for the next-hop and the + * associated interface) and set them here. This is nested + * forwarding which in most cases is undesirable, except where + * such control is nigh impossible. So we do it here. + * And I'm babbling. + */ + if (off == 0 && old != dst) { + struct in_ifaddr *ia; + + /* It's changed... */ + /* There must be a better way to do this next line... */ + static struct route sro_fwd, *ro_fwd = &sro_fwd; +#ifdef IPFIREWALL_FORWARD_DEBUG + printf("IPFIREWALL_FORWARD: New dst ip: "); + print_ip(dst->sin_addr); + printf("\n"); +#endif + /* + * We need to figure out if we have been forwarded + * to a local socket. If so then we should somehow + * "loop back" to ip_input, and get directed to the + * PCB as if we had received this packet. This is + * because it may be dificult to identify the packets + * you want to forward until they are being output + * and have selected an interface. (e.g. locally + * initiated packets) If we used the loopback inteface, + * we would not be able to control what happens + * as the packet runs through ip_input() as + * it is done through a ISR. + */ + TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { + /* + * If the addr to forward to is one + * of ours, we pretend to + * be the destination for this packet. + */ + if (IA_SIN(ia)->sin_addr.s_addr == + dst->sin_addr.s_addr) + break; + } + if (ia) { + /* tell ip_input "dont filter" */ + ip_fw_fwd_addr = dst; + if (m->m_pkthdr.rcvif == NULL) + m->m_pkthdr.rcvif = ifunit("lo0"); + if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { + m->m_pkthdr.csum_flags |= + CSUM_DATA_VALID | CSUM_PSEUDO_HDR; + m0->m_pkthdr.csum_data = 0xffff; + } + m->m_pkthdr.csum_flags |= + CSUM_IP_CHECKED | CSUM_IP_VALID; + HTONS(ip->ip_len); + HTONS(ip->ip_off); + ip_input(m); + goto done; + } + /* Some of the logic for this was + * nicked from above. + * + * This rewrites the cached route in a local PCB. + * Is this what we want to do? + */ + bcopy(dst, &ro_fwd->ro_dst, sizeof(*dst)); + + ro_fwd->ro_rt = 0; + rtalloc_ign(ro_fwd, RTF_PRCLONING); + + if (ro_fwd->ro_rt == 0) { + ipstat.ips_noroute++; + error = EHOSTUNREACH; + goto bad; + } + + ia = ifatoia(ro_fwd->ro_rt->rt_ifa); + ifp = ro_fwd->ro_rt->rt_ifp; + ro_fwd->ro_rt->rt_use++; + if (ro_fwd->ro_rt->rt_flags & RTF_GATEWAY) + dst = (struct sockaddr_in *)ro_fwd->ro_rt->rt_gateway; + if (ro_fwd->ro_rt->rt_flags & RTF_HOST) + isbroadcast = + (ro_fwd->ro_rt->rt_flags & RTF_BROADCAST); + else + isbroadcast = in_broadcast(dst->sin_addr, ifp); + RTFREE(ro->ro_rt); + ro->ro_rt = ro_fwd->ro_rt; + dst = (struct sockaddr_in *)&ro_fwd->ro_dst; + + /* + * If we added a default src ip earlier, + * which would have been gotten from the-then + * interface, do it again, from the new one. + */ + if (fwd_rewrite_src) + ip->ip_src = IA_SIN(ia)->sin_addr; + goto pass ; + } +#endif /* IPFIREWALL_FORWARD */ + /* + * if we get here, none of the above matches, and + * we have to drop the pkt + */ + m_freem(m); + error = EACCES; /* not sure this is the right error msg */ + goto done; + } + +pass: +#ifdef IPSEC + /* get SP for this packet */ + if (so == NULL) + sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND, flags, &error); + else + sp = ipsec4_getpolicybysock(m, IPSEC_DIR_OUTBOUND, so, &error); + + if (sp == NULL) { + ipsecstat.out_inval++; + goto bad; + } + + error = 0; + + /* check policy */ + switch (sp->policy) { + case IPSEC_POLICY_DISCARD: + /* + * This packet is just discarded. + */ + ipsecstat.out_polvio++; + goto bad; + + case IPSEC_POLICY_BYPASS: + case IPSEC_POLICY_NONE: + /* no need to do IPsec. */ + goto skip_ipsec; + + case IPSEC_POLICY_IPSEC: + if (sp->req == NULL) { + /* XXX should be panic ? */ + printf("ip_output: No IPsec request specified.\n"); + error = EINVAL; + goto bad; + } + break; + + case IPSEC_POLICY_ENTRUST: + default: + printf("ip_output: Invalid policy found. %d\n", sp->policy); + } + { + struct ipsec_output_state state; + bzero(&state, sizeof(state)); + state.m = m; + if (flags & IP_ROUTETOIF) { + state.ro = &iproute; + bzero(&iproute, sizeof(iproute)); + } else + state.ro = ro; + state.dst = (struct sockaddr *)dst; + + ip->ip_sum = 0; + + /* + * XXX + * delayed checksums are not currently compatible with IPsec + */ + if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { + in_delayed_cksum(m); + m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; + } + + HTONS(ip->ip_len); + HTONS(ip->ip_off); + + error = ipsec4_output(&state, sp, flags); + + m = state.m; + if (flags & IP_ROUTETOIF) { + /* + * if we have tunnel mode SA, we may need to ignore + * IP_ROUTETOIF. + */ + if (state.ro != &iproute || state.ro->ro_rt != NULL) { + flags &= ~IP_ROUTETOIF; + ro = state.ro; + } + } else + ro = state.ro; + dst = (struct sockaddr_in *)state.dst; + if (error) { + /* mbuf is already reclaimed in ipsec4_output. */ + m0 = NULL; + switch (error) { + case EHOSTUNREACH: + case ENETUNREACH: + case EMSGSIZE: + case ENOBUFS: + case ENOMEM: + break; + default: + printf("ip4_output (ipsec): error code %d\n", error); + /*fall through*/ + case ENOENT: + /* don't show these error codes to the user */ + error = 0; + break; + } + goto bad; + } + } + + /* be sure to update variables that are affected by ipsec4_output() */ + ip = mtod(m, struct ip *); +#ifdef _IP_VHL + hlen = IP_VHL_HL(ip->ip_vhl) << 2; +#else + hlen = ip->ip_hl << 2; +#endif + if (ro->ro_rt == NULL) { + if ((flags & IP_ROUTETOIF) == 0) { + printf("ip_output: " + "can't update route after IPsec processing\n"); + error = EHOSTUNREACH; /*XXX*/ + goto bad; + } + } else { + ia = ifatoia(ro->ro_rt->rt_ifa); + ifp = ro->ro_rt->rt_ifp; + } + + /* make it flipped, again. */ + NTOHS(ip->ip_len); + NTOHS(ip->ip_off); +skip_ipsec: +#endif /*IPSEC*/ + + m->m_pkthdr.csum_flags |= CSUM_IP; + sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist; + if (sw_csum & CSUM_DELAY_DATA) { + in_delayed_cksum(m); + sw_csum &= ~CSUM_DELAY_DATA; + } + m->m_pkthdr.csum_flags &= ifp->if_hwassist; + + /* + * If small enough for interface, or the interface will take + * care of the fragmentation for us, can just send directly. + */ + if ((u_short)ip->ip_len <= ifp->if_mtu || + ifp->if_hwassist & CSUM_FRAGMENT) { + HTONS(ip->ip_len); + HTONS(ip->ip_off); + ip->ip_sum = 0; + if (sw_csum & CSUM_DELAY_IP) { + if (ip->ip_vhl == IP_VHL_BORING) { + ip->ip_sum = in_cksum_hdr(ip); + } else { + ip->ip_sum = in_cksum(m, hlen); + } + } + + /* Record statistics for this interface address. */ + if (!(flags & IP_FORWARDING)) { + ia->ia_ifa.if_opackets++; + ia->ia_ifa.if_obytes += m->m_pkthdr.len; + } + + error = (*ifp->if_output)(ifp, m, + (struct sockaddr *)dst, ro->ro_rt); + goto done; + } + /* + * Too large for interface; fragment if possible. + * Must be able to put at least 8 bytes per fragment. + */ + if (ip->ip_off & IP_DF) { + error = EMSGSIZE; + /* + * This case can happen if the user changed the MTU + * of an interface after enabling IP on it. Because + * most netifs don't keep track of routes pointing to + * them, there is no way for one to update all its + * routes when the MTU is changed. + */ + if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) + && !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU) + && (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) { + ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu; + } + ipstat.ips_cantfrag++; + goto bad; + } + len = (ifp->if_mtu - hlen) &~ 7; + if (len < 8) { + error = EMSGSIZE; + goto bad; + } + + /* + * if the interface will not calculate checksums on + * fragmented packets, then do it here. + */ + if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA && + (ifp->if_hwassist & CSUM_IP_FRAGS) == 0) { + in_delayed_cksum(m); + m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; + } + + { + int mhlen, firstlen = len; + struct mbuf **mnext = &m->m_nextpkt; + int nfrags = 1; + + /* + * Loop through length of segment after first fragment, + * make new header and copy data of each part and link onto chain. + */ + m0 = m; + mhlen = sizeof (struct ip); + for (off = hlen + len; off < (u_short)ip->ip_len; off += len) { + MGETHDR(m, M_DONTWAIT, MT_HEADER); + if (m == 0) { + error = ENOBUFS; + ipstat.ips_odropped++; + goto sendorfree; + } + m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG; + m->m_data += max_linkhdr; + mhip = mtod(m, struct ip *); + *mhip = *ip; + if (hlen > sizeof (struct ip)) { + mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); + mhip->ip_vhl = IP_MAKE_VHL(IPVERSION, mhlen >> 2); + } + m->m_len = mhlen; + mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off; + if (off + len >= (u_short)ip->ip_len) + len = (u_short)ip->ip_len - off; + else + mhip->ip_off |= IP_MF; + mhip->ip_len = htons((u_short)(len + mhlen)); + m->m_next = m_copy(m0, off, len); + if (m->m_next == 0) { + (void) m_free(m); + error = ENOBUFS; /* ??? */ + ipstat.ips_odropped++; + goto sendorfree; + } + m->m_pkthdr.len = mhlen + len; + m->m_pkthdr.rcvif = (struct ifnet *)0; + m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags; + HTONS(mhip->ip_off); + mhip->ip_sum = 0; + if (sw_csum & CSUM_DELAY_IP) { + if (mhip->ip_vhl == IP_VHL_BORING) { + mhip->ip_sum = in_cksum_hdr(mhip); + } else { + mhip->ip_sum = in_cksum(m, mhlen); + } + } + *mnext = m; + mnext = &m->m_nextpkt; + nfrags++; + } + ipstat.ips_ofragments += nfrags; + + /* set first/last markers for fragment chain */ + m->m_flags |= M_LASTFRAG; + m0->m_flags |= M_FIRSTFRAG | M_FRAG; + m0->m_pkthdr.csum_data = nfrags; + + /* + * Update first fragment by trimming what's been copied out + * and updating header, then send each fragment (in order). + */ + m = m0; + m_adj(m, hlen + firstlen - (u_short)ip->ip_len); + m->m_pkthdr.len = hlen + firstlen; + ip->ip_len = htons((u_short)m->m_pkthdr.len); + ip->ip_off |= IP_MF; + HTONS(ip->ip_off); + ip->ip_sum = 0; + if (sw_csum & CSUM_DELAY_IP) { + if (ip->ip_vhl == IP_VHL_BORING) { + ip->ip_sum = in_cksum_hdr(ip); + } else { + ip->ip_sum = in_cksum(m, hlen); + } + } +sendorfree: + for (m = m0; m; m = m0) { + m0 = m->m_nextpkt; + m->m_nextpkt = 0; + if (error == 0) { + /* Record statistics for this interface address. */ + ia->ia_ifa.if_opackets++; + ia->ia_ifa.if_obytes += m->m_pkthdr.len; + + error = (*ifp->if_output)(ifp, m, + (struct sockaddr *)dst, ro->ro_rt); + } else + m_freem(m); + } + + if (error == 0) + ipstat.ips_fragmented++; + } +done: +#ifdef IPSEC + if (ro == &iproute && ro->ro_rt) { + RTFREE(ro->ro_rt); + ro->ro_rt = NULL; + } + if (sp != NULL) { + KEYDEBUG(KEYDEBUG_IPSEC_STAMP, + printf("DP ip_output call free SP:%p\n", sp)); + key_freesp(sp); + } +#endif /* IPSEC */ + return (error); +bad: + m_freem(m0); + goto done; +} + +void +in_delayed_cksum(struct mbuf *m) +{ + struct ip *ip; + u_short csum, offset; + + ip = mtod(m, struct ip *); + offset = IP_VHL_HL(ip->ip_vhl) << 2 ; + csum = in_cksum_skip(m, ip->ip_len, offset); + if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0) + csum = 0xffff; + offset += m->m_pkthdr.csum_data; /* checksum offset */ + + if (offset + sizeof(u_short) > m->m_len) { + printf("delayed m_pullup, m->len: %d off: %d p: %d\n", + m->m_len, offset, ip->ip_p); + /* + * XXX + * this shouldn't happen, but if it does, the + * correct behavior may be to insert the checksum + * in the existing chain instead of rearranging it. + */ + m = m_pullup(m, offset + sizeof(u_short)); + } + *(u_short *)(m->m_data + offset) = csum; +} + +/* + * Insert IP options into preformed packet. + * Adjust IP destination as required for IP source routing, + * as indicated by a non-zero in_addr at the start of the options. + * + * XXX This routine assumes that the packet has no options in place. + */ +static struct mbuf * +ip_insertoptions(m, opt, phlen) + register struct mbuf *m; + struct mbuf *opt; + int *phlen; +{ + register struct ipoption *p = mtod(opt, struct ipoption *); + struct mbuf *n; + register struct ip *ip = mtod(m, struct ip *); + unsigned optlen; + + optlen = opt->m_len - sizeof(p->ipopt_dst); + if (optlen + (u_short)ip->ip_len > IP_MAXPACKET) + return (m); /* XXX should fail */ + if (p->ipopt_dst.s_addr) + ip->ip_dst = p->ipopt_dst; + if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) { + MGETHDR(n, M_DONTWAIT, MT_HEADER); + if (n == 0) + return (m); + n->m_pkthdr.rcvif = (struct ifnet *)0; + n->m_pkthdr.len = m->m_pkthdr.len + optlen; + m->m_len -= sizeof(struct ip); + m->m_data += sizeof(struct ip); + n->m_next = m; + m = n; + m->m_len = optlen + sizeof(struct ip); + m->m_data += max_linkhdr; + (void)memcpy(mtod(m, void *), ip, sizeof(struct ip)); + } else { + m->m_data -= optlen; + m->m_len += optlen; + m->m_pkthdr.len += optlen; + ovbcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); + } + ip = mtod(m, struct ip *); + bcopy(p->ipopt_list, ip + 1, optlen); + *phlen = sizeof(struct ip) + optlen; + ip->ip_vhl = IP_MAKE_VHL(IPVERSION, *phlen >> 2); + ip->ip_len += optlen; + return (m); +} + +/* + * Copy options from ip to jp, + * omitting those not copied during fragmentation. + */ +int +ip_optcopy(ip, jp) + struct ip *ip, *jp; +{ + register u_char *cp, *dp; + int opt, optlen, cnt; + + cp = (u_char *)(ip + 1); + dp = (u_char *)(jp + 1); + cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip); + for (; cnt > 0; cnt -= optlen, cp += optlen) { + opt = cp[0]; + if (opt == IPOPT_EOL) + break; + if (opt == IPOPT_NOP) { + /* Preserve for IP mcast tunnel's LSRR alignment. */ + *dp++ = IPOPT_NOP; + optlen = 1; + continue; + } +#ifdef DIAGNOSTIC + if (cnt < IPOPT_OLEN + sizeof(*cp)) + panic("malformed IPv4 option passed to ip_optcopy"); +#endif + optlen = cp[IPOPT_OLEN]; +#ifdef DIAGNOSTIC + if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) + panic("malformed IPv4 option passed to ip_optcopy"); +#endif + /* bogus lengths should have been caught by ip_dooptions */ + if (optlen > cnt) + optlen = cnt; + if (IPOPT_COPIED(opt)) { + bcopy(cp, dp, optlen); + dp += optlen; + } + } + for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++) + *dp++ = IPOPT_EOL; + return (optlen); +} + +/* + * IP socket option processing. + */ +int +ip_ctloutput(so, sopt) + struct socket *so; + struct sockopt *sopt; +{ + struct inpcb *inp = sotoinpcb(so); + int error, optval; + + error = optval = 0; + if (sopt->sopt_level != IPPROTO_IP) { + return (EINVAL); + } + + switch (sopt->sopt_dir) { + case SOPT_SET: + switch (sopt->sopt_name) { + case IP_OPTIONS: +#ifdef notyet + case IP_RETOPTS: +#endif + { + struct mbuf *m; + if (sopt->sopt_valsize > MLEN) { + error = EMSGSIZE; + break; + } + MGET(m, sopt->sopt_p ? M_TRYWAIT : M_DONTWAIT, MT_HEADER); + if (m == 0) { + error = ENOBUFS; + break; + } + m->m_len = sopt->sopt_valsize; + error = sooptcopyin(sopt, mtod(m, char *), m->m_len, + m->m_len); + + return (ip_pcbopts(sopt->sopt_name, &inp->inp_options, + m)); + } + + case IP_TOS: + case IP_TTL: + case IP_RECVOPTS: + case IP_RECVRETOPTS: + case IP_RECVDSTADDR: + case IP_RECVIF: +#if defined(NFAITH) && NFAITH > 0 + case IP_FAITH: +#endif + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + break; + + switch (sopt->sopt_name) { + case IP_TOS: + inp->inp_ip_tos = optval; + break; + + case IP_TTL: + inp->inp_ip_ttl = optval; + break; +#define OPTSET(bit) \ + if (optval) \ + inp->inp_flags |= bit; \ + else \ + inp->inp_flags &= ~bit; + + case IP_RECVOPTS: + OPTSET(INP_RECVOPTS); + break; + + case IP_RECVRETOPTS: + OPTSET(INP_RECVRETOPTS); + break; + + case IP_RECVDSTADDR: + OPTSET(INP_RECVDSTADDR); + break; + + case IP_RECVIF: + OPTSET(INP_RECVIF); + break; + +#if defined(NFAITH) && NFAITH > 0 + case IP_FAITH: + OPTSET(INP_FAITH); + break; +#endif + } + break; +#undef OPTSET + + case IP_MULTICAST_IF: + case IP_MULTICAST_VIF: + case IP_MULTICAST_TTL: + case IP_MULTICAST_LOOP: + case IP_ADD_MEMBERSHIP: + case IP_DROP_MEMBERSHIP: + error = ip_setmoptions(sopt, &inp->inp_moptions); + break; + + case IP_PORTRANGE: + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + break; + + switch (optval) { + case IP_PORTRANGE_DEFAULT: + inp->inp_flags &= ~(INP_LOWPORT); + inp->inp_flags &= ~(INP_HIGHPORT); + break; + + case IP_PORTRANGE_HIGH: + inp->inp_flags &= ~(INP_LOWPORT); + inp->inp_flags |= INP_HIGHPORT; + break; + + case IP_PORTRANGE_LOW: + inp->inp_flags &= ~(INP_HIGHPORT); + inp->inp_flags |= INP_LOWPORT; + break; + + default: + error = EINVAL; + break; + } + break; + +#ifdef IPSEC + case IP_IPSEC_POLICY: + { + caddr_t req; + size_t len = 0; + int priv; + struct mbuf *m; + int optname; + + if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ + break; + if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ + break; + priv = (sopt->sopt_p != NULL && + suser(sopt->sopt_p) != 0) ? 0 : 1; + req = mtod(m, caddr_t); + len = m->m_len; + optname = sopt->sopt_name; + error = ipsec4_set_policy(inp, optname, req, len, priv); + m_freem(m); + break; + } +#endif /*IPSEC*/ + + default: + error = ENOPROTOOPT; + break; + } + break; + + case SOPT_GET: + switch (sopt->sopt_name) { + case IP_OPTIONS: + case IP_RETOPTS: + if (inp->inp_options) + error = sooptcopyout(sopt, + mtod(inp->inp_options, + char *), + inp->inp_options->m_len); + else + sopt->sopt_valsize = 0; + break; + + case IP_TOS: + case IP_TTL: + case IP_RECVOPTS: + case IP_RECVRETOPTS: + case IP_RECVDSTADDR: + case IP_RECVIF: + case IP_PORTRANGE: +#if defined(NFAITH) && NFAITH > 0 + case IP_FAITH: +#endif + switch (sopt->sopt_name) { + + case IP_TOS: + optval = inp->inp_ip_tos; + break; + + case IP_TTL: + optval = inp->inp_ip_ttl; + break; + +#define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) + + case IP_RECVOPTS: + optval = OPTBIT(INP_RECVOPTS); + break; + + case IP_RECVRETOPTS: + optval = OPTBIT(INP_RECVRETOPTS); + break; + + case IP_RECVDSTADDR: + optval = OPTBIT(INP_RECVDSTADDR); + break; + + case IP_RECVIF: + optval = OPTBIT(INP_RECVIF); + break; + + case IP_PORTRANGE: + if (inp->inp_flags & INP_HIGHPORT) + optval = IP_PORTRANGE_HIGH; + else if (inp->inp_flags & INP_LOWPORT) + optval = IP_PORTRANGE_LOW; + else + optval = 0; + break; + +#if defined(NFAITH) && NFAITH > 0 + case IP_FAITH: + optval = OPTBIT(INP_FAITH); + break; +#endif + } + error = sooptcopyout(sopt, &optval, sizeof optval); + break; + + case IP_MULTICAST_IF: + case IP_MULTICAST_VIF: + case IP_MULTICAST_TTL: + case IP_MULTICAST_LOOP: + case IP_ADD_MEMBERSHIP: + case IP_DROP_MEMBERSHIP: + error = ip_getmoptions(sopt, inp->inp_moptions); + break; + +#ifdef IPSEC + case IP_IPSEC_POLICY: + { + struct mbuf *m = NULL; + caddr_t req = NULL; + size_t len = 0; + + if (m != 0) { + req = mtod(m, caddr_t); + len = m->m_len; + } + error = ipsec4_get_policy(sotoinpcb(so), req, len, &m); + if (error == 0) + error = soopt_mcopyout(sopt, m); /* XXX */ + if (error == 0) + m_freem(m); + break; + } +#endif /*IPSEC*/ + + default: + error = ENOPROTOOPT; + break; + } + break; + } + return (error); +} + +/* + * Set up IP options in pcb for insertion in output packets. + * Store in mbuf with pointer in pcbopt, adding pseudo-option + * with destination address if source routed. + */ +static int +ip_pcbopts(optname, pcbopt, m) + int optname; + struct mbuf **pcbopt; + register struct mbuf *m; +{ + register int cnt, optlen; + register u_char *cp; + u_char opt; + + /* turn off any old options */ + if (*pcbopt) + (void)m_free(*pcbopt); + *pcbopt = 0; + if (m == (struct mbuf *)0 || m->m_len == 0) { + /* + * Only turning off any previous options. + */ + if (m) + (void)m_free(m); + return (0); + } + + if (m->m_len % sizeof(int32_t)) + goto bad; + /* + * IP first-hop destination address will be stored before + * actual options; move other options back + * and clear it when none present. + */ + if (m->m_data + m->m_len + sizeof(struct in_addr) >= &m->m_dat[MLEN]) + goto bad; + cnt = m->m_len; + m->m_len += sizeof(struct in_addr); + cp = mtod(m, u_char *) + sizeof(struct in_addr); + ovbcopy(mtod(m, caddr_t), (caddr_t)cp, (unsigned)cnt); + bzero(mtod(m, caddr_t), sizeof(struct in_addr)); + + for (; cnt > 0; cnt -= optlen, cp += optlen) { + opt = cp[IPOPT_OPTVAL]; + if (opt == IPOPT_EOL) + break; + if (opt == IPOPT_NOP) + optlen = 1; + else { + if (cnt < IPOPT_OLEN + sizeof(*cp)) + goto bad; + optlen = cp[IPOPT_OLEN]; + if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) + goto bad; + } + switch (opt) { + + default: + break; + + case IPOPT_LSRR: + case IPOPT_SSRR: + /* + * user process specifies route as: + * ->A->B->C->D + * D must be our final destination (but we can't + * check that since we may not have connected yet). + * A is first hop destination, which doesn't appear in + * actual IP option, but is stored before the options. + */ + if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr)) + goto bad; + m->m_len -= sizeof(struct in_addr); + cnt -= sizeof(struct in_addr); + optlen -= sizeof(struct in_addr); + cp[IPOPT_OLEN] = optlen; + /* + * Move first hop before start of options. + */ + bcopy((caddr_t)&cp[IPOPT_OFFSET+1], mtod(m, caddr_t), + sizeof(struct in_addr)); + /* + * Then copy rest of options back + * to close up the deleted entry. + */ + ovbcopy((caddr_t)(&cp[IPOPT_OFFSET+1] + + sizeof(struct in_addr)), + (caddr_t)&cp[IPOPT_OFFSET+1], + (unsigned)cnt + sizeof(struct in_addr)); + break; + } + } + if (m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr)) + goto bad; + *pcbopt = m; + return (0); + +bad: + (void)m_free(m); + return (EINVAL); +} + +/* + * XXX + * The whole multicast option thing needs to be re-thought. + * Several of these options are equally applicable to non-multicast + * transmission, and one (IP_MULTICAST_TTL) totally duplicates a + * standard option (IP_TTL). + */ +/* + * Set the IP multicast options in response to user setsockopt(). + */ +static int +ip_setmoptions(sopt, imop) + struct sockopt *sopt; + struct ip_moptions **imop; +{ + int error = 0; + int i; + struct in_addr addr; + struct ip_mreq mreq; + struct ifnet *ifp; + struct ip_moptions *imo = *imop; + struct route ro; + struct sockaddr_in *dst; + int s; + + if (imo == NULL) { + /* + * No multicast option buffer attached to the pcb; + * allocate one and initialize to default values. + */ + imo = (struct ip_moptions*)malloc(sizeof(*imo), M_IPMOPTS, + M_WAITOK); + + if (imo == NULL) + return (ENOBUFS); + *imop = imo; + imo->imo_multicast_ifp = NULL; + imo->imo_multicast_vif = -1; + imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; + imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; + imo->imo_num_memberships = 0; + } + + switch (sopt->sopt_name) { + /* store an index number for the vif you wanna use in the send */ + case IP_MULTICAST_VIF: + if (legal_vif_num == 0) { + error = EOPNOTSUPP; + break; + } + error = sooptcopyin(sopt, &i, sizeof i, sizeof i); + if (error) + break; + if (!legal_vif_num(i) && (i != -1)) { + error = EINVAL; + break; + } + imo->imo_multicast_vif = i; + break; + + case IP_MULTICAST_IF: + /* + * Select the interface for outgoing multicast packets. + */ + error = sooptcopyin(sopt, &addr, sizeof addr, sizeof addr); + if (error) + break; + /* + * INADDR_ANY is used to remove a previous selection. + * When no interface is selected, a default one is + * chosen every time a multicast packet is sent. + */ + if (addr.s_addr == INADDR_ANY) { + imo->imo_multicast_ifp = NULL; + break; + } + /* + * The selected interface is identified by its local + * IP address. Find the interface and confirm that + * it supports multicasting. + */ + s = splimp(); + INADDR_TO_IFP(addr, ifp); + if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { + splx(s); + error = EADDRNOTAVAIL; + break; + } + imo->imo_multicast_ifp = ifp; + splx(s); + break; + + case IP_MULTICAST_TTL: + /* + * Set the IP time-to-live for outgoing multicast packets. + * The original multicast API required a char argument, + * which is inconsistent with the rest of the socket API. + * We allow either a char or an int. + */ + if (sopt->sopt_valsize == 1) { + u_char ttl; + error = sooptcopyin(sopt, &ttl, 1, 1); + if (error) + break; + imo->imo_multicast_ttl = ttl; + } else { + u_int ttl; + error = sooptcopyin(sopt, &ttl, sizeof ttl, + sizeof ttl); + if (error) + break; + if (ttl > 255) + error = EINVAL; + else + imo->imo_multicast_ttl = ttl; + } + break; + + case IP_MULTICAST_LOOP: + /* + * Set the loopback flag for outgoing multicast packets. + * Must be zero or one. The original multicast API required a + * char argument, which is inconsistent with the rest + * of the socket API. We allow either a char or an int. + */ + if (sopt->sopt_valsize == 1) { + u_char loop; + error = sooptcopyin(sopt, &loop, 1, 1); + if (error) + break; + imo->imo_multicast_loop = !!loop; + } else { + u_int loop; + error = sooptcopyin(sopt, &loop, sizeof loop, + sizeof loop); + if (error) + break; + imo->imo_multicast_loop = !!loop; + } + break; + + case IP_ADD_MEMBERSHIP: + /* + * Add a multicast group membership. + * Group must be a valid IP multicast address. + */ + error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq); + if (error) + break; + + if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) { + error = EINVAL; + break; + } + s = splimp(); + /* + * If no interface address was provided, use the interface of + * the route to the given multicast address. + */ + if (mreq.imr_interface.s_addr == INADDR_ANY) { + bzero((caddr_t)&ro, sizeof(ro)); + dst = (struct sockaddr_in *)&ro.ro_dst; + dst->sin_len = sizeof(*dst); + dst->sin_family = AF_INET; + dst->sin_addr = mreq.imr_multiaddr; + rtalloc(&ro); + if (ro.ro_rt == NULL) { + error = EADDRNOTAVAIL; + splx(s); + break; + } + ifp = ro.ro_rt->rt_ifp; + rtfree(ro.ro_rt); + } + else { + INADDR_TO_IFP(mreq.imr_interface, ifp); + } + + /* + * See if we found an interface, and confirm that it + * supports multicast. + */ + if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { + error = EADDRNOTAVAIL; + splx(s); + break; + } + /* + * See if the membership already exists or if all the + * membership slots are full. + */ + for (i = 0; i < imo->imo_num_memberships; ++i) { + if (imo->imo_membership[i]->inm_ifp == ifp && + imo->imo_membership[i]->inm_addr.s_addr + == mreq.imr_multiaddr.s_addr) + break; + } + if (i < imo->imo_num_memberships) { + error = EADDRINUSE; + splx(s); + break; + } + if (i == IP_MAX_MEMBERSHIPS) { + error = ETOOMANYREFS; + splx(s); + break; + } + /* + * Everything looks good; add a new record to the multicast + * address list for the given interface. + */ + if ((imo->imo_membership[i] = + in_addmulti(&mreq.imr_multiaddr, ifp)) == NULL) { + error = ENOBUFS; + splx(s); + break; + } + ++imo->imo_num_memberships; + splx(s); + break; + + case IP_DROP_MEMBERSHIP: + /* + * Drop a multicast group membership. + * Group must be a valid IP multicast address. + */ + error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq); + if (error) + break; + + if (!IN_MULTICAST(ntohl(mreq.imr_multiaddr.s_addr))) { + error = EINVAL; + break; + } + + s = splimp(); + /* + * If an interface address was specified, get a pointer + * to its ifnet structure. + */ + if (mreq.imr_interface.s_addr == INADDR_ANY) + ifp = NULL; + else { + INADDR_TO_IFP(mreq.imr_interface, ifp); + if (ifp == NULL) { + error = EADDRNOTAVAIL; + splx(s); + break; + } + } + /* + * Find the membership in the membership array. + */ + for (i = 0; i < imo->imo_num_memberships; ++i) { + if ((ifp == NULL || + imo->imo_membership[i]->inm_ifp == ifp) && + imo->imo_membership[i]->inm_addr.s_addr == + mreq.imr_multiaddr.s_addr) + break; + } + if (i == imo->imo_num_memberships) { + error = EADDRNOTAVAIL; + splx(s); + break; + } + /* + * Give up the multicast address record to which the + * membership points. + */ + in_delmulti(imo->imo_membership[i]); + /* + * Remove the gap in the membership array. + */ + for (++i; i < imo->imo_num_memberships; ++i) + imo->imo_membership[i-1] = imo->imo_membership[i]; + --imo->imo_num_memberships; + splx(s); + break; + + default: + error = EOPNOTSUPP; + break; + } + + /* + * If all options have default values, no need to keep the mbuf. + */ + if (imo->imo_multicast_ifp == NULL && + imo->imo_multicast_vif == -1 && + imo->imo_multicast_ttl == IP_DEFAULT_MULTICAST_TTL && + imo->imo_multicast_loop == IP_DEFAULT_MULTICAST_LOOP && + imo->imo_num_memberships == 0) { + free(*imop, M_IPMOPTS); + *imop = NULL; + } + + return (error); +} + +/* + * Return the IP multicast options in response to user getsockopt(). + */ +static int +ip_getmoptions(sopt, imo) + struct sockopt *sopt; + register struct ip_moptions *imo; +{ + struct in_addr addr; + struct in_ifaddr *ia; + int error, optval; + u_char coptval; + + error = 0; + switch (sopt->sopt_name) { + case IP_MULTICAST_VIF: + if (imo != NULL) + optval = imo->imo_multicast_vif; + else + optval = -1; + error = sooptcopyout(sopt, &optval, sizeof optval); + break; + + case IP_MULTICAST_IF: + if (imo == NULL || imo->imo_multicast_ifp == NULL) + addr.s_addr = INADDR_ANY; + else { + IFP_TO_IA(imo->imo_multicast_ifp, ia); + addr.s_addr = (ia == NULL) ? INADDR_ANY + : IA_SIN(ia)->sin_addr.s_addr; + } + error = sooptcopyout(sopt, &addr, sizeof addr); + break; + + case IP_MULTICAST_TTL: + if (imo == 0) + optval = coptval = IP_DEFAULT_MULTICAST_TTL; + else + optval = coptval = imo->imo_multicast_ttl; + if (sopt->sopt_valsize == 1) + error = sooptcopyout(sopt, &coptval, 1); + else + error = sooptcopyout(sopt, &optval, sizeof optval); + break; + + case IP_MULTICAST_LOOP: + if (imo == 0) + optval = coptval = IP_DEFAULT_MULTICAST_LOOP; + else + optval = coptval = imo->imo_multicast_loop; + if (sopt->sopt_valsize == 1) + error = sooptcopyout(sopt, &coptval, 1); + else + error = sooptcopyout(sopt, &optval, sizeof optval); + break; + + default: + error = ENOPROTOOPT; + break; + } + return (error); +} + +/* + * Discard the IP multicast options. + */ +void +ip_freemoptions(imo) + register struct ip_moptions *imo; +{ + register int i; + + if (imo != NULL) { + for (i = 0; i < imo->imo_num_memberships; ++i) + in_delmulti(imo->imo_membership[i]); + free(imo, M_IPMOPTS); + } +} + +/* + * Routine called from ip_output() to loop back a copy of an IP multicast + * packet to the input queue of a specified interface. Note that this + * calls the output routine of the loopback "driver", but with an interface + * pointer that might NOT be a loopback interface -- evil, but easier than + * replicating that code here. + */ +static void +ip_mloopback(ifp, m, dst, hlen) + struct ifnet *ifp; + register struct mbuf *m; + register struct sockaddr_in *dst; + int hlen; +{ + register struct ip *ip; + struct mbuf *copym; + + copym = m_copy(m, 0, M_COPYALL); + if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen)) + copym = m_pullup(copym, hlen); + if (copym != NULL) { + /* + * We don't bother to fragment if the IP length is greater + * than the interface's MTU. Can this possibly matter? + */ + ip = mtod(copym, struct ip *); + HTONS(ip->ip_len); + HTONS(ip->ip_off); + ip->ip_sum = 0; + if (ip->ip_vhl == IP_VHL_BORING) { + ip->ip_sum = in_cksum_hdr(ip); + } else { + ip->ip_sum = in_cksum(copym, hlen); + } + /* + * NB: + * It's not clear whether there are any lingering + * reentrancy problems in other areas which might + * be exposed by using ip_input directly (in + * particular, everything which modifies the packet + * in-place). Yet another option is using the + * protosw directly to deliver the looped back + * packet. For the moment, we'll err on the side + * of safety by using if_simloop(). + */ +#if 1 /* XXX */ + if (dst->sin_family != AF_INET) { + printf("ip_mloopback: bad address family %d\n", + dst->sin_family); + dst->sin_family = AF_INET; + } +#endif + +#ifdef notdef + copym->m_pkthdr.rcvif = ifp; + ip_input(copym); +#else + /* if the checksum hasn't been computed, mark it as valid */ + if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { + copym->m_pkthdr.csum_flags |= + CSUM_DATA_VALID | CSUM_PSEUDO_HDR; + copym->m_pkthdr.csum_data = 0xffff; + } + if_simloop(ifp, copym, dst->sin_family, 0); +#endif + } +} diff --git a/sys/netinet/ip_proxy.c b/sys/netinet/ip_proxy.c new file mode 100644 index 0000000..47d0e5e --- /dev/null +++ b/sys/netinet/ip_proxy.c @@ -0,0 +1,452 @@ +/* + * Copyright (C) 1997-2000 by Darren Reed. + * + * Redistribution and use in source and binary forms are permitted + * provided that this notice is preserved and due credit is given + * to the original author and the contributors. + */ +#if !defined(lint) +/*static const char rcsid[] = "@(#)$Id: ip_proxy.c,v 2.2.2.1 1999/09/19 12:18:19 darrenr Exp $";*/ +static const char rcsid[] = "@(#)$FreeBSD$"; +#endif + +#if defined(__FreeBSD__) && defined(KERNEL) && !defined(_KERNEL) +# define _KERNEL +#endif + +#include <sys/errno.h> +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/file.h> +#if !defined(__FreeBSD_version) +# include <sys/ioctl.h> +#endif +#include <sys/fcntl.h> +#include <sys/uio.h> +#if !defined(_KERNEL) && !defined(KERNEL) +# include <stdio.h> +# include <string.h> +# include <stdlib.h> +#endif +#ifndef linux +# include <sys/protosw.h> +#endif +#include <sys/socket.h> +#if defined(_KERNEL) +# if !defined(linux) +# include <sys/systm.h> +# else +# include <linux/string.h> +# endif +#endif +#if !defined(__SVR4) && !defined(__svr4__) +# ifndef linux +# include <sys/mbuf.h> +# endif +#else +# include <sys/byteorder.h> +# ifdef _KERNEL +# include <sys/dditypes.h> +# endif +# include <sys/stream.h> +# include <sys/kmem.h> +#endif +#if __FreeBSD__ > 2 +# include <sys/queue.h> +#endif +#include <net/if.h> +#ifdef sun +# include <net/af.h> +#endif +#include <net/route.h> +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#ifndef linux +# include <netinet/ip_var.h> +#endif +#include <netinet/tcp.h> +#include <netinet/udp.h> +#include <netinet/ip_icmp.h> +#include "netinet/ip_compat.h" +#include <netinet/tcpip.h> +#include "netinet/ip_fil.h" +#include "netinet/ip_proxy.h" +#include "netinet/ip_nat.h" +#include "netinet/ip_state.h" +#if (__FreeBSD_version >= 300000) +# include <sys/malloc.h> +#endif + + +#ifndef MIN +#define MIN(a,b) (((a)<(b))?(a):(b)) +#endif + +static ap_session_t *appr_new_session __P((aproxy_t *, ip_t *, + fr_info_t *, nat_t *)); +static int appr_fixseqack __P((fr_info_t *, ip_t *, ap_session_t *, int )); + + +#define AP_SESS_SIZE 53 + +#if defined(_KERNEL) && !defined(linux) +#include "netinet/ip_ftp_pxy.c" +#include "netinet/ip_rcmd_pxy.c" +#include "netinet/ip_raudio_pxy.c" +#endif + +ap_session_t *ap_sess_tab[AP_SESS_SIZE]; +ap_session_t *ap_sess_list = NULL; +aproxy_t *ap_proxylist = NULL; +aproxy_t ap_proxies[] = { +#ifdef IPF_FTP_PROXY + { NULL, "ftp", (char)IPPROTO_TCP, 0, 0, ippr_ftp_init, NULL, + ippr_ftp_new, ippr_ftp_in, ippr_ftp_out }, +#endif +#ifdef IPF_RCMD_PROXY + { NULL, "rcmd", (char)IPPROTO_TCP, 0, 0, ippr_rcmd_init, NULL, + ippr_rcmd_new, NULL, ippr_rcmd_out }, +#endif +#ifdef IPF_RAUDIO_PROXY + { NULL, "raudio", (char)IPPROTO_TCP, 0, 0, ippr_raudio_init, NULL, + ippr_raudio_new, ippr_raudio_in, ippr_raudio_out }, +#endif + { NULL, "", '\0', 0, 0, NULL, NULL } +}; + + +int appr_add(ap) +aproxy_t *ap; +{ + aproxy_t *a; + + for (a = ap_proxies; a->apr_p; a++) + if ((a->apr_p == ap->apr_p) && + !strncmp(a->apr_label, ap->apr_label, + sizeof(ap->apr_label))) + return -1; + + for (a = ap_proxylist; a->apr_p; a = a->apr_next) + if ((a->apr_p == ap->apr_p) && + !strncmp(a->apr_label, ap->apr_label, + sizeof(ap->apr_label))) + return -1; + ap->apr_next = ap_proxylist; + ap_proxylist = ap; + return (*ap->apr_init)(); +} + + +int appr_del(ap) +aproxy_t *ap; +{ + aproxy_t *a, **app; + + for (app = &ap_proxylist; (a = *app); app = &a->apr_next) + if (a == ap) { + if (ap->apr_ref != 0) + return 1; + *app = a->apr_next; + return 0; + } + return -1; +} + + +int appr_ok(ip, tcp, nat) +ip_t *ip; +tcphdr_t *tcp; +ipnat_t *nat; +{ + aproxy_t *apr = nat->in_apr; + u_short dport = nat->in_dport; + + if (!apr || (apr->apr_flags & APR_DELETE) || + (ip->ip_p != apr->apr_p)) + return 0; + if ((tcp && (tcp->th_dport != dport)) || (!tcp && dport)) + return 0; + return 1; +} + + +/* + * Allocate a new application proxy structure and fill it in with the + * relevant details. call the init function once complete, prior to + * returning. + */ +static ap_session_t *appr_new_session(apr, ip, fin, nat) +aproxy_t *apr; +ip_t *ip; +fr_info_t *fin; +nat_t *nat; +{ + register ap_session_t *aps; + + if (!apr || (apr->apr_flags & APR_DELETE) || (ip->ip_p != apr->apr_p)) + return NULL; + + KMALLOC(aps, ap_session_t *); + if (!aps) + return NULL; + bzero((char *)aps, sizeof(*aps)); + aps->aps_p = ip->ip_p; + aps->aps_data = NULL; + aps->aps_apr = apr; + aps->aps_psiz = 0; + if (apr->apr_new != NULL) + if ((*apr->apr_new)(fin, ip, aps, nat) == -1) { + KFREE(aps); + return NULL; + } + aps->aps_nat = nat; + aps->aps_next = ap_sess_list; + ap_sess_list = aps; + return aps; +} + + +/* + * check to see if a packet should be passed through an active proxy routine + * if one has been setup for it. + */ +int appr_check(ip, fin, nat) +ip_t *ip; +fr_info_t *fin; +nat_t *nat; +{ + ap_session_t *aps; + aproxy_t *apr; + tcphdr_t *tcp = NULL; + u_32_t sum; + short rv; + int err; + + if (nat->nat_aps == NULL) + nat->nat_aps = appr_new_session(nat->nat_ptr->in_apr, ip, + fin, nat); + aps = nat->nat_aps; + if ((aps != NULL) && (aps->aps_p == ip->ip_p)) { + if (ip->ip_p == IPPROTO_TCP) { + tcp = (tcphdr_t *)fin->fin_dp; + /* + * verify that the checksum is correct. If not, then + * don't do anything with this packet. + */ +#if SOLARIS && defined(_KERNEL) + sum = fr_tcpsum(fin->fin_qfm, ip, tcp); +#else + sum = fr_tcpsum(*(mb_t **)fin->fin_mp, ip, tcp); +#endif + if (sum != tcp->th_sum) { + frstats[fin->fin_out].fr_tcpbad++; + return -1; + } + } + + apr = aps->aps_apr; + err = 0; + if (fin->fin_out != 0) { + if (apr->apr_outpkt != NULL) + err = (*apr->apr_outpkt)(fin, ip, aps, nat); + } else { + if (apr->apr_inpkt != NULL) + err = (*apr->apr_inpkt)(fin, ip, aps, nat); + } + + rv = APR_EXIT(err); + if (rv == -1) + return rv; + + if (tcp != NULL) { + err = appr_fixseqack(fin, ip, aps, APR_INC(err)); +#if SOLARIS && defined(_KERNEL) + tcp->th_sum = fr_tcpsum(fin->fin_qfm, ip, tcp); +#else + tcp->th_sum = fr_tcpsum(*(mb_t **)fin->fin_mp, ip, tcp); +#endif + } + aps->aps_bytes += ip->ip_len; + aps->aps_pkts++; + return 1; + } + return 0; +} + + +aproxy_t *appr_match(pr, name) +u_int pr; +char *name; +{ + aproxy_t *ap; + + for (ap = ap_proxies; ap->apr_p; ap++) + if ((ap->apr_p == pr) && + !strncmp(name, ap->apr_label, sizeof(ap->apr_label))) { + ap->apr_ref++; + return ap; + } + + for (ap = ap_proxylist; ap; ap = ap->apr_next) + if ((ap->apr_p == pr) && + !strncmp(name, ap->apr_label, sizeof(ap->apr_label))) { + ap->apr_ref++; + return ap; + } + return NULL; +} + + +void appr_free(ap) +aproxy_t *ap; +{ + ap->apr_ref--; +} + + +void aps_free(aps) +ap_session_t *aps; +{ + ap_session_t *a, **ap; + + if (!aps) + return; + + for (ap = &ap_sess_list; (a = *ap); ap = &a->aps_next) + if (a == aps) { + *ap = a->aps_next; + break; + } + + if ((aps->aps_data != NULL) && (aps->aps_psiz != 0)) + KFREES(aps->aps_data, aps->aps_psiz); + KFREE(aps); +} + + +static int appr_fixseqack(fin, ip, aps, inc) +fr_info_t *fin; +ip_t *ip; +ap_session_t *aps; +int inc; +{ + int sel, ch = 0, out, nlen; + u_32_t seq1, seq2; + tcphdr_t *tcp; + + tcp = (tcphdr_t *)fin->fin_dp; + out = fin->fin_out; + nlen = ip->ip_len; + nlen -= (ip->ip_hl << 2) + (tcp->th_off << 2); + + if (out != 0) { + seq1 = (u_32_t)ntohl(tcp->th_seq); + sel = aps->aps_sel[out]; + + /* switch to other set ? */ + if ((aps->aps_seqmin[!sel] > aps->aps_seqmin[sel]) && + (seq1 > aps->aps_seqmin[!sel])) + sel = aps->aps_sel[out] = !sel; + + if (aps->aps_seqoff[sel]) { + seq2 = aps->aps_seqmin[sel] - aps->aps_seqoff[sel]; + if (seq1 > seq2) { + seq2 = aps->aps_seqoff[sel]; + seq1 += seq2; + tcp->th_seq = htonl(seq1); + ch = 1; + } + } + + if (inc && (seq1 > aps->aps_seqmin[!sel])) { + aps->aps_seqmin[!sel] = seq1 + nlen - 1; + aps->aps_seqoff[!sel] = aps->aps_seqoff[sel] + inc; + } + + /***/ + + seq1 = ntohl(tcp->th_ack); + sel = aps->aps_sel[1 - out]; + + /* switch to other set ? */ + if ((aps->aps_ackmin[!sel] > aps->aps_ackmin[sel]) && + (seq1 > aps->aps_ackmin[!sel])) + sel = aps->aps_sel[1 - out] = !sel; + + if (aps->aps_ackoff[sel] && (seq1 > aps->aps_ackmin[sel])) { + seq2 = aps->aps_ackoff[sel]; + tcp->th_ack = htonl(seq1 - seq2); + ch = 1; + } + } else { + seq1 = ntohl(tcp->th_seq); + sel = aps->aps_sel[out]; + + /* switch to other set ? */ + if ((aps->aps_ackmin[!sel] > aps->aps_ackmin[sel]) && + (seq1 > aps->aps_ackmin[!sel])) + sel = aps->aps_sel[out] = !sel; + + if (aps->aps_ackoff[sel]) { + seq2 = aps->aps_ackmin[sel] - + aps->aps_ackoff[sel]; + if (seq1 > seq2) { + seq2 = aps->aps_ackoff[sel]; + seq1 += seq2; + tcp->th_seq = htonl(seq1); + ch = 1; + } + } + + if (inc && (seq1 > aps->aps_ackmin[!sel])) { + aps->aps_ackmin[!sel] = seq1 + nlen - 1; + aps->aps_ackoff[!sel] = aps->aps_ackoff[sel] + inc; + } + + /***/ + + seq1 = ntohl(tcp->th_ack); + sel = aps->aps_sel[1 - out]; + + /* switch to other set ? */ + if ((aps->aps_seqmin[!sel] > aps->aps_seqmin[sel]) && + (seq1 > aps->aps_seqmin[!sel])) + sel = aps->aps_sel[1 - out] = !sel; + + if (aps->aps_seqoff[sel] && (seq1 > aps->aps_seqmin[sel])) { + seq2 = aps->aps_seqoff[sel]; + tcp->th_ack = htonl(seq1 - seq2); + ch = 1; + } + } + return ch ? 2 : 0; +} + + +int appr_init() +{ + aproxy_t *ap; + int err = 0; + + for (ap = ap_proxies; ap->apr_p; ap++) { + err = (*ap->apr_init)(); + if (err != 0) + break; + } + return err; +} + + +void appr_unload() +{ + aproxy_t *ap; + + for (ap = ap_proxies; ap->apr_p; ap++) + if (ap->apr_fini) + (*ap->apr_fini)(); + for (ap = ap_proxylist; ap; ap = ap->apr_next) + if (ap->apr_fini) + (*ap->apr_fini)(); +} diff --git a/sys/netinet/ip_proxy.h b/sys/netinet/ip_proxy.h new file mode 100644 index 0000000..f22c709 --- /dev/null +++ b/sys/netinet/ip_proxy.h @@ -0,0 +1,157 @@ +/* + * Copyright (C) 1997-2000 by Darren Reed. + * + * Redistribution and use in source and binary forms are permitted + * provided that this notice is preserved and due credit is given + * to the original author and the contributors. + * + * $Id: ip_proxy.h,v 2.8.2.4 2000/12/02 00:15:03 darrenr Exp $ + * $FreeBSD$ + */ + +#ifndef __IP_PROXY_H__ +#define __IP_PROXY_H__ + +#ifndef SOLARIS +#define SOLARIS (defined(sun) && (defined(__svr4__) || defined(__SVR4))) +#endif + +#ifndef APR_LABELLEN +#define APR_LABELLEN 16 +#endif +#define AP_SESS_SIZE 53 + +struct nat; +struct ipnat; + +typedef struct ap_tcp { + u_short apt_sport; /* source port */ + u_short apt_dport; /* destination port */ + short apt_sel[2]; /* {seq,ack}{off,min} set selector */ + short apt_seqoff[2]; /* sequence # difference */ + tcp_seq apt_seqmin[2]; /* don't change seq-off until after this */ + short apt_ackoff[2]; /* sequence # difference */ + tcp_seq apt_ackmin[2]; /* don't change seq-off until after this */ + u_char apt_state[2]; /* connection state */ +} ap_tcp_t; + +typedef struct ap_udp { + u_short apu_sport; /* source port */ + u_short apu_dport; /* destination port */ +} ap_udp_t; + +typedef struct ap_session { + struct aproxy *aps_apr; + union { + struct ap_tcp apu_tcp; + struct ap_udp apu_udp; + } aps_un; + u_int aps_flags; + U_QUAD_T aps_bytes; /* bytes sent */ + U_QUAD_T aps_pkts; /* packets sent */ + void *aps_nat; /* pointer back to nat struct */ + void *aps_data; /* private data */ + int aps_p; /* protocol */ + int aps_psiz; /* size of private data */ + struct ap_session *aps_hnext; + struct ap_session *aps_next; +} ap_session_t; + +#define aps_sport aps_un.apu_tcp.apt_sport +#define aps_dport aps_un.apu_tcp.apt_dport +#define aps_sel aps_un.apu_tcp.apt_sel +#define aps_seqoff aps_un.apu_tcp.apt_seqoff +#define aps_seqmin aps_un.apu_tcp.apt_seqmin +#define aps_state aps_un.apu_tcp.apt_state +#define aps_ackoff aps_un.apu_tcp.apt_ackoff +#define aps_ackmin aps_un.apu_tcp.apt_ackmin + + +typedef struct aproxy { + struct aproxy *apr_next; + char apr_label[APR_LABELLEN]; /* Proxy label # */ + u_char apr_p; /* protocol */ + int apr_ref; /* +1 per rule referencing it */ + int apr_flags; + int (* apr_init) __P((void)); + void (* apr_fini) __P((void)); + int (* apr_new) __P((fr_info_t *, ip_t *, + ap_session_t *, struct nat *)); + int (* apr_inpkt) __P((fr_info_t *, ip_t *, + ap_session_t *, struct nat *)); + int (* apr_outpkt) __P((fr_info_t *, ip_t *, + ap_session_t *, struct nat *)); +} aproxy_t; + +#define APR_DELETE 1 + +#define APR_ERR(x) (((x) & 0xffff) << 16) +#define APR_EXIT(x) (((x) >> 16) & 0xffff) +#define APR_INC(x) ((x) & 0xffff) + +#define FTP_BUFSZ 160 +/* + * For the ftp proxy. + */ +typedef struct ftpside { + char *ftps_rptr; + char *ftps_wptr; + u_32_t ftps_seq; + u_32_t ftps_len; + int ftps_junk; + char ftps_buf[FTP_BUFSZ]; +} ftpside_t; + +typedef struct ftpinfo { + u_int ftp_passok; + ftpside_t ftp_side[2]; +} ftpinfo_t; + +/* + * Real audio proxy structure and #defines + */ +typedef struct { + int rap_seenpna; + int rap_seenver; + int rap_version; + int rap_eos; /* End Of Startup */ + int rap_gotid; + int rap_gotlen; + int rap_mode; + int rap_sdone; + u_short rap_plport; + u_short rap_prport; + u_short rap_srport; + char rap_svr[19]; + u_32_t rap_sbf; /* flag to indicate which of the 19 bytes have + * been filled + */ + tcp_seq rap_sseq; +} raudio_t; + +#define RA_ID_END 0 +#define RA_ID_UDP 1 +#define RA_ID_ROBUST 7 + +#define RAP_M_UDP 1 +#define RAP_M_ROBUST 2 +#define RAP_M_TCP 4 +#define RAP_M_UDP_ROBUST (RAP_M_UDP|RAP_M_ROBUST) + + +extern ap_session_t *ap_sess_tab[AP_SESS_SIZE]; +extern ap_session_t *ap_sess_list; +extern aproxy_t ap_proxies[]; +extern int ippr_ftp_pasvonly; + +extern int appr_add __P((aproxy_t *)); +extern int appr_del __P((aproxy_t *)); +extern int appr_init __P((void)); +extern void appr_unload __P((void)); +extern int appr_ok __P((ip_t *, tcphdr_t *, struct ipnat *)); +extern void appr_free __P((aproxy_t *)); +extern void aps_free __P((ap_session_t *)); +extern int appr_check __P((ip_t *, fr_info_t *, struct nat *)); +extern aproxy_t *appr_match __P((u_int, char *)); + +#endif /* __IP_PROXY_H__ */ diff --git a/sys/netinet/ip_raudio_pxy.c b/sys/netinet/ip_raudio_pxy.c new file mode 100644 index 0000000..8b2c231 --- /dev/null +++ b/sys/netinet/ip_raudio_pxy.c @@ -0,0 +1,308 @@ +/* + * $FreeBSD$ + */ +#if SOLARIS && defined(_KERNEL) +extern kmutex_t ipf_rw; +#endif + +#define IPF_RAUDIO_PROXY + + +int ippr_raudio_init __P((void)); +int ippr_raudio_new __P((fr_info_t *, ip_t *, ap_session_t *, nat_t *)); +int ippr_raudio_in __P((fr_info_t *, ip_t *, ap_session_t *, nat_t *)); +int ippr_raudio_out __P((fr_info_t *, ip_t *, ap_session_t *, nat_t *)); + +static frentry_t raudiofr; + + +/* + * Real Audio application proxy initialization. + */ +int ippr_raudio_init() +{ + bzero((char *)&raudiofr, sizeof(raudiofr)); + raudiofr.fr_ref = 1; + raudiofr.fr_flags = FR_INQUE|FR_PASS|FR_QUICK|FR_KEEPSTATE; + return 0; +} + + +/* + * Setup for a new proxy to handle Real Audio. + */ +int ippr_raudio_new(fin, ip, aps, nat) +fr_info_t *fin; +ip_t *ip; +ap_session_t *aps; +nat_t *nat; +{ + raudio_t *rap; + + + KMALLOCS(aps->aps_data, void *, sizeof(raudio_t)); + if (aps->aps_data == NULL) + return -1; + + bzero(aps->aps_data, sizeof(raudio_t)); + rap = aps->aps_data; + aps->aps_psiz = sizeof(raudio_t); + rap->rap_mode = RAP_M_TCP; /* default is for TCP */ + return 0; +} + + + +int ippr_raudio_out(fin, ip, aps, nat) +fr_info_t *fin; +ip_t *ip; +ap_session_t *aps; +nat_t *nat; +{ + raudio_t *rap = aps->aps_data; + unsigned char membuf[512 + 1], *s; + u_short id = 0; + tcphdr_t *tcp; + int off, dlen; + int len = 0; + mb_t *m; +#if SOLARIS + mb_t *m1; +#endif + + /* + * If we've already processed the start messages, then nothing left + * for the proxy to do. + */ + if (rap->rap_eos == 1) + return 0; + + tcp = (tcphdr_t *)fin->fin_dp; + off = (ip->ip_hl << 2) + (tcp->th_off << 2); + bzero(membuf, sizeof(membuf)); +#if SOLARIS + m = fin->fin_qfm; + + dlen = msgdsize(m) - off; + if (dlen <= 0) + return 0; + copyout_mblk(m, off, MIN(sizeof(membuf), dlen), (char *)membuf); +#else + m = *(mb_t **)fin->fin_mp; + + dlen = mbufchainlen(m) - off; + if (dlen <= 0) + return 0; + m_copydata(m, off, MIN(sizeof(membuf), dlen), (char *)membuf); +#endif + /* + * In all the startup parsing, ensure that we don't go outside + * the packet buffer boundary. + */ + /* + * Look for the start of connection "PNA" string if not seen yet. + */ + if (rap->rap_seenpna == 0) { + s = (u_char *)memstr("PNA", (char *)membuf, 3, dlen); + if (s == NULL) + return 0; + s += 3; + rap->rap_seenpna = 1; + } else + s = membuf; + + /* + * Directly after the PNA will be the version number of this + * connection. + */ + if (rap->rap_seenpna == 1 && rap->rap_seenver == 0) { + if ((s + 1) - membuf < dlen) { + rap->rap_version = (*s << 8) | *(s + 1); + s += 2; + rap->rap_seenver = 1; + } else + return 0; + } + + /* + * Now that we've been past the PNA and version number, we're into the + * startup messages block. This ends when a message with an ID of 0. + */ + while ((rap->rap_eos == 0) && ((s + 1) - membuf < dlen)) { + if (rap->rap_gotid == 0) { + id = (*s << 8) | *(s + 1); + s += 2; + rap->rap_gotid = 1; + if (id == RA_ID_END) { + rap->rap_eos = 1; + break; + } + } else if (rap->rap_gotlen == 0) { + len = (*s << 8) | *(s + 1); + s += 2; + rap->rap_gotlen = 1; + } + + if (rap->rap_gotid == 1 && rap->rap_gotlen == 1) { + if (id == RA_ID_UDP) { + rap->rap_mode &= ~RAP_M_TCP; + rap->rap_mode |= RAP_M_UDP; + rap->rap_plport = (*s << 8) | *(s + 1); + } else if (id == RA_ID_ROBUST) { + rap->rap_mode |= RAP_M_ROBUST; + rap->rap_prport = (*s << 8) | *(s + 1); + } + s += len; + rap->rap_gotlen = 0; + rap->rap_gotid = 0; + } + } + return 0; +} + + +int ippr_raudio_in(fin, ip, aps, nat) +fr_info_t *fin; +ip_t *ip; +ap_session_t *aps; +nat_t *nat; +{ + unsigned char membuf[IPF_MAXPORTLEN + 1], *s; + tcphdr_t *tcp, tcph, *tcp2 = &tcph; + raudio_t *rap = aps->aps_data; + struct in_addr swa, swb; + int off, dlen, slen; + int a1, a2, a3, a4; + u_short sp, dp; + fr_info_t fi; + tcp_seq seq; + nat_t *ipn; + u_char swp; + mb_t *m; +#if SOLARIS + mb_t *m1; +#endif + + /* + * Wait until we've seen the end of the start messages and even then + * only proceed further if we're using UDP. If they want to use TCP + * then data is sent back on the same channel that is already open. + */ + if (rap->rap_sdone != 0) + return 0; + + tcp = (tcphdr_t *)fin->fin_dp; + off = (ip->ip_hl << 2) + (tcp->th_off << 2); + m = *(mb_t **)fin->fin_mp; + +#if SOLARIS + m = fin->fin_qfm; + + dlen = msgdsize(m) - off; + if (dlen <= 0) + return 0; + bzero(membuf, sizeof(membuf)); + copyout_mblk(m, off, MIN(sizeof(membuf), dlen), (char *)membuf); +#else + dlen = mbufchainlen(m) - off; + if (dlen <= 0) + return 0; + bzero(membuf, sizeof(membuf)); + m_copydata(m, off, MIN(sizeof(membuf), dlen), (char *)membuf); +#endif + + seq = ntohl(tcp->th_seq); + /* + * Check to see if the data in this packet is of interest to us. + * We only care for the first 19 bytes coming back from the server. + */ + if (rap->rap_sseq == 0) { + s = (u_char *)memstr("PNA", (char *)membuf, 3, dlen); + if (s == NULL) + return 0; + a1 = s - membuf; + dlen -= a1; + a1 = 0; + rap->rap_sseq = seq; + a2 = MIN(dlen, sizeof(rap->rap_svr)); + } else if (seq <= rap->rap_sseq + sizeof(rap->rap_svr)) { + /* + * seq # which is the start of data and from that the offset + * into the buffer array. + */ + a1 = seq - rap->rap_sseq; + a2 = MIN(dlen, sizeof(rap->rap_svr)); + a2 -= a1; + s = membuf; + } else + return 0; + + for (a3 = a1, a4 = a2; (a4 > 0) && (a3 < 19) && (a3 >= 0); a4--,a3++) { + rap->rap_sbf |= (1 << a3); + rap->rap_svr[a3] = *s++; + } + + if ((rap->rap_sbf != 0x7ffff) || (!rap->rap_eos)) /* 19 bits */ + return 0; + rap->rap_sdone = 1; + + s = (u_char *)rap->rap_svr + 11; + if (((*s << 8) | *(s + 1)) == RA_ID_ROBUST) { + s += 2; + rap->rap_srport = (*s << 8) | *(s + 1); + } + + swp = ip->ip_p; + swa = ip->ip_src; + swb = ip->ip_dst; + + ip->ip_p = IPPROTO_UDP; + ip->ip_src = nat->nat_inip; + ip->ip_dst = nat->nat_oip; + + bcopy((char *)fin, (char *)&fi, sizeof(fi)); + bzero((char *)tcp2, sizeof(*tcp2)); + tcp2->th_off = 5; + fi.fin_dp = (char *)tcp2; + fi.fin_fr = &raudiofr; + fi.fin_dlen = sizeof(*tcp2); + tcp2->th_win = htons(8192); + slen = ip->ip_len; + ip->ip_len = fin->fin_hlen + sizeof(*tcp); + + if (((rap->rap_mode & RAP_M_UDP_ROBUST) == RAP_M_UDP_ROBUST) && + (rap->rap_srport != 0)) { + dp = rap->rap_srport; + sp = rap->rap_prport; + tcp2->th_sport = htons(sp); + tcp2->th_dport = htons(dp); + fi.fin_data[0] = dp; + fi.fin_data[1] = sp; + ipn = nat_new(nat->nat_ptr, ip, &fi, + IPN_UDP | (sp ? 0 : FI_W_SPORT), NAT_OUTBOUND); + if (ipn != NULL) { + ipn->nat_age = fr_defnatage; + (void) fr_addstate(ip, &fi, sp ? 0 : FI_W_SPORT); + } + } + + if ((rap->rap_mode & RAP_M_UDP) == RAP_M_UDP) { + sp = rap->rap_plport; + tcp2->th_sport = htons(sp); + tcp2->th_dport = 0; /* XXX - don't specify remote port */ + fi.fin_data[0] = sp; + fi.fin_data[1] = 0; + ipn = nat_new(nat->nat_ptr, ip, &fi, IPN_UDP|FI_W_DPORT, + NAT_OUTBOUND); + if (ipn != NULL) { + ipn->nat_age = fr_defnatage; + (void) fr_addstate(ip, &fi, FI_W_DPORT); + } + } + + ip->ip_p = swp; + ip->ip_len = slen; + ip->ip_src = swa; + ip->ip_dst = swb; + return 0; +} diff --git a/sys/netinet/ip_rcmd_pxy.c b/sys/netinet/ip_rcmd_pxy.c new file mode 100644 index 0000000..0ae0210 --- /dev/null +++ b/sys/netinet/ip_rcmd_pxy.c @@ -0,0 +1,174 @@ +/* + * $Id: ip_rcmd_pxy.c,v 1.4.2.4 2000/11/01 14:34:20 darrenr Exp $ + */ +/* + * Simple RCMD transparent proxy for in-kernel use. For use with the NAT + * code. + * $FreeBSD$ + */ +#if SOLARIS && defined(_KERNEL) +extern kmutex_t ipf_rw; +#endif + +#define isdigit(x) ((x) >= '0' && (x) <= '9') + +#define IPF_RCMD_PROXY + + +int ippr_rcmd_init __P((void)); +int ippr_rcmd_new __P((fr_info_t *, ip_t *, ap_session_t *, nat_t *)); +int ippr_rcmd_out __P((fr_info_t *, ip_t *, ap_session_t *, nat_t *)); +u_short ipf_rcmd_atoi __P((char *)); +int ippr_rcmd_portmsg __P((fr_info_t *, ip_t *, ap_session_t *, nat_t *)); + +static frentry_t rcmdfr; + + +/* + * RCMD application proxy initialization. + */ +int ippr_rcmd_init() +{ + bzero((char *)&rcmdfr, sizeof(rcmdfr)); + rcmdfr.fr_ref = 1; + rcmdfr.fr_flags = FR_INQUE|FR_PASS|FR_QUICK|FR_KEEPSTATE; + return 0; +} + + +/* + * Setup for a new RCMD proxy. + */ +int ippr_rcmd_new(fin, ip, aps, nat) +fr_info_t *fin; +ip_t *ip; +ap_session_t *aps; +nat_t *nat; +{ + tcphdr_t *tcp = (tcphdr_t *)fin->fin_dp; + + aps->aps_psiz = sizeof(u_32_t); + KMALLOCS(aps->aps_data, u_32_t *, sizeof(u_32_t)); + if (aps->aps_data == NULL) + return -1; + *(u_32_t *)aps->aps_data = 0; + aps->aps_sport = tcp->th_sport; + aps->aps_dport = tcp->th_dport; + return 0; +} + + +/* + * ipf_rcmd_atoi - implement a simple version of atoi + */ +u_short ipf_rcmd_atoi(ptr) +char *ptr; +{ + register char *s = ptr, c; + register u_short i = 0; + + while ((c = *s++) && isdigit(c)) { + i *= 10; + i += c - '0'; + } + return i; +} + + +int ippr_rcmd_portmsg(fin, ip, aps, nat) +fr_info_t *fin; +ip_t *ip; +ap_session_t *aps; +nat_t *nat; +{ + char portbuf[8], *s; + struct in_addr swip; + u_short sp, dp; + int off, dlen; + tcphdr_t *tcp, tcph, *tcp2 = &tcph; + fr_info_t fi; + nat_t *ipn; + mb_t *m; +#if SOLARIS + mb_t *m1; +#endif + + tcp = (tcphdr_t *)fin->fin_dp; + + if (tcp->th_flags & TH_SYN) { + *(u_32_t *)aps->aps_data = htonl(ntohl(tcp->th_seq) + 1); + return 0; + } + + if ((*(u_32_t *)aps->aps_data != 0) && + (tcp->th_seq != *(u_32_t *)aps->aps_data)) + return 0; + + off = (ip->ip_hl << 2) + (tcp->th_off << 2); + +#if SOLARIS + m = fin->fin_qfm; + + dlen = msgdsize(m) - off; + bzero(portbuf, sizeof(portbuf)); + copyout_mblk(m, off, MIN(sizeof(portbuf), dlen), portbuf); +#else + m = *(mb_t **)fin->fin_mp; + dlen = mbufchainlen(m) - off; + bzero(portbuf, sizeof(portbuf)); + m_copydata(m, off, MIN(sizeof(portbuf), dlen), portbuf); +#endif + + portbuf[sizeof(portbuf) - 1] = '\0'; + s = portbuf; + sp = ipf_rcmd_atoi(s); + if (!sp) + return 0; + + /* + * Add skeleton NAT entry for connection which will come back the + * other way. + */ + sp = htons(sp); + dp = htons(fin->fin_data[1]); + ipn = nat_outlookup(fin->fin_ifp, IPN_TCP, nat->nat_p, nat->nat_inip, + ip->ip_dst, (dp << 16) | sp, 0); + if (ipn == NULL) { + int slen; + + slen = ip->ip_len; + ip->ip_len = fin->fin_hlen + sizeof(*tcp); + bcopy((char *)fin, (char *)&fi, sizeof(fi)); + bzero((char *)tcp2, sizeof(*tcp2)); + tcp2->th_win = htons(8192); + tcp2->th_sport = sp; + tcp2->th_dport = 0; /* XXX - don't specify remote port */ + tcp2->th_off = 5; + fi.fin_data[0] = ntohs(sp); + fi.fin_data[1] = 0; + fi.fin_dp = (char *)tcp2; + fi.fin_dlen = sizeof(*tcp2); + swip = ip->ip_src; + ip->ip_src = nat->nat_inip; + ipn = nat_new(nat->nat_ptr, ip, &fi, IPN_TCP|FI_W_DPORT, + NAT_OUTBOUND); + if (ipn != NULL) { + ipn->nat_age = fr_defnatage; + fi.fin_fr = &rcmdfr; + (void) fr_addstate(ip, &fi, FI_W_DPORT); + } + ip->ip_len = slen; + ip->ip_src = swip; + } + return 0; +} + + +int ippr_rcmd_out(fin, ip, aps, nat) +fr_info_t *fin; +ip_t *ip; +ap_session_t *aps; +nat_t *nat; +{ + return ippr_rcmd_portmsg(fin, ip, aps, nat); +} diff --git a/sys/netinet/ip_state.c b/sys/netinet/ip_state.c new file mode 100644 index 0000000..8cfe62d --- /dev/null +++ b/sys/netinet/ip_state.c @@ -0,0 +1,1901 @@ +/* + * Copyright (C) 1995-2000 by Darren Reed. + * + * Redistribution and use in source and binary forms are permitted + * provided that this notice is preserved and due credit is given + * to the original author and the contributors. + */ +#if !defined(lint) +static const char sccsid[] = "@(#)ip_state.c 1.8 6/5/96 (C) 1993-1995 Darren Reed"; +static const char rcsid[] = "@(#)$FreeBSD$"; +#endif + +#include <sys/errno.h> +#include <sys/types.h> +#include <sys/param.h> +#include <sys/file.h> +#if defined(__NetBSD__) && (NetBSD >= 199905) && !defined(IPFILTER_LKM) && \ + defined(_KERNEL) +# include "opt_ipfilter_log.h" +#endif +#if defined(_KERNEL) && defined(__FreeBSD_version) && \ + (__FreeBSD_version >= 400000) && !defined(KLD_MODULE) +#include "opt_inet6.h" +#endif +#if !defined(_KERNEL) && !defined(KERNEL) && !defined(__KERNEL__) +# include <stdio.h> +# include <stdlib.h> +# include <string.h> +#else +# ifdef linux +# include <linux/kernel.h> +# include <linux/module.h> +# endif +#endif +#if (defined(KERNEL) || defined(_KERNEL)) && (__FreeBSD_version >= 220000) +# include <sys/filio.h> +# include <sys/fcntl.h> +# if (__FreeBSD_version >= 300000) && !defined(IPFILTER_LKM) +# include "opt_ipfilter.h" +# endif +#else +# include <sys/ioctl.h> +#endif +#include <sys/time.h> +#include <sys/uio.h> +#ifndef linux +# include <sys/protosw.h> +#endif +#include <sys/socket.h> +#if (defined(_KERNEL) || defined(KERNEL)) && !defined(linux) +# include <sys/systm.h> +#endif +#if !defined(__SVR4) && !defined(__svr4__) +# ifndef linux +# include <sys/mbuf.h> +# endif +#else +# include <sys/filio.h> +# include <sys/byteorder.h> +# ifdef _KERNEL +# include <sys/dditypes.h> +# endif +# include <sys/stream.h> +# include <sys/kmem.h> +#endif + +#include <net/if.h> +#ifdef sun +# include <net/af.h> +#endif +#include <net/route.h> +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/tcp.h> +#ifndef linux +# include <netinet/ip_var.h> +# include <netinet/tcp_fsm.h> +#endif +#include <netinet/udp.h> +#include <netinet/ip_icmp.h> +#include "netinet/ip_compat.h" +#include <netinet/tcpip.h> +#include "netinet/ip_fil.h" +#include "netinet/ip_nat.h" +#include "netinet/ip_frag.h" +#include "netinet/ip_proxy.h" +#include "netinet/ip_state.h" +#ifdef USE_INET6 +#include <netinet/icmp6.h> +#endif +#if (__FreeBSD_version >= 300000) +# include <sys/malloc.h> +# if (defined(_KERNEL) || defined(KERNEL)) && !defined(IPFILTER_LKM) +# include <sys/libkern.h> +# include <sys/systm.h> +# endif +#endif + +#ifndef MIN +# define MIN(a,b) (((a)<(b))?(a):(b)) +#endif + +#define TCP_CLOSE (TH_FIN|TH_RST) + +static ipstate_t **ips_table = NULL; +static ipstate_t *ips_list = NULL; +static int ips_num = 0; +static int ips_wild = 0; +static ips_stat_t ips_stats; +#if (SOLARIS || defined(__sgi)) && defined(_KERNEL) +extern KRWLOCK_T ipf_state, ipf_mutex; +extern kmutex_t ipf_rw; +#endif + +#ifdef USE_INET6 +static frentry_t *fr_checkicmp6matchingstate __P((ip6_t *, fr_info_t *)); +#endif +static int fr_matchsrcdst __P((ipstate_t *, union i6addr, union i6addr, + fr_info_t *, tcphdr_t *)); +static frentry_t *fr_checkicmpmatchingstate __P((ip_t *, fr_info_t *)); +static int fr_matchicmpqueryreply __P((int, ipstate_t *, icmphdr_t *)); +static int fr_state_flush __P((int)); +static ips_stat_t *fr_statetstats __P((void)); +static void fr_delstate __P((ipstate_t *)); +static int fr_state_remove __P((caddr_t)); +static void fr_ipsmove __P((ipstate_t **, ipstate_t *, u_int)); +int fr_stputent __P((caddr_t)); +int fr_stgetent __P((caddr_t)); +void fr_stinsert __P((ipstate_t *)); + + +#define FIVE_DAYS (2 * 5 * 86400) /* 5 days: half closed session */ + +#define TCP_MSL 240 /* 2 minutes */ +u_long fr_tcpidletimeout = FIVE_DAYS, + fr_tcpclosewait = 2 * TCP_MSL, + fr_tcplastack = 2 * TCP_MSL, + fr_tcptimeout = 2 * TCP_MSL, + fr_tcpclosed = 120, + fr_tcphalfclosed = 2 * 2 * 3600, /* 2 hours */ + fr_udptimeout = 240, + fr_icmptimeout = 120; +int fr_statemax = IPSTATE_MAX, + fr_statesize = IPSTATE_SIZE; +int fr_state_doflush = 0, + fr_state_lock = 0; + +static int icmpreplytype4[ICMP_MAXTYPE + 1]; + +int fr_stateinit() +{ + int i; + + KMALLOCS(ips_table, ipstate_t **, fr_statesize * sizeof(ipstate_t *)); + if (ips_table != NULL) + bzero((char *)ips_table, fr_statesize * sizeof(ipstate_t *)); + else + return -1; + + /* fill icmp reply type table */ + for (i = 0; i <= ICMP_MAXTYPE; i++) + icmpreplytype4[i] = -1; + icmpreplytype4[ICMP_ECHO] = ICMP_ECHOREPLY; + icmpreplytype4[ICMP_TSTAMP] = ICMP_TSTAMPREPLY; + icmpreplytype4[ICMP_IREQ] = ICMP_IREQREPLY; + icmpreplytype4[ICMP_MASKREQ] = ICMP_MASKREPLY; + + return 0; +} + + +static ips_stat_t *fr_statetstats() +{ + ips_stats.iss_active = ips_num; + ips_stats.iss_table = ips_table; + ips_stats.iss_list = ips_list; + return &ips_stats; +} + + +/* + * flush state tables. two actions currently defined: + * which == 0 : flush all state table entries + * which == 1 : flush TCP connections which have started to close but are + * stuck for some reason. + */ +static int fr_state_flush(which) +int which; +{ + register ipstate_t *is, **isp; +#if defined(_KERNEL) && !SOLARIS + int s; +#endif + int delete, removed = 0; + + SPL_NET(s); + for (isp = &ips_list; (is = *isp); ) { + delete = 0; + + switch (which) + { + case 0 : + delete = 1; + break; + case 1 : + if (is->is_p != IPPROTO_TCP) + break; + if ((is->is_state[0] != TCPS_ESTABLISHED) || + (is->is_state[1] != TCPS_ESTABLISHED)) + delete = 1; + break; + } + + if (delete) { + if (is->is_p == IPPROTO_TCP) + ips_stats.iss_fin++; + else + ips_stats.iss_expire++; +#ifdef IPFILTER_LOG + ipstate_log(is, ISL_FLUSH); +#endif + fr_delstate(is); + removed++; + } else + isp = &is->is_next; + } + SPL_X(s); + return removed; +} + + +static int fr_state_remove(data) +caddr_t data; +{ + ipstate_t *sp, st; + int error; + + sp = &st; + error = IRCOPYPTR(data, (caddr_t)&st, sizeof(st)); + if (error) + return EFAULT; + + for (sp = ips_list; sp; sp = sp->is_next) + if ((sp->is_p == st.is_p) && (sp->is_v == st.is_v) && + !bcmp((char *)&sp->is_src, (char *)&st.is_src, + sizeof(st.is_src)) && + !bcmp((char *)&sp->is_dst, (char *)&st.is_src, + sizeof(st.is_dst)) && + !bcmp((char *)&sp->is_ps, (char *)&st.is_ps, + sizeof(st.is_ps))) { + WRITE_ENTER(&ipf_state); +#ifdef IPFILTER_LOG + ipstate_log(sp, ISL_REMOVE); +#endif + fr_delstate(sp); + RWLOCK_EXIT(&ipf_state); + return 0; + } + return ESRCH; +} + + +int fr_state_ioctl(data, cmd, mode) +caddr_t data; +#if defined(__NetBSD__) || defined(__OpenBSD__) +u_long cmd; +#else +int cmd; +#endif +int mode; +{ + int arg, ret, error = 0; + + switch (cmd) + { + case SIOCDELST : + error = fr_state_remove(data); + break; + case SIOCIPFFL : + error = IRCOPY(data, (caddr_t)&arg, sizeof(arg)); + if (error) + break; + if (arg == 0 || arg == 1) { + WRITE_ENTER(&ipf_state); + ret = fr_state_flush(arg); + RWLOCK_EXIT(&ipf_state); + error = IWCOPY((caddr_t)&ret, data, sizeof(ret)); + } else + error = EINVAL; + break; +#ifdef IPFILTER_LOG + case SIOCIPFFB : + if (!(mode & FWRITE)) + error = EPERM; + else { + int tmp; + + tmp = ipflog_clear(IPL_LOGSTATE); + IWCOPY((char *)&tmp, data, sizeof(tmp)); + } + break; +#endif + case SIOCGETFS : + error = IWCOPYPTR((caddr_t)fr_statetstats(), data, + sizeof(ips_stat_t)); + break; + case FIONREAD : +#ifdef IPFILTER_LOG + arg = (int)iplused[IPL_LOGSTATE]; + error = IWCOPY((caddr_t)&arg, (caddr_t)data, sizeof(arg)); +#endif + break; + case SIOCSTLCK : + error = fr_lock(data, &fr_state_lock); + break; + case SIOCSTPUT : + if (!fr_state_lock) { + error = EACCES; + break; + } + error = fr_stputent(data); + break; + case SIOCSTGET : + if (!fr_state_lock) { + error = EACCES; + break; + } + error = fr_stgetent(data); + break; + default : + error = EINVAL; + break; + } + return error; +} + + +int fr_stgetent(data) +caddr_t data; +{ + register ipstate_t *is, *isn; + ipstate_save_t ips, *ipsp; + int error; + + error = IRCOPY(data, (caddr_t)&ipsp, sizeof(ipsp)); + if (error) + return EFAULT; + error = IRCOPY((caddr_t)ipsp, (caddr_t)&ips, sizeof(ips)); + if (error) + return EFAULT; + + isn = ips.ips_next; + if (!isn) { + isn = ips_list; + if (isn == NULL) { + if (ips.ips_next == NULL) + return ENOENT; + return 0; + } + } else { + /* + * Make sure the pointer we're copying from exists in the + * current list of entries. Security precaution to prevent + * copying of random kernel data. + */ + for (is = ips_list; is; is = is->is_next) + if (is == isn) + break; + if (!is) + return ESRCH; + } + ips.ips_next = isn->is_next; + bcopy((char *)isn, (char *)&ips.ips_is, sizeof(ips.ips_is)); + if (isn->is_rule) + bcopy((char *)isn->is_rule, (char *)&ips.ips_fr, + sizeof(ips.ips_fr)); + error = IWCOPY((caddr_t)&ips, ipsp, sizeof(ips)); + if (error) + error = EFAULT; + return error; +} + + +int fr_stputent(data) +caddr_t data; +{ + register ipstate_t *is, *isn; + ipstate_save_t ips, *ipsp; + int error, out; + frentry_t *fr; + + error = IRCOPY(data, (caddr_t)&ipsp, sizeof(ipsp)); + if (error) + return EFAULT; + error = IRCOPY((caddr_t)ipsp, (caddr_t)&ips, sizeof(ips)); + if (error) + return EFAULT; + + KMALLOC(isn, ipstate_t *); + if (isn == NULL) + return ENOMEM; + + bcopy((char *)&ips.ips_is, (char *)isn, sizeof(*isn)); + fr = isn->is_rule; + if (fr != NULL) { + if (isn->is_flags & FI_NEWFR) { + KMALLOC(fr, frentry_t *); + if (fr == NULL) { + KFREE(isn); + return ENOMEM; + } + bcopy((char *)&ips.ips_fr, (char *)fr, sizeof(*fr)); + out = fr->fr_flags & FR_OUTQUE ? 1 : 0; + isn->is_rule = fr; + ips.ips_is.is_rule = fr; + if (*fr->fr_ifname) { + fr->fr_ifa = GETUNIT(fr->fr_ifname, fr->fr_v); + if (fr->fr_ifa == NULL) + fr->fr_ifa = (void *)-1; +#ifdef _KERNEL + else { + strncpy(isn->is_ifname[out], + IFNAME(fr->fr_ifa), IFNAMSIZ); + isn->is_ifp[out] = fr->fr_ifa; + } +#endif + } else + fr->fr_ifa = NULL; + /* + * send a copy back to userland of what we ended up + * to allow for verification. + */ + error = IWCOPY((caddr_t)&ips, ipsp, sizeof(ips)); + if (error) { + KFREE(isn); + KFREE(fr); + return EFAULT; + } + } else { + for (is = ips_list; is; is = is->is_next) + if (is->is_rule == fr) + break; + if (!is) { + KFREE(isn); + return ESRCH; + } + } + } + fr_stinsert(isn); + return 0; +} + + +void fr_stinsert(is) +register ipstate_t *is; +{ + register u_int hv = is->is_hv; + + MUTEX_INIT(&is->is_lock, "ipf state entry", NULL); + + is->is_ifname[0][sizeof(is->is_ifname[0]) - 1] = '\0'; + if (is->is_ifname[0][0] != '\0') { + is->is_ifp[0] = GETUNIT(is->is_ifname[0], is->is_v); + } + is->is_ifname[1][sizeof(is->is_ifname[0]) - 1] = '\0'; + if (is->is_ifname[1][0] != '\0') { + is->is_ifp[1] = GETUNIT(is->is_ifname[1], is->is_v); + } + + /* + * add into list table. + */ + if (ips_list) + ips_list->is_pnext = &is->is_next; + is->is_pnext = &ips_list; + is->is_next = ips_list; + ips_list = is; + if (ips_table[hv]) + ips_table[hv]->is_phnext = &is->is_hnext; + else + ips_stats.iss_inuse++; + is->is_phnext = ips_table + hv; + is->is_hnext = ips_table[hv]; + ips_table[hv] = is; + ips_num++; +} + + +/* + * Create a new ipstate structure and hang it off the hash table. + */ +ipstate_t *fr_addstate(ip, fin, flags) +ip_t *ip; +fr_info_t *fin; +u_int flags; +{ + register tcphdr_t *tcp = NULL; + register ipstate_t *is; + register u_int hv; + ipstate_t ips; + u_int pass; + int out; + + if (fr_state_lock || (fin->fin_off & IP_OFFMASK) || + (fin->fin_fi.fi_fl & FI_SHORT)) + return NULL; + if (ips_num == fr_statemax) { + ips_stats.iss_max++; + fr_state_doflush = 1; + return NULL; + } + out = fin->fin_out; + is = &ips; + bzero((char *)is, sizeof(*is)); + ips.is_age = 1; + ips.is_state[0] = 0; + ips.is_state[1] = 0; + /* + * Copy and calculate... + */ + hv = (is->is_p = fin->fin_fi.fi_p); + is->is_src = fin->fin_fi.fi_src; + hv += is->is_saddr; + is->is_dst = fin->fin_fi.fi_dst; + hv += is->is_daddr; +#ifdef USE_INET6 + if (fin->fin_v == 6) { + if (is->is_p == IPPROTO_ICMPV6) { + if (IN6_IS_ADDR_MULTICAST(&is->is_dst.in6)) + flags |= FI_W_DADDR; + if (out) + hv -= is->is_daddr; + else + hv -= is->is_saddr; + } + } +#endif + + switch (is->is_p) + { +#ifdef USE_INET6 + case IPPROTO_ICMPV6 : +#endif + case IPPROTO_ICMP : + { + struct icmp *ic = (struct icmp *)fin->fin_dp; + +#ifdef USE_INET6 + if ((is->is_p == IPPROTO_ICMPV6) && + ((ic->icmp_type & ICMP6_INFOMSG_MASK) == 0)) + return NULL; +#endif + switch (ic->icmp_type) + { +#ifdef USE_INET6 + case ICMP6_ECHO_REQUEST : + is->is_icmp.ics_type = ICMP6_ECHO_REPLY; + hv += (is->is_icmp.ics_id = ic->icmp_id); + hv += (is->is_icmp.ics_seq = ic->icmp_seq); + break; + case ICMP6_MEMBERSHIP_QUERY : + case ND_ROUTER_SOLICIT : + case ND_NEIGHBOR_SOLICIT : + is->is_icmp.ics_type = ic->icmp_type + 1; + break; +#endif + case ICMP_ECHO : + case ICMP_TSTAMP : + case ICMP_IREQ : + case ICMP_MASKREQ : + is->is_icmp.ics_type = ic->icmp_type; + hv += (is->is_icmp.ics_id = ic->icmp_id); + hv += (is->is_icmp.ics_seq = ic->icmp_seq); + break; + default : + return NULL; + } + ATOMIC_INCL(ips_stats.iss_icmp); + is->is_age = fr_icmptimeout; + break; + } + case IPPROTO_TCP : + { + tcp = (tcphdr_t *)fin->fin_dp; + + if (tcp->th_flags & TH_RST) + return NULL; + /* + * The endian of the ports doesn't matter, but the ack and + * sequence numbers do as we do mathematics on them later. + */ + is->is_dport = tcp->th_dport; + is->is_sport = tcp->th_sport; + if ((flags & (FI_W_DPORT|FI_W_SPORT)) == 0) { + hv += tcp->th_dport; + hv += tcp->th_sport; + } + is->is_send = ntohl(tcp->th_seq) + fin->fin_dlen - + (tcp->th_off << 2) + + ((tcp->th_flags & TH_SYN) ? 1 : 0) + + ((tcp->th_flags & TH_FIN) ? 1 : 0); + is->is_maxsend = is->is_send; + is->is_dend = 0; + is->is_maxdwin = 1; + is->is_maxswin = ntohs(tcp->th_win); + if (is->is_maxswin == 0) + is->is_maxswin = 1; + /* + * If we're creating state for a starting connection, start the + * timer on it as we'll never see an error if it fails to + * connect. + */ + ATOMIC_INCL(ips_stats.iss_tcp); + break; + } + case IPPROTO_UDP : + { + tcp = (tcphdr_t *)fin->fin_dp; + + is->is_dport = tcp->th_dport; + is->is_sport = tcp->th_sport; + if ((flags & (FI_W_DPORT|FI_W_SPORT)) == 0) { + hv += tcp->th_dport; + hv += tcp->th_sport; + } + ATOMIC_INCL(ips_stats.iss_udp); + is->is_age = fr_udptimeout; + break; + } + default : + return NULL; + } + + KMALLOC(is, ipstate_t *); + if (is == NULL) { + ATOMIC_INCL(ips_stats.iss_nomem); + return NULL; + } + bcopy((char *)&ips, (char *)is, sizeof(*is)); + hv %= fr_statesize; + is->is_hv = hv; + is->is_rule = fin->fin_fr; + if (is->is_rule != NULL) { + ATOMIC_INC32(is->is_rule->fr_ref); + pass = is->is_rule->fr_flags; + } else + pass = fr_flags; + WRITE_ENTER(&ipf_state); + + is->is_pass = pass; + is->is_pkts = 1; + is->is_bytes = fin->fin_dlen + fin->fin_hlen; + /* + * We want to check everything that is a property of this packet, + * but we don't (automatically) care about it's fragment status as + * this may change. + */ + is->is_v = fin->fin_fi.fi_v; + is->is_opt = fin->fin_fi.fi_optmsk; + is->is_optmsk = 0xffffffff; + is->is_sec = fin->fin_fi.fi_secmsk; + is->is_secmsk = 0xffff; + is->is_auth = fin->fin_fi.fi_auth; + is->is_authmsk = 0xffff; + is->is_flags = fin->fin_fi.fi_fl & FI_CMP; + is->is_flags |= FI_CMP << 4; + is->is_flags |= flags & (FI_WILDP|FI_WILDA); + if (flags & (FI_WILDP|FI_WILDA)) + ips_wild++; + is->is_ifp[1 - out] = NULL; + is->is_ifp[out] = fin->fin_ifp; +#ifdef _KERNEL + strncpy(is->is_ifname[out], IFNAME(fin->fin_ifp), IFNAMSIZ); +#endif + is->is_ifname[1 - out][0] = '\0'; + if (pass & FR_LOGFIRST) + is->is_pass &= ~(FR_LOGFIRST|FR_LOG); + fr_stinsert(is); + if (is->is_p == IPPROTO_TCP) { + MUTEX_ENTER(&is->is_lock); + fr_tcp_age(&is->is_age, is->is_state, fin, + 0); /* 0 = packet from the source */ + MUTEX_EXIT(&is->is_lock); + } +#ifdef IPFILTER_LOG + ipstate_log(is, ISL_NEW); +#endif + RWLOCK_EXIT(&ipf_state); + fin->fin_rev = IP6NEQ(is->is_dst, fin->fin_fi.fi_dst); + if ((fin->fin_fi.fi_fl & FI_FRAG) && (pass & FR_KEEPFRAG)) + ipfr_newfrag(ip, fin, pass ^ FR_KEEPSTATE); + return is; +} + + + +/* + * check to see if a packet with TCP headers fits within the TCP window. + * change timeout depending on whether new packet is a SYN-ACK returning for a + * SYN or a RST or FIN which indicate time to close up shop. + */ +int fr_tcpstate(is, fin, ip, tcp) +register ipstate_t *is; +fr_info_t *fin; +ip_t *ip; +tcphdr_t *tcp; +{ + register tcp_seq seq, ack, end; + register int ackskew; + tcpdata_t *fdata, *tdata; + u_short win, maxwin; + int ret = 0; + int source; + + /* + * Find difference between last checked packet and this packet. + */ + source = IP6EQ(fin->fin_fi.fi_src, is->is_src); + fdata = &is->is_tcp.ts_data[!source]; + tdata = &is->is_tcp.ts_data[source]; + seq = ntohl(tcp->th_seq); + ack = ntohl(tcp->th_ack); + win = ntohs(tcp->th_win); + end = seq + fin->fin_dlen - (tcp->th_off << 2) + + ((tcp->th_flags & TH_SYN) ? 1 : 0) + + ((tcp->th_flags & TH_FIN) ? 1 : 0); + + MUTEX_ENTER(&is->is_lock); + if (fdata->td_end == 0) { + /* + * Must be a (outgoing) SYN-ACK in reply to a SYN. + */ + fdata->td_end = end; + fdata->td_maxwin = 1; + fdata->td_maxend = end + 1; + } + + if (!(tcp->th_flags & TH_ACK)) { /* Pretend an ack was sent */ + ack = tdata->td_end; + } else if (((tcp->th_flags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) && + (ack == 0)) { + /* gross hack to get around certain broken tcp stacks */ + ack = tdata->td_end; + } + + if (seq == end) + seq = end = fdata->td_end; + + maxwin = tdata->td_maxwin; + ackskew = tdata->td_end - ack; + +#define SEQ_GE(a,b) ((int)((a) - (b)) >= 0) +#define SEQ_GT(a,b) ((int)((a) - (b)) > 0) + if ((SEQ_GE(fdata->td_maxend, end)) && + (SEQ_GE(seq, fdata->td_end - maxwin)) && +/* XXX what about big packets */ +#define MAXACKWINDOW 66000 + (ackskew >= -MAXACKWINDOW) && + (ackskew <= MAXACKWINDOW)) { + /* if ackskew < 0 then this should be due to fragented + * packets. There is no way to know the length of the + * total packet in advance. + * We do know the total length from the fragment cache though. + * Note however that there might be more sessions with + * exactly the same source and destination paramters in the + * state cache (and source and destination is the only stuff + * that is saved in the fragment cache). Note further that + * some TCP connections in the state cache are hashed with + * sport and dport as well which makes it not worthwhile to + * look for them. + * Thus, when ackskew is negative but still seems to belong + * to this session, we bump up the destinations end value. + */ + if (ackskew < 0) + tdata->td_end = ack; + + /* update max window seen */ + if (fdata->td_maxwin < win) + fdata->td_maxwin = win; + if (SEQ_GT(end, fdata->td_end)) + fdata->td_end = end; + if (SEQ_GE(ack + win, tdata->td_maxend)) { + tdata->td_maxend = ack + win; + if (win == 0) + tdata->td_maxend++; + } + + ATOMIC_INCL(ips_stats.iss_hits); + /* + * Nearing end of connection, start timeout. + */ + /* source ? 0 : 1 -> !source */ + fr_tcp_age(&is->is_age, is->is_state, fin, !source); + ret = 1; + } + MUTEX_EXIT(&is->is_lock); + return ret; +} + + +static int fr_matchsrcdst(is, src, dst, fin, tcp) +ipstate_t *is; +union i6addr src, dst; +fr_info_t *fin; +tcphdr_t *tcp; +{ + int ret = 0, rev, out, flags; + u_short sp, dp; + void *ifp; + + rev = fin->fin_rev = IP6NEQ(is->is_dst, dst); + ifp = fin->fin_ifp; + out = fin->fin_out; + + if (tcp != NULL) { + flags = is->is_flags; + sp = tcp->th_sport; + dp = tcp->th_dport; + } else { + flags = is->is_flags & FI_WILDA; + sp = 0; + dp = 0; + } + + if (rev == 0) { + if (!out) { + if (is->is_ifpin == NULL || is->is_ifpin == ifp) + ret = 1; + } else { + if (is->is_ifpout == NULL || is->is_ifpout == ifp) + ret = 1; + } + } else { + if (out) { + if (is->is_ifpin == NULL || is->is_ifpin == ifp) + ret = 1; + } else { + if (is->is_ifpout == NULL || is->is_ifpout == ifp) + ret = 1; + } + } + if (ret == 0) + return 0; + ret = 0; + + if (rev == 0) { + if ( + (IP6EQ(is->is_dst, dst) || (flags & FI_W_DADDR)) && + (IP6EQ(is->is_src, src) || (flags & FI_W_SADDR)) && + (!tcp || ((sp == is->is_sport || flags & FI_W_SPORT) && + (dp == is->is_dport || flags & FI_W_DPORT)))) { + ret = 1; + } + } else { + if ( + (IP6EQ(is->is_dst, src) || (flags & FI_W_DADDR)) && + (IP6EQ(is->is_src, dst) || (flags & FI_W_SADDR)) && + (!tcp || ((sp == is->is_dport || flags & FI_W_DPORT) && + (dp == is->is_sport || flags & FI_W_SPORT)))) { + ret = 1; + } + } + if (ret == 0) + return 0; + + /* + * Whether or not this should be here, is questionable, but the aim + * is to get this out of the main line. + */ + if (tcp == NULL) + flags = is->is_flags & (FI_CMP|(FI_CMP<<4)); + + if (((fin->fin_fi.fi_fl & (flags >> 4)) != (flags & FI_CMP)) || + ((fin->fin_fi.fi_optmsk & is->is_optmsk) != is->is_opt) || + ((fin->fin_fi.fi_secmsk & is->is_secmsk) != is->is_sec) || + ((fin->fin_fi.fi_auth & is->is_authmsk) != is->is_auth)) + return 0; + + if ((flags & (FI_W_SPORT|FI_W_DPORT))) { + if ((flags & FI_W_SPORT) != 0) { + if (rev == 0) { + is->is_sport = sp; + is->is_send = htonl(tcp->th_seq); + } else { + is->is_sport = dp; + is->is_send = htonl(tcp->th_ack); + } + is->is_maxsend = is->is_send + 1; + } else if ((flags & FI_W_DPORT) != 0) { + if (rev == 0) { + is->is_dport = dp; + is->is_dend = htonl(tcp->th_ack); + } else { + is->is_dport = sp; + is->is_dend = htonl(tcp->th_seq); + } + is->is_maxdend = is->is_dend + 1; + } + is->is_flags &= ~(FI_W_SPORT|FI_W_DPORT); + ips_wild--; + } + + ret = -1; + + if (!rev) { + if (out) { + if (!is->is_ifpout) + ret = 1; + } else { + if (!is->is_ifpin) + ret = 0; + } + } else { + if (out) { + if (!is->is_ifpin) + ret = 0; + } else { + if (!is->is_ifpout) + ret = 1; + } + } + + if (ret >= 0) { + is->is_ifp[ret] = ifp; +#ifdef _KERNEL + strncpy(is->is_ifname[out], IFNAME(fin->fin_ifp), + sizeof(is->is_ifname[1])); +#endif + } +#ifdef _KERNEL + if (ret >= 0) { + strncpy(is->is_ifname[out], IFNAME(fin->fin_ifp), + sizeof(is->is_ifname[1])); + } +#endif + return 1; +} + +static int fr_matchicmpqueryreply(v, is, icmp) +int v; +ipstate_t *is; +icmphdr_t *icmp; +{ + if (v == 4) { + /* + * If we matched its type on the way in, then when going out + * it will still be the same type. + */ + if (((icmp->icmp_type == is->is_type) || + (icmpreplytype4[is->is_type] == icmp->icmp_type)) && + (icmp->icmp_id == is->is_icmp.ics_id) && + (icmp->icmp_seq == is->is_icmp.ics_seq)) { + return 1; + }; + } +#ifdef USE_INET6 + else if (is->is_v == 6) { + if ((is->is_type == ICMP6_ECHO_REPLY) && + (icmp->icmp_type == ICMP6_ECHO_REQUEST) && + (icmp->icmp_id == is->is_icmp.ics_id) && + (icmp->icmp_seq == is->is_icmp.ics_seq)) { + return 1; + }; + } +#endif + return 0; +} + +static frentry_t *fr_checkicmpmatchingstate(ip, fin) +ip_t *ip; +fr_info_t *fin; +{ + register ipstate_t *is, **isp; + register u_short sport, dport; + register u_char pr; + union i6addr dst, src; + struct icmp *ic; + u_short savelen; + icmphdr_t *icmp; + fr_info_t ofin; + int type, len; + tcphdr_t *tcp; + frentry_t *fr; + ip_t *oip; + u_int hv; + + /* + * Does it at least have the return (basic) IP header ? + * Only a basic IP header (no options) should be with + * an ICMP error header. + */ + if (((ip->ip_v != 4) || (ip->ip_hl != 5)) || + (fin->fin_plen < ICMPERR_MINPKTLEN)) + return NULL; + ic = (struct icmp *)fin->fin_dp; + type = ic->icmp_type; + /* + * If it's not an error type, then return + */ + if ((type != ICMP_UNREACH) && (type != ICMP_SOURCEQUENCH) && + (type != ICMP_REDIRECT) && (type != ICMP_TIMXCEED) && + (type != ICMP_PARAMPROB)) + return NULL; + + oip = (ip_t *)((char *)ic + ICMPERR_ICMPHLEN); + if (fin->fin_plen < ICMPERR_MAXPKTLEN + ((oip->ip_hl - 5) << 2)) + return NULL; + + /* + * Sanity checks. + */ + len = fin->fin_dlen - ICMPERR_ICMPHLEN; + if ((len <= 0) || ((oip->ip_hl << 2) > len)) + return NULL; + + /* + * Is the buffer big enough for all of it ? It's the size of the IP + * header claimed in the encapsulated part which is of concern. It + * may be too big to be in this buffer but not so big that it's + * outside the ICMP packet, leading to TCP deref's causing problems. + * This is possible because we don't know how big oip_hl is when we + * do the pullup early in fr_check() and thus can't gaurantee it is + * all here now. + */ +#ifdef _KERNEL + { + mb_t *m; + +# if SOLARIS + m = fin->fin_qfm; + if ((char *)oip + len > (char *)m->b_wptr) + return NULL; +# else + m = *(mb_t **)fin->fin_mp; + if ((char *)oip + len > (char *)ip + m->m_len) + return NULL; +# endif + } +#endif + + /* + * in the IPv4 case we must zero the i6addr union otherwise + * the IP6EQ and IP6NEQ macros produce the wrong results because + * of the 'junk' in the unused part of the union + */ + bzero((char *)&src, sizeof(src)); + bzero((char *)&dst, sizeof(dst)); + + if (oip->ip_p == IPPROTO_ICMP) { + icmp = (icmphdr_t *)((char *)oip + (oip->ip_hl << 2)); + + /* + * a ICMP error can only be generated as a result of an + * ICMP query, not as the response on an ICMP error + * + * XXX theoretically ICMP_ECHOREP and the other reply's are + * ICMP query's as well, but adding them here seems strange XXX + */ + if ((icmp->icmp_type != ICMP_ECHO) && + (icmp->icmp_type != ICMP_TSTAMP) && + (icmp->icmp_type != ICMP_IREQ) && + (icmp->icmp_type != ICMP_MASKREQ)) + return NULL; + + /* + * perform a lookup of the ICMP packet in the state table + */ + hv = (pr = oip->ip_p); + src.in4 = oip->ip_src; + hv += src.in4.s_addr; + dst.in4 = oip->ip_dst; + hv += dst.in4.s_addr; + hv += icmp->icmp_id; + hv += icmp->icmp_seq; + hv %= fr_statesize; + + savelen = oip->ip_len; + oip->ip_len = len; + ofin.fin_v = 4; + fr_makefrip(oip->ip_hl << 2, oip, &ofin); + oip->ip_len = savelen; + ofin.fin_ifp = fin->fin_ifp; + ofin.fin_out = !fin->fin_out; + ofin.fin_mp = NULL; /* if dereferenced, panic XXX */ + + READ_ENTER(&ipf_state); + for (isp = &ips_table[hv]; (is = *isp); isp = &is->is_hnext) + if ((is->is_p == pr) && (is->is_v == 4) && + fr_matchsrcdst(is, src, dst, &ofin, NULL) && + fr_matchicmpqueryreply(is->is_v, is, icmp)) { + ips_stats.iss_hits++; + is->is_pkts++; + is->is_bytes += ip->ip_len; + fr = is->is_rule; + RWLOCK_EXIT(&ipf_state); + return fr; + } + RWLOCK_EXIT(&ipf_state); + return NULL; + }; + + if ((oip->ip_p != IPPROTO_TCP) && (oip->ip_p != IPPROTO_UDP)) + return NULL; + + tcp = (tcphdr_t *)((char *)oip + (oip->ip_hl << 2)); + dport = tcp->th_dport; + sport = tcp->th_sport; + + hv = (pr = oip->ip_p); + src.in4 = oip->ip_src; + hv += src.in4.s_addr; + dst.in4 = oip->ip_dst; + hv += dst.in4.s_addr; + hv += dport; + hv += sport; + hv %= fr_statesize; + /* + * we make an fin entry to be able to feed it to + * matchsrcdst note that not all fields are encessary + * but this is the cleanest way. Note further we fill + * in fin_mp such that if someone uses it we'll get + * a kernel panic. fr_matchsrcdst does not use this. + * + * watch out here, as ip is in host order and oip in network + * order. Any change we make must be undone afterwards. + */ + savelen = oip->ip_len; + oip->ip_len = len; + ofin.fin_v = 4; + fr_makefrip(oip->ip_hl << 2, oip, &ofin); + oip->ip_len = savelen; + ofin.fin_ifp = fin->fin_ifp; + ofin.fin_out = !fin->fin_out; + ofin.fin_mp = NULL; /* if dereferenced, panic XXX */ + READ_ENTER(&ipf_state); + for (isp = &ips_table[hv]; (is = *isp); isp = &is->is_hnext) { + /* + * Only allow this icmp though if the + * encapsulated packet was allowed through the + * other way around. Note that the minimal amount + * of info present does not allow for checking against + * tcp internals such as seq and ack numbers. + */ + if ((is->is_p == pr) && (is->is_v == 4) && + fr_matchsrcdst(is, src, dst, &ofin, tcp)) { + fr = is->is_rule; + ips_stats.iss_hits++; + is->is_pkts++; + is->is_bytes += fin->fin_plen; + /* + * we deliberately do not touch the timeouts + * for the accompanying state table entry. + * It remains to be seen if that is correct. XXX + */ + RWLOCK_EXIT(&ipf_state); + return fr; + } + } + RWLOCK_EXIT(&ipf_state); + return NULL; +} + + +static void fr_ipsmove(isp, is, hv) +ipstate_t **isp, *is; +u_int hv; +{ + u_int hvm; + + hvm = is->is_hv; + /* + * Remove the hash from the old location... + */ + if (is->is_hnext) + is->is_hnext->is_phnext = isp; + *isp = is->is_hnext; + if (ips_table[hvm] == NULL) + ips_stats.iss_inuse--; + + /* + * ...and put the hash in the new one. + */ + hvm = hv % fr_statesize; + is->is_hv = hvm; + isp = &ips_table[hvm]; + if (*isp) + (*isp)->is_phnext = &is->is_hnext; + else + ips_stats.iss_inuse++; + is->is_phnext = isp; + is->is_hnext = *isp; + *isp = is; +} + + +/* + * Check if a packet has a registered state. + */ +frentry_t *fr_checkstate(ip, fin) +ip_t *ip; +fr_info_t *fin; +{ + union i6addr dst, src; + register ipstate_t *is, **isp; + register u_char pr; + u_int hv, hvm, hlen, tryagain, pass, v; + struct icmp *ic; + frentry_t *fr; + tcphdr_t *tcp; + + if (fr_state_lock || (fin->fin_off & IP_OFFMASK) || + (fin->fin_fi.fi_fl & FI_SHORT)) + return NULL; + + is = NULL; + hlen = fin->fin_hlen; + tcp = (tcphdr_t *)((char *)ip + hlen); + ic = (struct icmp *)tcp; + hv = (pr = fin->fin_fi.fi_p); + src = fin->fin_fi.fi_src; + dst = fin->fin_fi.fi_dst; + hv += src.in4.s_addr; + hv += dst.in4.s_addr; + + /* + * Search the hash table for matching packet header info. + */ + v = fin->fin_fi.fi_v; + switch (fin->fin_fi.fi_p) + { +#ifdef USE_INET6 + case IPPROTO_ICMPV6 : + if (v == 6) { + if (fin->fin_out) + hv -= dst.in4.s_addr; + else + hv -= src.in4.s_addr; + if ((ic->icmp_type == ICMP6_ECHO_REQUEST) || + (ic->icmp_type == ICMP6_ECHO_REPLY)) { + hv += ic->icmp_id; + hv += ic->icmp_seq; + } + } +#endif + case IPPROTO_ICMP : + if (v == 4) { + hv += ic->icmp_id; + hv += ic->icmp_seq; + } + hv %= fr_statesize; + READ_ENTER(&ipf_state); + for (isp = &ips_table[hv]; (is = *isp); isp = &is->is_hnext) { + if ((is->is_p == pr) && (is->is_v == v) && + fr_matchsrcdst(is, src, dst, fin, NULL) && + fr_matchicmpqueryreply(v, is, ic)) { + is->is_age = fr_icmptimeout; + break; + } + } + if (is != NULL) + break; + RWLOCK_EXIT(&ipf_state); + /* + * No matching icmp state entry. Perhaps this is a + * response to another state entry. + */ +#ifdef USE_INET6 + if (v == 6) + fr = fr_checkicmp6matchingstate((ip6_t *)ip, fin); + else +#endif + fr = fr_checkicmpmatchingstate(ip, fin); + if (fr) + return fr; + break; + case IPPROTO_TCP : + { + register u_short dport, sport; + register int i; + + i = tcp->th_flags; + /* + * Just plain ignore RST flag set with either FIN or SYN. + */ + if ((i & TH_RST) && + ((i & (TH_FIN|TH_SYN|TH_RST)) != TH_RST)) + break; + case IPPROTO_UDP : + dport = tcp->th_dport; + sport = tcp->th_sport; + tryagain = 0; + hv += dport; + hv += sport; + READ_ENTER(&ipf_state); +retry_tcpudp: + hvm = hv % fr_statesize; + for (isp = &ips_table[hvm]; (is = *isp); isp = &is->is_hnext) + if ((is->is_p == pr) && (is->is_v == v) && + fr_matchsrcdst(is, src, dst, fin, tcp)) { + if ((pr == IPPROTO_TCP)) { + if (!fr_tcpstate(is, fin, ip, tcp)) { + continue; + } + } + break; + } + if (is != NULL) { + if (tryagain && + !(is->is_flags & (FI_WILDP|FI_WILDA))) { + hv += dport; + hv += sport; + fr_ipsmove(isp, is, hv); + MUTEX_DOWNGRADE(&ipf_state); + } + break; + } + RWLOCK_EXIT(&ipf_state); + if (!tryagain && ips_wild) { + hv -= dport; + hv -= sport; + tryagain = 1; + WRITE_ENTER(&ipf_state); + goto retry_tcpudp; + } + break; + } + default : + break; + } + if (is == NULL) { + ATOMIC_INCL(ips_stats.iss_miss); + return NULL; + } + MUTEX_ENTER(&is->is_lock); + is->is_bytes += fin->fin_plen; + ips_stats.iss_hits++; + is->is_pkts++; + MUTEX_EXIT(&is->is_lock); + fr = is->is_rule; + fin->fin_fr = fr; + pass = is->is_pass; +#ifndef _KERNEL + if (tcp->th_flags & TCP_CLOSE) + fr_delstate(is); +#endif + RWLOCK_EXIT(&ipf_state); + if ((fin->fin_fi.fi_fl & FI_FRAG) && (pass & FR_KEEPFRAG)) + ipfr_newfrag(ip, fin, pass ^ FR_KEEPSTATE); + return fr; +} + + +void ip_statesync(ifp) +void *ifp; +{ + register ipstate_t *is; + + WRITE_ENTER(&ipf_state); + for (is = ips_list; is; is = is->is_next) { + if (is->is_ifpin == ifp) { + is->is_ifpin = GETUNIT(is->is_ifname[0], is->is_v); + if (!is->is_ifpin) + is->is_ifpin = (void *)-1; + } + if (is->is_ifpout == ifp) { + is->is_ifpout = GETUNIT(is->is_ifname[1], is->is_v); + if (!is->is_ifpout) + is->is_ifpout = (void *)-1; + } + } + RWLOCK_EXIT(&ipf_state); +} + + +/* + * Must always be called with fr_ipfstate held as a write lock. + */ +static void fr_delstate(is) +ipstate_t *is; +{ + frentry_t *fr; + + if (is->is_flags & (FI_WILDP|FI_WILDA)) + ips_wild--; + if (is->is_next) + is->is_next->is_pnext = is->is_pnext; + *is->is_pnext = is->is_next; + if (is->is_hnext) + is->is_hnext->is_phnext = is->is_phnext; + *is->is_phnext = is->is_hnext; + if (ips_table[is->is_hv] == NULL) + ips_stats.iss_inuse--; + + fr = is->is_rule; + if (fr != NULL) { + fr->fr_ref--; + if (fr->fr_ref == 0) { + KFREE(fr); + } + } +#ifdef _KERNEL + MUTEX_DESTROY(&is->is_lock); +#endif + KFREE(is); + ips_num--; +} + + +/* + * Free memory in use by all state info. kept. + */ +void fr_stateunload() +{ + register ipstate_t *is; + + WRITE_ENTER(&ipf_state); + while ((is = ips_list)) + fr_delstate(is); + ips_stats.iss_inuse = 0; + ips_num = 0; + RWLOCK_EXIT(&ipf_state); + KFREES(ips_table, fr_statesize * sizeof(ipstate_t *)); + ips_table = NULL; +} + + +/* + * Slowly expire held state for thingslike UDP and ICMP. Timeouts are set + * in expectation of this being called twice per second. + */ +void fr_timeoutstate() +{ + register ipstate_t *is, **isp; +#if defined(_KERNEL) && !SOLARIS + int s; +#endif + + SPL_NET(s); + WRITE_ENTER(&ipf_state); + for (isp = &ips_list; (is = *isp); ) + if (is->is_age && !--is->is_age) { + if (is->is_p == IPPROTO_TCP) + ips_stats.iss_fin++; + else + ips_stats.iss_expire++; +#ifdef IPFILTER_LOG + ipstate_log(is, ISL_EXPIRE); +#endif + fr_delstate(is); + } else + isp = &is->is_next; + if (fr_state_doflush) { + (void) fr_state_flush(1); + fr_state_doflush = 0; + } + RWLOCK_EXIT(&ipf_state); + SPL_X(s); +} + + +/* + * Original idea freom Pradeep Krishnan for use primarily with NAT code. + * (pkrishna@netcom.com) + * + * Rewritten by Arjan de Vet <Arjan.deVet@adv.iae.nl>, 2000-07-29: + * + * - (try to) base state transitions on real evidence only, + * i.e. packets that are sent and have been received by ipfilter; + * diagram 18.12 of TCP/IP volume 1 by W. Richard Stevens was used. + * + * - deal with half-closed connections correctly; + * + * - store the state of the source in state[0] such that ipfstat + * displays the state as source/dest instead of dest/source; the calls + * to fr_tcp_age have been changed accordingly. + * + * Parameters: + * + * state[0] = state of source (host that initiated connection) + * state[1] = state of dest (host that accepted the connection) + * + * dir == 0 : a packet from source to dest + * dir == 1 : a packet from dest to source + * + */ +void fr_tcp_age(age, state, fin, dir) +u_long *age; +u_char *state; +fr_info_t *fin; +int dir; +{ + tcphdr_t *tcp = (tcphdr_t *)fin->fin_dp; + u_char flags = tcp->th_flags; + int dlen, ostate; + + ostate = state[1 - dir]; + + dlen = fin->fin_plen - fin->fin_hlen - (tcp->th_off << 2); + + if (flags & TH_RST) { + if (!(tcp->th_flags & TH_PUSH) && !dlen) { + *age = fr_tcpclosed; + state[dir] = TCPS_CLOSED; + } else { + *age = fr_tcpclosewait; + state[dir] = TCPS_CLOSE_WAIT; + } + return; + } + + *age = fr_tcptimeout; /* default 4 mins */ + + switch(state[dir]) + { + case TCPS_CLOSED: /* 0 */ + if ((flags & TH_OPENING) == TH_OPENING) { + /* + * 'dir' received an S and sends SA in response, + * CLOSED -> SYN_RECEIVED + */ + state[dir] = TCPS_SYN_RECEIVED; + *age = fr_tcptimeout; + } else if ((flags & (TH_SYN|TH_ACK)) == TH_SYN) { + /* 'dir' sent S, CLOSED -> SYN_SENT */ + state[dir] = TCPS_SYN_SENT; + *age = fr_tcptimeout; + } + /* + * The next piece of code makes it possible to get + * already established connections into the state table + * after a restart or reload of the filter rules; this + * does not work when a strict 'flags S keep state' is + * used for tcp connections of course + */ + if ((flags & (TH_FIN|TH_SYN|TH_RST|TH_ACK)) == TH_ACK) { + /* we saw an A, guess 'dir' is in ESTABLISHED mode */ + state[dir] = TCPS_ESTABLISHED; + *age = fr_tcpidletimeout; + } + /* + * TODO: besides regular ACK packets we can have other + * packets as well; it is yet to be determined how we + * should initialize the states in those cases + */ + break; + + case TCPS_LISTEN: /* 1 */ + /* NOT USED */ + break; + + case TCPS_SYN_SENT: /* 2 */ + if ((flags & (TH_SYN|TH_FIN|TH_ACK)) == TH_ACK) { + /* + * We see an A from 'dir' which is in SYN_SENT + * state: 'dir' sent an A in response to an SA + * which it received, SYN_SENT -> ESTABLISHED + */ + state[dir] = TCPS_ESTABLISHED; + *age = fr_tcpidletimeout; + } else if (flags & TH_FIN) { + /* + * We see an F from 'dir' which is in SYN_SENT + * state and wants to close its side of the + * connection; SYN_SENT -> FIN_WAIT_1 + */ + state[dir] = TCPS_FIN_WAIT_1; + *age = fr_tcpidletimeout; /* or fr_tcptimeout? */ + } else if ((flags & TH_OPENING) == TH_OPENING) { + /* + * We see an SA from 'dir' which is already in + * SYN_SENT state, this means we have a + * simultaneous open; SYN_SENT -> SYN_RECEIVED + */ + state[dir] = TCPS_SYN_RECEIVED; + *age = fr_tcptimeout; + } + break; + + case TCPS_SYN_RECEIVED: /* 3 */ + if ((flags & (TH_SYN|TH_FIN|TH_ACK)) == TH_ACK) { + /* + * We see an A from 'dir' which was in SYN_RECEIVED + * state so it must now be in established state, + * SYN_RECEIVED -> ESTABLISHED + */ + state[dir] = TCPS_ESTABLISHED; + *age = fr_tcpidletimeout; + } else if (flags & TH_FIN) { + /* + * We see an F from 'dir' which is in SYN_RECEIVED + * state and wants to close its side of the connection; + * SYN_RECEIVED -> FIN_WAIT_1 + */ + state[dir] = TCPS_FIN_WAIT_1; + *age = fr_tcpidletimeout; + } + break; + + case TCPS_ESTABLISHED: /* 4 */ + if (flags & TH_FIN) { + /* + * 'dir' closed its side of the connection; this + * gives us a half-closed connection; + * ESTABLISHED -> FIN_WAIT_1 + */ + state[dir] = TCPS_FIN_WAIT_1; + *age = fr_tcphalfclosed; + } else if (flags & TH_ACK) { + /* an ACK, should we exclude other flags here? */ + if (ostate == TCPS_FIN_WAIT_1) { + /* + * We know the other side did an active close, + * so we are ACKing the recvd FIN packet (does + * the window matching code guarantee this?) + * and go into CLOSE_WAIT state; this gives us + * a half-closed connection + */ + state[dir] = TCPS_CLOSE_WAIT; + *age = fr_tcphalfclosed; + } else if (ostate < TCPS_CLOSE_WAIT) + /* + * Still a fully established connection, + * reset timeout + */ + *age = fr_tcpidletimeout; + } + break; + + case TCPS_CLOSE_WAIT: /* 5 */ + if (flags & TH_FIN) { + /* + * Application closed and 'dir' sent a FIN, we're now + * going into LAST_ACK state + */ + *age = fr_tcplastack; + state[dir] = TCPS_LAST_ACK; + } else { + /* + * We remain in CLOSE_WAIT because the other side has + * closed already and we did not close our side yet; + * reset timeout + */ + *age = fr_tcphalfclosed; + } + break; + + case TCPS_FIN_WAIT_1: /* 6 */ + if ((flags & TH_ACK) && ostate > TCPS_CLOSE_WAIT) { + /* + * If the other side is not active anymore it has sent + * us a FIN packet that we are ack'ing now with an ACK; + * this means both sides have now closed the connection + * and we go into TIME_WAIT + */ + /* + * XXX: how do we know we really are ACKing the FIN + * packet here? does the window code guarantee that? + */ + state[dir] = TCPS_TIME_WAIT; + *age = fr_tcptimeout; + } else + /* + * We closed our side of the connection already but the + * other side is still active (ESTABLISHED/CLOSE_WAIT); + * continue with this half-closed connection + */ + *age = fr_tcphalfclosed; + break; + + case TCPS_CLOSING: /* 7 */ + /* NOT USED */ + break; + + case TCPS_LAST_ACK: /* 8 */ + if (flags & TH_ACK) { + if ((flags & TH_PUSH) || dlen) + /* + * There is still data to be delivered, reset + * timeout + */ + *age = fr_tcplastack; + } + /* + * We cannot detect when we go out of LAST_ACK state to CLOSED + * because that is based on the reception of ACK packets; + * ipfilter can only detect that a packet has been sent by a + * host + */ + break; + + case TCPS_FIN_WAIT_2: /* 9 */ + /* NOT USED */ + break; + + case TCPS_TIME_WAIT: /* 10 */ + /* we're in 2MSL timeout now */ + break; + } +} + + +#ifdef IPFILTER_LOG +void ipstate_log(is, type) +struct ipstate *is; +u_int type; +{ + struct ipslog ipsl; + void *items[1]; + size_t sizes[1]; + int types[1]; + + ipsl.isl_type = type; + ipsl.isl_pkts = is->is_pkts; + ipsl.isl_bytes = is->is_bytes; + ipsl.isl_src = is->is_src; + ipsl.isl_dst = is->is_dst; + ipsl.isl_p = is->is_p; + ipsl.isl_v = is->is_v; + ipsl.isl_flags = is->is_flags; + if (ipsl.isl_p == IPPROTO_TCP || ipsl.isl_p == IPPROTO_UDP) { + ipsl.isl_sport = is->is_sport; + ipsl.isl_dport = is->is_dport; + if (ipsl.isl_p == IPPROTO_TCP) { + ipsl.isl_state[0] = is->is_state[0]; + ipsl.isl_state[1] = is->is_state[1]; + } + } else if (ipsl.isl_p == IPPROTO_ICMP) + ipsl.isl_itype = is->is_icmp.ics_type; + else { + ipsl.isl_ps.isl_filler[0] = 0; + ipsl.isl_ps.isl_filler[1] = 0; + } + items[0] = &ipsl; + sizes[0] = sizeof(ipsl); + types[0] = 0; + + (void) ipllog(IPL_LOGSTATE, NULL, items, sizes, types, 1); +} +#endif + + +#ifdef USE_INET6 +frentry_t *fr_checkicmp6matchingstate(ip, fin) +ip6_t *ip; +fr_info_t *fin; +{ + register ipstate_t *is, **isp; + register u_short sport, dport; + register u_char pr; + struct icmp6_hdr *ic, *oic; + union i6addr dst, src; + u_short savelen; + fr_info_t ofin; + tcphdr_t *tcp; + frentry_t *fr; + ip6_t *oip; + int type; + u_int hv; + + /* + * Does it at least have the return (basic) IP header ? + * Only a basic IP header (no options) should be with + * an ICMP error header. + */ + if ((fin->fin_v != 6) || (fin->fin_plen < ICMP6ERR_MINPKTLEN)) + return NULL; + ic = (struct icmp6_hdr *)fin->fin_dp; + type = ic->icmp6_type; + /* + * If it's not an error type, then return + */ + if ((type != ICMP6_DST_UNREACH) && (type != ICMP6_PACKET_TOO_BIG) && + (type != ICMP6_TIME_EXCEEDED) && (type != ICMP6_PARAM_PROB)) + return NULL; + + oip = (ip6_t *)((char *)ic + ICMPERR_ICMPHLEN); + if (fin->fin_plen < sizeof(*oip)) + return NULL; + + if (oip->ip6_nxt == IPPROTO_ICMPV6) { + oic = (struct icmp6_hdr *)(oip + 1); + /* + * a ICMP error can only be generated as a result of an + * ICMP query, not as the response on an ICMP error + * + * XXX theoretically ICMP_ECHOREP and the other reply's are + * ICMP query's as well, but adding them here seems strange XXX + */ + if (!(oic->icmp6_type & ICMP6_INFOMSG_MASK)) + return NULL; + + /* + * perform a lookup of the ICMP packet in the state table + */ + hv = (pr = oip->ip6_nxt); + src.in6 = oip->ip6_src; + hv += src.in4.s_addr; + dst.in6 = oip->ip6_dst; + hv += dst.in4.s_addr; + hv += oic->icmp6_id; + hv += oic->icmp6_seq; + hv %= fr_statesize; + + oip->ip6_plen = ntohs(oip->ip6_plen); + ofin.fin_v = 6; + fr_makefrip(sizeof(*oip), (ip_t *)oip, &ofin); + oip->ip6_plen = htons(oip->ip6_plen); + ofin.fin_ifp = fin->fin_ifp; + ofin.fin_out = !fin->fin_out; + ofin.fin_mp = NULL; /* if dereferenced, panic XXX */ + + READ_ENTER(&ipf_state); + for (isp = &ips_table[hv]; (is = *isp); isp = &is->is_hnext) + if ((is->is_p == pr) && + (oic->icmp6_id == is->is_icmp.ics_id) && + (oic->icmp6_seq == is->is_icmp.ics_seq) && + fr_matchsrcdst(is, src, dst, &ofin, NULL)) { + /* + * in the state table ICMP query's are stored + * with the type of the corresponding ICMP + * response. Correct here + */ + if (((is->is_type == ICMP6_ECHO_REPLY) && + (oic->icmp6_type == ICMP6_ECHO_REQUEST)) || + (is->is_type - 1 == oic->icmp6_type )) { + ips_stats.iss_hits++; + is->is_pkts++; + is->is_bytes += fin->fin_plen; + return is->is_rule; + } + } + RWLOCK_EXIT(&ipf_state); + + return NULL; + }; + + if ((oip->ip6_nxt != IPPROTO_TCP) && (oip->ip6_nxt != IPPROTO_UDP)) + return NULL; + tcp = (tcphdr_t *)(oip + 1); + dport = tcp->th_dport; + sport = tcp->th_sport; + + hv = (pr = oip->ip6_nxt); + src.in6 = oip->ip6_src; + hv += src.in4.s_addr; + dst.in6 = oip->ip6_dst; + hv += dst.in4.s_addr; + hv += dport; + hv += sport; + hv %= fr_statesize; + /* + * we make an fin entry to be able to feed it to + * matchsrcdst note that not all fields are encessary + * but this is the cleanest way. Note further we fill + * in fin_mp such that if someone uses it we'll get + * a kernel panic. fr_matchsrcdst does not use this. + * + * watch out here, as ip is in host order and oip in network + * order. Any change we make must be undone afterwards. + */ + savelen = oip->ip6_plen; + oip->ip6_plen = ip->ip6_plen - sizeof(*ip) - ICMPERR_ICMPHLEN; + ofin.fin_v = 6; + fr_makefrip(sizeof(*oip), (ip_t *)oip, &ofin); + oip->ip6_plen = savelen; + ofin.fin_ifp = fin->fin_ifp; + ofin.fin_out = !fin->fin_out; + ofin.fin_mp = NULL; /* if dereferenced, panic XXX */ + READ_ENTER(&ipf_state); + for (isp = &ips_table[hv]; (is = *isp); isp = &is->is_hnext) { + /* + * Only allow this icmp though if the + * encapsulated packet was allowed through the + * other way around. Note that the minimal amount + * of info present does not allow for checking against + * tcp internals such as seq and ack numbers. + */ + if ((is->is_p == pr) && (is->is_v == 6) && + fr_matchsrcdst(is, src, dst, &ofin, tcp)) { + fr = is->is_rule; + ips_stats.iss_hits++; + /* + * we must swap src and dst here because the icmp + * comes the other way around + */ + is->is_pkts++; + is->is_bytes += fin->fin_plen; + /* + * we deliberately do not touch the timeouts + * for the accompanying state table entry. + * It remains to be seen if that is correct. XXX + */ + RWLOCK_EXIT(&ipf_state); + return fr; + } + } + RWLOCK_EXIT(&ipf_state); + return NULL; +} +#endif diff --git a/sys/netinet/ip_state.h b/sys/netinet/ip_state.h new file mode 100644 index 0000000..765709c --- /dev/null +++ b/sys/netinet/ip_state.h @@ -0,0 +1,196 @@ +/* + * Copyright (C) 1995-2000 by Darren Reed. + * + * Redistribution and use in source and binary forms are permitted + * provided that this notice is preserved and due credit is given + * to the original author and the contributors. + * + * @(#)ip_state.h 1.3 1/12/96 (C) 1995 Darren Reed + * $Id: ip_state.h,v 2.13.2.1 2000/07/08 02:15:35 darrenr Exp $ + * $FreeBSD$ + */ +#ifndef __IP_STATE_H__ +#define __IP_STATE_H__ + +#if defined(__STDC__) || defined(__GNUC__) +# define SIOCDELST _IOW('r', 61, struct ipstate *) +#else +# define SIOCDELST _IOW(r, 61, struct ipstate *) +#endif + +#define IPSTATE_SIZE 5737 +#define IPSTATE_MAX 4013 /* Maximum number of states held */ + +#define PAIRS(s1,d1,s2,d2) ((((s1) == (s2)) && ((d1) == (d2))) ||\ + (((s1) == (d2)) && ((d1) == (s2)))) +#define IPPAIR(s1,d1,s2,d2) PAIRS((s1).s_addr, (d1).s_addr, \ + (s2).s_addr, (d2).s_addr) + + +typedef struct udpstate { + u_short us_sport; + u_short us_dport; +} udpstate_t; + +typedef struct icmpstate { + u_short ics_id; + u_short ics_seq; + u_char ics_type; +} icmpstate_t; + +typedef struct tcpdata { + u_32_t td_end; + u_32_t td_maxend; + u_short td_maxwin; +} tcpdata_t; + +typedef struct tcpstate { + u_short ts_sport; + u_short ts_dport; + tcpdata_t ts_data[2]; + u_char ts_state[2]; +} tcpstate_t; + +typedef struct ipstate { + struct ipstate *is_next; + struct ipstate **is_pnext; + struct ipstate *is_hnext; + struct ipstate **is_phnext; + u_long is_age; + u_int is_pass; + U_QUAD_T is_pkts; + U_QUAD_T is_bytes; + void *is_ifp[2]; + frentry_t *is_rule; + union i6addr is_src; + union i6addr is_dst; + u_char is_p; /* Protocol */ + u_char is_v; + u_int is_hv; + u_32_t is_flags; + u_32_t is_opt; /* packet options set */ + u_32_t is_optmsk; /* " " mask */ + u_short is_sec; /* security options set */ + u_short is_secmsk; /* " " mask */ + u_short is_auth; /* authentication options set */ + u_short is_authmsk; /* " " mask */ + union { + icmpstate_t is_ics; + tcpstate_t is_ts; + udpstate_t is_us; + } is_ps; + char is_ifname[2][IFNAMSIZ]; +#if SOLARIS || defined(__sgi) + kmutex_t is_lock; +#endif +} ipstate_t; + +#define is_saddr is_src.in4.s_addr +#define is_daddr is_dst.in4.s_addr +#define is_icmp is_ps.is_ics +#define is_type is_icmp.ics_type +#define is_code is_icmp.ics_code +#define is_tcp is_ps.is_ts +#define is_udp is_ps.is_us +#define is_send is_tcp.ts_data[0].td_end +#define is_dend is_tcp.ts_data[1].td_end +#define is_maxswin is_tcp.ts_data[0].td_maxwin +#define is_maxdwin is_tcp.ts_data[1].td_maxwin +#define is_maxsend is_tcp.ts_data[0].td_maxend +#define is_maxdend is_tcp.ts_data[1].td_maxend +#define is_sport is_tcp.ts_sport +#define is_dport is_tcp.ts_dport +#define is_state is_tcp.ts_state +#define is_ifpin is_ifp[0] +#define is_ifpout is_ifp[1] + +#define TH_OPENING (TH_SYN|TH_ACK) +/* + * is_flags: + * Bits 0 - 3 are use as a mask with the current packet's bits to check for + * whether it is short, tcp/udp, a fragment or the presence of IP options. + * Bits 4 - 7 are set from the initial packet and contain what the packet + * anded with bits 0-3 must match. + * Bits 8,9 are used to indicate wildcard source/destination port matching. + */ + +typedef struct ipstate_save { + void *ips_next; + struct ipstate ips_is; + struct frentry ips_fr; +} ipstate_save_t; + +#define ips_rule ips_is.is_rule + + +typedef struct ipslog { + U_QUAD_T isl_pkts; + U_QUAD_T isl_bytes; + union i6addr isl_src; + union i6addr isl_dst; + u_short isl_type; + union { + u_short isl_filler[2]; + u_short isl_ports[2]; + u_short isl_icmp; + } isl_ps; + u_char isl_v; + u_char isl_p; + u_char isl_flags; + u_char isl_state[2]; +} ipslog_t; + +#define isl_sport isl_ps.isl_ports[0] +#define isl_dport isl_ps.isl_ports[1] +#define isl_itype isl_ps.isl_icmp + +#define ISL_NEW 0 +#define ISL_EXPIRE 0xffff +#define ISL_FLUSH 0xfffe +#define ISL_REMOVE 0xfffd + + +typedef struct ips_stat { + u_long iss_hits; + u_long iss_miss; + u_long iss_max; + u_long iss_tcp; + u_long iss_udp; + u_long iss_icmp; + u_long iss_nomem; + u_long iss_expire; + u_long iss_fin; + u_long iss_active; + u_long iss_logged; + u_long iss_logfail; + u_long iss_inuse; + ipstate_t **iss_table; + ipstate_t *iss_list; +} ips_stat_t; + + +extern u_long fr_tcpidletimeout; +extern u_long fr_tcpclosewait; +extern u_long fr_tcplastack; +extern u_long fr_tcptimeout; +extern u_long fr_tcpclosed; +extern u_long fr_tcphalfclosed; +extern u_long fr_udptimeout; +extern u_long fr_icmptimeout; +extern int fr_state_lock; +extern int fr_stateinit __P((void)); +extern int fr_tcpstate __P((ipstate_t *, fr_info_t *, ip_t *, tcphdr_t *)); +extern ipstate_t *fr_addstate __P((ip_t *, fr_info_t *, u_int)); +extern frentry_t *fr_checkstate __P((ip_t *, fr_info_t *)); +extern void ip_statesync __P((void *)); +extern void fr_timeoutstate __P((void)); +extern void fr_tcp_age __P((u_long *, u_char *, fr_info_t *, int)); +extern void fr_stateunload __P((void)); +extern void ipstate_log __P((struct ipstate *, u_int)); +#if defined(__NetBSD__) || defined(__OpenBSD__) +extern int fr_state_ioctl __P((caddr_t, u_long, int)); +#else +extern int fr_state_ioctl __P((caddr_t, int, int)); +#endif + +#endif /* __IP_STATE_H__ */ diff --git a/sys/netinet/ip_var.h b/sys/netinet/ip_var.h new file mode 100644 index 0000000..bc8b797 --- /dev/null +++ b/sys/netinet/ip_var.h @@ -0,0 +1,194 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip_var.h 8.2 (Berkeley) 1/9/95 + * $FreeBSD$ + */ + +#ifndef _NETINET_IP_VAR_H_ +#define _NETINET_IP_VAR_H_ + +/* + * Overlay for ip header used by other protocols (tcp, udp). + */ +struct ipovly { + u_char ih_x1[9]; /* (unused) */ + u_char ih_pr; /* protocol */ + u_short ih_len; /* protocol length */ + struct in_addr ih_src; /* source internet address */ + struct in_addr ih_dst; /* destination internet address */ +}; + +/* + * Ip reassembly queue structure. Each fragment + * being reassembled is attached to one of these structures. + * They are timed out after ipq_ttl drops to 0, and may also + * be reclaimed if memory becomes tight. + */ +struct ipq { + TAILQ_ENTRY(ipq) ipq_list; /* to other reass headers */ + u_char ipq_ttl; /* time for reass q to live */ + u_char ipq_p; /* protocol of this fragment */ + u_short ipq_id; /* sequence id for reassembly */ + struct mbuf *ipq_frags; /* to ip headers of fragments */ + struct in_addr ipq_src,ipq_dst; +#ifdef IPDIVERT + u_int32_t ipq_div_info; /* ipfw divert port & flags */ + u_int16_t ipq_div_cookie; /* ipfw divert cookie */ +#endif +}; + +/* + * Structure stored in mbuf in inpcb.ip_options + * and passed to ip_output when ip options are in use. + * The actual length of the options (including ipopt_dst) + * is in m_len. + */ +#define MAX_IPOPTLEN 40 + +struct ipoption { + struct in_addr ipopt_dst; /* first-hop dst if source routed */ + char ipopt_list[MAX_IPOPTLEN]; /* options proper */ +}; + +/* + * Structure attached to inpcb.ip_moptions and + * passed to ip_output when IP multicast options are in use. + */ +struct ip_moptions { + struct ifnet *imo_multicast_ifp; /* ifp for outgoing multicasts */ + u_char imo_multicast_ttl; /* TTL for outgoing multicasts */ + u_char imo_multicast_loop; /* 1 => hear sends if a member */ + u_short imo_num_memberships; /* no. memberships this socket */ + struct in_multi *imo_membership[IP_MAX_MEMBERSHIPS]; + u_long imo_multicast_vif; /* vif num outgoing multicasts */ +}; + +struct ipstat { + u_long ips_total; /* total packets received */ + u_long ips_badsum; /* checksum bad */ + u_long ips_tooshort; /* packet too short */ + u_long ips_toosmall; /* not enough data */ + u_long ips_badhlen; /* ip header length < data size */ + u_long ips_badlen; /* ip length < ip header length */ + u_long ips_fragments; /* fragments received */ + u_long ips_fragdropped; /* frags dropped (dups, out of space) */ + u_long ips_fragtimeout; /* fragments timed out */ + u_long ips_forward; /* packets forwarded */ + u_long ips_fastforward; /* packets fast forwarded */ + u_long ips_cantforward; /* packets rcvd for unreachable dest */ + u_long ips_redirectsent; /* packets forwarded on same net */ + u_long ips_noproto; /* unknown or unsupported protocol */ + u_long ips_delivered; /* datagrams delivered to upper level*/ + u_long ips_localout; /* total ip packets generated here */ + u_long ips_odropped; /* lost packets due to nobufs, etc. */ + u_long ips_reassembled; /* total packets reassembled ok */ + u_long ips_fragmented; /* datagrams successfully fragmented */ + u_long ips_ofragments; /* output fragments created */ + u_long ips_cantfrag; /* don't fragment flag was set, etc. */ + u_long ips_badoptions; /* error in option processing */ + u_long ips_noroute; /* packets discarded due to no route */ + u_long ips_badvers; /* ip version != 4 */ + u_long ips_rawout; /* total raw ip packets generated */ + u_long ips_toolong; /* ip length > max ip packet size */ + u_long ips_notmember; /* multicasts for unregistered grps */ + u_long ips_nogif; /* no match gif found */ +}; + +#ifdef _KERNEL + +/* flags passed to ip_output as last parameter */ +#define IP_FORWARDING 0x1 /* most of ip header exists */ +#define IP_RAWOUTPUT 0x2 /* raw ip header exists */ +#define IP_ROUTETOIF SO_DONTROUTE /* bypass routing tables */ +#define IP_ALLOWBROADCAST SO_BROADCAST /* can send broadcast packets */ + +struct ip; +struct inpcb; +struct route; +struct sockopt; + +extern struct ipstat ipstat; +extern u_short ip_id; /* ip packet ctr, for ids */ +extern int ip_defttl; /* default IP ttl */ +extern int ipforwarding; /* ip forwarding */ +extern struct route ipforward_rt; /* ip forwarding cached route */ +extern u_char ip_protox[]; +extern struct socket *ip_rsvpd; /* reservation protocol daemon */ +extern struct socket *ip_mrouter; /* multicast routing daemon */ +extern int (*legal_vif_num) __P((int)); +extern u_long (*ip_mcast_src) __P((int)); +extern int rsvp_on; +extern struct pr_usrreqs rip_usrreqs; + +int ip_ctloutput __P((struct socket *, struct sockopt *sopt)); +void ip_drain __P((void)); +void ip_freemoptions __P((struct ip_moptions *)); +void ip_init __P((void)); +extern int (*ip_mforward) __P((struct ip *, struct ifnet *, struct mbuf *, + struct ip_moptions *)); +int ip_output __P((struct mbuf *, + struct mbuf *, struct route *, int, struct ip_moptions *)); +void ip_savecontrol __P((struct inpcb *, struct mbuf **, struct ip *, + struct mbuf *)); +void ip_slowtimo __P((void)); +struct mbuf * + ip_srcroute __P((void)); +void ip_stripoptions __P((struct mbuf *, struct mbuf *)); +int rip_ctloutput __P((struct socket *, struct sockopt *)); +void rip_ctlinput __P((int, struct sockaddr *, void *)); +void rip_init __P((void)); +void rip_input __P((struct mbuf *, int, int)); +int rip_output __P((struct mbuf *, struct socket *, u_long)); +void ipip_input __P((struct mbuf *, int, int)); +void rsvp_input __P((struct mbuf *, int, int)); +int ip_rsvp_init __P((struct socket *)); +int ip_rsvp_done __P((void)); +int ip_rsvp_vif_init __P((struct socket *, struct sockopt *)); +int ip_rsvp_vif_done __P((struct socket *, struct sockopt *)); +void ip_rsvp_force_done __P((struct socket *)); + +#ifdef IPDIVERT +void div_init __P((void)); +void div_input __P((struct mbuf *, int, int)); +void divert_packet __P((struct mbuf *, int, int)); +extern struct pr_usrreqs div_usrreqs; +extern u_int16_t ip_divert_cookie; +#endif + +extern struct sockaddr_in *ip_fw_fwd_addr; + +void in_delayed_cksum(struct mbuf *m); + +#endif /* _KERNEL */ + +#endif /* !_NETINET_IP_VAR_H_ */ diff --git a/sys/netinet/ipl.h b/sys/netinet/ipl.h new file mode 100644 index 0000000..79d0bd3 --- /dev/null +++ b/sys/netinet/ipl.h @@ -0,0 +1,17 @@ +/* + * Copyright (C) 1993-2000 by Darren Reed. + * + * Redistribution and use in source and binary forms are permitted + * provided that this notice is preserved and due credit is given + * to the original author and the contributors. + * + * @(#)ipl.h 1.21 6/5/96 + * $FreeBSD$ + */ + +#ifndef __IPL_H__ +#define __IPL_H__ + +#define IPL_VERSION "IP Filter: v3.4.16" + +#endif diff --git a/sys/netinet/ipprotosw.h b/sys/netinet/ipprotosw.h new file mode 100644 index 0000000..1d65f0c --- /dev/null +++ b/sys/netinet/ipprotosw.h @@ -0,0 +1,109 @@ +/* + * Copyright (C) 1995, 1996, 1997, 1998, and 1999 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)protosw.h 8.1 (Berkeley) 6/2/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_IPPROTOSW_H_ +#define _NETINET_IPPROTOSW_H_ + +/* + * For pfil_head structure. + */ +#include <net/pfil.h> + +/* Forward declare these structures referenced from prototypes below. */ +struct mbuf; +struct proc; +struct sockaddr; +struct socket; +struct sockopt; + +struct ipprotosw { + short pr_type; /* socket type used for */ + struct domain *pr_domain; /* domain protocol a member of */ + short pr_protocol; /* protocol number */ + short pr_flags; /* see below */ +/* protocol-protocol hooks */ + void (*pr_input) __P((struct mbuf *, int off, int proto)); + /* input to protocol (from below) */ + int (*pr_output) __P((struct mbuf *m, struct socket *so)); + /* output to protocol (from above) */ + void (*pr_ctlinput)__P((int, struct sockaddr *, void *)); + /* control input (from below) */ + int (*pr_ctloutput)__P((struct socket *, struct sockopt *)); + /* control output (from above) */ +/* user-protocol hook */ + void *pr_ousrreq; +/* utility hooks */ + void (*pr_init) __P((void)); /* initialization hook */ + void (*pr_fasttimo) __P((void)); + /* fast timeout (200ms) */ + void (*pr_slowtimo) __P((void)); + /* slow timeout (500ms) */ + void (*pr_drain) __P((void)); + /* flush any excess space possible */ + struct pr_usrreqs *pr_usrreqs; /* supersedes pr_usrreq() */ + struct pfil_head pr_pfh; +}; + +#endif /* !_NETINET_IPPROTOSW_H_ */ diff --git a/sys/netinet/libalias/HISTORY b/sys/netinet/libalias/HISTORY new file mode 100644 index 0000000..c5bca59 --- /dev/null +++ b/sys/netinet/libalias/HISTORY @@ -0,0 +1,145 @@ +$FreeBSD$ + +Version 1.0: August 11, 1996 (cjm) + +Version 1.1: August 20, 1996 (cjm) + - Host accepts incoming connections for ports 0 to 1023. + +Version 1.2: September 7, 1996 (cjm) + - Fragment handling error in alias_db.c corrected. + +Version 1.3: September 15, 1996 (cjm) + - Generalized mechanism for handling incoming + connections (no more 0 to 1023 restriction). + + - Increased ICMP support (will handle traceroute now). + + - Improved TCP close connection logic. + +Version 1.4: September 16, 1996 (cjm) + +Version 1.5: September 17, 1996 (cjm) + - Corrected error in handling incoming UDP packets + with zero checksum. + +Version 1.6: September 18, 1996 + - Simplified ICMP data storage. Will now handle + tracert from Win95 and NT as well as FreeBSD + traceroute, which uses UDP packets to non-existent + ports. + +Version 1.7: January 9, 1997 (cjm) + - Reduced malloc() activity for ICMP echo and + timestamp requests. + + - Added handling for out-of-order IP fragments. + + - Switched to differential checksum computation + for IP headers (TCP, UDP and ICMP checksums + were already differential). + + - Accepts FTP data connections from other than + port 20. This allows one ftp connections + from two hosts which are both running packet + aliasing. + + - Checksum error on FTP transfers. Problem + in code located by Martin Renters and + Brian Somers. + +Version 1.8: January 14, 1997 (cjm) + - Fixed data type error in function StartPoint() + in alias_db.c (this bug did not exist before v1.7) + Problem in code located by Ari Suutari. + +Version 1.9: February 1, 1997 (Eivind Eklund <perhaps@yes.no>) + - Added support for IRC DCC (ee) + + - Changed the aliasing routines to use ANSI style + throughout (ee) + + - Minor API changes for integration with other + programs than PPP (ee) + + - Fixed minor security hole in alias_ftp.c for + other applications of the aliasing software. + Hole could _not_ manifest in ppp+pktAlias, but + could potentially manifest in other applications + of the aliasing. (ee) + + - Connections initiated from packet aliasing + host machine will not have their port number + aliased unless it conflicts with an aliasing + port already being used. (There is an option + to disable this for debugging) (cjm) + + - Sockets will be allocated in cases where + there might be port interference with the + host machine. This can be disabled in cases + where the ppp host will be acting purely as a + masquerading router and not generate any + traffic of its own. + (cjm) + +Version 2.0: March, 1997 (cjm) + - Aliasing links are cleared only when a host interface address + changes. + + - PacketAliasPermanentLink() API added. + + - Option for only aliasing private, unregistered + IP addresses added. + + - Substantial rework to the aliasing lookup engine. + +Version 2.1: May, 1997 (cjm) + - Continuing rework to the aliasing lookup engine + to support multiple incoming addresses and static + NAT. PacketAliasRedirectPort() and + PacketAliasRedirectAddr() added to API. + + - Now supports outgoing as well as incoming ICMP + error messages. + +Version 2.2: July, 1997 (cjm) + - Rationalized API function names to all begin with + "PacketAlias..." Old function names are retained + for backwards compatibility. + + - Packet aliasing engine will now free memory of + fragments which are never resolved after a timeout + period. Once a fragment is resolved, it becomes + the users responsibility to free the memory. + +Version 2.3: August 11, 1997 (cjm) + - Problem associated with socket file descriptor + accumulation in alias_db.c corrected. The sockets + had to be closed when a binding failed. Problem + in code located by Gordon Burditt. + +Version 2.4: September 1, 1997 (cjm) + - PKT_ALIAS_UNREGISTERED_ONLY option repaired. + This part of the code was incorrectly re-implemented + in version 2.1. + +Version 2.5: December, 1997 (ee) + - Added PKT_ALIAS_PUNCH_FW mode for firewall + bypass of FTP/IRC DCC data connections. Also added + improved TCP connection monitoring. + +Version 2.6: May, 1998 (amurai) + - Added supporting routine for NetBios over TCP/IP. + +Version 3.0: January 1, 1999 + - Transparent proxying support added. + - PPTP redirecting support added based on patches + contributed by Dru Nelson <dnelson@redwoodsoft.com>. + +Version 3.1: May, 2000 (Erik Salander, erik@whistle.com) + - Added support to alias 227 replies, allows aliasing for + FTP servers in passive mode. + - Added support for PPTP aliasing. + +Version 3.2: July, 2000 (Erik Salander, erik@whistle.com and + Junichi Satoh, junichi@junichi.org) + - Added support for streaming media (RTSP and PNA) aliasing. diff --git a/sys/netinet/libalias/Makefile b/sys/netinet/libalias/Makefile new file mode 100644 index 0000000..a6f577d --- /dev/null +++ b/sys/netinet/libalias/Makefile @@ -0,0 +1,13 @@ +# $FreeBSD$ + +LIB= alias +SHLIB_MAJOR= 4 +SHLIB_MINOR= 0 +CFLAGS+= -Wall -Wmissing-prototypes +SRCS= alias.c alias_cuseeme.c alias_db.c alias_ftp.c alias_irc.c \ + alias_nbt.c alias_pptp.c alias_proxy.c alias_smedia.c \ + alias_util.c +INCS= alias.h +MAN= libalias.3 + +.include <bsd.lib.mk> diff --git a/sys/netinet/libalias/alias.c b/sys/netinet/libalias/alias.c new file mode 100644 index 0000000..bf5fd47 --- /dev/null +++ b/sys/netinet/libalias/alias.c @@ -0,0 +1,1538 @@ +/* -*- mode: c; tab-width: 8; c-basic-indent: 4; -*- */ +/* + Alias.c provides supervisory control for the functions of the + packet aliasing software. It consists of routines to monitor + TCP connection state, protocol-specific aliasing routines, + fragment handling and the following outside world functional + interfaces: SaveFragmentPtr, GetFragmentPtr, FragmentAliasIn, + PacketAliasIn and PacketAliasOut. + + The other C program files are briefly described. The data + structure framework which holds information needed to translate + packets is encapsulated in alias_db.c. Data is accessed by + function calls, so other segments of the program need not know + about the underlying data structures. Alias_ftp.c contains + special code for modifying the ftp PORT command used to establish + data connections, while alias_irc.c does the same for IRC + DCC. Alias_util.c contains a few utility routines. + + This software is placed into the public domain with no restrictions + on its distribution. + + Version 1.0 August, 1996 (cjm) + + Version 1.1 August 20, 1996 (cjm) + PPP host accepts incoming connections for ports 0 to 1023. + (Gary Roberts pointed out the need to handle incoming + connections.) + + Version 1.2 September 7, 1996 (cjm) + Fragment handling error in alias_db.c corrected. + (Tom Torrance helped fix this problem.) + + Version 1.4 September 16, 1996 (cjm) + - A more generalized method for handling incoming + connections, without the 0-1023 restriction, is + implemented in alias_db.c + - Improved ICMP support in alias.c. Traceroute + packet streams can now be correctly aliased. + - TCP connection closing logic simplified in + alias.c and now allows for additional 1 minute + "grace period" after FIN or RST is observed. + + Version 1.5 September 17, 1996 (cjm) + Corrected error in handling incoming UDP packets with 0 checksum. + (Tom Torrance helped fix this problem.) + + Version 1.6 September 18, 1996 (cjm) + Simplified ICMP aliasing scheme. Should now support + traceroute from Win95 as well as FreeBSD. + + Version 1.7 January 9, 1997 (cjm) + - Out-of-order fragment handling. + - IP checksum error fixed for ftp transfers + from aliasing host. + - Integer return codes added to all + aliasing/de-aliasing functions. + - Some obsolete comments cleaned up. + - Differential checksum computations for + IP header (TCP, UDP and ICMP were already + differential). + + Version 2.1 May 1997 (cjm) + - Added support for outgoing ICMP error + messages. + - Added two functions PacketAliasIn2() + and PacketAliasOut2() for dynamic address + control (e.g. round-robin allocation of + incoming packets). + + Version 2.2 July 1997 (cjm) + - Rationalized API function names to begin + with "PacketAlias..." + - Eliminated PacketAliasIn2() and + PacketAliasOut2() as poorly conceived. + + Version 2.3 Dec 1998 (dillon) + - Major bounds checking additions, see FreeBSD/CVS + + Version 3.1 May, 2000 (salander) + - Added hooks to handle PPTP. + + Version 3.2 July, 2000 (salander and satoh) + - Added PacketUnaliasOut routine. + - Added hooks to handle RTSP/RTP. + + See HISTORY file for additional revisions. + + $FreeBSD$ +*/ + +#include <sys/types.h> + +#include <netinet/in_systm.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/ip_icmp.h> +#include <netinet/tcp.h> +#include <netinet/udp.h> + +#include <stdio.h> + +#include "alias_local.h" +#include "alias.h" + +#define NETBIOS_NS_PORT_NUMBER 137 +#define NETBIOS_DGM_PORT_NUMBER 138 +#define FTP_CONTROL_PORT_NUMBER 21 +#define IRC_CONTROL_PORT_NUMBER_1 6667 +#define IRC_CONTROL_PORT_NUMBER_2 6668 +#define CUSEEME_PORT_NUMBER 7648 +#define RTSP_CONTROL_PORT_NUMBER_1 554 +#define RTSP_CONTROL_PORT_NUMBER_2 7070 +#define PPTP_CONTROL_PORT_NUMBER 1723 + + + + +/* TCP Handling Routines + + TcpMonitorIn() -- These routines monitor TCP connections, and + TcpMonitorOut() delete a link when a connection is closed. + +These routines look for SYN, FIN and RST flags to determine when TCP +connections open and close. When a TCP connection closes, the data +structure containing packet aliasing information is deleted after +a timeout period. +*/ + +/* Local prototypes */ +static void TcpMonitorIn(struct ip *, struct alias_link *); + +static void TcpMonitorOut(struct ip *, struct alias_link *); + + +static void +TcpMonitorIn(struct ip *pip, struct alias_link *link) +{ + struct tcphdr *tc; + + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + + switch (GetStateIn(link)) + { + case ALIAS_TCP_STATE_NOT_CONNECTED: + if (tc->th_flags & TH_RST) + SetStateIn(link, ALIAS_TCP_STATE_DISCONNECTED); + else if (tc->th_flags & TH_SYN) + SetStateIn(link, ALIAS_TCP_STATE_CONNECTED); + break; + case ALIAS_TCP_STATE_CONNECTED: + if (tc->th_flags & (TH_FIN | TH_RST)) + SetStateIn(link, ALIAS_TCP_STATE_DISCONNECTED); + break; + } +} + +static void +TcpMonitorOut(struct ip *pip, struct alias_link *link) +{ + struct tcphdr *tc; + + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + + switch (GetStateOut(link)) + { + case ALIAS_TCP_STATE_NOT_CONNECTED: + if (tc->th_flags & TH_RST) + SetStateOut(link, ALIAS_TCP_STATE_DISCONNECTED); + else if (tc->th_flags & TH_SYN) + SetStateOut(link, ALIAS_TCP_STATE_CONNECTED); + break; + case ALIAS_TCP_STATE_CONNECTED: + if (tc->th_flags & (TH_FIN | TH_RST)) + SetStateOut(link, ALIAS_TCP_STATE_DISCONNECTED); + break; + } +} + + + + + +/* Protocol Specific Packet Aliasing Routines + + IcmpAliasIn(), IcmpAliasIn1(), IcmpAliasIn2() + IcmpAliasOut(), IcmpAliasOut1(), IcmpAliasOut2() + ProtoAliasIn(), ProtoAliasOut() + UdpAliasIn(), UdpAliasOut() + TcpAliasIn(), TcpAliasOut() + +These routines handle protocol specific details of packet aliasing. +One may observe a certain amount of repetitive arithmetic in these +functions, the purpose of which is to compute a revised checksum +without actually summing over the entire data packet, which could be +unnecessarily time consuming. + +The purpose of the packet aliasing routines is to replace the source +address of the outgoing packet and then correctly put it back for +any incoming packets. For TCP and UDP, ports are also re-mapped. + +For ICMP echo/timestamp requests and replies, the following scheme +is used: the ID number is replaced by an alias for the outgoing +packet. + +ICMP error messages are handled by looking at the IP fragment +in the data section of the message. + +For TCP and UDP protocols, a port number is chosen for an outgoing +packet, and then incoming packets are identified by IP address and +port numbers. For TCP packets, there is additional logic in the event +that sequence and ACK numbers have been altered (as in the case for +FTP data port commands). + +The port numbers used by the packet aliasing module are not true +ports in the Unix sense. No sockets are actually bound to ports. +They are more correctly thought of as placeholders. + +All packets go through the aliasing mechanism, whether they come from +the gateway machine or other machines on a local area network. +*/ + + +/* Local prototypes */ +static int IcmpAliasIn1(struct ip *); +static int IcmpAliasIn2(struct ip *); +static int IcmpAliasIn (struct ip *); + +static int IcmpAliasOut1(struct ip *); +static int IcmpAliasOut2(struct ip *); +static int IcmpAliasOut (struct ip *); + +static int ProtoAliasIn(struct ip *); +static int ProtoAliasOut(struct ip *); + +static int UdpAliasOut(struct ip *); +static int UdpAliasIn (struct ip *); + +static int TcpAliasOut(struct ip *, int); +static int TcpAliasIn (struct ip *); + + +static int +IcmpAliasIn1(struct ip *pip) +{ +/* + De-alias incoming echo and timestamp replies. + Alias incoming echo and timestamp requests. +*/ + struct alias_link *link; + struct icmp *ic; + + ic = (struct icmp *) ((char *) pip + (pip->ip_hl << 2)); + +/* Get source address from ICMP data field and restore original data */ + link = FindIcmpIn(pip->ip_src, pip->ip_dst, ic->icmp_id, 1); + if (link != NULL) + { + u_short original_id; + int accumulate; + + original_id = GetOriginalPort(link); + +/* Adjust ICMP checksum */ + accumulate = ic->icmp_id; + accumulate -= original_id; + ADJUST_CHECKSUM(accumulate, ic->icmp_cksum); + +/* Put original sequence number back in */ + ic->icmp_id = original_id; + +/* Put original address back into IP header */ + { + struct in_addr original_address; + + original_address = GetOriginalAddress(link); + DifferentialChecksum(&pip->ip_sum, + (u_short *) &original_address, + (u_short *) &pip->ip_dst, + 2); + pip->ip_dst = original_address; + } + + return(PKT_ALIAS_OK); + } + return(PKT_ALIAS_IGNORED); +} + +static int +IcmpAliasIn2(struct ip *pip) +{ +/* + Alias incoming ICMP error messages containing + IP header and first 64 bits of datagram. +*/ + struct ip *ip; + struct icmp *ic, *ic2; + struct udphdr *ud; + struct tcphdr *tc; + struct alias_link *link; + + ic = (struct icmp *) ((char *) pip + (pip->ip_hl << 2)); + ip = &ic->icmp_ip; + + ud = (struct udphdr *) ((char *) ip + (ip->ip_hl <<2)); + tc = (struct tcphdr *) ud; + ic2 = (struct icmp *) ud; + + if (ip->ip_p == IPPROTO_UDP) + link = FindUdpTcpIn(ip->ip_dst, ip->ip_src, + ud->uh_dport, ud->uh_sport, + IPPROTO_UDP, 0); + else if (ip->ip_p == IPPROTO_TCP) + link = FindUdpTcpIn(ip->ip_dst, ip->ip_src, + tc->th_dport, tc->th_sport, + IPPROTO_TCP, 0); + else if (ip->ip_p == IPPROTO_ICMP) { + if (ic2->icmp_type == ICMP_ECHO || ic2->icmp_type == ICMP_TSTAMP) + link = FindIcmpIn(ip->ip_dst, ip->ip_src, ic2->icmp_id, 0); + else + link = NULL; + } else + link = NULL; + + if (link != NULL) + { + if (ip->ip_p == IPPROTO_UDP || ip->ip_p == IPPROTO_TCP) + { + u_short *sptr; + int accumulate; + struct in_addr original_address; + u_short original_port; + + original_address = GetOriginalAddress(link); + original_port = GetOriginalPort(link); + +/* Adjust ICMP checksum */ + sptr = (u_short *) &(ip->ip_src); + accumulate = *sptr++; + accumulate += *sptr; + sptr = (u_short *) &original_address; + accumulate -= *sptr++; + accumulate -= *sptr; + accumulate += ud->uh_sport; + accumulate -= original_port; + ADJUST_CHECKSUM(accumulate, ic->icmp_cksum); + +/* Un-alias address in IP header */ + DifferentialChecksum(&pip->ip_sum, + (u_short *) &original_address, + (u_short *) &pip->ip_dst, + 2); + pip->ip_dst = original_address; + +/* Un-alias address and port number of original IP packet +fragment contained in ICMP data section */ + ip->ip_src = original_address; + ud->uh_sport = original_port; + } + else if (ip->ip_p == IPPROTO_ICMP) + { + u_short *sptr; + int accumulate; + struct in_addr original_address; + u_short original_id; + + original_address = GetOriginalAddress(link); + original_id = GetOriginalPort(link); + +/* Adjust ICMP checksum */ + sptr = (u_short *) &(ip->ip_src); + accumulate = *sptr++; + accumulate += *sptr; + sptr = (u_short *) &original_address; + accumulate -= *sptr++; + accumulate -= *sptr; + accumulate += ic2->icmp_id; + accumulate -= original_id; + ADJUST_CHECKSUM(accumulate, ic->icmp_cksum); + +/* Un-alias address in IP header */ + DifferentialChecksum(&pip->ip_sum, + (u_short *) &original_address, + (u_short *) &pip->ip_dst, + 2); + pip->ip_dst = original_address; + +/* Un-alias address of original IP packet and sequence number of + embedded ICMP datagram */ + ip->ip_src = original_address; + ic2->icmp_id = original_id; + } + return(PKT_ALIAS_OK); + } + return(PKT_ALIAS_IGNORED); +} + + +static int +IcmpAliasIn(struct ip *pip) +{ + int iresult; + struct icmp *ic; + +/* Return if proxy-only mode is enabled */ + if (packetAliasMode & PKT_ALIAS_PROXY_ONLY) + return PKT_ALIAS_OK; + + ic = (struct icmp *) ((char *) pip + (pip->ip_hl << 2)); + + iresult = PKT_ALIAS_IGNORED; + switch (ic->icmp_type) + { + case ICMP_ECHOREPLY: + case ICMP_TSTAMPREPLY: + if (ic->icmp_code == 0) + { + iresult = IcmpAliasIn1(pip); + } + break; + case ICMP_UNREACH: + case ICMP_SOURCEQUENCH: + case ICMP_TIMXCEED: + case ICMP_PARAMPROB: + iresult = IcmpAliasIn2(pip); + break; + case ICMP_ECHO: + case ICMP_TSTAMP: + iresult = IcmpAliasIn1(pip); + break; + } + return(iresult); +} + + +static int +IcmpAliasOut1(struct ip *pip) +{ +/* + Alias outgoing echo and timestamp requests. + De-alias outgoing echo and timestamp replies. +*/ + struct alias_link *link; + struct icmp *ic; + + ic = (struct icmp *) ((char *) pip + (pip->ip_hl << 2)); + +/* Save overwritten data for when echo packet returns */ + link = FindIcmpOut(pip->ip_src, pip->ip_dst, ic->icmp_id, 1); + if (link != NULL) + { + u_short alias_id; + int accumulate; + + alias_id = GetAliasPort(link); + +/* Since data field is being modified, adjust ICMP checksum */ + accumulate = ic->icmp_id; + accumulate -= alias_id; + ADJUST_CHECKSUM(accumulate, ic->icmp_cksum); + +/* Alias sequence number */ + ic->icmp_id = alias_id; + +/* Change source address */ + { + struct in_addr alias_address; + + alias_address = GetAliasAddress(link); + DifferentialChecksum(&pip->ip_sum, + (u_short *) &alias_address, + (u_short *) &pip->ip_src, + 2); + pip->ip_src = alias_address; + } + + return(PKT_ALIAS_OK); + } + return(PKT_ALIAS_IGNORED); +} + + +static int +IcmpAliasOut2(struct ip *pip) +{ +/* + Alias outgoing ICMP error messages containing + IP header and first 64 bits of datagram. +*/ + struct ip *ip; + struct icmp *ic, *ic2; + struct udphdr *ud; + struct tcphdr *tc; + struct alias_link *link; + + ic = (struct icmp *) ((char *) pip + (pip->ip_hl << 2)); + ip = &ic->icmp_ip; + + ud = (struct udphdr *) ((char *) ip + (ip->ip_hl <<2)); + tc = (struct tcphdr *) ud; + ic2 = (struct icmp *) ud; + + if (ip->ip_p == IPPROTO_UDP) + link = FindUdpTcpOut(ip->ip_dst, ip->ip_src, + ud->uh_dport, ud->uh_sport, + IPPROTO_UDP, 0); + else if (ip->ip_p == IPPROTO_TCP) + link = FindUdpTcpOut(ip->ip_dst, ip->ip_src, + tc->th_dport, tc->th_sport, + IPPROTO_TCP, 0); + else if (ip->ip_p == IPPROTO_ICMP) { + if (ic2->icmp_type == ICMP_ECHO || ic2->icmp_type == ICMP_TSTAMP) + link = FindIcmpOut(ip->ip_dst, ip->ip_src, ic2->icmp_id, 0); + else + link = NULL; + } else + link = NULL; + + if (link != NULL) + { + if (ip->ip_p == IPPROTO_UDP || ip->ip_p == IPPROTO_TCP) + { + u_short *sptr; + int accumulate; + struct in_addr alias_address; + u_short alias_port; + + alias_address = GetAliasAddress(link); + alias_port = GetAliasPort(link); + +/* Adjust ICMP checksum */ + sptr = (u_short *) &(ip->ip_dst); + accumulate = *sptr++; + accumulate += *sptr; + sptr = (u_short *) &alias_address; + accumulate -= *sptr++; + accumulate -= *sptr; + accumulate += ud->uh_dport; + accumulate -= alias_port; + ADJUST_CHECKSUM(accumulate, ic->icmp_cksum); + +/* + * Alias address in IP header if it comes from the host + * the original TCP/UDP packet was destined for. + */ + if (pip->ip_src.s_addr == ip->ip_dst.s_addr) { + DifferentialChecksum(&pip->ip_sum, + (u_short *) &alias_address, + (u_short *) &pip->ip_src, + 2); + pip->ip_src = alias_address; + } + +/* Alias address and port number of original IP packet +fragment contained in ICMP data section */ + ip->ip_dst = alias_address; + ud->uh_dport = alias_port; + } + else if (ip->ip_p == IPPROTO_ICMP) + { + u_short *sptr; + int accumulate; + struct in_addr alias_address; + u_short alias_id; + + alias_address = GetAliasAddress(link); + alias_id = GetAliasPort(link); + +/* Adjust ICMP checksum */ + sptr = (u_short *) &(ip->ip_dst); + accumulate = *sptr++; + accumulate += *sptr; + sptr = (u_short *) &alias_address; + accumulate -= *sptr++; + accumulate -= *sptr; + accumulate += ic2->icmp_id; + accumulate -= alias_id; + ADJUST_CHECKSUM(accumulate, ic->icmp_cksum); + +/* + * Alias address in IP header if it comes from the host + * the original ICMP message was destined for. + */ + if (pip->ip_src.s_addr == ip->ip_dst.s_addr) { + DifferentialChecksum(&pip->ip_sum, + (u_short *) &alias_address, + (u_short *) &pip->ip_src, + 2); + pip->ip_src = alias_address; + } + +/* Alias address of original IP packet and sequence number of + embedded ICMP datagram */ + ip->ip_dst = alias_address; + ic2->icmp_id = alias_id; + } + return(PKT_ALIAS_OK); + } + return(PKT_ALIAS_IGNORED); +} + + +static int +IcmpAliasOut(struct ip *pip) +{ + int iresult; + struct icmp *ic; + +/* Return if proxy-only mode is enabled */ + if (packetAliasMode & PKT_ALIAS_PROXY_ONLY) + return PKT_ALIAS_OK; + + ic = (struct icmp *) ((char *) pip + (pip->ip_hl << 2)); + + iresult = PKT_ALIAS_IGNORED; + switch (ic->icmp_type) + { + case ICMP_ECHO: + case ICMP_TSTAMP: + if (ic->icmp_code == 0) + { + iresult = IcmpAliasOut1(pip); + } + break; + case ICMP_UNREACH: + case ICMP_SOURCEQUENCH: + case ICMP_TIMXCEED: + case ICMP_PARAMPROB: + iresult = IcmpAliasOut2(pip); + break; + case ICMP_ECHOREPLY: + case ICMP_TSTAMPREPLY: + iresult = IcmpAliasOut1(pip); + } + return(iresult); +} + + + +static int +ProtoAliasIn(struct ip *pip) +{ +/* + Handle incoming IP packets. The + only thing which is done in this case is to alias + the dest IP address of the packet to our inside + machine. +*/ + struct alias_link *link; + +/* Return if proxy-only mode is enabled */ + if (packetAliasMode & PKT_ALIAS_PROXY_ONLY) + return PKT_ALIAS_OK; + + link = FindProtoIn(pip->ip_src, pip->ip_dst, pip->ip_p); + if (link != NULL) + { + struct in_addr original_address; + + original_address = GetOriginalAddress(link); + +/* Restore original IP address */ + DifferentialChecksum(&pip->ip_sum, + (u_short *) &original_address, + (u_short *) &pip->ip_dst, + 2); + pip->ip_dst = original_address; + + return(PKT_ALIAS_OK); + } + return(PKT_ALIAS_IGNORED); +} + + +static int +ProtoAliasOut(struct ip *pip) +{ +/* + Handle outgoing IP packets. The + only thing which is done in this case is to alias + the source IP address of the packet. +*/ + struct alias_link *link; + +/* Return if proxy-only mode is enabled */ + if (packetAliasMode & PKT_ALIAS_PROXY_ONLY) + return PKT_ALIAS_OK; + + link = FindProtoOut(pip->ip_src, pip->ip_dst, pip->ip_p); + if (link != NULL) + { + struct in_addr alias_address; + + alias_address = GetAliasAddress(link); + +/* Change source address */ + DifferentialChecksum(&pip->ip_sum, + (u_short *) &alias_address, + (u_short *) &pip->ip_src, + 2); + pip->ip_src = alias_address; + + return(PKT_ALIAS_OK); + } + return(PKT_ALIAS_IGNORED); +} + + +static int +UdpAliasIn(struct ip *pip) +{ + struct udphdr *ud; + struct alias_link *link; + +/* Return if proxy-only mode is enabled */ + if (packetAliasMode & PKT_ALIAS_PROXY_ONLY) + return PKT_ALIAS_OK; + + ud = (struct udphdr *) ((char *) pip + (pip->ip_hl << 2)); + + link = FindUdpTcpIn(pip->ip_src, pip->ip_dst, + ud->uh_sport, ud->uh_dport, + IPPROTO_UDP, 1); + if (link != NULL) + { + struct in_addr alias_address; + struct in_addr original_address; + u_short alias_port; + int accumulate; + u_short *sptr; + int r = 0; + + alias_address = GetAliasAddress(link); + original_address = GetOriginalAddress(link); + alias_port = ud->uh_dport; + ud->uh_dport = GetOriginalPort(link); + +/* Special processing for IP encoding protocols */ + if (ntohs(ud->uh_dport) == CUSEEME_PORT_NUMBER) + AliasHandleCUSeeMeIn(pip, original_address); +/* If NETBIOS Datagram, It should be alias address in UDP Data, too */ + else if (ntohs(ud->uh_dport) == NETBIOS_DGM_PORT_NUMBER + || ntohs(ud->uh_sport) == NETBIOS_DGM_PORT_NUMBER) + r = AliasHandleUdpNbt(pip, link, &original_address, ud->uh_dport); + else if (ntohs(ud->uh_dport) == NETBIOS_NS_PORT_NUMBER + || ntohs(ud->uh_sport) == NETBIOS_NS_PORT_NUMBER) + r = AliasHandleUdpNbtNS(pip, link, &alias_address, &alias_port, + &original_address, &ud->uh_dport); + +/* If UDP checksum is not zero, then adjust since destination port */ +/* is being unaliased and destination address is being altered. */ + if (ud->uh_sum != 0) + { + accumulate = alias_port; + accumulate -= ud->uh_dport; + sptr = (u_short *) &alias_address; + accumulate += *sptr++; + accumulate += *sptr; + sptr = (u_short *) &original_address; + accumulate -= *sptr++; + accumulate -= *sptr; + ADJUST_CHECKSUM(accumulate, ud->uh_sum); + } + +/* Restore original IP address */ + DifferentialChecksum(&pip->ip_sum, + (u_short *) &original_address, + (u_short *) &pip->ip_dst, + 2); + pip->ip_dst = original_address; + + /* + * If we cannot figure out the packet, ignore it. + */ + if (r < 0) + return(PKT_ALIAS_IGNORED); + else + return(PKT_ALIAS_OK); + } + return(PKT_ALIAS_IGNORED); +} + +static int +UdpAliasOut(struct ip *pip) +{ + struct udphdr *ud; + struct alias_link *link; + +/* Return if proxy-only mode is enabled */ + if (packetAliasMode & PKT_ALIAS_PROXY_ONLY) + return PKT_ALIAS_OK; + + ud = (struct udphdr *) ((char *) pip + (pip->ip_hl << 2)); + + link = FindUdpTcpOut(pip->ip_src, pip->ip_dst, + ud->uh_sport, ud->uh_dport, + IPPROTO_UDP, 1); + if (link != NULL) + { + u_short alias_port; + struct in_addr alias_address; + + alias_address = GetAliasAddress(link); + alias_port = GetAliasPort(link); + +/* Special processing for IP encoding protocols */ + if (ntohs(ud->uh_dport) == CUSEEME_PORT_NUMBER) + AliasHandleCUSeeMeOut(pip, link); +/* If NETBIOS Datagram, It should be alias address in UDP Data, too */ + else if (ntohs(ud->uh_dport) == NETBIOS_DGM_PORT_NUMBER + || ntohs(ud->uh_sport) == NETBIOS_DGM_PORT_NUMBER) + AliasHandleUdpNbt(pip, link, &alias_address, alias_port); + else if (ntohs(ud->uh_dport) == NETBIOS_NS_PORT_NUMBER + || ntohs(ud->uh_sport) == NETBIOS_NS_PORT_NUMBER) + AliasHandleUdpNbtNS(pip, link, &pip->ip_src, &ud->uh_sport, + &alias_address, &alias_port); + +/* If UDP checksum is not zero, adjust since source port is */ +/* being aliased and source address is being altered */ + if (ud->uh_sum != 0) + { + int accumulate; + u_short *sptr; + + accumulate = ud->uh_sport; + accumulate -= alias_port; + sptr = (u_short *) &(pip->ip_src); + accumulate += *sptr++; + accumulate += *sptr; + sptr = (u_short *) &alias_address; + accumulate -= *sptr++; + accumulate -= *sptr; + ADJUST_CHECKSUM(accumulate, ud->uh_sum); + } + +/* Put alias port in UDP header */ + ud->uh_sport = alias_port; + +/* Change source address */ + DifferentialChecksum(&pip->ip_sum, + (u_short *) &alias_address, + (u_short *) &pip->ip_src, + 2); + pip->ip_src = alias_address; + + return(PKT_ALIAS_OK); + } + return(PKT_ALIAS_IGNORED); +} + + + +static int +TcpAliasIn(struct ip *pip) +{ + struct tcphdr *tc; + struct alias_link *link; + + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + + link = FindUdpTcpIn(pip->ip_src, pip->ip_dst, + tc->th_sport, tc->th_dport, + IPPROTO_TCP, + !(packetAliasMode & PKT_ALIAS_PROXY_ONLY)); + if (link != NULL) + { + struct in_addr alias_address; + struct in_addr original_address; + struct in_addr proxy_address; + u_short alias_port; + u_short proxy_port; + int accumulate; + u_short *sptr; + +/* Special processing for IP encoding protocols */ + if (ntohs(tc->th_dport) == PPTP_CONTROL_PORT_NUMBER + || ntohs(tc->th_sport) == PPTP_CONTROL_PORT_NUMBER) + AliasHandlePptpIn(pip, link); + + alias_address = GetAliasAddress(link); + original_address = GetOriginalAddress(link); + proxy_address = GetProxyAddress(link); + alias_port = tc->th_dport; + tc->th_dport = GetOriginalPort(link); + proxy_port = GetProxyPort(link); + +/* Adjust TCP checksum since destination port is being unaliased */ +/* and destination port is being altered. */ + accumulate = alias_port; + accumulate -= tc->th_dport; + sptr = (u_short *) &alias_address; + accumulate += *sptr++; + accumulate += *sptr; + sptr = (u_short *) &original_address; + accumulate -= *sptr++; + accumulate -= *sptr; + +/* If this is a proxy, then modify the TCP source port and + checksum accumulation */ + if (proxy_port != 0) + { + accumulate += tc->th_sport; + tc->th_sport = proxy_port; + accumulate -= tc->th_sport; + + sptr = (u_short *) &pip->ip_src; + accumulate += *sptr++; + accumulate += *sptr; + sptr = (u_short *) &proxy_address; + accumulate -= *sptr++; + accumulate -= *sptr; + } + +/* See if ACK number needs to be modified */ + if (GetAckModified(link) == 1) + { + int delta; + + delta = GetDeltaAckIn(pip, link); + if (delta != 0) + { + sptr = (u_short *) &tc->th_ack; + accumulate += *sptr++; + accumulate += *sptr; + tc->th_ack = htonl(ntohl(tc->th_ack) - delta); + sptr = (u_short *) &tc->th_ack; + accumulate -= *sptr++; + accumulate -= *sptr; + } + } + + ADJUST_CHECKSUM(accumulate, tc->th_sum); + +/* Restore original IP address */ + sptr = (u_short *) &pip->ip_dst; + accumulate = *sptr++; + accumulate += *sptr; + pip->ip_dst = original_address; + sptr = (u_short *) &pip->ip_dst; + accumulate -= *sptr++; + accumulate -= *sptr; + +/* If this is a transparent proxy packet, then modify the source + address */ + if (proxy_address.s_addr != 0) + { + sptr = (u_short *) &pip->ip_src; + accumulate += *sptr++; + accumulate += *sptr; + pip->ip_src = proxy_address; + sptr = (u_short *) &pip->ip_src; + accumulate -= *sptr++; + accumulate -= *sptr; + } + + ADJUST_CHECKSUM(accumulate, pip->ip_sum); + +/* Monitor TCP connection state */ + TcpMonitorIn(pip, link); + + return(PKT_ALIAS_OK); + } + return(PKT_ALIAS_IGNORED); +} + +static int +TcpAliasOut(struct ip *pip, int maxpacketsize) +{ + int proxy_type; + u_short dest_port; + u_short proxy_server_port; + struct in_addr dest_address; + struct in_addr proxy_server_address; + struct tcphdr *tc; + struct alias_link *link; + + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + + proxy_type = ProxyCheck(pip, &proxy_server_address, &proxy_server_port); + + if (proxy_type == 0 && (packetAliasMode & PKT_ALIAS_PROXY_ONLY)) + return PKT_ALIAS_OK; + +/* If this is a transparent proxy, save original destination, + then alter the destination and adjust checksums */ + dest_port = tc->th_dport; + dest_address = pip->ip_dst; + if (proxy_type != 0) + { + int accumulate; + u_short *sptr; + + accumulate = tc->th_dport; + tc->th_dport = proxy_server_port; + accumulate -= tc->th_dport; + + sptr = (u_short *) &(pip->ip_dst); + accumulate += *sptr++; + accumulate += *sptr; + sptr = (u_short *) &proxy_server_address; + accumulate -= *sptr++; + accumulate -= *sptr; + + ADJUST_CHECKSUM(accumulate, tc->th_sum); + + sptr = (u_short *) &(pip->ip_dst); + accumulate = *sptr++; + accumulate += *sptr; + pip->ip_dst = proxy_server_address; + sptr = (u_short *) &(pip->ip_dst); + accumulate -= *sptr++; + accumulate -= *sptr; + + ADJUST_CHECKSUM(accumulate, pip->ip_sum); + } + + link = FindUdpTcpOut(pip->ip_src, pip->ip_dst, + tc->th_sport, tc->th_dport, + IPPROTO_TCP, 1); + if (link !=NULL) + { + u_short alias_port; + struct in_addr alias_address; + int accumulate; + u_short *sptr; + +/* Save original destination address, if this is a proxy packet. + Also modify packet to include destination encoding. */ + if (proxy_type != 0) + { + SetProxyPort(link, dest_port); + SetProxyAddress(link, dest_address); + ProxyModify(link, pip, maxpacketsize, proxy_type); + } + +/* Get alias address and port */ + alias_port = GetAliasPort(link); + alias_address = GetAliasAddress(link); + +/* Monitor TCP connection state */ + TcpMonitorOut(pip, link); + +/* Special processing for IP encoding protocols */ + if (ntohs(tc->th_dport) == FTP_CONTROL_PORT_NUMBER + || ntohs(tc->th_sport) == FTP_CONTROL_PORT_NUMBER) + AliasHandleFtpOut(pip, link, maxpacketsize); + else if (ntohs(tc->th_dport) == IRC_CONTROL_PORT_NUMBER_1 + || ntohs(tc->th_dport) == IRC_CONTROL_PORT_NUMBER_2) + AliasHandleIrcOut(pip, link, maxpacketsize); + else if (ntohs(tc->th_dport) == RTSP_CONTROL_PORT_NUMBER_1 + || ntohs(tc->th_sport) == RTSP_CONTROL_PORT_NUMBER_1 + || ntohs(tc->th_dport) == RTSP_CONTROL_PORT_NUMBER_2 + || ntohs(tc->th_sport) == RTSP_CONTROL_PORT_NUMBER_2) + AliasHandleRtspOut(pip, link, maxpacketsize); + else if (ntohs(tc->th_dport) == PPTP_CONTROL_PORT_NUMBER + || ntohs(tc->th_sport) == PPTP_CONTROL_PORT_NUMBER) + AliasHandlePptpOut(pip, link); + +/* Adjust TCP checksum since source port is being aliased */ +/* and source address is being altered */ + accumulate = tc->th_sport; + tc->th_sport = alias_port; + accumulate -= tc->th_sport; + + sptr = (u_short *) &(pip->ip_src); + accumulate += *sptr++; + accumulate += *sptr; + sptr = (u_short *) &alias_address; + accumulate -= *sptr++; + accumulate -= *sptr; + +/* Modify sequence number if necessary */ + if (GetAckModified(link) == 1) + { + int delta; + + delta = GetDeltaSeqOut(pip, link); + if (delta != 0) + { + sptr = (u_short *) &tc->th_seq; + accumulate += *sptr++; + accumulate += *sptr; + tc->th_seq = htonl(ntohl(tc->th_seq) + delta); + sptr = (u_short *) &tc->th_seq; + accumulate -= *sptr++; + accumulate -= *sptr; + } + } + + ADJUST_CHECKSUM(accumulate, tc->th_sum); + +/* Change source address */ + sptr = (u_short *) &(pip->ip_src); + accumulate = *sptr++; + accumulate += *sptr; + pip->ip_src = alias_address; + sptr = (u_short *) &(pip->ip_src); + accumulate -= *sptr++; + accumulate -= *sptr; + + ADJUST_CHECKSUM(accumulate, pip->ip_sum); + + return(PKT_ALIAS_OK); + } + return(PKT_ALIAS_IGNORED); +} + + + + +/* Fragment Handling + + FragmentIn() + FragmentOut() + +The packet aliasing module has a limited ability for handling IP +fragments. If the ICMP, TCP or UDP header is in the first fragment +received, then the ID number of the IP packet is saved, and other +fragments are identified according to their ID number and IP address +they were sent from. Pointers to unresolved fragments can also be +saved and recalled when a header fragment is seen. +*/ + +/* Local prototypes */ +static int FragmentIn(struct ip *); +static int FragmentOut(struct ip *); + + +static int +FragmentIn(struct ip *pip) +{ + struct alias_link *link; + + link = FindFragmentIn2(pip->ip_src, pip->ip_dst, pip->ip_id); + if (link != NULL) + { + struct in_addr original_address; + + GetFragmentAddr(link, &original_address); + DifferentialChecksum(&pip->ip_sum, + (u_short *) &original_address, + (u_short *) &pip->ip_dst, + 2); + pip->ip_dst = original_address; + + return(PKT_ALIAS_OK); + } + return(PKT_ALIAS_UNRESOLVED_FRAGMENT); +} + + +static int +FragmentOut(struct ip *pip) +{ + struct in_addr alias_address; + + alias_address = FindAliasAddress(pip->ip_src); + DifferentialChecksum(&pip->ip_sum, + (u_short *) &alias_address, + (u_short *) &pip->ip_src, + 2); + pip->ip_src = alias_address; + + return(PKT_ALIAS_OK); +} + + + + + + +/* Outside World Access + + PacketAliasSaveFragment() + PacketAliasGetFragment() + PacketAliasFragmentIn() + PacketAliasIn() + PacketAliasOut() + PacketUnaliasOut() + +(prototypes in alias.h) +*/ + + +int +PacketAliasSaveFragment(char *ptr) +{ + int iresult; + struct alias_link *link; + struct ip *pip; + + pip = (struct ip *) ptr; + link = AddFragmentPtrLink(pip->ip_src, pip->ip_id); + iresult = PKT_ALIAS_ERROR; + if (link != NULL) + { + SetFragmentPtr(link, ptr); + iresult = PKT_ALIAS_OK; + } + return(iresult); +} + + +char * +PacketAliasGetFragment(char *ptr) +{ + struct alias_link *link; + char *fptr; + struct ip *pip; + + pip = (struct ip *) ptr; + link = FindFragmentPtr(pip->ip_src, pip->ip_id); + if (link != NULL) + { + GetFragmentPtr(link, &fptr); + SetFragmentPtr(link, NULL); + SetExpire(link, 0); /* Deletes link */ + + return(fptr); + } + else + { + return(NULL); + } +} + + +void +PacketAliasFragmentIn(char *ptr, /* Points to correctly de-aliased + header fragment */ + char *ptr_fragment /* Points to fragment which must + be de-aliased */ + ) +{ + struct ip *pip; + struct ip *fpip; + + pip = (struct ip *) ptr; + fpip = (struct ip *) ptr_fragment; + + DifferentialChecksum(&fpip->ip_sum, + (u_short *) &pip->ip_dst, + (u_short *) &fpip->ip_dst, + 2); + fpip->ip_dst = pip->ip_dst; +} + + +int +PacketAliasIn(char *ptr, int maxpacketsize) +{ + struct in_addr alias_addr; + struct ip *pip; + int iresult; + + if (packetAliasMode & PKT_ALIAS_REVERSE) { + packetAliasMode &= ~PKT_ALIAS_REVERSE; + iresult = PacketAliasOut(ptr, maxpacketsize); + packetAliasMode |= PKT_ALIAS_REVERSE; + return iresult; + } + + HouseKeeping(); + ClearCheckNewLink(); + pip = (struct ip *) ptr; + alias_addr = pip->ip_dst; + + /* Defense against mangled packets */ + if (ntohs(pip->ip_len) > maxpacketsize + || (pip->ip_hl<<2) > maxpacketsize) + return PKT_ALIAS_IGNORED; + + iresult = PKT_ALIAS_IGNORED; + if ( (ntohs(pip->ip_off) & IP_OFFMASK) == 0 ) + { + switch (pip->ip_p) + { + case IPPROTO_ICMP: + iresult = IcmpAliasIn(pip); + break; + case IPPROTO_UDP: + iresult = UdpAliasIn(pip); + break; + case IPPROTO_TCP: + iresult = TcpAliasIn(pip); + break; + case IPPROTO_GRE: + if (packetAliasMode & PKT_ALIAS_PROXY_ONLY || + AliasHandlePptpGreIn(pip) == 0) + iresult = PKT_ALIAS_OK; + else + iresult = ProtoAliasIn(pip); + break; + default: + iresult = ProtoAliasIn(pip); + break; + } + + if (ntohs(pip->ip_off) & IP_MF) + { + struct alias_link *link; + + link = FindFragmentIn1(pip->ip_src, alias_addr, pip->ip_id); + if (link != NULL) + { + iresult = PKT_ALIAS_FOUND_HEADER_FRAGMENT; + SetFragmentAddr(link, pip->ip_dst); + } + else + { + iresult = PKT_ALIAS_ERROR; + } + } + } + else + { + iresult = FragmentIn(pip); + } + + return(iresult); +} + + + +/* Unregistered address ranges */ + +/* 10.0.0.0 -> 10.255.255.255 */ +#define UNREG_ADDR_A_LOWER 0x0a000000 +#define UNREG_ADDR_A_UPPER 0x0affffff + +/* 172.16.0.0 -> 172.31.255.255 */ +#define UNREG_ADDR_B_LOWER 0xac100000 +#define UNREG_ADDR_B_UPPER 0xac1fffff + +/* 192.168.0.0 -> 192.168.255.255 */ +#define UNREG_ADDR_C_LOWER 0xc0a80000 +#define UNREG_ADDR_C_UPPER 0xc0a8ffff + +int +PacketAliasOut(char *ptr, /* valid IP packet */ + int maxpacketsize /* How much the packet data may grow + (FTP and IRC inline changes) */ + ) +{ + int iresult; + struct in_addr addr_save; + struct ip *pip; + + if (packetAliasMode & PKT_ALIAS_REVERSE) { + packetAliasMode &= ~PKT_ALIAS_REVERSE; + iresult = PacketAliasIn(ptr, maxpacketsize); + packetAliasMode |= PKT_ALIAS_REVERSE; + return iresult; + } + + HouseKeeping(); + ClearCheckNewLink(); + pip = (struct ip *) ptr; + + /* Defense against mangled packets */ + if (ntohs(pip->ip_len) > maxpacketsize + || (pip->ip_hl<<2) > maxpacketsize) + return PKT_ALIAS_IGNORED; + + addr_save = GetDefaultAliasAddress(); + if (packetAliasMode & PKT_ALIAS_UNREGISTERED_ONLY) + { + u_long addr; + int iclass; + + iclass = 0; + addr = ntohl(pip->ip_src.s_addr); + if (addr >= UNREG_ADDR_C_LOWER && addr <= UNREG_ADDR_C_UPPER) + iclass = 3; + else if (addr >= UNREG_ADDR_B_LOWER && addr <= UNREG_ADDR_B_UPPER) + iclass = 2; + else if (addr >= UNREG_ADDR_A_LOWER && addr <= UNREG_ADDR_A_UPPER) + iclass = 1; + + if (iclass == 0) + { + SetDefaultAliasAddress(pip->ip_src); + } + } + + iresult = PKT_ALIAS_IGNORED; + if ((ntohs(pip->ip_off) & IP_OFFMASK) == 0) + { + switch (pip->ip_p) + { + case IPPROTO_ICMP: + iresult = IcmpAliasOut(pip); + break; + case IPPROTO_UDP: + iresult = UdpAliasOut(pip); + break; + case IPPROTO_TCP: + iresult = TcpAliasOut(pip, maxpacketsize); + break; + case IPPROTO_GRE: + if (AliasHandlePptpGreOut(pip) == 0) + iresult = PKT_ALIAS_OK; + else + iresult = ProtoAliasOut(pip); + break; + default: + iresult = ProtoAliasOut(pip); + break; + } + } + else + { + iresult = FragmentOut(pip); + } + + SetDefaultAliasAddress(addr_save); + return(iresult); +} + +int +PacketUnaliasOut(char *ptr, /* valid IP packet */ + int maxpacketsize /* for error checking */ + ) +{ + struct ip *pip; + struct icmp *ic; + struct udphdr *ud; + struct tcphdr *tc; + struct alias_link *link; + int iresult = PKT_ALIAS_IGNORED; + + pip = (struct ip *) ptr; + + /* Defense against mangled packets */ + if (ntohs(pip->ip_len) > maxpacketsize + || (pip->ip_hl<<2) > maxpacketsize) + return(iresult); + + ud = (struct udphdr *) ((char *) pip + (pip->ip_hl << 2)); + tc = (struct tcphdr *) ud; + ic = (struct icmp *) ud; + + /* Find a link */ + if (pip->ip_p == IPPROTO_UDP) + link = FindUdpTcpIn(pip->ip_dst, pip->ip_src, + ud->uh_dport, ud->uh_sport, + IPPROTO_UDP, 0); + else if (pip->ip_p == IPPROTO_TCP) + link = FindUdpTcpIn(pip->ip_dst, pip->ip_src, + tc->th_dport, tc->th_sport, + IPPROTO_TCP, 0); + else if (pip->ip_p == IPPROTO_ICMP) + link = FindIcmpIn(pip->ip_dst, pip->ip_src, ic->icmp_id, 0); + else + link = NULL; + + /* Change it from an aliased packet to an unaliased packet */ + if (link != NULL) + { + if (pip->ip_p == IPPROTO_UDP || pip->ip_p == IPPROTO_TCP) + { + u_short *sptr; + int accumulate; + struct in_addr original_address; + u_short original_port; + + original_address = GetOriginalAddress(link); + original_port = GetOriginalPort(link); + + /* Adjust TCP/UDP checksum */ + sptr = (u_short *) &(pip->ip_src); + accumulate = *sptr++; + accumulate += *sptr; + sptr = (u_short *) &original_address; + accumulate -= *sptr++; + accumulate -= *sptr; + + if (pip->ip_p == IPPROTO_UDP) { + accumulate += ud->uh_sport; + accumulate -= original_port; + ADJUST_CHECKSUM(accumulate, ud->uh_sum); + } else { + accumulate += tc->th_sport; + accumulate -= original_port; + ADJUST_CHECKSUM(accumulate, tc->th_sum); + } + + /* Adjust IP checksum */ + DifferentialChecksum(&pip->ip_sum, + (u_short *) &original_address, + (u_short *) &pip->ip_src, + 2); + + /* Un-alias source address and port number */ + pip->ip_src = original_address; + if (pip->ip_p == IPPROTO_UDP) + ud->uh_sport = original_port; + else + tc->th_sport = original_port; + + iresult = PKT_ALIAS_OK; + + } else if (pip->ip_p == IPPROTO_ICMP) { + + u_short *sptr; + int accumulate; + struct in_addr original_address; + u_short original_id; + + original_address = GetOriginalAddress(link); + original_id = GetOriginalPort(link); + + /* Adjust ICMP checksum */ + sptr = (u_short *) &(pip->ip_src); + accumulate = *sptr++; + accumulate += *sptr; + sptr = (u_short *) &original_address; + accumulate -= *sptr++; + accumulate -= *sptr; + accumulate += ic->icmp_id; + accumulate -= original_id; + ADJUST_CHECKSUM(accumulate, ic->icmp_cksum); + + /* Adjust IP checksum */ + DifferentialChecksum(&pip->ip_sum, + (u_short *) &original_address, + (u_short *) &pip->ip_src, + 2); + + /* Un-alias source address and port number */ + pip->ip_src = original_address; + ic->icmp_id = original_id; + + iresult = PKT_ALIAS_OK; + } + } + return(iresult); + +} diff --git a/sys/netinet/libalias/alias.h b/sys/netinet/libalias/alias.h new file mode 100644 index 0000000..607021f --- /dev/null +++ b/sys/netinet/libalias/alias.h @@ -0,0 +1,156 @@ +/* lint -save -library Flexelint comment for external headers */ + +/*- + * Alias.h defines the outside world interfaces for the packet aliasing + * software. + * + * This software is placed into the public domain with no restrictions on its + * distribution. + * + * $FreeBSD$ + */ + +#ifndef _ALIAS_H_ +#define _ALIAS_H_ + +/* The external interface to libalias, the packet aliasing engine. */ + +/* Initialization and control functions. */ +void PacketAliasInit(void); +void PacketAliasSetAddress(struct in_addr _addr); +void PacketAliasSetFWBase(unsigned int _base, unsigned int _num); +unsigned int + PacketAliasSetMode(unsigned int _flags, unsigned int _mask); +void PacketAliasUninit(void); + +/* Packet Handling functions. */ +int PacketAliasIn(char *_ptr, int _maxpacketsize); +int PacketAliasOut(char *_ptr, int _maxpacketsize); +int PacketUnaliasOut(char *_ptr, int _maxpacketsize); + +/* Port and address redirection functions. */ + +/* + * An anonymous structure, a pointer to which is returned from + * PacketAliasRedirectAddr(), PacketAliasRedirectPort() or + * PacketAliasRedirectProto(), passed to PacketAliasAddServer(), + * and freed by PacketAliasRedirectDelete(). + */ +struct alias_link; + +int PacketAliasAddServer(struct alias_link *_link, + struct in_addr _addr, unsigned short _port); +struct alias_link * + PacketAliasRedirectAddr(struct in_addr _src_addr, + struct in_addr _alias_addr); +void PacketAliasRedirectDelete(struct alias_link *_link); +struct alias_link * + PacketAliasRedirectPort(struct in_addr _src_addr, + unsigned short _src_port, struct in_addr _dst_addr, + unsigned short _dst_port, struct in_addr _alias_addr, + unsigned short _alias_port, unsigned char _proto); +struct alias_link * + PacketAliasRedirectProto(struct in_addr _src_addr, + struct in_addr _dst_addr, struct in_addr _alias_addr, + unsigned char _proto); + +/* Fragment Handling functions. */ +void PacketAliasFragmentIn(char *_ptr, char *_ptr_fragment); +char *PacketAliasGetFragment(char *_ptr); +int PacketAliasSaveFragment(char *_ptr); + +/* Miscellaneous functions. */ +int PacketAliasCheckNewLink(void); +unsigned short + PacketAliasInternetChecksum(unsigned short *_ptr, int _nbytes); +void PacketAliasSetTarget(struct in_addr _target_addr); + +/* Transparent proxying routines. */ +int PacketAliasProxyRule(const char *_cmd); + +/* Mode flags, set using PacketAliasSetMode() */ + +/* + * If PKT_ALIAS_LOG is set, a message will be printed to /var/log/alias.log + * every time a link is created or deleted. This is useful for debugging. + */ +#define PKT_ALIAS_LOG 0x01 + +/* + * If PKT_ALIAS_DENY_INCOMING is set, then incoming connections (e.g. to ftp, + * telnet or web servers will be prevented by the aliasing mechanism. + */ +#define PKT_ALIAS_DENY_INCOMING 0x02 + +/* + * If PKT_ALIAS_SAME_PORTS is set, packets will be attempted sent from the + * same port as they originated on. This allows e.g. rsh to work *99% of the + * time*, but _not_ 100% (it will be slightly flakey instead of not working + * at all). This mode bit is set by PacketAliasInit(), so it is a default + * mode of operation. + */ +#define PKT_ALIAS_SAME_PORTS 0x04 + +/* + * If PKT_ALIAS_USE_SOCKETS is set, then when partially specified links (e.g. + * destination port and/or address is zero), the packet aliasing engine will + * attempt to allocate a socket for the aliasing port it chooses. This will + * avoid interference with the host machine. Fully specified links do not + * require this. This bit is set after a call to PacketAliasInit(), so it is + * a default mode of operation. + */ +#define PKT_ALIAS_USE_SOCKETS 0x08 + +/*- + * If PKT_ALIAS_UNREGISTERED_ONLY is set, then only packets with + * unregistered source addresses will be aliased. Private + * addresses are those in the following ranges: + * + * 10.0.0.0 -> 10.255.255.255 + * 172.16.0.0 -> 172.31.255.255 + * 192.168.0.0 -> 192.168.255.255 + */ +#define PKT_ALIAS_UNREGISTERED_ONLY 0x10 + +/* + * If PKT_ALIAS_RESET_ON_ADDR_CHANGE is set, then the table of dynamic + * aliasing links will be reset whenever PacketAliasSetAddress() changes the + * default aliasing address. If the default aliasing address is left + * unchanged by this function call, then the table of dynamic aliasing links + * will be left intact. This bit is set after a call to PacketAliasInit(). + */ +#define PKT_ALIAS_RESET_ON_ADDR_CHANGE 0x20 + +#ifndef NO_FW_PUNCH +/* + * If PKT_ALIAS_PUNCH_FW is set, active FTP and IRC DCC connections will + * create a 'hole' in the firewall to allow the transfers to work. The + * ipfw rule number that the hole is created with is controlled by + * PacketAliasSetFWBase(). The hole will be attached to that + * particular alias_link, so when the link goes away the hole is deleted. + */ +#define PKT_ALIAS_PUNCH_FW 0x100 +#endif + +/* + * If PKT_ALIAS_PROXY_ONLY is set, then NAT will be disabled and only + * transparent proxying is performed. + */ +#define PKT_ALIAS_PROXY_ONLY 0x40 + +/* + * If PKT_ALIAS_REVERSE is set, the actions of PacketAliasIn() and + * PacketAliasOut() are reversed. + */ +#define PKT_ALIAS_REVERSE 0x80 + +/* Function return codes. */ +#define PKT_ALIAS_ERROR -1 +#define PKT_ALIAS_OK 1 +#define PKT_ALIAS_IGNORED 2 +#define PKT_ALIAS_UNRESOLVED_FRAGMENT 3 +#define PKT_ALIAS_FOUND_HEADER_FRAGMENT 4 + +#endif /* !_ALIAS_H_ */ + +/* lint -restore */ diff --git a/sys/netinet/libalias/alias_cuseeme.c b/sys/netinet/libalias/alias_cuseeme.c new file mode 100644 index 0000000..1a7fbf1 --- /dev/null +++ b/sys/netinet/libalias/alias_cuseeme.c @@ -0,0 +1,120 @@ +/*- + * Copyright (c) 1998 Brian Somers <brian@Awfulhak.org> + * with the aid of code written by + * Junichi SATOH <junichi@astec.co.jp> 1996, 1997. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/types.h> +#include <netinet/in_systm.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/udp.h> + +#include "alias_local.h" + +/* CU-SeeMe Data Header */ +struct cu_header { + u_int16_t dest_family; + u_int16_t dest_port; + u_int32_t dest_addr; + int16_t family; + u_int16_t port; + u_int32_t addr; + u_int32_t seq; + u_int16_t msg; + u_int16_t data_type; + u_int16_t packet_len; +}; + +/* Open Continue Header */ +struct oc_header { + u_int16_t client_count; /* Number of client info structs */ + u_int32_t seq_no; + char user_name[20]; + char reserved[4]; /* flags, version stuff, etc */ +}; + +/* client info structures */ +struct client_info { + u_int32_t address; /* Client address */ + char reserved[8]; /* Flags, pruning bitfield, packet counts etc */ +}; + +void +AliasHandleCUSeeMeOut(struct ip *pip, struct alias_link *link) +{ + struct udphdr *ud; + + ud = (struct udphdr *)((char *)pip + (pip->ip_hl << 2)); + if (ntohs(ud->uh_ulen) - sizeof(struct udphdr) >= sizeof(struct cu_header)) { + struct cu_header *cu; + struct alias_link *cu_link; + + cu = (struct cu_header *)(ud + 1); + if (cu->addr) + cu->addr = (u_int32_t)GetAliasAddress(link).s_addr; + + cu_link = FindUdpTcpOut(pip->ip_src, GetDestAddress(link), + ud->uh_dport, 0, IPPROTO_UDP, 1); + +#ifndef NO_FW_PUNCH + if (cu_link) + PunchFWHole(cu_link); +#endif + } +} + +void +AliasHandleCUSeeMeIn(struct ip *pip, struct in_addr original_addr) +{ + struct in_addr alias_addr; + struct udphdr *ud; + struct cu_header *cu; + struct oc_header *oc; + struct client_info *ci; + char *end; + int i; + + alias_addr.s_addr = pip->ip_dst.s_addr; + ud = (struct udphdr *)((char *)pip + (pip->ip_hl << 2)); + cu = (struct cu_header *)(ud + 1); + oc = (struct oc_header *)(cu + 1); + ci = (struct client_info *)(oc + 1); + end = (char *)ud + ntohs(ud->uh_ulen); + + if ((char *)oc <= end) { + if(cu->dest_addr) + cu->dest_addr = (u_int32_t)original_addr.s_addr; + if(ntohs(cu->data_type) == 101) + /* Find and change our address */ + for(i = 0; (char *)(ci + 1) <= end && i < oc->client_count; i++, ci++) + if(ci->address == (u_int32_t)alias_addr.s_addr) { + ci->address = (u_int32_t)original_addr.s_addr; + break; + } + } +} diff --git a/sys/netinet/libalias/alias_db.c b/sys/netinet/libalias/alias_db.c new file mode 100644 index 0000000..60d425f --- /dev/null +++ b/sys/netinet/libalias/alias_db.c @@ -0,0 +1,2788 @@ +/* -*- mode: c; tab-width: 8; c-basic-indent: 4; -*- + Alias_db.c encapsulates all data structures used for storing + packet aliasing data. Other parts of the aliasing software + access data through functions provided in this file. + + Data storage is based on the notion of a "link", which is + established for ICMP echo/reply packets, UDP datagrams and + TCP stream connections. A link stores the original source + and destination addresses. For UDP and TCP, it also stores + source and destination port numbers, as well as an alias + port number. Links are also used to store information about + fragments. + + There is a facility for sweeping through and deleting old + links as new packets are sent through. A simple timeout is + used for ICMP and UDP links. TCP links are left alone unless + there is an incomplete connection, in which case the link + can be deleted after a certain amount of time. + + + This software is placed into the public domain with no restrictions + on its distribution. + + Initial version: August, 1996 (cjm) + + Version 1.4: September 16, 1996 (cjm) + Facility for handling incoming links added. + + Version 1.6: September 18, 1996 (cjm) + ICMP data handling simplified. + + Version 1.7: January 9, 1997 (cjm) + Fragment handling simplified. + Saves pointers for unresolved fragments. + Permits links for unspecified remote ports + or unspecified remote addresses. + Fixed bug which did not properly zero port + table entries after a link was deleted. + Cleaned up some obsolete comments. + + Version 1.8: January 14, 1997 (cjm) + Fixed data type error in StartPoint(). + (This error did not exist prior to v1.7 + and was discovered and fixed by Ari Suutari) + + Version 1.9: February 1, 1997 + Optionally, connections initiated from packet aliasing host + machine will will not have their port number aliased unless it + conflicts with an aliasing port already being used. (cjm) + + All options earlier being #ifdef'ed are now available through + a new interface, SetPacketAliasMode(). This allows run time + control (which is now available in PPP+pktAlias through the + 'alias' keyword). (ee) + + Added ability to create an alias port without + either destination address or port specified. + port type = ALIAS_PORT_UNKNOWN_DEST_ALL (ee) + + Removed K&R style function headers + and general cleanup. (ee) + + Added packetAliasMode to replace compiler #defines's (ee) + + Allocates sockets for partially specified + ports if ALIAS_USE_SOCKETS defined. (cjm) + + Version 2.0: March, 1997 + SetAliasAddress() will now clean up alias links + if the aliasing address is changed. (cjm) + + PacketAliasPermanentLink() function added to support permanent + links. (J. Fortes suggested the need for this.) + Examples: + + (192.168.0.1, port 23) <-> alias port 6002, unknown dest addr/port + + (192.168.0.2, port 21) <-> alias port 3604, known dest addr + unknown dest port + + These permanent links allow for incoming connections to + machines on the local network. They can be given with a + user-chosen amount of specificity, with increasing specificity + meaning more security. (cjm) + + Quite a bit of rework to the basic engine. The portTable[] + array, which kept track of which ports were in use was replaced + by a table/linked list structure. (cjm) + + SetExpire() function added. (cjm) + + DeleteLink() no longer frees memory association with a pointer + to a fragment (this bug was first recognized by E. Eklund in + v1.9). + + Version 2.1: May, 1997 (cjm) + Packet aliasing engine reworked so that it can handle + multiple external addresses rather than just a single + host address. + + PacketAliasRedirectPort() and PacketAliasRedirectAddr() + added to the API. The first function is a more generalized + version of PacketAliasPermanentLink(). The second function + implements static network address translation. + + Version 3.2: July, 2000 (salander and satoh) + Added FindNewPortGroup to get contiguous range of port values. + + Added QueryUdpTcpIn and QueryUdpTcpOut to look for an aliasing + link but not actually add one. + + Added FindRtspOut, which is closely derived from FindUdpTcpOut, + except that the alias port (from FindNewPortGroup) is provided + as input. + + See HISTORY file for additional revisions. + + $FreeBSD$ +*/ + + +/* System include files */ +#include <errno.h> +#include <stdlib.h> +#include <stdio.h> +#include <unistd.h> + +#include <sys/queue.h> +#include <sys/socket.h> +#include <sys/time.h> +#include <sys/types.h> + +/* BSD network include files */ +#include <netinet/in_systm.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/tcp.h> +#include <arpa/inet.h> + +#include "alias.h" +#include "alias_local.h" + + + +/* + Constants (note: constants are also defined + near relevant functions or structs) +*/ + +/* Sizes of input and output link tables */ +#define LINK_TABLE_OUT_SIZE 101 +#define LINK_TABLE_IN_SIZE 4001 + +/* Parameters used for cleanup of expired links */ +#define ALIAS_CLEANUP_INTERVAL_SECS 60 +#define ALIAS_CLEANUP_MAX_SPOKES 30 + +/* Timeouts (in seconds) for different link types */ +#define ICMP_EXPIRE_TIME 60 +#define UDP_EXPIRE_TIME 60 +#define PROTO_EXPIRE_TIME 60 +#define FRAGMENT_ID_EXPIRE_TIME 10 +#define FRAGMENT_PTR_EXPIRE_TIME 30 + +/* TCP link expire time for different cases */ +/* When the link has been used and closed - minimal grace time to + allow ACKs and potential re-connect in FTP (XXX - is this allowed?) */ +#ifndef TCP_EXPIRE_DEAD +# define TCP_EXPIRE_DEAD 10 +#endif + +/* When the link has been used and closed on one side - the other side + is allowed to still send data */ +#ifndef TCP_EXPIRE_SINGLEDEAD +# define TCP_EXPIRE_SINGLEDEAD 90 +#endif + +/* When the link isn't yet up */ +#ifndef TCP_EXPIRE_INITIAL +# define TCP_EXPIRE_INITIAL 300 +#endif + +/* When the link is up */ +#ifndef TCP_EXPIRE_CONNECTED +# define TCP_EXPIRE_CONNECTED 86400 +#endif + + +/* Dummy port number codes used for FindLinkIn/Out() and AddLink(). + These constants can be anything except zero, which indicates an + unknown port number. */ + +#define NO_DEST_PORT 1 +#define NO_SRC_PORT 1 + + + +/* Data Structures + + The fundamental data structure used in this program is + "struct alias_link". Whenever a TCP connection is made, + a UDP datagram is sent out, or an ICMP echo request is made, + a link record is made (if it has not already been created). + The link record is identified by the source address/port + and the destination address/port. In the case of an ICMP + echo request, the source port is treated as being equivalent + with the 16-bit ID number of the ICMP packet. + + The link record also can store some auxiliary data. For + TCP connections that have had sequence and acknowledgment + modifications, data space is available to track these changes. + A state field is used to keep track in changes to the TCP + connection state. ID numbers of fragments can also be + stored in the auxiliary space. Pointers to unresolved + fragments can also be stored. + + The link records support two independent chainings. Lookup + tables for input and out tables hold the initial pointers + the link chains. On input, the lookup table indexes on alias + port and link type. On output, the lookup table indexes on + source address, destination address, source port, destination + port and link type. +*/ + +struct ack_data_record /* used to save changes to ACK/sequence numbers */ +{ + u_long ack_old; + u_long ack_new; + int delta; + int active; +}; + +struct tcp_state /* Information about TCP connection */ +{ + int in; /* State for outside -> inside */ + int out; /* State for inside -> outside */ + int index; /* Index to ACK data array */ + int ack_modified; /* Indicates whether ACK and sequence numbers */ + /* been modified */ +}; + +#define N_LINK_TCP_DATA 3 /* Number of distinct ACK number changes + saved for a modified TCP stream */ +struct tcp_dat +{ + struct tcp_state state; + struct ack_data_record ack[N_LINK_TCP_DATA]; + int fwhole; /* Which firewall record is used for this hole? */ +}; + +struct server /* LSNAT server pool (circular list) */ +{ + struct in_addr addr; + u_short port; + struct server *next; +}; + +struct alias_link /* Main data structure */ +{ + struct in_addr src_addr; /* Address and port information */ + struct in_addr dst_addr; + struct in_addr alias_addr; + struct in_addr proxy_addr; + u_short src_port; + u_short dst_port; + u_short alias_port; + u_short proxy_port; + struct server *server; + + int link_type; /* Type of link: TCP, UDP, ICMP, proto, frag */ + +/* values for link_type */ +#define LINK_ICMP IPPROTO_ICMP +#define LINK_UDP IPPROTO_UDP +#define LINK_TCP IPPROTO_TCP +#define LINK_FRAGMENT_ID (IPPROTO_MAX + 1) +#define LINK_FRAGMENT_PTR (IPPROTO_MAX + 2) +#define LINK_ADDR (IPPROTO_MAX + 3) +#define LINK_PPTP (IPPROTO_MAX + 4) + + int flags; /* indicates special characteristics */ + +/* flag bits */ +#define LINK_UNKNOWN_DEST_PORT 0x01 +#define LINK_UNKNOWN_DEST_ADDR 0x02 +#define LINK_PERMANENT 0x04 +#define LINK_PARTIALLY_SPECIFIED 0x03 /* logical-or of first two bits */ +#define LINK_UNFIREWALLED 0x08 +#define LINK_LAST_LINE_CRLF_TERMED 0x10 + + int timestamp; /* Time link was last accessed */ + int expire_time; /* Expire time for link */ + + int sockfd; /* socket descriptor */ + + LIST_ENTRY(alias_link) list_out; /* Linked list of pointers for */ + LIST_ENTRY(alias_link) list_in; /* input and output lookup tables */ + + union /* Auxiliary data */ + { + char *frag_ptr; + struct in_addr frag_addr; + struct tcp_dat *tcp; + } data; +}; + + + + + +/* Global Variables + + The global variables listed here are only accessed from + within alias_db.c and so are prefixed with the static + designation. +*/ + +int packetAliasMode; /* Mode flags */ + /* - documented in alias.h */ + +static struct in_addr aliasAddress; /* Address written onto source */ + /* field of IP packet. */ + +static struct in_addr targetAddress; /* IP address incoming packets */ + /* are sent to if no aliasing */ + /* link already exists */ + +static struct in_addr nullAddress; /* Used as a dummy parameter for */ + /* some function calls */ +static LIST_HEAD(, alias_link) +linkTableOut[LINK_TABLE_OUT_SIZE]; /* Lookup table of pointers to */ + /* chains of link records. Each */ +static LIST_HEAD(, alias_link) /* link record is doubly indexed */ +linkTableIn[LINK_TABLE_IN_SIZE]; /* into input and output lookup */ + /* tables. */ + +static int icmpLinkCount; /* Link statistics */ +static int udpLinkCount; +static int tcpLinkCount; +static int pptpLinkCount; +static int protoLinkCount; +static int fragmentIdLinkCount; +static int fragmentPtrLinkCount; +static int sockCount; + +static int cleanupIndex; /* Index to chain of link table */ + /* being inspected for old links */ + +static int timeStamp; /* System time in seconds for */ + /* current packet */ + +static int lastCleanupTime; /* Last time IncrementalCleanup() */ + /* was called */ + +static int houseKeepingResidual; /* used by HouseKeeping() */ + +static int deleteAllLinks; /* If equal to zero, DeleteLink() */ + /* will not remove permanent links */ + +static FILE *monitorFile; /* File descriptor for link */ + /* statistics monitoring file */ + +static int newDefaultLink; /* Indicates if a new aliasing */ + /* link has been created after a */ + /* call to PacketAliasIn/Out(). */ + +#ifndef NO_FW_PUNCH +static int fireWallFD = -1; /* File descriptor to be able to */ + /* control firewall. Opened by */ + /* PacketAliasSetMode on first */ + /* setting the PKT_ALIAS_PUNCH_FW */ + /* flag. */ +#endif + + + + + + + +/* Internal utility routines (used only in alias_db.c) + +Lookup table starting points: + StartPointIn() -- link table initial search point for + incoming packets + StartPointOut() -- link table initial search point for + outgoing packets + +Miscellaneous: + SeqDiff() -- difference between two TCP sequences + ShowAliasStats() -- send alias statistics to a monitor file +*/ + + +/* Local prototypes */ +static u_int StartPointIn(struct in_addr, u_short, int); + +static u_int StartPointOut(struct in_addr, struct in_addr, + u_short, u_short, int); + +static int SeqDiff(u_long, u_long); + +static void ShowAliasStats(void); + +#ifndef NO_FW_PUNCH +/* Firewall control */ +static void InitPunchFW(void); +static void UninitPunchFW(void); +static void ClearFWHole(struct alias_link *link); +#endif + +/* Log file control */ +static void InitPacketAliasLog(void); +static void UninitPacketAliasLog(void); + +static u_int +StartPointIn(struct in_addr alias_addr, + u_short alias_port, + int link_type) +{ + u_int n; + + n = alias_addr.s_addr; + if (link_type != LINK_PPTP) + n += alias_port; + n += link_type; + return(n % LINK_TABLE_IN_SIZE); +} + + +static u_int +StartPointOut(struct in_addr src_addr, struct in_addr dst_addr, + u_short src_port, u_short dst_port, int link_type) +{ + u_int n; + + n = src_addr.s_addr; + n += dst_addr.s_addr; + if (link_type != LINK_PPTP) { + n += src_port; + n += dst_port; + } + n += link_type; + + return(n % LINK_TABLE_OUT_SIZE); +} + + +static int +SeqDiff(u_long x, u_long y) +{ +/* Return the difference between two TCP sequence numbers */ + +/* + This function is encapsulated in case there are any unusual + arithmetic conditions that need to be considered. +*/ + + return (ntohl(y) - ntohl(x)); +} + + +static void +ShowAliasStats(void) +{ +/* Used for debugging */ + + if (monitorFile) + { + fprintf(monitorFile, "icmp=%d, udp=%d, tcp=%d, pptp=%d, proto=%d, frag_id=%d frag_ptr=%d", + icmpLinkCount, + udpLinkCount, + tcpLinkCount, + pptpLinkCount, + protoLinkCount, + fragmentIdLinkCount, + fragmentPtrLinkCount); + + fprintf(monitorFile, " / tot=%d (sock=%d)\n", + icmpLinkCount + udpLinkCount + + tcpLinkCount + + pptpLinkCount + + protoLinkCount + + fragmentIdLinkCount + + fragmentPtrLinkCount, + sockCount); + + fflush(monitorFile); + } +} + + + + + +/* Internal routines for finding, deleting and adding links + +Port Allocation: + GetNewPort() -- find and reserve new alias port number + GetSocket() -- try to allocate a socket for a given port + +Link creation and deletion: + CleanupAliasData() - remove all link chains from lookup table + IncrementalCleanup() - look for stale links in a single chain + DeleteLink() - remove link + AddLink() - add link + ReLink() - change link + +Link search: + FindLinkOut() - find link for outgoing packets + FindLinkIn() - find link for incoming packets + +Port search: + FindNewPortGroup() - find an available group of ports +*/ + +/* Local prototypes */ +static int GetNewPort(struct alias_link *, int); + +static u_short GetSocket(u_short, int *, int); + +static void CleanupAliasData(void); + +static void IncrementalCleanup(void); + +static void DeleteLink(struct alias_link *); + +static struct alias_link * +AddLink(struct in_addr, struct in_addr, struct in_addr, + u_short, u_short, int, int); + +static struct alias_link * +ReLink(struct alias_link *, + struct in_addr, struct in_addr, struct in_addr, + u_short, u_short, int, int); + +static struct alias_link * +FindLinkOut(struct in_addr, struct in_addr, u_short, u_short, int, int); + +static struct alias_link * +FindLinkIn(struct in_addr, struct in_addr, u_short, u_short, int, int); + + +#define ALIAS_PORT_BASE 0x08000 +#define ALIAS_PORT_MASK 0x07fff +#define ALIAS_PORT_MASK_EVEN 0x07ffe +#define GET_NEW_PORT_MAX_ATTEMPTS 20 + +#define GET_ALIAS_PORT -1 +#define GET_ALIAS_ID GET_ALIAS_PORT + +#define FIND_EVEN_ALIAS_BASE 1 + +/* GetNewPort() allocates port numbers. Note that if a port number + is already in use, that does not mean that it cannot be used by + another link concurrently. This is because GetNewPort() looks for + unused triplets: (dest addr, dest port, alias port). */ + +static int +GetNewPort(struct alias_link *link, int alias_port_param) +{ + int i; + int max_trials; + u_short port_sys; + u_short port_net; + +/* + Description of alias_port_param for GetNewPort(). When + this parameter is zero or positive, it precisely specifies + the port number. GetNewPort() will return this number + without check that it is in use. + + When this parameter is GET_ALIAS_PORT, it indicates to get a randomly + selected port number. +*/ + + if (alias_port_param == GET_ALIAS_PORT) + { + /* + * The aliasing port is automatically selected + * by one of two methods below: + */ + max_trials = GET_NEW_PORT_MAX_ATTEMPTS; + + if (packetAliasMode & PKT_ALIAS_SAME_PORTS) + { + /* + * When the PKT_ALIAS_SAME_PORTS option is + * chosen, the first try will be the + * actual source port. If this is already + * in use, the remainder of the trials + * will be random. + */ + port_net = link->src_port; + port_sys = ntohs(port_net); + } + else + { + /* First trial and all subsequent are random. */ + port_sys = random() & ALIAS_PORT_MASK; + port_sys += ALIAS_PORT_BASE; + port_net = htons(port_sys); + } + } + else if (alias_port_param >= 0 && alias_port_param < 0x10000) + { + link->alias_port = (u_short) alias_port_param; + return(0); + } + else + { +#ifdef DEBUG + fprintf(stderr, "PacketAlias/GetNewPort(): "); + fprintf(stderr, "input parameter error\n"); +#endif + return(-1); + } + + +/* Port number search */ + for (i=0; i<max_trials; i++) + { + int go_ahead; + struct alias_link *search_result; + + search_result = FindLinkIn(link->dst_addr, link->alias_addr, + link->dst_port, port_net, + link->link_type, 0); + + if (search_result == NULL) + go_ahead = 1; + else if (!(link->flags & LINK_PARTIALLY_SPECIFIED) + && (search_result->flags & LINK_PARTIALLY_SPECIFIED)) + go_ahead = 1; + else + go_ahead = 0; + + if (go_ahead) + { + if ((packetAliasMode & PKT_ALIAS_USE_SOCKETS) + && (link->flags & LINK_PARTIALLY_SPECIFIED) + && ((link->link_type == LINK_TCP) || + (link->link_type == LINK_UDP))) + { + if (GetSocket(port_net, &link->sockfd, link->link_type)) + { + link->alias_port = port_net; + return(0); + } + } + else + { + link->alias_port = port_net; + return(0); + } + } + + port_sys = random() & ALIAS_PORT_MASK; + port_sys += ALIAS_PORT_BASE; + port_net = htons(port_sys); + } + +#ifdef DEBUG + fprintf(stderr, "PacketAlias/GetnewPort(): "); + fprintf(stderr, "could not find free port\n"); +#endif + + return(-1); +} + + +static u_short +GetSocket(u_short port_net, int *sockfd, int link_type) +{ + int err; + int sock; + struct sockaddr_in sock_addr; + + if (link_type == LINK_TCP) + sock = socket(AF_INET, SOCK_STREAM, 0); + else if (link_type == LINK_UDP) + sock = socket(AF_INET, SOCK_DGRAM, 0); + else + { +#ifdef DEBUG + fprintf(stderr, "PacketAlias/GetSocket(): "); + fprintf(stderr, "incorrect link type\n"); +#endif + return(0); + } + + if (sock < 0) + { +#ifdef DEBUG + fprintf(stderr, "PacketAlias/GetSocket(): "); + fprintf(stderr, "socket() error %d\n", *sockfd); +#endif + return(0); + } + + sock_addr.sin_family = AF_INET; + sock_addr.sin_addr.s_addr = htonl(INADDR_ANY); + sock_addr.sin_port = port_net; + + err = bind(sock, + (struct sockaddr *) &sock_addr, + sizeof(sock_addr)); + if (err == 0) + { + sockCount++; + *sockfd = sock; + return(1); + } + else + { + close(sock); + return(0); + } +} + + +/* FindNewPortGroup() returns a base port number for an available + range of contiguous port numbers. Note that if a port number + is already in use, that does not mean that it cannot be used by + another link concurrently. This is because FindNewPortGroup() + looks for unused triplets: (dest addr, dest port, alias port). */ + +int +FindNewPortGroup(struct in_addr dst_addr, + struct in_addr alias_addr, + u_short src_port, + u_short dst_port, + u_short port_count, + u_char proto, + u_char align) +{ + int i, j; + int max_trials; + u_short port_sys; + int link_type; + + /* + * Get link_type from protocol + */ + + switch (proto) + { + case IPPROTO_UDP: + link_type = LINK_UDP; + break; + case IPPROTO_TCP: + link_type = LINK_TCP; + break; + default: + return (0); + break; + } + + /* + * The aliasing port is automatically selected + * by one of two methods below: + */ + max_trials = GET_NEW_PORT_MAX_ATTEMPTS; + + if (packetAliasMode & PKT_ALIAS_SAME_PORTS) { + /* + * When the ALIAS_SAME_PORTS option is + * chosen, the first try will be the + * actual source port. If this is already + * in use, the remainder of the trials + * will be random. + */ + port_sys = ntohs(src_port); + + } else { + + /* First trial and all subsequent are random. */ + if (align == FIND_EVEN_ALIAS_BASE) + port_sys = random() & ALIAS_PORT_MASK_EVEN; + else + port_sys = random() & ALIAS_PORT_MASK; + + port_sys += ALIAS_PORT_BASE; + } + +/* Port number search */ + for (i = 0; i < max_trials; i++) { + + struct alias_link *search_result; + + for (j = 0; j < port_count; j++) + if (0 != (search_result = FindLinkIn(dst_addr, alias_addr, + dst_port, htons(port_sys + j), + link_type, 0))) + break; + + /* Found a good range, return base */ + if (j == port_count) + return (htons(port_sys)); + + /* Find a new base to try */ + if (align == FIND_EVEN_ALIAS_BASE) + port_sys = random() & ALIAS_PORT_MASK_EVEN; + else + port_sys = random() & ALIAS_PORT_MASK; + + port_sys += ALIAS_PORT_BASE; + } + +#ifdef DEBUG + fprintf(stderr, "PacketAlias/FindNewPortGroup(): "); + fprintf(stderr, "could not find free port(s)\n"); +#endif + + return(0); +} + +static void +CleanupAliasData(void) +{ + struct alias_link *link; + int i, icount; + + icount = 0; + for (i=0; i<LINK_TABLE_OUT_SIZE; i++) + { + link = LIST_FIRST(&linkTableOut[i]); + while (link != NULL) + { + struct alias_link *link_next; + link_next = LIST_NEXT(link, list_out); + icount++; + DeleteLink(link); + link = link_next; + } + } + + cleanupIndex =0; +} + + +static void +IncrementalCleanup(void) +{ + int icount; + struct alias_link *link; + + icount = 0; + link = LIST_FIRST(&linkTableOut[cleanupIndex++]); + while (link != NULL) + { + int idelta; + struct alias_link *link_next; + + link_next = LIST_NEXT(link, list_out); + idelta = timeStamp - link->timestamp; + switch (link->link_type) + { + case LINK_TCP: + if (idelta > link->expire_time) + { + struct tcp_dat *tcp_aux; + + tcp_aux = link->data.tcp; + if (tcp_aux->state.in != ALIAS_TCP_STATE_CONNECTED + || tcp_aux->state.out != ALIAS_TCP_STATE_CONNECTED) + { + DeleteLink(link); + icount++; + } + } + break; + default: + if (idelta > link->expire_time) + { + DeleteLink(link); + icount++; + } + break; + } + link = link_next; + } + + if (cleanupIndex == LINK_TABLE_OUT_SIZE) + cleanupIndex = 0; +} + +static void +DeleteLink(struct alias_link *link) +{ + +/* Don't do anything if the link is marked permanent */ + if (deleteAllLinks == 0 && link->flags & LINK_PERMANENT) + return; + +#ifndef NO_FW_PUNCH +/* Delete associated firewall hole, if any */ + ClearFWHole(link); +#endif + +/* Free memory allocated for LSNAT server pool */ + if (link->server != NULL) { + struct server *head, *curr, *next; + + head = curr = link->server; + do { + next = curr->next; + free(curr); + } while ((curr = next) != head); + } + +/* Adjust output table pointers */ + LIST_REMOVE(link, list_out); + +/* Adjust input table pointers */ + LIST_REMOVE(link, list_in); + +/* Close socket, if one has been allocated */ + if (link->sockfd != -1) + { + sockCount--; + close(link->sockfd); + } + +/* Link-type dependent cleanup */ + switch(link->link_type) + { + case LINK_ICMP: + icmpLinkCount--; + break; + case LINK_UDP: + udpLinkCount--; + break; + case LINK_TCP: + tcpLinkCount--; + free(link->data.tcp); + break; + case LINK_PPTP: + pptpLinkCount--; + break; + case LINK_FRAGMENT_ID: + fragmentIdLinkCount--; + break; + case LINK_FRAGMENT_PTR: + fragmentPtrLinkCount--; + if (link->data.frag_ptr != NULL) + free(link->data.frag_ptr); + break; + case LINK_ADDR: + break; + default: + protoLinkCount--; + break; + } + +/* Free memory */ + free(link); + +/* Write statistics, if logging enabled */ + if (packetAliasMode & PKT_ALIAS_LOG) + { + ShowAliasStats(); + } +} + + +static struct alias_link * +AddLink(struct in_addr src_addr, + struct in_addr dst_addr, + struct in_addr alias_addr, + u_short src_port, + u_short dst_port, + int alias_port_param, /* if less than zero, alias */ + int link_type) /* port will be automatically */ +{ /* chosen. If greater than */ + u_int start_point; /* zero, equal to alias port */ + struct alias_link *link; + + link = malloc(sizeof(struct alias_link)); + if (link != NULL) + { + /* Basic initialization */ + link->src_addr = src_addr; + link->dst_addr = dst_addr; + link->alias_addr = alias_addr; + link->proxy_addr.s_addr = INADDR_ANY; + link->src_port = src_port; + link->dst_port = dst_port; + link->proxy_port = 0; + link->server = NULL; + link->link_type = link_type; + link->sockfd = -1; + link->flags = 0; + link->timestamp = timeStamp; + + /* Expiration time */ + switch (link_type) + { + case LINK_ICMP: + link->expire_time = ICMP_EXPIRE_TIME; + break; + case LINK_UDP: + link->expire_time = UDP_EXPIRE_TIME; + break; + case LINK_TCP: + link->expire_time = TCP_EXPIRE_INITIAL; + break; + case LINK_PPTP: + link->flags |= LINK_PERMANENT; /* no timeout. */ + break; + case LINK_FRAGMENT_ID: + link->expire_time = FRAGMENT_ID_EXPIRE_TIME; + break; + case LINK_FRAGMENT_PTR: + link->expire_time = FRAGMENT_PTR_EXPIRE_TIME; + break; + case LINK_ADDR: + break; + default: + link->expire_time = PROTO_EXPIRE_TIME; + break; + } + + /* Determine alias flags */ + if (dst_addr.s_addr == INADDR_ANY) + link->flags |= LINK_UNKNOWN_DEST_ADDR; + if (dst_port == 0) + link->flags |= LINK_UNKNOWN_DEST_PORT; + + /* Determine alias port */ + if (GetNewPort(link, alias_port_param) != 0) + { + free(link); + return(NULL); + } + + /* Link-type dependent initialization */ + switch(link_type) + { + struct tcp_dat *aux_tcp; + + case LINK_ICMP: + icmpLinkCount++; + break; + case LINK_UDP: + udpLinkCount++; + break; + case LINK_TCP: + aux_tcp = malloc(sizeof(struct tcp_dat)); + if (aux_tcp != NULL) + { + int i; + + tcpLinkCount++; + aux_tcp->state.in = ALIAS_TCP_STATE_NOT_CONNECTED; + aux_tcp->state.out = ALIAS_TCP_STATE_NOT_CONNECTED; + aux_tcp->state.index = 0; + aux_tcp->state.ack_modified = 0; + for (i=0; i<N_LINK_TCP_DATA; i++) + aux_tcp->ack[i].active = 0; + aux_tcp->fwhole = -1; + link->data.tcp = aux_tcp; + } + else + { +#ifdef DEBUG + fprintf(stderr, "PacketAlias/AddLink: "); + fprintf(stderr, " cannot allocate auxiliary TCP data\n"); +#endif + free(link); + return (NULL); + } + break; + case LINK_PPTP: + pptpLinkCount++; + break; + case LINK_FRAGMENT_ID: + fragmentIdLinkCount++; + break; + case LINK_FRAGMENT_PTR: + fragmentPtrLinkCount++; + break; + case LINK_ADDR: + break; + default: + protoLinkCount++; + break; + } + + /* Set up pointers for output lookup table */ + start_point = StartPointOut(src_addr, dst_addr, + src_port, dst_port, link_type); + LIST_INSERT_HEAD(&linkTableOut[start_point], link, list_out); + + /* Set up pointers for input lookup table */ + start_point = StartPointIn(alias_addr, link->alias_port, link_type); + LIST_INSERT_HEAD(&linkTableIn[start_point], link, list_in); + } + else + { +#ifdef DEBUG + fprintf(stderr, "PacketAlias/AddLink(): "); + fprintf(stderr, "malloc() call failed.\n"); +#endif + } + + if (packetAliasMode & PKT_ALIAS_LOG) + { + ShowAliasStats(); + } + + return(link); +} + +static struct alias_link * +ReLink(struct alias_link *old_link, + struct in_addr src_addr, + struct in_addr dst_addr, + struct in_addr alias_addr, + u_short src_port, + u_short dst_port, + int alias_port_param, /* if less than zero, alias */ + int link_type) /* port will be automatically */ +{ /* chosen. If greater than */ + struct alias_link *new_link; /* zero, equal to alias port */ + + new_link = AddLink(src_addr, dst_addr, alias_addr, + src_port, dst_port, alias_port_param, + link_type); +#ifndef NO_FW_PUNCH + if (new_link != NULL && + old_link->link_type == LINK_TCP && + old_link->data.tcp->fwhole > 0) { + PunchFWHole(new_link); + } +#endif + DeleteLink(old_link); + return new_link; +} + +static struct alias_link * +_FindLinkOut(struct in_addr src_addr, + struct in_addr dst_addr, + u_short src_port, + u_short dst_port, + int link_type, + int replace_partial_links) +{ + u_int i; + struct alias_link *link; + + i = StartPointOut(src_addr, dst_addr, src_port, dst_port, link_type); + LIST_FOREACH(link, &linkTableOut[i], list_out) + { + if (link->src_addr.s_addr == src_addr.s_addr + && link->server == NULL + && link->dst_addr.s_addr == dst_addr.s_addr + && link->dst_port == dst_port + && link->src_port == src_port + && link->link_type == link_type) + { + link->timestamp = timeStamp; + break; + } + } + +/* Search for partially specified links. */ + if (link == NULL && replace_partial_links) + { + if (dst_port != 0 && dst_addr.s_addr != INADDR_ANY) + { + link = _FindLinkOut(src_addr, dst_addr, src_port, 0, + link_type, 0); + if (link == NULL) + link = _FindLinkOut(src_addr, nullAddress, src_port, + dst_port, link_type, 0); + } + if (link == NULL && + (dst_port != 0 || dst_addr.s_addr != INADDR_ANY)) + { + link = _FindLinkOut(src_addr, nullAddress, src_port, 0, + link_type, 0); + } + if (link != NULL) + { + link = ReLink(link, + src_addr, dst_addr, link->alias_addr, + src_port, dst_port, link->alias_port, + link_type); + } + } + + return(link); +} + +static struct alias_link * +FindLinkOut(struct in_addr src_addr, + struct in_addr dst_addr, + u_short src_port, + u_short dst_port, + int link_type, + int replace_partial_links) +{ + struct alias_link *link; + + link = _FindLinkOut(src_addr, dst_addr, src_port, dst_port, + link_type, replace_partial_links); + + if (link == NULL) + { + /* The following allows permanent links to be + specified as using the default source address + (i.e. device interface address) without knowing + in advance what that address is. */ + if (aliasAddress.s_addr != 0 && + src_addr.s_addr == aliasAddress.s_addr) + { + link = _FindLinkOut(nullAddress, dst_addr, src_port, dst_port, + link_type, replace_partial_links); + } + } + + return(link); +} + + +static struct alias_link * +_FindLinkIn(struct in_addr dst_addr, + struct in_addr alias_addr, + u_short dst_port, + u_short alias_port, + int link_type, + int replace_partial_links) +{ + int flags_in; + u_int start_point; + struct alias_link *link; + struct alias_link *link_fully_specified; + struct alias_link *link_unknown_all; + struct alias_link *link_unknown_dst_addr; + struct alias_link *link_unknown_dst_port; + +/* Initialize pointers */ + link_fully_specified = NULL; + link_unknown_all = NULL; + link_unknown_dst_addr = NULL; + link_unknown_dst_port = NULL; + +/* If either the dest addr or port is unknown, the search + loop will have to know about this. */ + + flags_in = 0; + if (dst_addr.s_addr == INADDR_ANY) + flags_in |= LINK_UNKNOWN_DEST_ADDR; + if (dst_port == 0) + flags_in |= LINK_UNKNOWN_DEST_PORT; + +/* Search loop */ + start_point = StartPointIn(alias_addr, alias_port, link_type); + LIST_FOREACH(link, &linkTableIn[start_point], list_in) + { + int flags; + + flags = flags_in | link->flags; + if (!(flags & LINK_PARTIALLY_SPECIFIED)) + { + if (link->alias_addr.s_addr == alias_addr.s_addr + && link->alias_port == alias_port + && link->dst_addr.s_addr == dst_addr.s_addr + && link->dst_port == dst_port + && link->link_type == link_type) + { + link_fully_specified = link; + break; + } + } + else if ((flags & LINK_UNKNOWN_DEST_ADDR) + && (flags & LINK_UNKNOWN_DEST_PORT)) + { + if (link->alias_addr.s_addr == alias_addr.s_addr + && link->alias_port == alias_port + && link->link_type == link_type) + { + if (link_unknown_all == NULL) + link_unknown_all = link; + } + } + else if (flags & LINK_UNKNOWN_DEST_ADDR) + { + if (link->alias_addr.s_addr == alias_addr.s_addr + && link->alias_port == alias_port + && link->link_type == link_type + && link->dst_port == dst_port) + { + if (link_unknown_dst_addr == NULL) + link_unknown_dst_addr = link; + } + } + else if (flags & LINK_UNKNOWN_DEST_PORT) + { + if (link->alias_addr.s_addr == alias_addr.s_addr + && link->alias_port == alias_port + && link->link_type == link_type + && link->dst_addr.s_addr == dst_addr.s_addr) + { + if (link_unknown_dst_port == NULL) + link_unknown_dst_port = link; + } + } + } + + + + if (link_fully_specified != NULL) + { + link_fully_specified->timestamp = timeStamp; + link = link_fully_specified; + } + else if (link_unknown_dst_port != NULL) + link = link_unknown_dst_port; + else if (link_unknown_dst_addr != NULL) + link = link_unknown_dst_addr; + else if (link_unknown_all != NULL) + link = link_unknown_all; + else + return (NULL); + + if (replace_partial_links && + (link->flags & LINK_PARTIALLY_SPECIFIED || link->server != NULL)) + { + struct in_addr src_addr; + u_short src_port; + + if (link->server != NULL) { /* LSNAT link */ + src_addr = link->server->addr; + src_port = link->server->port; + link->server = link->server->next; + } else { + src_addr = link->src_addr; + src_port = link->src_port; + } + + link = ReLink(link, + src_addr, dst_addr, alias_addr, + src_port, dst_port, alias_port, + link_type); + } + + return (link); +} + +static struct alias_link * +FindLinkIn(struct in_addr dst_addr, + struct in_addr alias_addr, + u_short dst_port, + u_short alias_port, + int link_type, + int replace_partial_links) +{ + struct alias_link *link; + + link = _FindLinkIn(dst_addr, alias_addr, dst_port, alias_port, + link_type, replace_partial_links); + + if (link == NULL) + { + /* The following allows permanent links to be + specified as using the default aliasing address + (i.e. device interface address) without knowing + in advance what that address is. */ + if (aliasAddress.s_addr != 0 && + alias_addr.s_addr == aliasAddress.s_addr) + { + link = _FindLinkIn(dst_addr, nullAddress, dst_port, alias_port, + link_type, replace_partial_links); + } + } + + return(link); +} + + + + +/* External routines for finding/adding links + +-- "external" means outside alias_db.c, but within alias*.c -- + + FindIcmpIn(), FindIcmpOut() + FindFragmentIn1(), FindFragmentIn2() + AddFragmentPtrLink(), FindFragmentPtr() + FindProtoIn(), FindProtoOut() + FindUdpTcpIn(), FindUdpTcpOut() + AddPptp(), FindPptpOutByCallId(), FindPptpInByCallId(), + FindPptpOutByPeerCallId(), FindPptpInByPeerCallId() + FindOriginalAddress(), FindAliasAddress() + +(prototypes in alias_local.h) +*/ + + +struct alias_link * +FindIcmpIn(struct in_addr dst_addr, + struct in_addr alias_addr, + u_short id_alias, + int create) +{ + struct alias_link *link; + + link = FindLinkIn(dst_addr, alias_addr, + NO_DEST_PORT, id_alias, + LINK_ICMP, 0); + if (link == NULL && create && !(packetAliasMode & PKT_ALIAS_DENY_INCOMING)) + { + struct in_addr target_addr; + + target_addr = FindOriginalAddress(alias_addr); + link = AddLink(target_addr, dst_addr, alias_addr, + id_alias, NO_DEST_PORT, id_alias, + LINK_ICMP); + } + + return (link); +} + + +struct alias_link * +FindIcmpOut(struct in_addr src_addr, + struct in_addr dst_addr, + u_short id, + int create) +{ + struct alias_link * link; + + link = FindLinkOut(src_addr, dst_addr, + id, NO_DEST_PORT, + LINK_ICMP, 0); + if (link == NULL && create) + { + struct in_addr alias_addr; + + alias_addr = FindAliasAddress(src_addr); + link = AddLink(src_addr, dst_addr, alias_addr, + id, NO_DEST_PORT, GET_ALIAS_ID, + LINK_ICMP); + } + + return(link); +} + + +struct alias_link * +FindFragmentIn1(struct in_addr dst_addr, + struct in_addr alias_addr, + u_short ip_id) +{ + struct alias_link *link; + + link = FindLinkIn(dst_addr, alias_addr, + NO_DEST_PORT, ip_id, + LINK_FRAGMENT_ID, 0); + + if (link == NULL) + { + link = AddLink(nullAddress, dst_addr, alias_addr, + NO_SRC_PORT, NO_DEST_PORT, ip_id, + LINK_FRAGMENT_ID); + } + + return(link); +} + + +struct alias_link * +FindFragmentIn2(struct in_addr dst_addr, /* Doesn't add a link if one */ + struct in_addr alias_addr, /* is not found. */ + u_short ip_id) +{ + return FindLinkIn(dst_addr, alias_addr, + NO_DEST_PORT, ip_id, + LINK_FRAGMENT_ID, 0); +} + + +struct alias_link * +AddFragmentPtrLink(struct in_addr dst_addr, + u_short ip_id) +{ + return AddLink(nullAddress, dst_addr, nullAddress, + NO_SRC_PORT, NO_DEST_PORT, ip_id, + LINK_FRAGMENT_PTR); +} + + +struct alias_link * +FindFragmentPtr(struct in_addr dst_addr, + u_short ip_id) +{ + return FindLinkIn(dst_addr, nullAddress, + NO_DEST_PORT, ip_id, + LINK_FRAGMENT_PTR, 0); +} + + +struct alias_link * +FindProtoIn(struct in_addr dst_addr, + struct in_addr alias_addr, + u_char proto) +{ + struct alias_link *link; + + link = FindLinkIn(dst_addr, alias_addr, + NO_DEST_PORT, 0, + proto, 1); + + if (link == NULL && !(packetAliasMode & PKT_ALIAS_DENY_INCOMING)) + { + struct in_addr target_addr; + + target_addr = FindOriginalAddress(alias_addr); + link = AddLink(target_addr, dst_addr, alias_addr, + NO_SRC_PORT, NO_DEST_PORT, 0, + proto); + } + + return (link); +} + + +struct alias_link * +FindProtoOut(struct in_addr src_addr, + struct in_addr dst_addr, + u_char proto) +{ + struct alias_link *link; + + link = FindLinkOut(src_addr, dst_addr, + NO_SRC_PORT, NO_DEST_PORT, + proto, 1); + + if (link == NULL) + { + struct in_addr alias_addr; + + alias_addr = FindAliasAddress(src_addr); + link = AddLink(src_addr, dst_addr, alias_addr, + NO_SRC_PORT, NO_DEST_PORT, 0, + proto); + } + + return (link); +} + + +struct alias_link * +FindUdpTcpIn(struct in_addr dst_addr, + struct in_addr alias_addr, + u_short dst_port, + u_short alias_port, + u_char proto, + int create) +{ + int link_type; + struct alias_link *link; + + switch (proto) + { + case IPPROTO_UDP: + link_type = LINK_UDP; + break; + case IPPROTO_TCP: + link_type = LINK_TCP; + break; + default: + return NULL; + break; + } + + link = FindLinkIn(dst_addr, alias_addr, + dst_port, alias_port, + link_type, create); + + if (link == NULL && create && !(packetAliasMode & PKT_ALIAS_DENY_INCOMING)) + { + struct in_addr target_addr; + + target_addr = FindOriginalAddress(alias_addr); + link = AddLink(target_addr, dst_addr, alias_addr, + alias_port, dst_port, alias_port, + link_type); + } + + return(link); +} + + +struct alias_link * +FindUdpTcpOut(struct in_addr src_addr, + struct in_addr dst_addr, + u_short src_port, + u_short dst_port, + u_char proto, + int create) +{ + int link_type; + struct alias_link *link; + + switch (proto) + { + case IPPROTO_UDP: + link_type = LINK_UDP; + break; + case IPPROTO_TCP: + link_type = LINK_TCP; + break; + default: + return NULL; + break; + } + + link = FindLinkOut(src_addr, dst_addr, src_port, dst_port, link_type, create); + + if (link == NULL && create) + { + struct in_addr alias_addr; + + alias_addr = FindAliasAddress(src_addr); + link = AddLink(src_addr, dst_addr, alias_addr, + src_port, dst_port, GET_ALIAS_PORT, + link_type); + } + + return(link); +} + + +struct alias_link * +AddPptp(struct in_addr src_addr, + struct in_addr dst_addr, + struct in_addr alias_addr, + u_int16_t src_call_id) +{ + struct alias_link *link; + + link = AddLink(src_addr, dst_addr, alias_addr, + src_call_id, 0, GET_ALIAS_PORT, + LINK_PPTP); + + return (link); +} + + +struct alias_link * +FindPptpOutByCallId(struct in_addr src_addr, + struct in_addr dst_addr, + u_int16_t src_call_id) +{ + u_int i; + struct alias_link *link; + + i = StartPointOut(src_addr, dst_addr, 0, 0, LINK_PPTP); + LIST_FOREACH(link, &linkTableOut[i], list_out) + if (link->link_type == LINK_PPTP && + link->src_addr.s_addr == src_addr.s_addr && + link->dst_addr.s_addr == dst_addr.s_addr && + link->src_port == src_call_id) + break; + + return (link); +} + + +struct alias_link * +FindPptpOutByPeerCallId(struct in_addr src_addr, + struct in_addr dst_addr, + u_int16_t dst_call_id) +{ + u_int i; + struct alias_link *link; + + i = StartPointOut(src_addr, dst_addr, 0, 0, LINK_PPTP); + LIST_FOREACH(link, &linkTableOut[i], list_out) + if (link->link_type == LINK_PPTP && + link->src_addr.s_addr == src_addr.s_addr && + link->dst_addr.s_addr == dst_addr.s_addr && + link->dst_port == dst_call_id) + break; + + return (link); +} + + +struct alias_link * +FindPptpInByCallId(struct in_addr dst_addr, + struct in_addr alias_addr, + u_int16_t dst_call_id) +{ + u_int i; + struct alias_link *link; + + i = StartPointIn(alias_addr, 0, LINK_PPTP); + LIST_FOREACH(link, &linkTableIn[i], list_in) + if (link->link_type == LINK_PPTP && + link->dst_addr.s_addr == dst_addr.s_addr && + link->alias_addr.s_addr == alias_addr.s_addr && + link->dst_port == dst_call_id) + break; + + return (link); +} + + +struct alias_link * +FindPptpInByPeerCallId(struct in_addr dst_addr, + struct in_addr alias_addr, + u_int16_t alias_call_id) +{ + struct alias_link *link; + + link = FindLinkIn(dst_addr, alias_addr, + 0/* any */, alias_call_id, + LINK_PPTP, 0); + + + return (link); +} + + +struct alias_link * +FindRtspOut(struct in_addr src_addr, + struct in_addr dst_addr, + u_short src_port, + u_short alias_port, + u_char proto) +{ + int link_type; + struct alias_link *link; + + switch (proto) + { + case IPPROTO_UDP: + link_type = LINK_UDP; + break; + case IPPROTO_TCP: + link_type = LINK_TCP; + break; + default: + return NULL; + break; + } + + link = FindLinkOut(src_addr, dst_addr, src_port, 0, link_type, 1); + + if (link == NULL) + { + struct in_addr alias_addr; + + alias_addr = FindAliasAddress(src_addr); + link = AddLink(src_addr, dst_addr, alias_addr, + src_port, 0, alias_port, + link_type); + } + + return(link); +} + + +struct in_addr +FindOriginalAddress(struct in_addr alias_addr) +{ + struct alias_link *link; + + link = FindLinkIn(nullAddress, alias_addr, + 0, 0, LINK_ADDR, 0); + if (link == NULL) + { + newDefaultLink = 1; + if (targetAddress.s_addr == INADDR_ANY) + return alias_addr; + else if (targetAddress.s_addr == INADDR_NONE) + return aliasAddress; + else + return targetAddress; + } + else + { + if (link->server != NULL) { /* LSNAT link */ + struct in_addr src_addr; + + src_addr = link->server->addr; + link->server = link->server->next; + return (src_addr); + } else if (link->src_addr.s_addr == INADDR_ANY) + return aliasAddress; + else + return link->src_addr; + } +} + + +struct in_addr +FindAliasAddress(struct in_addr original_addr) +{ + struct alias_link *link; + + link = FindLinkOut(original_addr, nullAddress, + 0, 0, LINK_ADDR, 0); + if (link == NULL) + { + return aliasAddress; + } + else + { + if (link->alias_addr.s_addr == INADDR_ANY) + return aliasAddress; + else + return link->alias_addr; + } +} + + +/* External routines for getting or changing link data + (external to alias_db.c, but internal to alias*.c) + + SetFragmentData(), GetFragmentData() + SetFragmentPtr(), GetFragmentPtr() + SetStateIn(), SetStateOut(), GetStateIn(), GetStateOut() + GetOriginalAddress(), GetDestAddress(), GetAliasAddress() + GetOriginalPort(), GetAliasPort() + SetAckModified(), GetAckModified() + GetDeltaAckIn(), GetDeltaSeqOut(), AddSeq() + SetLastLineCrlfTermed(), GetLastLineCrlfTermed() + SetDestCallId() +*/ + + +void +SetFragmentAddr(struct alias_link *link, struct in_addr src_addr) +{ + link->data.frag_addr = src_addr; +} + + +void +GetFragmentAddr(struct alias_link *link, struct in_addr *src_addr) +{ + *src_addr = link->data.frag_addr; +} + + +void +SetFragmentPtr(struct alias_link *link, char *fptr) +{ + link->data.frag_ptr = fptr; +} + + +void +GetFragmentPtr(struct alias_link *link, char **fptr) +{ + *fptr = link->data.frag_ptr; +} + + +void +SetStateIn(struct alias_link *link, int state) +{ + /* TCP input state */ + switch (state) { + case ALIAS_TCP_STATE_DISCONNECTED: + if (link->data.tcp->state.out != ALIAS_TCP_STATE_CONNECTED) + link->expire_time = TCP_EXPIRE_DEAD; + else + link->expire_time = TCP_EXPIRE_SINGLEDEAD; + break; + case ALIAS_TCP_STATE_CONNECTED: + if (link->data.tcp->state.out == ALIAS_TCP_STATE_CONNECTED) + link->expire_time = TCP_EXPIRE_CONNECTED; + break; + default: + abort(); + } + link->data.tcp->state.in = state; +} + + +void +SetStateOut(struct alias_link *link, int state) +{ + /* TCP output state */ + switch (state) { + case ALIAS_TCP_STATE_DISCONNECTED: + if (link->data.tcp->state.in != ALIAS_TCP_STATE_CONNECTED) + link->expire_time = TCP_EXPIRE_DEAD; + else + link->expire_time = TCP_EXPIRE_SINGLEDEAD; + break; + case ALIAS_TCP_STATE_CONNECTED: + if (link->data.tcp->state.in == ALIAS_TCP_STATE_CONNECTED) + link->expire_time = TCP_EXPIRE_CONNECTED; + break; + default: + abort(); + } + link->data.tcp->state.out = state; +} + + +int +GetStateIn(struct alias_link *link) +{ + /* TCP input state */ + return link->data.tcp->state.in; +} + + +int +GetStateOut(struct alias_link *link) +{ + /* TCP output state */ + return link->data.tcp->state.out; +} + + +struct in_addr +GetOriginalAddress(struct alias_link *link) +{ + if (link->src_addr.s_addr == INADDR_ANY) + return aliasAddress; + else + return(link->src_addr); +} + + +struct in_addr +GetDestAddress(struct alias_link *link) +{ + return(link->dst_addr); +} + + +struct in_addr +GetAliasAddress(struct alias_link *link) +{ + if (link->alias_addr.s_addr == INADDR_ANY) + return aliasAddress; + else + return link->alias_addr; +} + + +struct in_addr +GetDefaultAliasAddress() +{ + return aliasAddress; +} + + +void +SetDefaultAliasAddress(struct in_addr alias_addr) +{ + aliasAddress = alias_addr; +} + + +u_short +GetOriginalPort(struct alias_link *link) +{ + return(link->src_port); +} + + +u_short +GetAliasPort(struct alias_link *link) +{ + return(link->alias_port); +} + +#ifndef NO_FW_PUNCH +static u_short +GetDestPort(struct alias_link *link) +{ + return(link->dst_port); +} +#endif + +void +SetAckModified(struct alias_link *link) +{ +/* Indicate that ACK numbers have been modified in a TCP connection */ + link->data.tcp->state.ack_modified = 1; +} + + +struct in_addr +GetProxyAddress(struct alias_link *link) +{ + return link->proxy_addr; +} + + +void +SetProxyAddress(struct alias_link *link, struct in_addr addr) +{ + link->proxy_addr = addr; +} + + +u_short +GetProxyPort(struct alias_link *link) +{ + return link->proxy_port; +} + + +void +SetProxyPort(struct alias_link *link, u_short port) +{ + link->proxy_port = port; +} + + +int +GetAckModified(struct alias_link *link) +{ +/* See if ACK numbers have been modified */ + return link->data.tcp->state.ack_modified; +} + + +int +GetDeltaAckIn(struct ip *pip, struct alias_link *link) +{ +/* +Find out how much the ACK number has been altered for an incoming +TCP packet. To do this, a circular list of ACK numbers where the TCP +packet size was altered is searched. +*/ + + int i; + struct tcphdr *tc; + int delta, ack_diff_min; + u_long ack; + + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + ack = tc->th_ack; + + delta = 0; + ack_diff_min = -1; + for (i=0; i<N_LINK_TCP_DATA; i++) + { + struct ack_data_record x; + + x = link->data.tcp->ack[i]; + if (x.active == 1) + { + int ack_diff; + + ack_diff = SeqDiff(x.ack_new, ack); + if (ack_diff >= 0) + { + if (ack_diff_min >= 0) + { + if (ack_diff < ack_diff_min) + { + delta = x.delta; + ack_diff_min = ack_diff; + } + } + else + { + delta = x.delta; + ack_diff_min = ack_diff; + } + } + } + } + return (delta); +} + + +int +GetDeltaSeqOut(struct ip *pip, struct alias_link *link) +{ +/* +Find out how much the sequence number has been altered for an outgoing +TCP packet. To do this, a circular list of ACK numbers where the TCP +packet size was altered is searched. +*/ + + int i; + struct tcphdr *tc; + int delta, seq_diff_min; + u_long seq; + + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + seq = tc->th_seq; + + delta = 0; + seq_diff_min = -1; + for (i=0; i<N_LINK_TCP_DATA; i++) + { + struct ack_data_record x; + + x = link->data.tcp->ack[i]; + if (x.active == 1) + { + int seq_diff; + + seq_diff = SeqDiff(x.ack_old, seq); + if (seq_diff >= 0) + { + if (seq_diff_min >= 0) + { + if (seq_diff < seq_diff_min) + { + delta = x.delta; + seq_diff_min = seq_diff; + } + } + else + { + delta = x.delta; + seq_diff_min = seq_diff; + } + } + } + } + return (delta); +} + + +void +AddSeq(struct ip *pip, struct alias_link *link, int delta) +{ +/* +When a TCP packet has been altered in length, save this +information in a circular list. If enough packets have +been altered, then this list will begin to overwrite itself. +*/ + + struct tcphdr *tc; + struct ack_data_record x; + int hlen, tlen, dlen; + int i; + + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + + hlen = (pip->ip_hl + tc->th_off) << 2; + tlen = ntohs(pip->ip_len); + dlen = tlen - hlen; + + x.ack_old = htonl(ntohl(tc->th_seq) + dlen); + x.ack_new = htonl(ntohl(tc->th_seq) + dlen + delta); + x.delta = delta; + x.active = 1; + + i = link->data.tcp->state.index; + link->data.tcp->ack[i] = x; + + i++; + if (i == N_LINK_TCP_DATA) + link->data.tcp->state.index = 0; + else + link->data.tcp->state.index = i; +} + +void +SetExpire(struct alias_link *link, int expire) +{ + if (expire == 0) + { + link->flags &= ~LINK_PERMANENT; + DeleteLink(link); + } + else if (expire == -1) + { + link->flags |= LINK_PERMANENT; + } + else if (expire > 0) + { + link->expire_time = expire; + } + else + { +#ifdef DEBUG + fprintf(stderr, "PacketAlias/SetExpire(): "); + fprintf(stderr, "error in expire parameter\n"); +#endif + } +} + +void +ClearCheckNewLink(void) +{ + newDefaultLink = 0; +} + +void +SetLastLineCrlfTermed(struct alias_link *link, int yes) +{ + + if (yes) + link->flags |= LINK_LAST_LINE_CRLF_TERMED; + else + link->flags &= ~LINK_LAST_LINE_CRLF_TERMED; +} + +int +GetLastLineCrlfTermed(struct alias_link *link) +{ + + return (link->flags & LINK_LAST_LINE_CRLF_TERMED); +} + +void +SetDestCallId(struct alias_link *link, u_int16_t cid) +{ + + deleteAllLinks = 1; + link = ReLink(link, link->src_addr, link->dst_addr, link->alias_addr, + link->src_port, cid, link->alias_port, link->link_type); + deleteAllLinks = 0; +} + + +/* Miscellaneous Functions + + HouseKeeping() + InitPacketAliasLog() + UninitPacketAliasLog() +*/ + +/* + Whenever an outgoing or incoming packet is handled, HouseKeeping() + is called to find and remove timed-out aliasing links. Logic exists + to sweep through the entire table and linked list structure + every 60 seconds. + + (prototype in alias_local.h) +*/ + +void +HouseKeeping(void) +{ + int i, n, n100; + struct timeval tv; + struct timezone tz; + + /* + * Save system time (seconds) in global variable timeStamp for + * use by other functions. This is done so as not to unnecessarily + * waste timeline by making system calls. + */ + gettimeofday(&tv, &tz); + timeStamp = tv.tv_sec; + + /* Compute number of spokes (output table link chains) to cover */ + n100 = LINK_TABLE_OUT_SIZE * 100 + houseKeepingResidual; + n100 *= timeStamp - lastCleanupTime; + n100 /= ALIAS_CLEANUP_INTERVAL_SECS; + + n = n100/100; + + /* Handle different cases */ + if (n > ALIAS_CLEANUP_MAX_SPOKES) + { + n = ALIAS_CLEANUP_MAX_SPOKES; + lastCleanupTime = timeStamp; + houseKeepingResidual = 0; + + for (i=0; i<n; i++) + IncrementalCleanup(); + } + else if (n > 0) + { + lastCleanupTime = timeStamp; + houseKeepingResidual = n100 - 100*n; + + for (i=0; i<n; i++) + IncrementalCleanup(); + } + else if (n < 0) + { +#ifdef DEBUG + fprintf(stderr, "PacketAlias/HouseKeeping(): "); + fprintf(stderr, "something unexpected in time values\n"); +#endif + lastCleanupTime = timeStamp; + houseKeepingResidual = 0; + } +} + + +/* Init the log file and enable logging */ +static void +InitPacketAliasLog(void) +{ + if ((~packetAliasMode & PKT_ALIAS_LOG) + && (monitorFile = fopen("/var/log/alias.log", "w"))) + { + packetAliasMode |= PKT_ALIAS_LOG; + fprintf(monitorFile, + "PacketAlias/InitPacketAliasLog: Packet alias logging enabled.\n"); + } +} + + +/* Close the log-file and disable logging. */ +static void +UninitPacketAliasLog(void) +{ + if (monitorFile) { + fclose(monitorFile); + monitorFile = NULL; + } + packetAliasMode &= ~PKT_ALIAS_LOG; +} + + + + + + +/* Outside world interfaces + +-- "outside world" means other than alias*.c routines -- + + PacketAliasRedirectPort() + PacketAliasAddServer() + PacketAliasRedirectProto() + PacketAliasRedirectAddr() + PacketAliasRedirectDelete() + PacketAliasSetAddress() + PacketAliasInit() + PacketAliasUninit() + PacketAliasSetMode() + +(prototypes in alias.h) +*/ + +/* Redirection from a specific public addr:port to a + private addr:port */ +struct alias_link * +PacketAliasRedirectPort(struct in_addr src_addr, u_short src_port, + struct in_addr dst_addr, u_short dst_port, + struct in_addr alias_addr, u_short alias_port, + u_char proto) +{ + int link_type; + struct alias_link *link; + + switch(proto) + { + case IPPROTO_UDP: + link_type = LINK_UDP; + break; + case IPPROTO_TCP: + link_type = LINK_TCP; + break; + default: +#ifdef DEBUG + fprintf(stderr, "PacketAliasRedirectPort(): "); + fprintf(stderr, "only TCP and UDP protocols allowed\n"); +#endif + return NULL; + } + + link = AddLink(src_addr, dst_addr, alias_addr, + src_port, dst_port, alias_port, + link_type); + + if (link != NULL) + { + link->flags |= LINK_PERMANENT; + } +#ifdef DEBUG + else + { + fprintf(stderr, "PacketAliasRedirectPort(): " + "call to AddLink() failed\n"); + } +#endif + + return link; +} + +/* Add server to the pool of servers */ +int +PacketAliasAddServer(struct alias_link *link, struct in_addr addr, u_short port) +{ + struct server *server; + + server = malloc(sizeof(struct server)); + + if (server != NULL) { + struct server *head; + + server->addr = addr; + server->port = port; + + head = link->server; + if (head == NULL) + server->next = server; + else { + struct server *s; + + for (s = head; s->next != head; s = s->next); + s->next = server; + server->next = head; + } + link->server = server; + return (0); + } else + return (-1); +} + +/* Redirect packets of a given IP protocol from a specific + public address to a private address */ +struct alias_link * +PacketAliasRedirectProto(struct in_addr src_addr, + struct in_addr dst_addr, + struct in_addr alias_addr, + u_char proto) +{ + struct alias_link *link; + + link = AddLink(src_addr, dst_addr, alias_addr, + NO_SRC_PORT, NO_DEST_PORT, 0, + proto); + + if (link != NULL) + { + link->flags |= LINK_PERMANENT; + } +#ifdef DEBUG + else + { + fprintf(stderr, "PacketAliasRedirectProto(): " + "call to AddLink() failed\n"); + } +#endif + + return link; +} + +/* Static address translation */ +struct alias_link * +PacketAliasRedirectAddr(struct in_addr src_addr, + struct in_addr alias_addr) +{ + struct alias_link *link; + + link = AddLink(src_addr, nullAddress, alias_addr, + 0, 0, 0, + LINK_ADDR); + + if (link != NULL) + { + link->flags |= LINK_PERMANENT; + } +#ifdef DEBUG + else + { + fprintf(stderr, "PacketAliasRedirectAddr(): " + "call to AddLink() failed\n"); + } +#endif + + return link; +} + + +void +PacketAliasRedirectDelete(struct alias_link *link) +{ +/* This is a dangerous function to put in the API, + because an invalid pointer can crash the program. */ + + deleteAllLinks = 1; + DeleteLink(link); + deleteAllLinks = 0; +} + + +void +PacketAliasSetAddress(struct in_addr addr) +{ + if (packetAliasMode & PKT_ALIAS_RESET_ON_ADDR_CHANGE + && aliasAddress.s_addr != addr.s_addr) + CleanupAliasData(); + + aliasAddress = addr; +} + + +void +PacketAliasSetTarget(struct in_addr target_addr) +{ + targetAddress = target_addr; +} + + +void +PacketAliasInit(void) +{ + int i; + struct timeval tv; + struct timezone tz; + static int firstCall = 1; + + if (firstCall == 1) + { + gettimeofday(&tv, &tz); + timeStamp = tv.tv_sec; + lastCleanupTime = tv.tv_sec; + houseKeepingResidual = 0; + + for (i=0; i<LINK_TABLE_OUT_SIZE; i++) + LIST_INIT(&linkTableOut[i]); + for (i=0; i<LINK_TABLE_IN_SIZE; i++) + LIST_INIT(&linkTableIn[i]); + + atexit(PacketAliasUninit); + firstCall = 0; + } + else + { + deleteAllLinks = 1; + CleanupAliasData(); + deleteAllLinks = 0; + } + + aliasAddress.s_addr = INADDR_ANY; + targetAddress.s_addr = INADDR_ANY; + + icmpLinkCount = 0; + udpLinkCount = 0; + tcpLinkCount = 0; + pptpLinkCount = 0; + protoLinkCount = 0; + fragmentIdLinkCount = 0; + fragmentPtrLinkCount = 0; + sockCount = 0; + + cleanupIndex =0; + + packetAliasMode = PKT_ALIAS_SAME_PORTS + | PKT_ALIAS_USE_SOCKETS + | PKT_ALIAS_RESET_ON_ADDR_CHANGE; +} + +void +PacketAliasUninit(void) { + deleteAllLinks = 1; + CleanupAliasData(); + deleteAllLinks = 0; + UninitPacketAliasLog(); +#ifndef NO_FW_PUNCH + UninitPunchFW(); +#endif +} + + +/* Change mode for some operations */ +unsigned int +PacketAliasSetMode( + unsigned int flags, /* Which state to bring flags to */ + unsigned int mask /* Mask of which flags to affect (use 0 to do a + probe for flag values) */ +) +{ +/* Enable logging? */ + if (flags & mask & PKT_ALIAS_LOG) + { + InitPacketAliasLog(); /* Do the enable */ + } else +/* _Disable_ logging? */ + if (~flags & mask & PKT_ALIAS_LOG) { + UninitPacketAliasLog(); + } + +#ifndef NO_FW_PUNCH +/* Start punching holes in the firewall? */ + if (flags & mask & PKT_ALIAS_PUNCH_FW) { + InitPunchFW(); + } else +/* Stop punching holes in the firewall? */ + if (~flags & mask & PKT_ALIAS_PUNCH_FW) { + UninitPunchFW(); + } +#endif + +/* Other flags can be set/cleared without special action */ + packetAliasMode = (flags & mask) | (packetAliasMode & ~mask); + return packetAliasMode; +} + + +int +PacketAliasCheckNewLink(void) +{ + return newDefaultLink; +} + + +#ifndef NO_FW_PUNCH + +/***************** + Code to support firewall punching. This shouldn't really be in this + file, but making variables global is evil too. + ****************/ + +/* Firewall include files */ +#include <net/if.h> +#include <netinet/ip_fw.h> +#include <string.h> +#include <err.h> + +static void ClearAllFWHoles(void); + +static int fireWallBaseNum; /* The first firewall entry free for our use */ +static int fireWallNumNums; /* How many entries can we use? */ +static int fireWallActiveNum; /* Which entry did we last use? */ +static char *fireWallField; /* bool array for entries */ + +#define fw_setfield(field, num) \ +do { \ + (field)[(num) - fireWallBaseNum] = 1; \ +} /*lint -save -e717 */ while(0) /*lint -restore */ +#define fw_clrfield(field, num) \ +do { \ + (field)[(num) - fireWallBaseNum] = 0; \ +} /*lint -save -e717 */ while(0) /*lint -restore */ +#define fw_tstfield(field, num) ((field)[(num) - fireWallBaseNum]) + +static void +InitPunchFW(void) { + fireWallField = malloc(fireWallNumNums); + if (fireWallField) { + memset(fireWallField, 0, fireWallNumNums); + if (fireWallFD < 0) { + fireWallFD = socket(AF_INET, SOCK_RAW, IPPROTO_RAW); + } + ClearAllFWHoles(); + fireWallActiveNum = fireWallBaseNum; + } +} + +static void +UninitPunchFW(void) { + ClearAllFWHoles(); + if (fireWallFD >= 0) + close(fireWallFD); + fireWallFD = -1; + if (fireWallField) + free(fireWallField); + fireWallField = NULL; + packetAliasMode &= ~PKT_ALIAS_PUNCH_FW; +} + +/* Make a certain link go through the firewall */ +void +PunchFWHole(struct alias_link *link) { + int r; /* Result code */ + struct ip_fw rule; /* On-the-fly built rule */ + int fwhole; /* Where to punch hole */ + +/* Don't do anything unless we are asked to */ + if ( !(packetAliasMode & PKT_ALIAS_PUNCH_FW) || + fireWallFD < 0 || + link->link_type != LINK_TCP) + return; + + memset(&rule, 0, sizeof rule); + +/** Build rule **/ + + /* Find empty slot */ + for (fwhole = fireWallActiveNum; + fwhole < fireWallBaseNum + fireWallNumNums && + fw_tstfield(fireWallField, fwhole); + fwhole++) + ; + if (fwhole == fireWallBaseNum + fireWallNumNums) { + for (fwhole = fireWallBaseNum; + fwhole < fireWallActiveNum && + fw_tstfield(fireWallField, fwhole); + fwhole++) + ; + if (fwhole == fireWallActiveNum) { + /* No rule point empty - we can't punch more holes. */ + fireWallActiveNum = fireWallBaseNum; +#ifdef DEBUG + fprintf(stderr, "libalias: Unable to create firewall hole!\n"); +#endif + return; + } + } + /* Start next search at next position */ + fireWallActiveNum = fwhole+1; + + /* Build generic part of the two rules */ + rule.fw_number = fwhole; + IP_FW_SETNSRCP(&rule, 1); /* Number of source ports. */ + IP_FW_SETNDSTP(&rule, 1); /* Number of destination ports. */ + rule.fw_flg = IP_FW_F_ACCEPT | IP_FW_F_IN | IP_FW_F_OUT; + rule.fw_prot = IPPROTO_TCP; + rule.fw_smsk.s_addr = INADDR_BROADCAST; + rule.fw_dmsk.s_addr = INADDR_BROADCAST; + + /* Build and apply specific part of the rules */ + rule.fw_src = GetOriginalAddress(link); + rule.fw_dst = GetDestAddress(link); + rule.fw_uar.fw_pts[0] = ntohs(GetOriginalPort(link)); + rule.fw_uar.fw_pts[1] = ntohs(GetDestPort(link)); + + /* Skip non-bound links - XXX should not be strictly necessary, + but seems to leave hole if not done. Leak of non-bound links? + (Code should be left even if the problem is fixed - it is a + clear optimization) */ + if (rule.fw_uar.fw_pts[0] != 0 && rule.fw_uar.fw_pts[1] != 0) { + r = setsockopt(fireWallFD, IPPROTO_IP, IP_FW_ADD, &rule, sizeof rule); +#ifdef DEBUG + if (r) + err(1, "alias punch inbound(1) setsockopt(IP_FW_ADD)"); +#endif + rule.fw_src = GetDestAddress(link); + rule.fw_dst = GetOriginalAddress(link); + rule.fw_uar.fw_pts[0] = ntohs(GetDestPort(link)); + rule.fw_uar.fw_pts[1] = ntohs(GetOriginalPort(link)); + r = setsockopt(fireWallFD, IPPROTO_IP, IP_FW_ADD, &rule, sizeof rule); +#ifdef DEBUG + if (r) + err(1, "alias punch inbound(2) setsockopt(IP_FW_ADD)"); +#endif + } +/* Indicate hole applied */ + link->data.tcp->fwhole = fwhole; + fw_setfield(fireWallField, fwhole); +} + +/* Remove a hole in a firewall associated with a particular alias + link. Calling this too often is harmless. */ +static void +ClearFWHole(struct alias_link *link) { + if (link->link_type == LINK_TCP) { + int fwhole = link->data.tcp->fwhole; /* Where is the firewall hole? */ + struct ip_fw rule; + + if (fwhole < 0) + return; + + memset(&rule, 0, sizeof rule); + rule.fw_number = fwhole; + while (!setsockopt(fireWallFD, IPPROTO_IP, IP_FW_DEL, &rule, sizeof rule)) + ; + fw_clrfield(fireWallField, fwhole); + link->data.tcp->fwhole = -1; + } +} + +/* Clear out the entire range dedicated to firewall holes. */ +static void +ClearAllFWHoles(void) { + struct ip_fw rule; /* On-the-fly built rule */ + int i; + + if (fireWallFD < 0) + return; + + memset(&rule, 0, sizeof rule); + for (i = fireWallBaseNum; i < fireWallBaseNum + fireWallNumNums; i++) { + rule.fw_number = i; + while (!setsockopt(fireWallFD, IPPROTO_IP, IP_FW_DEL, &rule, sizeof rule)) + ; + } + memset(fireWallField, 0, fireWallNumNums); +} +#endif + +void +PacketAliasSetFWBase(unsigned int base, unsigned int num) { +#ifndef NO_FW_PUNCH + fireWallBaseNum = base; + fireWallNumNums = num; +#endif +} diff --git a/sys/netinet/libalias/alias_ftp.c b/sys/netinet/libalias/alias_ftp.c new file mode 100644 index 0000000..d5978f9 --- /dev/null +++ b/sys/netinet/libalias/alias_ftp.c @@ -0,0 +1,554 @@ +/* + Alias_ftp.c performs special processing for FTP sessions under + TCP. Specifically, when a PORT/EPRT command from the client + side or 227/229 reply from the server is sent, it is intercepted + and modified. The address is changed to the gateway machine + and an aliasing port is used. + + For this routine to work, the message must fit entirely into a + single TCP packet. This is typically the case, but exceptions + can easily be envisioned under the actual specifications. + + Probably the most troubling aspect of the approach taken here is + that the new message will typically be a different length, and + this causes a certain amount of bookkeeping to keep track of the + changes of sequence and acknowledgment numbers, since the client + machine is totally unaware of the modification to the TCP stream. + + + This software is placed into the public domain with no restrictions + on its distribution. + + References: RFC 959, RFC 2428. + + Initial version: August, 1996 (cjm) + + Version 1.6 + Brian Somers and Martin Renters identified an IP checksum + error for modified IP packets. + + Version 1.7: January 9, 1996 (cjm) + Differential checksum computation for change + in IP packet length. + + Version 2.1: May, 1997 (cjm) + Very minor changes to conform with + local/global/function naming conventions + within the packet aliasing module. + + Version 3.1: May, 2000 (eds) + Add support for passive mode, alias the 227 replies. + + See HISTORY file for record of revisions. + + $FreeBSD$ +*/ + +/* Includes */ +#include <ctype.h> +#include <stdio.h> +#include <string.h> +#include <sys/types.h> +#include <netinet/in_systm.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/tcp.h> + +#include "alias_local.h" + +#define FTP_CONTROL_PORT_NUMBER 21 +#define MAX_MESSAGE_SIZE 128 + +enum ftp_message_type { + FTP_PORT_COMMAND, + FTP_EPRT_COMMAND, + FTP_227_REPLY, + FTP_229_REPLY, + FTP_UNKNOWN_MESSAGE +}; + +static int ParseFtpPortCommand(char *, int); +static int ParseFtpEprtCommand(char *, int); +static int ParseFtp227Reply(char *, int); +static int ParseFtp229Reply(char *, int); +static void NewFtpMessage(struct ip *, struct alias_link *, int, int); + +static struct in_addr true_addr; /* in network byte order. */ +static u_short true_port; /* in host byte order. */ + +void +AliasHandleFtpOut( +struct ip *pip, /* IP packet to examine/patch */ +struct alias_link *link, /* The link to go through (aliased port) */ +int maxpacketsize /* The maximum size this packet can grow to (including headers) */) +{ + int hlen, tlen, dlen; + char *sptr; + struct tcphdr *tc; + int ftp_message_type; + +/* Calculate data length of TCP packet */ + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + hlen = (pip->ip_hl + tc->th_off) << 2; + tlen = ntohs(pip->ip_len); + dlen = tlen - hlen; + +/* Place string pointer and beginning of data */ + sptr = (char *) pip; + sptr += hlen; + +/* + * Check that data length is not too long and previous message was + * properly terminated with CRLF. + */ + if (dlen <= MAX_MESSAGE_SIZE && GetLastLineCrlfTermed(link)) { + ftp_message_type = FTP_UNKNOWN_MESSAGE; + + if (ntohs(tc->th_dport) == FTP_CONTROL_PORT_NUMBER) { +/* + * When aliasing a client, check for the PORT/EPRT command. + */ + if (ParseFtpPortCommand(sptr, dlen)) + ftp_message_type = FTP_PORT_COMMAND; + else if (ParseFtpEprtCommand(sptr, dlen)) + ftp_message_type = FTP_EPRT_COMMAND; + } else { +/* + * When aliasing a server, check for the 227/229 reply. + */ + if (ParseFtp227Reply(sptr, dlen)) + ftp_message_type = FTP_227_REPLY; + else if (ParseFtp229Reply(sptr, dlen)) + ftp_message_type = FTP_229_REPLY; + } + + if (ftp_message_type != FTP_UNKNOWN_MESSAGE) + NewFtpMessage(pip, link, maxpacketsize, ftp_message_type); + } + +/* Track the msgs which are CRLF term'd for PORT/PASV FW breach */ + + if (dlen) { /* only if there's data */ + sptr = (char *) pip; /* start over at beginning */ + tlen = ntohs(pip->ip_len); /* recalc tlen, pkt may have grown */ + SetLastLineCrlfTermed(link, + (sptr[tlen-2] == '\r') && (sptr[tlen-1] == '\n')); + } +} + +static int +ParseFtpPortCommand(char *sptr, int dlen) +{ + char ch; + int i, state; + u_int32_t addr; + u_short port; + u_int8_t octet; + + /* Format: "PORT A,D,D,R,PO,RT". */ + + /* Return if data length is too short. */ + if (dlen < 18) + return 0; + + addr = port = octet = 0; + state = -4; + for (i = 0; i < dlen; i++) { + ch = sptr[i]; + switch (state) { + case -4: if (ch == 'P') state++; else return 0; break; + case -3: if (ch == 'O') state++; else return 0; break; + case -2: if (ch == 'R') state++; else return 0; break; + case -1: if (ch == 'T') state++; else return 0; break; + + case 0: + if (isspace(ch)) + break; + else + state++; + case 1: case 3: case 5: case 7: case 9: case 11: + if (isdigit(ch)) { + octet = ch - '0'; + state++; + } else + return 0; + break; + case 2: case 4: case 6: case 8: + if (isdigit(ch)) + octet = 10 * octet + ch - '0'; + else if (ch == ',') { + addr = (addr << 8) + octet; + state++; + } else + return 0; + break; + case 10: case 12: + if (isdigit(ch)) + octet = 10 * octet + ch - '0'; + else if (ch == ',' || state == 12) { + port = (port << 8) + octet; + state++; + } else + return 0; + break; + } + } + + if (state == 13) { + true_addr.s_addr = htonl(addr); + true_port = port; + return 1; + } else + return 0; +} + +static int +ParseFtpEprtCommand(char *sptr, int dlen) +{ + char ch, delim; + int i, state; + u_int32_t addr; + u_short port; + u_int8_t octet; + + /* Format: "EPRT |1|A.D.D.R|PORT|". */ + + /* Return if data length is too short. */ + if (dlen < 18) + return 0; + + addr = port = octet = 0; + delim = '|'; /* XXX gcc -Wuninitialized */ + state = -4; + for (i = 0; i < dlen; i++) { + ch = sptr[i]; + switch (state) + { + case -4: if (ch == 'E') state++; else return 0; break; + case -3: if (ch == 'P') state++; else return 0; break; + case -2: if (ch == 'R') state++; else return 0; break; + case -1: if (ch == 'T') state++; else return 0; break; + + case 0: + if (!isspace(ch)) { + delim = ch; + state++; + } + break; + case 1: + if (ch == '1') /* IPv4 address */ + state++; + else + return 0; + break; + case 2: + if (ch == delim) + state++; + else + return 0; + break; + case 3: case 5: case 7: case 9: + if (isdigit(ch)) { + octet = ch - '0'; + state++; + } else + return 0; + break; + case 4: case 6: case 8: case 10: + if (isdigit(ch)) + octet = 10 * octet + ch - '0'; + else if (ch == '.' || state == 10) { + addr = (addr << 8) + octet; + state++; + } else + return 0; + break; + case 11: + if (isdigit(ch)) { + port = ch - '0'; + state++; + } else + return 0; + break; + case 12: + if (isdigit(ch)) + port = 10 * port + ch - '0'; + else if (ch == delim) + state++; + else + return 0; + break; + } + } + + if (state == 13) { + true_addr.s_addr = htonl(addr); + true_port = port; + return 1; + } else + return 0; +} + +static int +ParseFtp227Reply(char *sptr, int dlen) +{ + char ch; + int i, state; + u_int32_t addr; + u_short port; + u_int8_t octet; + + /* Format: "227 Entering Passive Mode (A,D,D,R,PO,RT)" */ + + /* Return if data length is too short. */ + if (dlen < 17) + return 0; + + addr = port = octet = 0; + + state = -3; + for (i = 0; i < dlen; i++) { + ch = sptr[i]; + switch (state) + { + case -3: if (ch == '2') state++; else return 0; break; + case -2: if (ch == '2') state++; else return 0; break; + case -1: if (ch == '7') state++; else return 0; break; + + case 0: + if (ch == '(') + state++; + break; + case 1: case 3: case 5: case 7: case 9: case 11: + if (isdigit(ch)) { + octet = ch - '0'; + state++; + } else + return 0; + break; + case 2: case 4: case 6: case 8: + if (isdigit(ch)) + octet = 10 * octet + ch - '0'; + else if (ch == ',') { + addr = (addr << 8) + octet; + state++; + } else + return 0; + break; + case 10: case 12: + if (isdigit(ch)) + octet = 10 * octet + ch - '0'; + else if (ch == ',' || (state == 12 && ch == ')')) { + port = (port << 8) + octet; + state++; + } else + return 0; + break; + } + } + + if (state == 13) { + true_port = port; + true_addr.s_addr = htonl(addr); + return 1; + } else + return 0; +} + +static int +ParseFtp229Reply(char *sptr, int dlen) +{ + char ch, delim; + int i, state; + u_short port; + + /* Format: "229 Entering Extended Passive Mode (|||PORT|)" */ + + /* Return if data length is too short. */ + if (dlen < 11) + return 0; + + port = 0; + delim = '|'; /* XXX gcc -Wuninitialized */ + + state = -3; + for (i = 0; i < dlen; i++) { + ch = sptr[i]; + switch (state) + { + case -3: if (ch == '2') state++; else return 0; break; + case -2: if (ch == '2') state++; else return 0; break; + case -1: if (ch == '9') state++; else return 0; break; + + case 0: + if (ch == '(') + state++; + break; + case 1: + delim = ch; + state++; + break; + case 2: case 3: + if (ch == delim) + state++; + else + return 0; + break; + case 4: + if (isdigit(ch)) { + port = ch - '0'; + state++; + } else + return 0; + break; + case 5: + if (isdigit(ch)) + port = 10 * port + ch - '0'; + else if (ch == delim) + state++; + else + return 0; + break; + case 6: + if (ch == ')') + state++; + else + return 0; + break; + } + } + + if (state == 7) { + true_port = port; + return 1; + } else + return 0; +} + +static void +NewFtpMessage(struct ip *pip, + struct alias_link *link, + int maxpacketsize, + int ftp_message_type) +{ + struct alias_link *ftp_link; + +/* Security checks. */ + if (ftp_message_type != FTP_229_REPLY && + pip->ip_src.s_addr != true_addr.s_addr) + return; + + if (true_port < IPPORT_RESERVED) + return; + +/* Establish link to address and port found in FTP control message. */ + ftp_link = FindUdpTcpOut(true_addr, GetDestAddress(link), + htons(true_port), 0, IPPROTO_TCP, 1); + + if (ftp_link != NULL) + { + int slen, hlen, tlen, dlen; + struct tcphdr *tc; + +#ifndef NO_FW_PUNCH + if (ftp_message_type == FTP_PORT_COMMAND || + ftp_message_type == FTP_EPRT_COMMAND) { + /* Punch hole in firewall */ + PunchFWHole(ftp_link); + } +#endif + +/* Calculate data length of TCP packet */ + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + hlen = (pip->ip_hl + tc->th_off) << 2; + tlen = ntohs(pip->ip_len); + dlen = tlen - hlen; + +/* Create new FTP message. */ + { + char stemp[MAX_MESSAGE_SIZE + 1]; + char *sptr; + u_short alias_port; + u_char *ptr; + int a1, a2, a3, a4, p1, p2; + struct in_addr alias_address; + +/* Decompose alias address into quad format */ + alias_address = GetAliasAddress(link); + ptr = (u_char *) &alias_address.s_addr; + a1 = *ptr++; a2=*ptr++; a3=*ptr++; a4=*ptr; + + alias_port = GetAliasPort(ftp_link); + + switch (ftp_message_type) + { + case FTP_PORT_COMMAND: + case FTP_227_REPLY: + /* Decompose alias port into pair format. */ + ptr = (char *) &alias_port; + p1 = *ptr++; p2=*ptr; + + if (ftp_message_type == FTP_PORT_COMMAND) { + /* Generate PORT command string. */ + sprintf(stemp, "PORT %d,%d,%d,%d,%d,%d\r\n", + a1,a2,a3,a4,p1,p2); + } else { + /* Generate 227 reply string. */ + sprintf(stemp, + "227 Entering Passive Mode (%d,%d,%d,%d,%d,%d)\r\n", + a1,a2,a3,a4,p1,p2); + } + break; + case FTP_EPRT_COMMAND: + /* Generate EPRT command string. */ + sprintf(stemp, "EPRT |1|%d.%d.%d.%d|%d|\r\n", + a1,a2,a3,a4,ntohs(alias_port)); + break; + case FTP_229_REPLY: + /* Generate 229 reply string. */ + sprintf(stemp, "229 Entering Extended Passive Mode (|||%d|)\r\n", + ntohs(alias_port)); + break; + } + +/* Save string length for IP header modification */ + slen = strlen(stemp); + +/* Copy modified buffer into IP packet. */ + sptr = (char *) pip; sptr += hlen; + strncpy(sptr, stemp, maxpacketsize-hlen); + } + +/* Save information regarding modified seq and ack numbers */ + { + int delta; + + SetAckModified(link); + delta = GetDeltaSeqOut(pip, link); + AddSeq(pip, link, delta+slen-dlen); + } + +/* Revise IP header */ + { + u_short new_len; + + new_len = htons(hlen + slen); + DifferentialChecksum(&pip->ip_sum, + &new_len, + &pip->ip_len, + 1); + pip->ip_len = new_len; + } + +/* Compute TCP checksum for revised packet */ + tc->th_sum = 0; + tc->th_sum = TcpChecksum(pip); + } + else + { +#ifdef DEBUG + fprintf(stderr, + "PacketAlias/HandleFtpOut: Cannot allocate FTP data port\n"); +#endif + } +} diff --git a/sys/netinet/libalias/alias_irc.c b/sys/netinet/libalias/alias_irc.c new file mode 100644 index 0000000..afd032d --- /dev/null +++ b/sys/netinet/libalias/alias_irc.c @@ -0,0 +1,317 @@ +/* Alias_irc.c intercepts packages contain IRC CTCP commands, and + changes DCC commands to export a port on the aliasing host instead + of an aliased host. + + For this routine to work, the DCC command must fit entirely into a + single TCP packet. This will usually happen, but is not + guaranteed. + + The interception is likely to change the length of the packet. + The handling of this is copied more-or-less verbatim from + ftp_alias.c + + This software is placed into the public domain with no restrictions + on its distribution. + + Initial version: Eivind Eklund <perhaps@yes.no> (ee) 97-01-29 + + Version 2.1: May, 1997 (cjm) + Very minor changes to conform with + local/global/function naming conventions + withing the packet alising module. + + $FreeBSD$ +*/ + +/* Includes */ +#include <ctype.h> +#include <stdio.h> +#include <string.h> +#include <sys/types.h> +#include <netinet/in_systm.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/tcp.h> +#include <limits.h> + +#include "alias_local.h" + +/* Local defines */ +#define DBprintf(a) + + +void +AliasHandleIrcOut(struct ip *pip, /* IP packet to examine */ + struct alias_link *link, /* Which link are we on? */ + int maxsize /* Maximum size of IP packet including headers */ + ) +{ + int hlen, tlen, dlen; + struct in_addr true_addr; + u_short true_port; + char *sptr; + struct tcphdr *tc; + int i; /* Iterator through the source */ + +/* Calculate data length of TCP packet */ + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + hlen = (pip->ip_hl + tc->th_off) << 2; + tlen = ntohs(pip->ip_len); + dlen = tlen - hlen; + + /* Return if data length is too short - assume an entire PRIVMSG in each packet. */ + if (dlen<sizeof(":A!a@n.n PRIVMSG A :aDCC 1 1a")-1) + return; + +/* Place string pointer at beginning of data */ + sptr = (char *) pip; + sptr += hlen; + maxsize -= hlen; /* We're interested in maximum size of data, not packet */ + + /* Search for a CTCP command [Note 1] */ + for( i=0; i<dlen; i++ ) { + if(sptr[i]=='\001') + goto lFOUND_CTCP; + } + return; /* No CTCP commands in */ + /* Handle CTCP commands - the buffer may have to be copied */ +lFOUND_CTCP: + { + char newpacket[65536]; /* Estimate of maximum packet size :) */ + int copyat = i; /* Same */ + int iCopy = 0; /* How much data have we written to copy-back string? */ + unsigned long org_addr; /* Original IP address */ + unsigned short org_port; /* Original source port address */ + lCTCP_START: + if( i >= dlen || iCopy >= sizeof(newpacket) ) + goto lPACKET_DONE; + newpacket[iCopy++] = sptr[i++]; /* Copy the CTCP start character */ + /* Start of a CTCP */ + if( i+4 >= dlen ) /* Too short for DCC */ + goto lBAD_CTCP; + if( sptr[i+0] != 'D' ) + goto lBAD_CTCP; + if( sptr[i+1] != 'C' ) + goto lBAD_CTCP; + if( sptr[i+2] != 'C' ) + goto lBAD_CTCP; + if( sptr[i+3] != ' ' ) + goto lBAD_CTCP; + /* We have a DCC command - handle it! */ + i+= 4; /* Skip "DCC " */ + if( iCopy+4 > sizeof(newpacket) ) + goto lPACKET_DONE; + newpacket[iCopy++] = 'D'; + newpacket[iCopy++] = 'C'; + newpacket[iCopy++] = 'C'; + newpacket[iCopy++] = ' '; + + DBprintf(("Found DCC\n")); + /* Skip any extra spaces (should not occur according to + protocol, but DCC breaks CTCP protocol anyway */ + while(sptr[i] == ' ') { + if( ++i >= dlen) { + DBprintf(("DCC packet terminated in just spaces\n")); + goto lPACKET_DONE; + } + } + + DBprintf(("Transferring command...\n")); + while(sptr[i] != ' ') { + newpacket[iCopy++] = sptr[i]; + if( ++i >= dlen || iCopy >= sizeof(newpacket) ) { + DBprintf(("DCC packet terminated during command\n")); + goto lPACKET_DONE; + } + } + /* Copy _one_ space */ + if( i+1 < dlen && iCopy < sizeof(newpacket) ) + newpacket[iCopy++] = sptr[i++]; + + DBprintf(("Done command - removing spaces\n")); + /* Skip any extra spaces (should not occur according to + protocol, but DCC breaks CTCP protocol anyway */ + while(sptr[i] == ' ') { + if( ++i >= dlen ) { + DBprintf(("DCC packet terminated in just spaces (post-command)\n")); + goto lPACKET_DONE; + } + } + + DBprintf(("Transferring filename...\n")); + while(sptr[i] != ' ') { + newpacket[iCopy++] = sptr[i]; + if( ++i >= dlen || iCopy >= sizeof(newpacket) ) { + DBprintf(("DCC packet terminated during filename\n")); + goto lPACKET_DONE; + } + } + /* Copy _one_ space */ + if( i+1 < dlen && iCopy < sizeof(newpacket) ) + newpacket[iCopy++] = sptr[i++]; + + DBprintf(("Done filename - removing spaces\n")); + /* Skip any extra spaces (should not occur according to + protocol, but DCC breaks CTCP protocol anyway */ + while(sptr[i] == ' ') { + if( ++i >= dlen ) { + DBprintf(("DCC packet terminated in just spaces (post-filename)\n")); + goto lPACKET_DONE; + } + } + + DBprintf(("Fetching IP address\n")); + /* Fetch IP address */ + org_addr = 0; + while(i<dlen && isdigit(sptr[i])) { + if( org_addr > ULONG_MAX/10UL ) { /* Terminate on overflow */ + DBprintf(("DCC Address overflow (org_addr == 0x%08lx, next char %c\n", org_addr, sptr[i])); + goto lBAD_CTCP; + } + org_addr *= 10; + org_addr += sptr[i++]-'0'; + } + DBprintf(("Skipping space\n")); + if( i+1 >= dlen || sptr[i] != ' ' ) { + DBprintf(("Overflow (%d >= %d) or bad character (%02x) terminating IP address\n", i+1, dlen, sptr[i])); + goto lBAD_CTCP; + } + /* Skip any extra spaces (should not occur according to + protocol, but DCC breaks CTCP protocol anyway, so we might + as well play it safe */ + while(sptr[i] == ' ') { + if( ++i >= dlen ) { + DBprintf(("Packet failure - space overflow.\n")); + goto lPACKET_DONE; + } + } + DBprintf(("Fetching port number\n")); + /* Fetch source port */ + org_port = 0; + while(i<dlen && isdigit(sptr[i])) { + if( org_port > 6554 ) { /* Terminate on overflow (65536/10 rounded up*/ + DBprintf(("DCC: port number overflow\n")); + goto lBAD_CTCP; + } + org_port *= 10; + org_port += sptr[i++]-'0'; + } + /* Skip illegal addresses (or early termination) */ + if( i >= dlen || (sptr[i] != '\001' && sptr[i] != ' ') ) { + DBprintf(("Bad port termination\n")); + goto lBAD_CTCP; + } + DBprintf(("Got IP %lu and port %u\n", org_addr, (unsigned)org_port)); + + /* We've got the address and port - now alias it */ + { + struct alias_link *dcc_link; + struct in_addr destaddr; + + + true_port = htons(org_port); + true_addr.s_addr = htonl(org_addr); + destaddr.s_addr = 0; + + /* Steal the FTP_DATA_PORT - it doesn't really matter, and this + would probably allow it through at least _some_ + firewalls. */ + dcc_link = FindUdpTcpOut(true_addr, destaddr, + true_port, 0, + IPPROTO_TCP, 1); + DBprintf(("Got a DCC link\n")); + if ( dcc_link ) { + struct in_addr alias_address; /* Address from aliasing */ + u_short alias_port; /* Port given by aliasing */ + +#ifndef NO_FW_PUNCH + /* Generate firewall hole as appropriate */ + PunchFWHole(dcc_link); +#endif + + alias_address = GetAliasAddress(link); + iCopy += snprintf(&newpacket[iCopy], + sizeof(newpacket)-iCopy, + "%lu ", (u_long)htonl(alias_address.s_addr)); + if( iCopy >= sizeof(newpacket) ) { /* Truncated/fit exactly - bad news */ + DBprintf(("DCC constructed packet overflow.\n")); + goto lBAD_CTCP; + } + alias_port = GetAliasPort(dcc_link); + iCopy += snprintf(&newpacket[iCopy], + sizeof(newpacket)-iCopy, + "%u", htons(alias_port) ); + /* Done - truncated cases will be taken care of by lBAD_CTCP */ + DBprintf(("Aliased IP %lu and port %u\n", alias_address.s_addr, (unsigned)alias_port)); + } + } + /* An uninteresting CTCP - state entered right after '\001' has + been pushed. Also used to copy the rest of a DCC, after IP + address and port has been handled */ + lBAD_CTCP: + for(; i<dlen && iCopy<sizeof(newpacket); i++,iCopy++) { + newpacket[iCopy] = sptr[i]; /* Copy CTCP unchanged */ + if(sptr[i] == '\001') { + goto lNORMAL_TEXT; + } + } + goto lPACKET_DONE; + /* Normal text */ + lNORMAL_TEXT: + for(; i<dlen && iCopy<sizeof(newpacket); i++,iCopy++) { + newpacket[iCopy] = sptr[i]; /* Copy CTCP unchanged */ + if(sptr[i] == '\001') { + goto lCTCP_START; + } + } + /* Handle the end of a packet */ + lPACKET_DONE: + iCopy = iCopy > maxsize-copyat ? maxsize-copyat : iCopy; + memcpy(sptr+copyat, newpacket, iCopy); + +/* Save information regarding modified seq and ack numbers */ + { + int delta; + + SetAckModified(link); + delta = GetDeltaSeqOut(pip, link); + AddSeq(pip, link, delta+copyat+iCopy-dlen); + } + + /* Revise IP header */ + { + u_short new_len; + + new_len = htons(hlen + iCopy + copyat); + DifferentialChecksum(&pip->ip_sum, + &new_len, + &pip->ip_len, + 1); + pip->ip_len = new_len; + } + + /* Compute TCP checksum for revised packet */ + tc->th_sum = 0; + tc->th_sum = TcpChecksum(pip); + return; + } +} + +/* Notes: + [Note 1] + The initial search will most often fail; it could be replaced with a 32-bit specific search. + Such a search would be done for 32-bit unsigned value V: + V ^= 0x01010101; (Search is for null bytes) + if( ((V-0x01010101)^V) & 0x80808080 ) { + (found a null bytes which was a 01 byte) + } + To assert that the processor is 32-bits, do + extern int ircdccar[32]; (32 bits) + extern int ircdccar[CHAR_BIT*sizeof(unsigned int)]; + which will generate a type-error on all but 32-bit machines. + + [Note 2] This routine really ought to be replaced with one that + creates a transparent proxy on the aliasing host, to allow arbitary + changes in the TCP stream. This should not be too difficult given + this base; I (ee) will try to do this some time later. + */ diff --git a/sys/netinet/libalias/alias_local.h b/sys/netinet/libalias/alias_local.h new file mode 100644 index 0000000..152406d --- /dev/null +++ b/sys/netinet/libalias/alias_local.h @@ -0,0 +1,203 @@ +/* + * Alias_local.h contains the function prototypes for alias.c, + * alias_db.c, alias_util.c and alias_ftp.c, alias_irc.c (as well + * as any future add-ons). It also includes macros, globals and + * struct definitions shared by more than one alias*.c file. + * + * This include file is intended to be used only within the aliasing + * software. Outside world interfaces are defined in alias.h + * + * This software is placed into the public domain with no restrictions + * on its distribution. + * + * Initial version: August, 1996 (cjm) + * + * <updated several times by original author and Eivind Eklund> + * + * $FreeBSD$ + */ + +#ifndef _ALIAS_LOCAL_H_ +#define _ALIAS_LOCAL_H_ + +/* Macros */ + +/* + * The following macro is used to update an + * internet checksum. "delta" is a 32-bit + * accumulation of all the changes to the + * checksum (adding in new 16-bit words and + * subtracting out old words), and "cksum" + * is the checksum value to be updated. + */ +#define ADJUST_CHECKSUM(acc, cksum) \ + do { \ + acc += cksum; \ + if (acc < 0) { \ + acc = -acc; \ + acc = (acc >> 16) + (acc & 0xffff); \ + acc += acc >> 16; \ + cksum = (u_short) ~acc; \ + } else { \ + acc = (acc >> 16) + (acc & 0xffff); \ + acc += acc >> 16; \ + cksum = (u_short) acc; \ + } \ + } while (0) + +/* Globals */ + +extern int packetAliasMode; + +/* Prototypes */ + +/* General utilities */ +u_short IpChecksum(struct ip *_pip); +u_short TcpChecksum(struct ip *_pip); +void DifferentialChecksum(u_short *_cksum, u_short *_new, u_short *_old, + int _n); + +/* Internal data access */ +struct alias_link * + FindIcmpIn(struct in_addr _dst_addr, struct in_addr _alias_addr, + u_short _id_alias, int _create); +struct alias_link * + FindIcmpOut(struct in_addr _src_addr, struct in_addr _dst_addr, + u_short _id, int _create); +struct alias_link * + FindFragmentIn1(struct in_addr _dst_addr, struct in_addr _alias_addr, + u_short _ip_id); +struct alias_link * + FindFragmentIn2(struct in_addr _dst_addr, struct in_addr _alias_addr, + u_short _ip_id); +struct alias_link * + AddFragmentPtrLink(struct in_addr _dst_addr, u_short _ip_id); +struct alias_link * + FindFragmentPtr(struct in_addr _dst_addr, u_short _ip_id); +struct alias_link * + FindProtoIn(struct in_addr _dst_addr, struct in_addr _alias_addr, + u_char _proto); +struct alias_link * + FindProtoOut(struct in_addr _src_addr, struct in_addr _dst_addr, + u_char _proto); +struct alias_link * + FindUdpTcpIn(struct in_addr _dst_addr, struct in_addr _alias_addr, + u_short _dst_port, u_short _alias_port, u_char _proto, int _create); +struct alias_link * + FindUdpTcpOut(struct in_addr _src_addr, struct in_addr _dst_addr, + u_short _src_port, u_short _dst_port, u_char _proto, int _create); +struct alias_link * + AddPptp(struct in_addr _src_addr, struct in_addr _dst_addr, + struct in_addr _alias_addr, u_int16_t _src_call_id); +struct alias_link * + FindPptpOutByCallId(struct in_addr _src_addr, + struct in_addr _dst_addr, u_int16_t _src_call_id); +struct alias_link * + FindPptpInByCallId(struct in_addr _dst_addr, + struct in_addr _alias_addr, u_int16_t _dst_call_id); +struct alias_link * + FindPptpOutByPeerCallId(struct in_addr _src_addr, + struct in_addr _dst_addr, u_int16_t _dst_call_id); +struct alias_link * + FindPptpInByPeerCallId(struct in_addr _dst_addr, + struct in_addr _alias_addr, u_int16_t _alias_call_id); +struct alias_link * + FindRtspOut(struct in_addr _src_addr, struct in_addr _dst_addr, + u_short _src_port, u_short _alias_port, u_char _proto); +struct in_addr + FindOriginalAddress(struct in_addr _alias_addr); +struct in_addr + FindAliasAddress(struct in_addr _original_addr); + +/* External data access/modification */ +int FindNewPortGroup(struct in_addr _dst_addr, struct in_addr _alias_addr, + u_short _src_port, u_short _dst_port, u_short _port_count, + u_char _proto, u_char _align); +void GetFragmentAddr(struct alias_link *_link, struct in_addr *_src_addr); +void SetFragmentAddr(struct alias_link *_link, struct in_addr _src_addr); +void GetFragmentPtr(struct alias_link *_link, char **_fptr); +void SetFragmentPtr(struct alias_link *_link, char *fptr); +void SetStateIn(struct alias_link *_link, int _state); +void SetStateOut(struct alias_link *_link, int _state); +int GetStateIn(struct alias_link *_link); +int GetStateOut(struct alias_link *_link); +struct in_addr + GetOriginalAddress(struct alias_link *_link); +struct in_addr + GetDestAddress(struct alias_link *_link); +struct in_addr + GetAliasAddress(struct alias_link *_link); +struct in_addr + GetDefaultAliasAddress(void); +void SetDefaultAliasAddress(struct in_addr _alias_addr); +u_short GetOriginalPort(struct alias_link *_link); +u_short GetAliasPort(struct alias_link *_link); +struct in_addr + GetProxyAddress(struct alias_link *_link); +void SetProxyAddress(struct alias_link *_link, struct in_addr _addr); +u_short GetProxyPort(struct alias_link *_link); +void SetProxyPort(struct alias_link *_link, u_short _port); +void SetAckModified(struct alias_link *_link); +int GetAckModified(struct alias_link *_link); +int GetDeltaAckIn(struct ip *_pip, struct alias_link *_link); +int GetDeltaSeqOut(struct ip *_pip, struct alias_link *_link); +void AddSeq(struct ip *_pip, struct alias_link *_link, int _delta); +void SetExpire(struct alias_link *_link, int _expire); +void ClearCheckNewLink(void); +void SetLastLineCrlfTermed(struct alias_link *_link, int _yes); +int GetLastLineCrlfTermed(struct alias_link *_link); +void SetDestCallId(struct alias_link *_link, u_int16_t _cid); +#ifndef NO_FW_PUNCH +void PunchFWHole(struct alias_link *_link); +#endif + +/* Housekeeping function */ +void HouseKeeping(void); + +/* Tcp specfic routines */ +/* lint -save -library Suppress flexelint warnings */ + +/* FTP routines */ +void AliasHandleFtpOut(struct ip *_pip, struct alias_link *_link, + int _maxpacketsize); + +/* IRC routines */ +void AliasHandleIrcOut(struct ip *_pip, struct alias_link *_link, + int _maxsize); + +/* RTSP routines */ +void AliasHandleRtspOut(struct ip *_pip, struct alias_link *_link, + int _maxpacketsize); + +/* PPTP routines */ +void AliasHandlePptpOut(struct ip *_pip, struct alias_link *_link); +void AliasHandlePptpIn(struct ip *_pip, struct alias_link *_link); +int AliasHandlePptpGreOut(struct ip *_pip); +int AliasHandlePptpGreIn(struct ip *_pip); + +/* NetBIOS routines */ +int AliasHandleUdpNbt(struct ip *_pip, struct alias_link *_link, + struct in_addr *_alias_address, u_short _alias_port); +int AliasHandleUdpNbtNS(struct ip *_pip, struct alias_link *_link, + struct in_addr *_alias_address, u_short *_alias_port, + struct in_addr *_original_address, u_short *_original_port); + +/* CUSeeMe routines */ +void AliasHandleCUSeeMeOut(struct ip *_pip, struct alias_link *_link); +void AliasHandleCUSeeMeIn(struct ip *_pip, struct in_addr _original_addr); + +/* Transparent proxy routines */ +int ProxyCheck(struct ip *_pip, struct in_addr *_proxy_server_addr, + u_short *_proxy_server_port); +void ProxyModify(struct alias_link *_link, struct ip *_pip, + int _maxpacketsize, int _proxy_type); + +enum alias_tcp_state { + ALIAS_TCP_STATE_NOT_CONNECTED, + ALIAS_TCP_STATE_CONNECTED, + ALIAS_TCP_STATE_DISCONNECTED +}; + +/*lint -restore */ + +#endif /* !_ALIAS_LOCAL_H_ */ diff --git a/sys/netinet/libalias/alias_nbt.c b/sys/netinet/libalias/alias_nbt.c new file mode 100644 index 0000000..74fe56e --- /dev/null +++ b/sys/netinet/libalias/alias_nbt.c @@ -0,0 +1,694 @@ +/* + * Written by Atsushi Murai <amurai@spec.co.jp> + * + * Copyright (C) 1998, System Planning and Engineering Co. All rights reserverd. + * + * Redistribution and use in source and binary forms are permitted + * provided that the above copyright notice and this paragraph are + * duplicated in all such forms and that any documentation, + * advertising materials, and other materials related to such + * distribution and use acknowledge that the software was developed + * by the System Planning and Engineering Co. The name of the + * SPEC may not be used to endorse or promote products derived + * from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED + * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + * + * $FreeBSD$ + * + * TODO: + * oClean up. + * oConsidering for word alignment for other platform. + */ +/* + alias_nbt.c performs special processing for NetBios over TCP/IP + sessions by UDP. + + Initial version: May, 1998 (Atsushi Murai <amurai@spec.co.jp>) + + See HISTORY file for record of revisions. +*/ + +/* Includes */ +#include <ctype.h> +#include <stdio.h> +#include <string.h> +#include <sys/types.h> +#include <netinet/in_systm.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <netinet/ip.h> +#include <netinet/udp.h> +#include <netinet/tcp.h> + +#include "alias_local.h" + +typedef struct { + struct in_addr oldaddr; + u_short oldport; + struct in_addr newaddr; + u_short newport; + u_short *uh_sum; +} NBTArguments; + +typedef struct { + unsigned char type; + unsigned char flags; + u_short id; + struct in_addr source_ip; + u_short source_port; + u_short len; + u_short offset; +} NbtDataHeader; + +#define OpQuery 0 +#define OpUnknown 4 +#define OpRegist 5 +#define OpRelease 6 +#define OpWACK 7 +#define OpRefresh 8 +typedef struct { + u_short nametrid; + u_short dir:1, opcode:4, nmflags:7, rcode:4; + u_short qdcount; + u_short ancount; + u_short nscount; + u_short arcount; +} NbtNSHeader; + +#define FMT_ERR 0x1 +#define SRV_ERR 0x2 +#define IMP_ERR 0x4 +#define RFS_ERR 0x5 +#define ACT_ERR 0x6 +#define CFT_ERR 0x7 + + +#ifdef DEBUG +static void PrintRcode( u_char rcode ) { + + switch (rcode) { + case FMT_ERR: + printf("\nFormat Error."); + case SRV_ERR: + printf("\nSever failure."); + case IMP_ERR: + printf("\nUnsupported request error.\n"); + case RFS_ERR: + printf("\nRefused error.\n"); + case ACT_ERR: + printf("\nActive error.\n"); + case CFT_ERR: + printf("\nName in conflict error.\n"); + default: + printf("\n???=%0x\n", rcode ); + + } +} +#endif + + +/* Handling Name field */ +static u_char *AliasHandleName ( u_char *p, char *pmax ) { + + u_char *s; + u_char c; + int compress; + + /* Following length field */ + + if (p == NULL || (char *)p >= pmax) + return(NULL); + + if (*p & 0xc0 ) { + p = p + 2; + if ((char *)p > pmax) + return(NULL); + return ((u_char *)p); + } + while ( ( *p & 0x3f) != 0x00 ) { + s = p + 1; + if ( *p == 0x20 ) + compress = 1; + else + compress = 0; + + /* Get next length field */ + p = (u_char *)(p + (*p & 0x3f) + 1); + if ((char *)p > pmax) { + p = NULL; + break; + } +#ifdef DEBUG + printf(":"); +#endif + while (s < p) { + if ( compress == 1 ) { + c = (u_char )(((((*s & 0x0f) << 4) | (*(s+1) & 0x0f)) - 0x11)); +#ifdef DEBUG + if (isprint( c ) ) + printf("%c", c ); + else + printf("<0x%02x>", c ); +#endif + s +=2; + } else { +#ifdef DEBUG + printf("%c", *s); +#endif + s++; + } + } +#ifdef DEBUG + printf(":"); +#endif + fflush(stdout); + } + + /* Set up to out of Name field */ + if (p == NULL || (char *)p >= pmax) + p = NULL; + else + p++; + return ((u_char *)p); +} + +/* + * NetBios Datagram Handler (IP/UDP) + */ +#define DGM_DIRECT_UNIQ 0x10 +#define DGM_DIRECT_GROUP 0x11 +#define DGM_BROADCAST 0x12 +#define DGM_ERROR 0x13 +#define DGM_QUERY 0x14 +#define DGM_POSITIVE_RES 0x15 +#define DGM_NEGATIVE_RES 0x16 + +int AliasHandleUdpNbt( + struct ip *pip, /* IP packet to examine/patch */ + struct alias_link *link, + struct in_addr *alias_address, + u_short alias_port +) { + struct udphdr * uh; + NbtDataHeader *ndh; + u_char *p = NULL; + char *pmax; + + /* Calculate data length of UDP packet */ + uh = (struct udphdr *) ((char *) pip + (pip->ip_hl << 2)); + pmax = (char *)uh + ntohs( uh->uh_ulen ); + + ndh = (NbtDataHeader *)((char *)uh + (sizeof (struct udphdr))); + if ((char *)(ndh + 1) > pmax) + return(-1); +#ifdef DEBUG + printf("\nType=%02x,", ndh->type ); +#endif + switch ( ndh->type ) { + case DGM_DIRECT_UNIQ: + case DGM_DIRECT_GROUP: + case DGM_BROADCAST: + p = (u_char *)ndh + 14; + p = AliasHandleName ( p, pmax ); /* Source Name */ + p = AliasHandleName ( p, pmax ); /* Destination Name */ + break; + case DGM_ERROR: + p = (u_char *)ndh + 11; + break; + case DGM_QUERY: + case DGM_POSITIVE_RES: + case DGM_NEGATIVE_RES: + p = (u_char *)ndh + 10; + p = AliasHandleName ( p, pmax ); /* Destination Name */ + break; + } + if (p == NULL || (char *)p > pmax) + p = NULL; +#ifdef DEBUG + printf("%s:%d-->", inet_ntoa(ndh->source_ip), ntohs(ndh->source_port) ); +#endif + /* Doing a IP address and Port number Translation */ + if ( uh->uh_sum != 0 ) { + int acc; + u_short *sptr; + acc = ndh->source_port; + acc -= alias_port; + sptr = (u_short *) &(ndh->source_ip); + acc += *sptr++; + acc += *sptr; + sptr = (u_short *) alias_address; + acc -= *sptr++; + acc -= *sptr; + ADJUST_CHECKSUM(acc, uh->uh_sum); + } + ndh->source_ip = *alias_address; + ndh->source_port = alias_port; +#ifdef DEBUG + printf("%s:%d\n", inet_ntoa(ndh->source_ip), ntohs(ndh->source_port) ); + fflush(stdout); +#endif + return((p == NULL) ? -1 : 0); +} +/* Question Section */ +#define QS_TYPE_NB 0x0020 +#define QS_TYPE_NBSTAT 0x0021 +#define QS_CLAS_IN 0x0001 +typedef struct { + u_short type; /* The type of Request */ + u_short class; /* The class of Request */ +} NBTNsQuestion; + +static u_char * +AliasHandleQuestion( + u_short count, + NBTNsQuestion *q, + char *pmax, + NBTArguments *nbtarg) +{ + + while ( count != 0 ) { + /* Name Filed */ + q = (NBTNsQuestion *)AliasHandleName((u_char *)q, pmax); + + if (q == NULL || (char *)(q + 1) > pmax) { + q = NULL; + break; + } + + /* Type and Class filed */ + switch ( ntohs(q->type) ) { + case QS_TYPE_NB: + case QS_TYPE_NBSTAT: + q= q+1; + break; + default: +#ifdef DEBUG + printf("\nUnknown Type on Question %0x\n", ntohs(q->type) ); +#endif + break; + } + count--; + } + + /* Set up to out of Question Section */ + return ((u_char *)q); +} + +/* Resource Record */ +#define RR_TYPE_A 0x0001 +#define RR_TYPE_NS 0x0002 +#define RR_TYPE_NULL 0x000a +#define RR_TYPE_NB 0x0020 +#define RR_TYPE_NBSTAT 0x0021 +#define RR_CLAS_IN 0x0001 +#define SizeOfNsResource 8 +typedef struct { + u_short type; + u_short class; + unsigned int ttl; + u_short rdlen; +} NBTNsResource; + +#define SizeOfNsRNB 6 +typedef struct { + u_short g:1, ont:2, resv:13; + struct in_addr addr; +} NBTNsRNB; + +static u_char * +AliasHandleResourceNB( + NBTNsResource *q, + char *pmax, + NBTArguments *nbtarg) +{ + NBTNsRNB *nb; + u_short bcount; + + if (q == NULL || (char *)(q + 1) > pmax) + return(NULL); + /* Check out a length */ + bcount = ntohs(q->rdlen); + + /* Forward to Resource NB position */ + nb = (NBTNsRNB *)((u_char *)q + SizeOfNsResource); + + /* Processing all in_addr array */ +#ifdef DEBUG + printf("NB rec[%s", inet_ntoa(nbtarg->oldaddr)); + printf("->%s, %dbytes] ",inet_ntoa(nbtarg->newaddr ), bcount); +#endif + while ( nb != NULL && bcount != 0 ) { + if ((char *)(nb + 1) > pmax) { + nb = NULL; + break; + } +#ifdef DEBUG + printf("<%s>", inet_ntoa(nb->addr) ); +#endif + if (!bcmp(&nbtarg->oldaddr,&nb->addr, sizeof(struct in_addr) ) ) { + if ( *nbtarg->uh_sum != 0 ) { + int acc; + u_short *sptr; + + sptr = (u_short *) &(nb->addr); + acc = *sptr++; + acc += *sptr; + sptr = (u_short *) &(nbtarg->newaddr); + acc -= *sptr++; + acc -= *sptr; + ADJUST_CHECKSUM(acc, *nbtarg->uh_sum); + } + + nb->addr = nbtarg->newaddr; +#ifdef DEBUG + printf("O"); +#endif + } +#ifdef DEBUG + else { + printf("."); + } +#endif + nb=(NBTNsRNB *)((u_char *)nb + SizeOfNsRNB); + bcount -= SizeOfNsRNB; + } + if (nb == NULL || (char *)(nb + 1) > pmax) { + nb = NULL; + } + + return ((u_char *)nb); +} + +#define SizeOfResourceA 6 +typedef struct { + struct in_addr addr; +} NBTNsResourceA; + +static u_char * +AliasHandleResourceA( + NBTNsResource *q, + char *pmax, + NBTArguments *nbtarg) +{ + NBTNsResourceA *a; + u_short bcount; + + if (q == NULL || (char *)(q + 1) > pmax) + return(NULL); + + /* Forward to Resource A position */ + a = (NBTNsResourceA *)( (u_char *)q + sizeof(NBTNsResource) ); + + /* Check out of length */ + bcount = ntohs(q->rdlen); + + /* Processing all in_addr array */ +#ifdef DEBUG + printf("Arec [%s", inet_ntoa(nbtarg->oldaddr)); + printf("->%s]",inet_ntoa(nbtarg->newaddr )); +#endif + while ( bcount != 0 ) { + if (a == NULL || (char *)(a + 1) > pmax) + return(NULL); +#ifdef DEBUG + printf("..%s", inet_ntoa(a->addr) ); +#endif + if ( !bcmp(&nbtarg->oldaddr, &a->addr, sizeof(struct in_addr) ) ) { + if ( *nbtarg->uh_sum != 0 ) { + int acc; + u_short *sptr; + + sptr = (u_short *) &(a->addr); /* Old */ + acc = *sptr++; + acc += *sptr; + sptr = (u_short *) &nbtarg->newaddr; /* New */ + acc -= *sptr++; + acc -= *sptr; + ADJUST_CHECKSUM(acc, *nbtarg->uh_sum); + } + + a->addr = nbtarg->newaddr; + } + a++; /*XXXX*/ + bcount -= SizeOfResourceA; + } + if (a == NULL || (char *)(a + 1) > pmax) + a = NULL; + return ((u_char *)a); +} + +typedef struct { + u_short opcode:4, flags:8, resv:4; +} NBTNsResourceNULL; + +static u_char * +AliasHandleResourceNULL( + NBTNsResource *q, + char *pmax, + NBTArguments *nbtarg) +{ + NBTNsResourceNULL *n; + u_short bcount; + + if (q == NULL || (char *)(q + 1) > pmax) + return(NULL); + + /* Forward to Resource NULL position */ + n = (NBTNsResourceNULL *)( (u_char *)q + sizeof(NBTNsResource) ); + + /* Check out of length */ + bcount = ntohs(q->rdlen); + + /* Processing all in_addr array */ + while ( bcount != 0 ) { + if ((char *)(n + 1) > pmax) { + n = NULL; + break; + } + n++; + bcount -= sizeof(NBTNsResourceNULL); + } + if ((char *)(n + 1) > pmax) + n = NULL; + + return ((u_char *)n); +} + +static u_char * +AliasHandleResourceNS( + NBTNsResource *q, + char *pmax, + NBTArguments *nbtarg) +{ + NBTNsResourceNULL *n; + u_short bcount; + + if (q == NULL || (char *)(q + 1) > pmax) + return(NULL); + + /* Forward to Resource NULL position */ + n = (NBTNsResourceNULL *)( (u_char *)q + sizeof(NBTNsResource) ); + + /* Check out of length */ + bcount = ntohs(q->rdlen); + + /* Resource Record Name Filed */ + q = (NBTNsResource *)AliasHandleName( (u_char *)n, pmax ); /* XXX */ + + if (q == NULL || (char *)((u_char *)n + bcount) > pmax) + return(NULL); + else + return ((u_char *)n + bcount); +} + +typedef struct { + u_short numnames; +} NBTNsResourceNBSTAT; + +static u_char * +AliasHandleResourceNBSTAT( + NBTNsResource *q, + char *pmax, + NBTArguments *nbtarg) +{ + NBTNsResourceNBSTAT *n; + u_short bcount; + + if (q == NULL || (char *)(q + 1) > pmax) + return(NULL); + + /* Forward to Resource NBSTAT position */ + n = (NBTNsResourceNBSTAT *)( (u_char *)q + sizeof(NBTNsResource) ); + + /* Check out of length */ + bcount = ntohs(q->rdlen); + + if (q == NULL || (char *)((u_char *)n + bcount) > pmax) + return(NULL); + else + return ((u_char *)n + bcount); +} + +static u_char * +AliasHandleResource( + u_short count, + NBTNsResource *q, + char *pmax, + NBTArguments + *nbtarg) +{ + while ( count != 0 ) { + /* Resource Record Name Filed */ + q = (NBTNsResource *)AliasHandleName( (u_char *)q, pmax ); + + if (q == NULL || (char *)(q + 1) > pmax) + break; +#ifdef DEBUG + printf("type=%02x, count=%d\n", ntohs(q->type), count ); +#endif + + /* Type and Class filed */ + switch ( ntohs(q->type) ) { + case RR_TYPE_NB: + q = (NBTNsResource *)AliasHandleResourceNB( + q, + pmax, + nbtarg + ); + break; + case RR_TYPE_A: + q = (NBTNsResource *)AliasHandleResourceA( + q, + pmax, + nbtarg + ); + break; + case RR_TYPE_NS: + q = (NBTNsResource *)AliasHandleResourceNS( + q, + pmax, + nbtarg + ); + break; + case RR_TYPE_NULL: + q = (NBTNsResource *)AliasHandleResourceNULL( + q, + pmax, + nbtarg + ); + break; + case RR_TYPE_NBSTAT: + q = (NBTNsResource *)AliasHandleResourceNBSTAT( + q, + pmax, + nbtarg + ); + break; + default: +#ifdef DEBUG + printf( + "\nUnknown Type of Resource %0x\n", + ntohs(q->type) + ); +#endif + break; + } + count--; + } + fflush(stdout); + return ((u_char *)q); +} + +int AliasHandleUdpNbtNS( + struct ip *pip, /* IP packet to examine/patch */ + struct alias_link *link, + struct in_addr *alias_address, + u_short *alias_port, + struct in_addr *original_address, + u_short *original_port ) +{ + struct udphdr * uh; + NbtNSHeader * nsh; + u_char * p; + char *pmax; + NBTArguments nbtarg; + + /* Set up Common Parameter */ + nbtarg.oldaddr = *alias_address; + nbtarg.oldport = *alias_port; + nbtarg.newaddr = *original_address; + nbtarg.newport = *original_port; + + /* Calculate data length of UDP packet */ + uh = (struct udphdr *) ((char *) pip + (pip->ip_hl << 2)); + nbtarg.uh_sum = &(uh->uh_sum); + nsh = (NbtNSHeader *)((char *)uh + (sizeof(struct udphdr))); + p = (u_char *)(nsh + 1); + pmax = (char *)uh + ntohs( uh->uh_ulen ); + + if ((char *)(nsh + 1) > pmax) + return(-1); + +#ifdef DEBUG + printf(" [%s] ID=%02x, op=%01x, flag=%02x, rcode=%01x, qd=%04x" + ", an=%04x, ns=%04x, ar=%04x, [%d]-->", + nsh->dir ? "Response": "Request", + nsh->nametrid, + nsh->opcode, + nsh->nmflags, + nsh->rcode, + ntohs(nsh->qdcount), + ntohs(nsh->ancount), + ntohs(nsh->nscount), + ntohs(nsh->arcount), + (u_char *)p -(u_char *)nsh + ); +#endif + + /* Question Entries */ + if (ntohs(nsh->qdcount) !=0 ) { + p = AliasHandleQuestion( + ntohs(nsh->qdcount), + (NBTNsQuestion *)p, + pmax, + &nbtarg + ); + } + + /* Answer Resource Records */ + if (ntohs(nsh->ancount) !=0 ) { + p = AliasHandleResource( + ntohs(nsh->ancount), + (NBTNsResource *)p, + pmax, + &nbtarg + ); + } + + /* Authority Resource Recodrs */ + if (ntohs(nsh->nscount) !=0 ) { + p = AliasHandleResource( + ntohs(nsh->nscount), + (NBTNsResource *)p, + pmax, + &nbtarg + ); + } + + /* Additional Resource Recodrs */ + if (ntohs(nsh->arcount) !=0 ) { + p = AliasHandleResource( + ntohs(nsh->arcount), + (NBTNsResource *)p, + pmax, + &nbtarg + ); + } + +#ifdef DEBUG + PrintRcode(nsh->rcode); +#endif + return ((p == NULL) ? -1 : 0); +} diff --git a/sys/netinet/libalias/alias_pptp.c b/sys/netinet/libalias/alias_pptp.c new file mode 100644 index 0000000..dbab7a9 --- /dev/null +++ b/sys/netinet/libalias/alias_pptp.c @@ -0,0 +1,368 @@ +/* + * alias_pptp.c + * + * Copyright (c) 2000 Whistle Communications, Inc. + * All rights reserved. + * + * Subject to the following obligations and disclaimer of warranty, use and + * redistribution of this software, in source or object code forms, with or + * without modifications are expressly permitted by Whistle Communications; + * provided, however, that: + * 1. Any and all reproductions of the source or object code must include the + * copyright notice above and the following disclaimer of warranties; and + * 2. No rights are granted, in any manner or form, to use Whistle + * Communications, Inc. trademarks, including the mark "WHISTLE + * COMMUNICATIONS" on advertising, endorsements, or otherwise except as + * such appears in the above copyright notice or in the software. + * + * THIS SOFTWARE IS BEING PROVIDED BY WHISTLE COMMUNICATIONS "AS IS", AND + * TO THE MAXIMUM EXTENT PERMITTED BY LAW, WHISTLE COMMUNICATIONS MAKES NO + * REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, REGARDING THIS SOFTWARE, + * INCLUDING WITHOUT LIMITATION, ANY AND ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. + * WHISTLE COMMUNICATIONS DOES NOT WARRANT, GUARANTEE, OR MAKE ANY + * REPRESENTATIONS REGARDING THE USE OF, OR THE RESULTS OF THE USE OF THIS + * SOFTWARE IN TERMS OF ITS CORRECTNESS, ACCURACY, RELIABILITY OR OTHERWISE. + * IN NO EVENT SHALL WHISTLE COMMUNICATIONS BE LIABLE FOR ANY DAMAGES + * RESULTING FROM OR ARISING OUT OF ANY USE OF THIS SOFTWARE, INCLUDING + * WITHOUT LIMITATION, ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, + * PUNITIVE, OR CONSEQUENTIAL DAMAGES, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES, LOSS OF USE, DATA OR PROFITS, HOWEVER CAUSED AND UNDER ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF WHISTLE COMMUNICATIONS IS ADVISED OF THE POSSIBILITY + * OF SUCH DAMAGE. + * + * Author: Erik Salander <erik@whistle.com> + * + * $FreeBSD$ + */ + +/* + Alias_pptp.c performs special processing for PPTP sessions under TCP. + Specifically, watch PPTP control messages and alias the Call ID or the + Peer's Call ID in the appropriate messages. Note, PPTP requires + "de-aliasing" of incoming packets, this is different than any other + TCP applications that are currently (ie. FTP, IRC and RTSP) aliased. + + For Call IDs encountered for the first time, a PPTP alias link is created. + The PPTP alias link uses the Call ID in place of the original port number. + An alias Call ID is created. + + For this routine to work, the PPTP control messages must fit entirely + into a single TCP packet. This is typically the case, but is not + required by the spec. + + Unlike some of the other TCP applications that are aliased (ie. FTP, + IRC and RTSP), the PPTP control messages that need to be aliased are + guaranteed to remain the same length. The aliased Call ID is a fixed + length field. + + Reference: RFC 2637 + + Initial version: May, 2000 (eds) + +*/ + +/* Includes */ +#include <sys/types.h> +#include <netinet/in_systm.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/tcp.h> + +#include <stdio.h> + +#include "alias_local.h" + +/* + * PPTP definitions + */ + +struct grehdr /* Enhanced GRE header. */ +{ + u_int16_t gh_flags; /* Flags. */ + u_int16_t gh_protocol; /* Protocol type. */ + u_int16_t gh_length; /* Payload length. */ + u_int16_t gh_call_id; /* Call ID. */ + u_int32_t gh_seq_no; /* Sequence number (optional). */ + u_int32_t gh_ack_no; /* Acknowledgment number (optional). */ +}; +typedef struct grehdr GreHdr; + +/* The PPTP protocol ID used in the GRE 'proto' field. */ +#define PPTP_GRE_PROTO 0x880b + +/* Bits that must be set a certain way in all PPTP/GRE packets. */ +#define PPTP_INIT_VALUE ((0x2001 << 16) | PPTP_GRE_PROTO) +#define PPTP_INIT_MASK 0xef7fffff + +#define PPTP_MAGIC 0x1a2b3c4d +#define PPTP_CTRL_MSG_TYPE 1 + +enum { + PPTP_StartCtrlConnRequest = 1, + PPTP_StartCtrlConnReply = 2, + PPTP_StopCtrlConnRequest = 3, + PPTP_StopCtrlConnReply = 4, + PPTP_EchoRequest = 5, + PPTP_EchoReply = 6, + PPTP_OutCallRequest = 7, + PPTP_OutCallReply = 8, + PPTP_InCallRequest = 9, + PPTP_InCallReply = 10, + PPTP_InCallConn = 11, + PPTP_CallClearRequest = 12, + PPTP_CallDiscNotify = 13, + PPTP_WanErrorNotify = 14, + PPTP_SetLinkInfo = 15 +}; + + /* Message structures */ + struct pptpMsgHead { + u_int16_t length; /* total length */ + u_int16_t msgType; /* PPTP message type */ + u_int32_t magic; /* magic cookie */ + u_int16_t type; /* control message type */ + u_int16_t resv0; /* reserved */ + }; + typedef struct pptpMsgHead *PptpMsgHead; + + struct pptpCodes { + u_int8_t resCode; /* Result Code */ + u_int8_t errCode; /* Error Code */ + }; + typedef struct pptpCodes *PptpCode; + + struct pptpCallIds { + u_int16_t cid1; /* Call ID field #1 */ + u_int16_t cid2; /* Call ID field #2 */ + }; + typedef struct pptpCallIds *PptpCallId; + +static PptpCallId AliasVerifyPptp(struct ip *, u_int16_t *); + + +void +AliasHandlePptpOut(struct ip *pip, /* IP packet to examine/patch */ + struct alias_link *link) /* The PPTP control link */ +{ + struct alias_link *pptp_link; + PptpCallId cptr; + PptpCode codes; + u_int16_t ctl_type; /* control message type */ + struct tcphdr *tc; + + /* Verify valid PPTP control message */ + if ((cptr = AliasVerifyPptp(pip, &ctl_type)) == NULL) + return; + + /* Modify certain PPTP messages */ + switch (ctl_type) { + case PPTP_OutCallRequest: + case PPTP_OutCallReply: + case PPTP_InCallRequest: + case PPTP_InCallReply: + /* Establish PPTP link for address and Call ID found in control message. */ + pptp_link = AddPptp(GetOriginalAddress(link), GetDestAddress(link), + GetAliasAddress(link), cptr->cid1); + break; + case PPTP_CallClearRequest: + case PPTP_CallDiscNotify: + /* Find PPTP link for address and Call ID found in control message. */ + pptp_link = FindPptpOutByCallId(GetOriginalAddress(link), + GetDestAddress(link), + cptr->cid1); + break; + default: + return; + } + + if (pptp_link != NULL) { + int accumulate = cptr->cid1; + + /* alias the Call Id */ + cptr->cid1 = GetAliasPort(pptp_link); + + /* Compute TCP checksum for revised packet */ + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + accumulate -= cptr->cid1; + ADJUST_CHECKSUM(accumulate, tc->th_sum); + + switch (ctl_type) { + case PPTP_OutCallReply: + case PPTP_InCallReply: + codes = (PptpCode)(cptr + 1); + if (codes->resCode == 1) /* Connection established, */ + SetDestCallId(pptp_link, /* note the Peer's Call ID. */ + cptr->cid2); + else + SetExpire(pptp_link, 0); /* Connection refused. */ + break; + case PPTP_CallDiscNotify: /* Connection closed. */ + SetExpire(pptp_link, 0); + break; + } + } +} + +void +AliasHandlePptpIn(struct ip *pip, /* IP packet to examine/patch */ + struct alias_link *link) /* The PPTP control link */ +{ + struct alias_link *pptp_link; + PptpCallId cptr; + u_int16_t *pcall_id; + u_int16_t ctl_type; /* control message type */ + struct tcphdr *tc; + + /* Verify valid PPTP control message */ + if ((cptr = AliasVerifyPptp(pip, &ctl_type)) == NULL) + return; + + /* Modify certain PPTP messages */ + switch (ctl_type) + { + case PPTP_InCallConn: + case PPTP_WanErrorNotify: + case PPTP_SetLinkInfo: + pcall_id = &cptr->cid1; + break; + case PPTP_OutCallReply: + case PPTP_InCallReply: + pcall_id = &cptr->cid2; + break; + case PPTP_CallDiscNotify: /* Connection closed. */ + pptp_link = FindPptpInByCallId(GetDestAddress(link), + GetAliasAddress(link), + cptr->cid1); + if (pptp_link != NULL) + SetExpire(pptp_link, 0); + return; + default: + return; + } + + /* Find PPTP link for address and Call ID found in PPTP Control Msg */ + pptp_link = FindPptpInByPeerCallId(GetDestAddress(link), + GetAliasAddress(link), + *pcall_id); + + if (pptp_link != NULL) { + int accumulate = *pcall_id; + + /* De-alias the Peer's Call Id. */ + *pcall_id = GetOriginalPort(pptp_link); + + /* Compute TCP checksum for modified packet */ + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + accumulate -= *pcall_id; + ADJUST_CHECKSUM(accumulate, tc->th_sum); + + if (ctl_type == PPTP_OutCallReply || ctl_type == PPTP_InCallReply) { + PptpCode codes = (PptpCode)(cptr + 1); + + if (codes->resCode == 1) /* Connection established, */ + SetDestCallId(pptp_link, /* note the Call ID. */ + cptr->cid1); + else + SetExpire(pptp_link, 0); /* Connection refused. */ + } + } +} + +static PptpCallId +AliasVerifyPptp(struct ip *pip, u_int16_t *ptype) /* IP packet to examine/patch */ +{ + int hlen, tlen, dlen; + PptpMsgHead hptr; + struct tcphdr *tc; + + /* Calculate some lengths */ + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + hlen = (pip->ip_hl + tc->th_off) << 2; + tlen = ntohs(pip->ip_len); + dlen = tlen - hlen; + + /* Verify data length */ + if (dlen < (sizeof(struct pptpMsgHead) + sizeof(struct pptpCallIds))) + return(NULL); + + /* Move up to PPTP message header */ + hptr = (PptpMsgHead)(((char *) pip) + hlen); + + /* Return the control message type */ + *ptype = ntohs(hptr->type); + + /* Verify PPTP Control Message */ + if ((ntohs(hptr->msgType) != PPTP_CTRL_MSG_TYPE) || + (ntohl(hptr->magic) != PPTP_MAGIC)) + return(NULL); + + /* Verify data length. */ + if ((*ptype == PPTP_OutCallReply || *ptype == PPTP_InCallReply) && + (dlen < sizeof(struct pptpMsgHead) + sizeof(struct pptpCallIds) + + sizeof(struct pptpCodes))) + return (NULL); + else + return (PptpCallId)(hptr + 1); +} + + +int +AliasHandlePptpGreOut(struct ip *pip) +{ + GreHdr *gr; + struct alias_link *link; + + gr = (GreHdr *)((char *)pip + (pip->ip_hl << 2)); + + /* Check GRE header bits. */ + if ((ntohl(*((u_int32_t *)gr)) & PPTP_INIT_MASK) != PPTP_INIT_VALUE) + return (-1); + + link = FindPptpOutByPeerCallId(pip->ip_src, pip->ip_dst, gr->gh_call_id); + if (link != NULL) { + struct in_addr alias_addr = GetAliasAddress(link); + + /* Change source IP address. */ + DifferentialChecksum(&pip->ip_sum, + (u_short *)&alias_addr, + (u_short *)&pip->ip_src, + 2); + pip->ip_src = alias_addr; + } + + return (0); +} + + +int +AliasHandlePptpGreIn(struct ip *pip) +{ + GreHdr *gr; + struct alias_link *link; + + gr = (GreHdr *)((char *)pip + (pip->ip_hl << 2)); + + /* Check GRE header bits. */ + if ((ntohl(*((u_int32_t *)gr)) & PPTP_INIT_MASK) != PPTP_INIT_VALUE) + return (-1); + + link = FindPptpInByPeerCallId(pip->ip_src, pip->ip_dst, gr->gh_call_id); + if (link != NULL) { + struct in_addr src_addr = GetOriginalAddress(link); + + /* De-alias the Peer's Call Id. */ + gr->gh_call_id = GetOriginalPort(link); + + /* Restore original IP address. */ + DifferentialChecksum(&pip->ip_sum, + (u_short *)&src_addr, + (u_short *)&pip->ip_dst, + 2); + pip->ip_dst = src_addr; + } + + return (0); +} diff --git a/sys/netinet/libalias/alias_proxy.c b/sys/netinet/libalias/alias_proxy.c new file mode 100644 index 0000000..75ffad7 --- /dev/null +++ b/sys/netinet/libalias/alias_proxy.c @@ -0,0 +1,810 @@ +/* file: alias_proxy.c + + This file encapsulates special operations related to transparent + proxy redirection. This is where packets with a particular destination, + usually tcp port 80, are redirected to a proxy server. + + When packets are proxied, the destination address and port are + modified. In certain cases, it is necessary to somehow encode + the original address/port info into the packet. Two methods are + presently supported: addition of a [DEST addr port] string at the + beginning a of tcp stream, or inclusion of an optional field + in the IP header. + + There is one public API function: + + PacketAliasProxyRule() -- Adds and deletes proxy + rules. + + Rules are stored in a linear linked list, so lookup efficiency + won't be too good for large lists. + + + Initial development: April, 1998 (cjm) + + $FreeBSD$ +*/ + + +/* System includes */ +#include <ctype.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <netdb.h> + +#include <sys/types.h> +#include <sys/socket.h> + +/* BSD IPV4 includes */ +#include <netinet/in_systm.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/tcp.h> + +#include <arpa/inet.h> + +#include "alias_local.h" /* Functions used by alias*.c */ +#include "alias.h" /* Public API functions for libalias */ + + + +/* + Data structures + */ + +/* + * A linked list of arbitrary length, based on struct proxy_entry is + * used to store proxy rules. + */ +struct proxy_entry +{ +#define PROXY_TYPE_ENCODE_NONE 1 +#define PROXY_TYPE_ENCODE_TCPSTREAM 2 +#define PROXY_TYPE_ENCODE_IPHDR 3 + int rule_index; + int proxy_type; + u_char proto; + u_short proxy_port; + u_short server_port; + + struct in_addr server_addr; + + struct in_addr src_addr; + struct in_addr src_mask; + + struct in_addr dst_addr; + struct in_addr dst_mask; + + struct proxy_entry *next; + struct proxy_entry *last; +}; + + + +/* + File scope variables +*/ + +static struct proxy_entry *proxyList; + + + +/* Local (static) functions: + + IpMask() -- Utility function for creating IP + masks from integer (1-32) specification. + IpAddr() -- Utility function for converting string + to IP address + IpPort() -- Utility function for converting string + to port number + RuleAdd() -- Adds an element to the rule list. + RuleDelete() -- Removes an element from the rule list. + RuleNumberDelete() -- Removes all elements from the rule list + having a certain rule number. + ProxyEncodeTcpStream() -- Adds [DEST x.x.x.x xxxx] to the beginning + of a TCP stream. + ProxyEncodeIpHeader() -- Adds an IP option indicating the true + destination of a proxied IP packet +*/ + +static int IpMask(int, struct in_addr *); +static int IpAddr(char *, struct in_addr *); +static int IpPort(char *, int, int *); +static void RuleAdd(struct proxy_entry *); +static void RuleDelete(struct proxy_entry *); +static int RuleNumberDelete(int); +static void ProxyEncodeTcpStream(struct alias_link *, struct ip *, int); +static void ProxyEncodeIpHeader(struct ip *, int); + +static int +IpMask(int nbits, struct in_addr *mask) +{ + int i; + u_int imask; + + if (nbits < 0 || nbits > 32) + return -1; + + imask = 0; + for (i=0; i<nbits; i++) + imask = (imask >> 1) + 0x80000000; + mask->s_addr = htonl(imask); + + return 0; +} + +static int +IpAddr(char *s, struct in_addr *addr) +{ + if (inet_aton(s, addr) == 0) + return -1; + else + return 0; +} + +static int +IpPort(char *s, int proto, int *port) +{ + int n; + + n = sscanf(s, "%d", port); + if (n != 1) + { + struct servent *se; + + if (proto == IPPROTO_TCP) + se = getservbyname(s, "tcp"); + else if (proto == IPPROTO_UDP) + se = getservbyname(s, "udp"); + else + return -1; + + if (se == NULL) + return -1; + + *port = (u_int) ntohs(se->s_port); + } + + return 0; +} + +void +RuleAdd(struct proxy_entry *entry) +{ + int rule_index; + struct proxy_entry *ptr; + struct proxy_entry *ptr_last; + + if (proxyList == NULL) + { + proxyList = entry; + entry->last = NULL; + entry->next = NULL; + return; + } + + rule_index = entry->rule_index; + ptr = proxyList; + ptr_last = NULL; + while (ptr != NULL) + { + if (ptr->rule_index >= rule_index) + { + if (ptr_last == NULL) + { + entry->next = proxyList; + entry->last = NULL; + proxyList->last = entry; + proxyList = entry; + return; + } + + ptr_last->next = entry; + ptr->last = entry; + entry->last = ptr->last; + entry->next = ptr; + return; + } + ptr_last = ptr; + ptr = ptr->next; + } + + ptr_last->next = entry; + entry->last = ptr_last; + entry->next = NULL; +} + +static void +RuleDelete(struct proxy_entry *entry) +{ + if (entry->last != NULL) + entry->last->next = entry->next; + else + proxyList = entry->next; + + if (entry->next != NULL) + entry->next->last = entry->last; + + free(entry); +} + +static int +RuleNumberDelete(int rule_index) +{ + int err; + struct proxy_entry *ptr; + + err = -1; + ptr = proxyList; + while (ptr != NULL) + { + struct proxy_entry *ptr_next; + + ptr_next = ptr->next; + if (ptr->rule_index == rule_index) + { + err = 0; + RuleDelete(ptr); + } + + ptr = ptr_next; + } + + return err; +} + +static void +ProxyEncodeTcpStream(struct alias_link *link, + struct ip *pip, + int maxpacketsize) +{ + int slen; + char buffer[40]; + struct tcphdr *tc; + +/* Compute pointer to tcp header */ + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + +/* Don't modify if once already modified */ + + if (GetAckModified (link)) + return; + +/* Translate destination address and port to string form */ + snprintf(buffer, sizeof(buffer) - 2, "[DEST %s %d]", + inet_ntoa(GetProxyAddress (link)), (u_int) ntohs(GetProxyPort (link))); + +/* Pad string out to a multiple of two in length */ + slen = strlen(buffer); + switch (slen % 2) + { + case 0: + strcat(buffer, " \n"); + slen += 2; + break; + case 1: + strcat(buffer, "\n"); + slen += 1; + } + +/* Check for packet overflow */ + if ((ntohs(pip->ip_len) + strlen(buffer)) > maxpacketsize) + return; + +/* Shift existing TCP data and insert destination string */ + { + int dlen; + int hlen; + u_char *p; + + hlen = (pip->ip_hl + tc->th_off) << 2; + dlen = ntohs (pip->ip_len) - hlen; + +/* Modify first packet that has data in it */ + + if (dlen == 0) + return; + + p = (char *) pip; + p += hlen; + + memmove(p + slen, p, dlen); + memcpy(p, buffer, slen); + } + +/* Save information about modfied sequence number */ + { + int delta; + + SetAckModified(link); + delta = GetDeltaSeqOut(pip, link); + AddSeq(pip, link, delta+slen); + } + +/* Update IP header packet length and checksum */ + { + int accumulate; + + accumulate = pip->ip_len; + pip->ip_len = htons(ntohs(pip->ip_len) + slen); + accumulate -= pip->ip_len; + + ADJUST_CHECKSUM(accumulate, pip->ip_sum); + } + +/* Update TCP checksum, Use TcpChecksum since so many things have + already changed. */ + + tc->th_sum = 0; + tc->th_sum = TcpChecksum (pip); +} + +static void +ProxyEncodeIpHeader(struct ip *pip, + int maxpacketsize) +{ +#define OPTION_LEN_BYTES 8 +#define OPTION_LEN_INT16 4 +#define OPTION_LEN_INT32 2 + u_char option[OPTION_LEN_BYTES]; + +#ifdef DEBUG + fprintf(stdout, " ip cksum 1 = %x\n", (u_int) IpChecksum(pip)); + fprintf(stdout, "tcp cksum 1 = %x\n", (u_int) TcpChecksum(pip)); +#endif + +/* Check to see that there is room to add an IP option */ + if (pip->ip_hl > (0x0f - OPTION_LEN_INT32)) + return; + +/* Build option and copy into packet */ + { + u_char *ptr; + struct tcphdr *tc; + + ptr = (u_char *) pip; + ptr += 20; + memcpy(ptr + OPTION_LEN_BYTES, ptr, ntohs(pip->ip_len) - 20); + + option[0] = 0x64; /* class: 3 (reserved), option 4 */ + option[1] = OPTION_LEN_BYTES; + + memcpy(&option[2], (u_char *) &pip->ip_dst, 4); + + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + memcpy(&option[6], (u_char *) &tc->th_sport, 2); + + memcpy(ptr, option, 8); + } + +/* Update checksum, header length and packet length */ + { + int i; + int accumulate; + u_short *sptr; + + sptr = (u_short *) option; + accumulate = 0; + for (i=0; i<OPTION_LEN_INT16; i++) + accumulate -= *(sptr++); + + sptr = (u_short *) pip; + accumulate += *sptr; + pip->ip_hl += OPTION_LEN_INT32; + accumulate -= *sptr; + + accumulate += pip->ip_len; + pip->ip_len = htons(ntohs(pip->ip_len) + OPTION_LEN_BYTES); + accumulate -= pip->ip_len; + + ADJUST_CHECKSUM(accumulate, pip->ip_sum); + } +#undef OPTION_LEN_BYTES +#undef OPTION_LEN_INT16 +#undef OPTION_LEN_INT32 +#ifdef DEBUG + fprintf(stdout, " ip cksum 2 = %x\n", (u_int) IpChecksum(pip)); + fprintf(stdout, "tcp cksum 2 = %x\n", (u_int) TcpChecksum(pip)); +#endif +} + + +/* Functions by other packet alias source files + + ProxyCheck() -- Checks whether an outgoing packet should + be proxied. + ProxyModify() -- Encodes the original destination address/port + for a packet which is to be redirected to + a proxy server. +*/ + +int +ProxyCheck(struct ip *pip, + struct in_addr *proxy_server_addr, + u_short *proxy_server_port) +{ + u_short dst_port; + struct in_addr src_addr; + struct in_addr dst_addr; + struct proxy_entry *ptr; + + src_addr = pip->ip_src; + dst_addr = pip->ip_dst; + dst_port = ((struct tcphdr *) ((char *) pip + (pip->ip_hl << 2))) + ->th_dport; + + ptr = proxyList; + while (ptr != NULL) + { + u_short proxy_port; + + proxy_port = ptr->proxy_port; + if ((dst_port == proxy_port || proxy_port == 0) + && pip->ip_p == ptr->proto + && src_addr.s_addr != ptr->server_addr.s_addr) + { + struct in_addr src_addr_masked; + struct in_addr dst_addr_masked; + + src_addr_masked.s_addr = src_addr.s_addr & ptr->src_mask.s_addr; + dst_addr_masked.s_addr = dst_addr.s_addr & ptr->dst_mask.s_addr; + + if ((src_addr_masked.s_addr == ptr->src_addr.s_addr) + && (dst_addr_masked.s_addr == ptr->dst_addr.s_addr)) + { + if ((*proxy_server_port = ptr->server_port) == 0) + *proxy_server_port = dst_port; + *proxy_server_addr = ptr->server_addr; + return ptr->proxy_type; + } + } + ptr = ptr->next; + } + + return 0; +} + +void +ProxyModify(struct alias_link *link, + struct ip *pip, + int maxpacketsize, + int proxy_type) +{ + switch (proxy_type) + { + case PROXY_TYPE_ENCODE_IPHDR: + ProxyEncodeIpHeader(pip, maxpacketsize); + break; + + case PROXY_TYPE_ENCODE_TCPSTREAM: + ProxyEncodeTcpStream(link, pip, maxpacketsize); + break; + } +} + + +/* + Public API functions +*/ + +int +PacketAliasProxyRule(const char *cmd) +{ +/* + * This function takes command strings of the form: + * + * server <addr>[:<port>] + * [port <port>] + * [rule n] + * [proto tcp|udp] + * [src <addr>[/n]] + * [dst <addr>[/n]] + * [type encode_tcp_stream|encode_ip_hdr|no_encode] + * + * delete <rule number> + * + * Subfields can be in arbitrary order. Port numbers and addresses + * must be in either numeric or symbolic form. An optional rule number + * is used to control the order in which rules are searched. If two + * rules have the same number, then search order cannot be guaranteed, + * and the rules should be disjoint. If no rule number is specified, + * then 0 is used, and group 0 rules are always checked before any + * others. + */ + int i, n, len; + int cmd_len; + int token_count; + int state; + char *token; + char buffer[256]; + char str_port[sizeof(buffer)]; + char str_server_port[sizeof(buffer)]; + char *res = buffer; + + int rule_index; + int proto; + int proxy_type; + int proxy_port; + int server_port; + struct in_addr server_addr; + struct in_addr src_addr, src_mask; + struct in_addr dst_addr, dst_mask; + struct proxy_entry *proxy_entry; + +/* Copy command line into a buffer */ + cmd += strspn(cmd, " \t"); + cmd_len = strlen(cmd); + if (cmd_len > (sizeof(buffer) - 1)) + return -1; + strcpy(buffer, cmd); + +/* Convert to lower case */ + len = strlen(buffer); + for (i=0; i<len; i++) + buffer[i] = tolower((unsigned char)buffer[i]); + +/* Set default proxy type */ + +/* Set up default values */ + rule_index = 0; + proxy_type = PROXY_TYPE_ENCODE_NONE; + proto = IPPROTO_TCP; + proxy_port = 0; + server_addr.s_addr = 0; + server_port = 0; + src_addr.s_addr = 0; + IpMask(0, &src_mask); + dst_addr.s_addr = 0; + IpMask(0, &dst_mask); + + str_port[0] = 0; + str_server_port[0] = 0; + +/* Parse command string with state machine */ +#define STATE_READ_KEYWORD 0 +#define STATE_READ_TYPE 1 +#define STATE_READ_PORT 2 +#define STATE_READ_SERVER 3 +#define STATE_READ_RULE 4 +#define STATE_READ_DELETE 5 +#define STATE_READ_PROTO 6 +#define STATE_READ_SRC 7 +#define STATE_READ_DST 8 + state = STATE_READ_KEYWORD; + token = strsep(&res, " \t"); + token_count = 0; + while (token != NULL) + { + token_count++; + switch (state) + { + case STATE_READ_KEYWORD: + if (strcmp(token, "type") == 0) + state = STATE_READ_TYPE; + else if (strcmp(token, "port") == 0) + state = STATE_READ_PORT; + else if (strcmp(token, "server") == 0) + state = STATE_READ_SERVER; + else if (strcmp(token, "rule") == 0) + state = STATE_READ_RULE; + else if (strcmp(token, "delete") == 0) + state = STATE_READ_DELETE; + else if (strcmp(token, "proto") == 0) + state = STATE_READ_PROTO; + else if (strcmp(token, "src") == 0) + state = STATE_READ_SRC; + else if (strcmp(token, "dst") == 0) + state = STATE_READ_DST; + else + return -1; + break; + + case STATE_READ_TYPE: + if (strcmp(token, "encode_ip_hdr") == 0) + proxy_type = PROXY_TYPE_ENCODE_IPHDR; + else if (strcmp(token, "encode_tcp_stream") == 0) + proxy_type = PROXY_TYPE_ENCODE_TCPSTREAM; + else if (strcmp(token, "no_encode") == 0) + proxy_type = PROXY_TYPE_ENCODE_NONE; + else + return -1; + state = STATE_READ_KEYWORD; + break; + + case STATE_READ_PORT: + strcpy(str_port, token); + state = STATE_READ_KEYWORD; + break; + + case STATE_READ_SERVER: + { + int err; + char *p; + char s[sizeof(buffer)]; + + p = token; + while (*p != ':' && *p != 0) + p++; + + if (*p != ':') + { + err = IpAddr(token, &server_addr); + if (err) + return -1; + } + else + { + *p = ' '; + + n = sscanf(token, "%s %s", s, str_server_port); + if (n != 2) + return -1; + + err = IpAddr(s, &server_addr); + if (err) + return -1; + } + } + state = STATE_READ_KEYWORD; + break; + + case STATE_READ_RULE: + n = sscanf(token, "%d", &rule_index); + if (n != 1 || rule_index < 0) + return -1; + state = STATE_READ_KEYWORD; + break; + + case STATE_READ_DELETE: + { + int err; + int rule_to_delete; + + if (token_count != 2) + return -1; + + n = sscanf(token, "%d", &rule_to_delete); + if (n != 1) + return -1; + err = RuleNumberDelete(rule_to_delete); + if (err) + return -1; + return 0; + } + + case STATE_READ_PROTO: + if (strcmp(token, "tcp") == 0) + proto = IPPROTO_TCP; + else if (strcmp(token, "udp") == 0) + proto = IPPROTO_UDP; + else + return -1; + state = STATE_READ_KEYWORD; + break; + + case STATE_READ_SRC: + case STATE_READ_DST: + { + int err; + char *p; + struct in_addr mask; + struct in_addr addr; + + p = token; + while (*p != '/' && *p != 0) + p++; + + if (*p != '/') + { + IpMask(32, &mask); + err = IpAddr(token, &addr); + if (err) + return -1; + } + else + { + int nbits; + char s[sizeof(buffer)]; + + *p = ' '; + n = sscanf(token, "%s %d", s, &nbits); + if (n != 2) + return -1; + + err = IpAddr(s, &addr); + if (err) + return -1; + + err = IpMask(nbits, &mask); + if (err) + return -1; + } + + if (state == STATE_READ_SRC) + { + src_addr = addr; + src_mask = mask; + } + else + { + dst_addr = addr; + dst_mask = mask; + } + } + state = STATE_READ_KEYWORD; + break; + + default: + return -1; + break; + } + + do { + token = strsep(&res, " \t"); + } while (token != NULL && !*token); + } +#undef STATE_READ_KEYWORD +#undef STATE_READ_TYPE +#undef STATE_READ_PORT +#undef STATE_READ_SERVER +#undef STATE_READ_RULE +#undef STATE_READ_DELETE +#undef STATE_READ_PROTO +#undef STATE_READ_SRC +#undef STATE_READ_DST + +/* Convert port strings to numbers. This needs to be done after + the string is parsed, because the prototype might not be designated + before the ports (which might be symbolic entries in /etc/services) */ + + if (strlen(str_port) != 0) + { + int err; + + err = IpPort(str_port, proto, &proxy_port); + if (err) + return -1; + } + else + { + proxy_port = 0; + } + + if (strlen(str_server_port) != 0) + { + int err; + + err = IpPort(str_server_port, proto, &server_port); + if (err) + return -1; + } + else + { + server_port = 0; + } + +/* Check that at least the server address has been defined */ + if (server_addr.s_addr == 0) + return -1; + +/* Add to linked list */ + proxy_entry = malloc(sizeof(struct proxy_entry)); + if (proxy_entry == NULL) + return -1; + + proxy_entry->proxy_type = proxy_type; + proxy_entry->rule_index = rule_index; + proxy_entry->proto = proto; + proxy_entry->proxy_port = htons(proxy_port); + proxy_entry->server_port = htons(server_port); + proxy_entry->server_addr = server_addr; + proxy_entry->src_addr.s_addr = src_addr.s_addr & src_mask.s_addr; + proxy_entry->dst_addr.s_addr = dst_addr.s_addr & dst_mask.s_addr; + proxy_entry->src_mask = src_mask; + proxy_entry->dst_mask = dst_mask; + + RuleAdd(proxy_entry); + + return 0; +} diff --git a/sys/netinet/libalias/alias_smedia.c b/sys/netinet/libalias/alias_smedia.c new file mode 100644 index 0000000..f4cf332 --- /dev/null +++ b/sys/netinet/libalias/alias_smedia.c @@ -0,0 +1,432 @@ +/* + * alias_smedia.c + * + * Copyright (c) 2000 Whistle Communications, Inc. + * All rights reserved. + * + * Subject to the following obligations and disclaimer of warranty, use and + * redistribution of this software, in source or object code forms, with or + * without modifications are expressly permitted by Whistle Communications; + * provided, however, that: + * 1. Any and all reproductions of the source or object code must include the + * copyright notice above and the following disclaimer of warranties; and + * 2. No rights are granted, in any manner or form, to use Whistle + * Communications, Inc. trademarks, including the mark "WHISTLE + * COMMUNICATIONS" on advertising, endorsements, or otherwise except as + * such appears in the above copyright notice or in the software. + * + * THIS SOFTWARE IS BEING PROVIDED BY WHISTLE COMMUNICATIONS "AS IS", AND + * TO THE MAXIMUM EXTENT PERMITTED BY LAW, WHISTLE COMMUNICATIONS MAKES NO + * REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, REGARDING THIS SOFTWARE, + * INCLUDING WITHOUT LIMITATION, ANY AND ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. + * WHISTLE COMMUNICATIONS DOES NOT WARRANT, GUARANTEE, OR MAKE ANY + * REPRESENTATIONS REGARDING THE USE OF, OR THE RESULTS OF THE USE OF THIS + * SOFTWARE IN TERMS OF ITS CORRECTNESS, ACCURACY, RELIABILITY OR OTHERWISE. + * IN NO EVENT SHALL WHISTLE COMMUNICATIONS BE LIABLE FOR ANY DAMAGES + * RESULTING FROM OR ARISING OUT OF ANY USE OF THIS SOFTWARE, INCLUDING + * WITHOUT LIMITATION, ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, + * PUNITIVE, OR CONSEQUENTIAL DAMAGES, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES, LOSS OF USE, DATA OR PROFITS, HOWEVER CAUSED AND UNDER ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF WHISTLE COMMUNICATIONS IS ADVISED OF THE POSSIBILITY + * OF SUCH DAMAGE. + * + * Copyright (c) 2000 Junichi SATOH <junichi@astec.co.jp> + * <junichi@junichi.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Authors: Erik Salander <erik@whistle.com> + * Junichi SATOH <junichi@astec.co.jp> + * <junichi@junichi.org> + * + * $FreeBSD$ + */ + +/* + Alias_smedia.c is meant to contain the aliasing code for streaming media + protocols. It performs special processing for RSTP sessions under TCP. + Specifically, when a SETUP request is sent by a client, or a 200 reply + is sent by a server, it is intercepted and modified. The address is + changed to the gateway machine and an aliasing port is used. + + More specifically, the "client_port" configuration parameter is + parsed for SETUP requests. The "server_port" configuration parameter is + parsed for 200 replies eminating from a server. This is intended to handle + the unicast case. + + RTSP also allows a redirection of a stream to another client by using the + "destination" configuration parameter. The destination config parm would + indicate a different IP address. This function is NOT supported by the + RTSP translation code below. + + The RTSP multicast functions without any address translation intervention. + + For this routine to work, the SETUP/200 must fit entirely + into a single TCP packet. This is typically the case, but exceptions + can easily be envisioned under the actual specifications. + + Probably the most troubling aspect of the approach taken here is + that the new SETUP/200 will typically be a different length, and + this causes a certain amount of bookkeeping to keep track of the + changes of sequence and acknowledgment numbers, since the client + machine is totally unaware of the modification to the TCP stream. + + Initial version: May, 2000 (eds) +*/ + +#include <stdio.h> +#include <string.h> +#include <sys/types.h> +#include <netinet/in_systm.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/tcp.h> +#include <netinet/udp.h> + +#include "alias_local.h" + +#define RTSP_CONTROL_PORT_NUMBER_1 554 +#define RTSP_CONTROL_PORT_NUMBER_2 7070 +#define RTSP_PORT_GROUP 2 + +#define ISDIGIT(a) (((a) >= '0') && ((a) <= '9')) + +static int +search_string(char *data, int dlen, const char *search_str) +{ + int i, j, k; + int search_str_len; + + search_str_len = strlen(search_str); + for (i = 0; i < dlen - search_str_len; i++) { + for (j = i, k = 0; j < dlen - search_str_len; j++, k++) { + if (data[j] != search_str[k] && + data[j] != search_str[k] - ('a' - 'A')) { + break; + } + if (k == search_str_len - 1) { + return j + 1; + } + } + } + return -1; +} + +static int +alias_rtsp_out(struct ip *pip, + struct alias_link *link, + char *data, + const char *port_str) +{ + int hlen, tlen, dlen; + struct tcphdr *tc; + int i, j, pos, state, port_dlen, new_dlen, delta; + u_short p[2], new_len; + u_short sport, eport, base_port; + u_short salias = 0, ealias = 0, base_alias = 0; + const char *transport_str = "transport:"; + char newdata[2048], *port_data, *port_newdata, stemp[80]; + int links_created = 0, pkt_updated = 0; + struct alias_link *rtsp_link = NULL; + struct in_addr null_addr; + + /* Calculate data length of TCP packet */ + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + hlen = (pip->ip_hl + tc->th_off) << 2; + tlen = ntohs(pip->ip_len); + dlen = tlen - hlen; + + /* Find keyword, "Transport: " */ + pos = search_string(data, dlen, transport_str); + if (pos < 0) { + return -1; + } + port_data = data + pos; + port_dlen = dlen - pos; + + memcpy(newdata, data, pos); + port_newdata = newdata + pos; + + while (port_dlen > strlen(port_str)) { + /* Find keyword, appropriate port string */ + pos = search_string(port_data, port_dlen, port_str); + if (pos < 0) { + break; + } + + memcpy (port_newdata, port_data, pos + 1); + port_newdata += (pos + 1); + + p[0] = p[1] = 0; + sport = eport = 0; + state = 0; + for (i = pos; i < port_dlen; i++) { + switch(state) { + case 0: + if (port_data[i] == '=') { + state++; + } + break; + case 1: + if (ISDIGIT(port_data[i])) { + p[0] = p[0] * 10 + port_data[i] - '0'; + } else { + if (port_data[i] == ';') { + state = 3; + } + if (port_data[i] == '-') { + state++; + } + } + break; + case 2: + if (ISDIGIT(port_data[i])) { + p[1] = p[1] * 10 + port_data[i] - '0'; + } else { + state++; + } + break; + case 3: + base_port = p[0]; + sport = htons(p[0]); + eport = htons(p[1]); + + if (!links_created) { + + links_created = 1; + /* Find an even numbered port number base that + satisfies the contiguous number of ports we need */ + null_addr.s_addr = 0; + if (0 == (salias = FindNewPortGroup(null_addr, + FindAliasAddress(pip->ip_src), + sport, 0, + RTSP_PORT_GROUP, + IPPROTO_UDP, 1))) { +#ifdef DEBUG + fprintf(stderr, + "PacketAlias/RTSP: Cannot find contiguous RTSP data ports\n"); +#endif + } else { + + base_alias = ntohs(salias); + for (j = 0; j < RTSP_PORT_GROUP; j++) { + /* Establish link to port found in RTSP packet */ + rtsp_link = FindRtspOut(GetOriginalAddress(link), null_addr, + htons(base_port + j), htons(base_alias + j), + IPPROTO_UDP); + if (rtsp_link != NULL) { +#ifndef NO_FW_PUNCH + /* Punch hole in firewall */ + PunchFWHole(rtsp_link); +#endif + } else { +#ifdef DEBUG + fprintf(stderr, + "PacketAlias/RTSP: Cannot allocate RTSP data ports\n"); +#endif + break; + } + } + } + ealias = htons(base_alias + (RTSP_PORT_GROUP - 1)); + } + + if (salias && rtsp_link) { + + pkt_updated = 1; + + /* Copy into IP packet */ + sprintf(stemp, "%d", ntohs(salias)); + memcpy(port_newdata, stemp, strlen(stemp)); + port_newdata += strlen(stemp); + + if (eport != 0) { + *port_newdata = '-'; + port_newdata++; + + /* Copy into IP packet */ + sprintf(stemp, "%d", ntohs(ealias)); + memcpy(port_newdata, stemp, strlen(stemp)); + port_newdata += strlen(stemp); + } + + *port_newdata = ';'; + port_newdata++; + } + state++; + break; + } + if (state > 3) { + break; + } + } + port_data += i; + port_dlen -= i; + } + + if (!pkt_updated) + return -1; + + memcpy (port_newdata, port_data, port_dlen); + port_newdata += port_dlen; + *port_newdata = '\0'; + + /* Create new packet */ + new_dlen = port_newdata - newdata; + memcpy (data, newdata, new_dlen); + + SetAckModified(link); + delta = GetDeltaSeqOut(pip, link); + AddSeq(pip, link, delta + new_dlen - dlen); + + new_len = htons(hlen + new_dlen); + DifferentialChecksum(&pip->ip_sum, + &new_len, + &pip->ip_len, + 1); + pip->ip_len = new_len; + + tc->th_sum = 0; + tc->th_sum = TcpChecksum(pip); + + return 0; +} + +/* Support the protocol used by early versions of RealPlayer */ + +static int +alias_pna_out(struct ip *pip, + struct alias_link *link, + char *data, + int dlen) +{ + struct alias_link *pna_links; + u_short msg_id, msg_len; + char *work; + u_short alias_port, port; + struct tcphdr *tc; + + work = data; + work += 5; + while (work + 4 < data + dlen) { + memcpy(&msg_id, work, 2); + work += 2; + memcpy(&msg_len, work, 2); + work += 2; + if (ntohs(msg_id) == 0) { + /* end of options */ + return 0; + } + if ((ntohs(msg_id) == 1) || (ntohs(msg_id) == 7)) { + memcpy(&port, work, 2); + pna_links = FindUdpTcpOut(pip->ip_src, GetDestAddress(link), + port, 0, IPPROTO_UDP, 1); + if (pna_links != NULL) { +#ifndef NO_FW_PUNCH + /* Punch hole in firewall */ + PunchFWHole(pna_links); +#endif + tc = (struct tcphdr *) ((char *) pip + (pip->ip_hl << 2)); + alias_port = GetAliasPort(pna_links); + memcpy(work, &alias_port, 2); + + /* Compute TCP checksum for revised packet */ + tc->th_sum = 0; + tc->th_sum = TcpChecksum(pip); + } + } + work += ntohs(msg_len); + } + + return 0; +} + +void +AliasHandleRtspOut(struct ip *pip, struct alias_link *link, int maxpacketsize) +{ + int hlen, tlen, dlen; + struct tcphdr *tc; + char *data; + const char *setup = "SETUP", *pna = "PNA", *str200 = "200"; + const char *okstr = "OK", *client_port_str = "client_port"; + const char *server_port_str = "server_port"; + int i, parseOk; + + tc = (struct tcphdr *)((char *)pip + (pip->ip_hl << 2)); + hlen = (pip->ip_hl + tc->th_off) << 2; + tlen = ntohs(pip->ip_len); + dlen = tlen - hlen; + + data = (char*)pip; + data += hlen; + + /* When aliasing a client, check for the SETUP request */ + if ((ntohs(tc->th_dport) == RTSP_CONTROL_PORT_NUMBER_1) || + (ntohs(tc->th_dport) == RTSP_CONTROL_PORT_NUMBER_2)) { + + if (dlen >= strlen(setup)) { + if (memcmp(data, setup, strlen(setup)) == 0) { + alias_rtsp_out(pip, link, data, client_port_str); + return; + } + } + if (dlen >= strlen(pna)) { + if (memcmp(data, pna, strlen(pna)) == 0) { + alias_pna_out(pip, link, data, dlen); + } + } + + } else { + + /* When aliasing a server, check for the 200 reply + Accomodate varying number of blanks between 200 & OK */ + + if (dlen >= strlen(str200)) { + + for (parseOk = 0, i = 0; + i <= dlen - strlen(str200); + i++) { + if (memcmp(&data[i], str200, strlen(str200)) == 0) { + parseOk = 1; + break; + } + } + if (parseOk) { + + i += strlen(str200); /* skip string found */ + while(data[i] == ' ') /* skip blank(s) */ + i++; + + if ((dlen - i) >= strlen(okstr)) { + + if (memcmp(&data[i], okstr, strlen(okstr)) == 0) + alias_rtsp_out(pip, link, data, server_port_str); + + } + } + } + } +} diff --git a/sys/netinet/libalias/alias_util.c b/sys/netinet/libalias/alias_util.c new file mode 100644 index 0000000..b939428 --- /dev/null +++ b/sys/netinet/libalias/alias_util.c @@ -0,0 +1,141 @@ +/* + Alias_util.c contains general utilities used by other functions + in the packet aliasing module. At the moment, there are functions + for computing IP header and TCP packet checksums. + + The checksum routines are based upon example code in a Unix networking + text written by Stevens (sorry, I can't remember the title -- but + at least this is a good author). + + Initial Version: August, 1996 (cjm) + + Version 1.7: January 9, 1997 + Added differential checksum update function. + + $FreeBSD$ +*/ + +/* +Note: the checksum routines assume that the actual checksum word has +been zeroed out. If the checksum word is filled with the proper value, +then these routines will give a result of zero (useful for testing +purposes); +*/ + +#include <sys/types.h> +#include <netinet/in_systm.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/tcp.h> + +#include "alias.h" +#include "alias_local.h" + +u_short +PacketAliasInternetChecksum(u_short *ptr, int nbytes) +{ + int sum, oddbyte; + + sum = 0; + while (nbytes > 1) + { + sum += *ptr++; + nbytes -= 2; + } + if (nbytes == 1) + { + oddbyte = 0; + ((u_char *) &oddbyte)[0] = *(u_char *) ptr; + ((u_char *) &oddbyte)[1] = 0; + sum += oddbyte; + } + sum = (sum >> 16) + (sum & 0xffff); + sum += (sum >> 16); + return(~sum); +} + +u_short +IpChecksum(struct ip *pip) +{ + return( PacketAliasInternetChecksum((u_short *) pip, + (pip->ip_hl << 2)) ); + +} + +u_short +TcpChecksum(struct ip *pip) +{ + u_short *ptr; + struct tcphdr *tc; + int nhdr, ntcp, nbytes; + int sum, oddbyte; + + nhdr = pip->ip_hl << 2; + ntcp = ntohs(pip->ip_len) - nhdr; + + tc = (struct tcphdr *) ((char *) pip + nhdr); + ptr = (u_short *) tc; + +/* Add up TCP header and data */ + nbytes = ntcp; + sum = 0; + while (nbytes > 1) + { + sum += *ptr++; + nbytes -= 2; + } + if (nbytes == 1) + { + oddbyte = 0; + ((u_char *) &oddbyte)[0] = *(u_char *) ptr; + ((u_char *) &oddbyte)[1] = 0; + sum += oddbyte; + } + +/* "Pseudo-header" data */ + ptr = (u_short *) &(pip->ip_dst); + sum += *ptr++; + sum += *ptr; + ptr = (u_short *) &(pip->ip_src); + sum += *ptr++; + sum += *ptr; + sum += htons((u_short) ntcp); + sum += htons((u_short) pip->ip_p); + +/* Roll over carry bits */ + sum = (sum >> 16) + (sum & 0xffff); + sum += (sum >> 16); + +/* Return checksum */ + return((u_short) ~sum); +} + + +void +DifferentialChecksum(u_short *cksum, u_short *new, u_short *old, int n) +{ + int i; + int accumulate; + + accumulate = *cksum; + for (i=0; i<n; i++) + { + accumulate -= *new++; + accumulate += *old++; + } + + if (accumulate < 0) + { + accumulate = -accumulate; + accumulate = (accumulate >> 16) + (accumulate & 0xffff); + accumulate += accumulate >> 16; + *cksum = (u_short) ~accumulate; + } + else + { + accumulate = (accumulate >> 16) + (accumulate & 0xffff); + accumulate += accumulate >> 16; + *cksum = (u_short) accumulate; + } +} + diff --git a/sys/netinet/libalias/libalias.3 b/sys/netinet/libalias/libalias.3 new file mode 100644 index 0000000..587e3ca --- /dev/null +++ b/sys/netinet/libalias/libalias.3 @@ -0,0 +1,958 @@ +.\" $FreeBSD$ +.\" +.Dd April 13, 2000 +.Dt LIBALIAS 3 +.Os FreeBSD +.Sh NAME +.Nm libalias +.Nd packet aliasing library for masquerading and network address translation +.Sh SYNOPSIS +.Fd #include <sys/types.h> +.Fd #include <netinet/in.h> +.Fd #include <alias.h> +.Pp +Function prototypes are given in the main body of the text. +.Sh DESCRIPTION +The +.Nm +library is a collection of functions for aliasing and de-aliasing of IP +packets, intended for masquerading and network address translation (NAT). +.Sh INTRODUCTION +This library is a moderately portable set of functions designed to assist +in the process of IP masquerading and network address translation. +Outgoing packets from a local network with unregistered IP addresses can +be aliased to appear as if they came from an accessible IP address. +Incoming packets are then de-aliased so that they are sent to the correct +machine on the local network. +.Pp +A certain amount of flexibility is built into the packet aliasing engine. +In the simplest mode of operation, a many-to-one address mapping takes +place between local network and the packet aliasing host. +This is known as IP masquerading. +In addition, one-to-one mappings between local and public addresses can +also be implemented, which is known as static NAT. +In between these extremes, different groups of private addresses can be +linked to different public addresses, comprising several distinct +many-to-one mappings. +Also, a given public address and port can be statically redirected to a +private address/port. +.Pp +The packet aliasing engine was designed to operate in user space outside +of the kernel, without any access to private kernel data structure, but +the source code can also be ported to a kernel environment. +.Sh INITIALIZATION AND CONTROL +Two special functions, +.Fn PacketAliasInit +and +.Fn PacketAliasSetAddress , +must always be called before any packet handling may be performed. +In addition, the operating mode of the packet aliasing engine can be +customized by calling +.Fn PacketAliasSetMode . +.Pp +.Ft void +.Fn PacketAliasInit void +.Bd -ragged -offset indent +This function has no arguments or return value and is used to initialize +internal data structures. +The following mode bits are always set after calling +.Fn PacketAliasInit . +See the description of +.Fn PacketAliasSetMode +below for the meaning of these mode bits. +.Pp +.Bl -item -offset indent -compact +.It +.Dv PKT_ALIAS_SAME_PORTS +.It +.Dv PKT_ALIAS_USE_SOCKETS +.It +.Dv PKT_ALIAS_RESET_ON_ADDR_CHANGE +.El +.Pp +This function will always return the packet aliasing engine to the same +initial state. +.Fn PacketAliasSetAddress +must be called afterwards, and any desired changes from the default mode +bits listed above require a call to +.Fn PacketAliasSetMode . +.Pp +It is mandatory that this function be called at the beginning of a program +prior to any packet handling. +.Ed +.Pp +.Ft void +.Fn PacketAliasUninit void +.Bd -ragged -offset indent +This function has no arguments or return value and is used to clear any +resources attached to internal data structures. +.Pp +This functions should be called when a program stops using the aliasing +engine; it does, amongst other things, clear out any firewall holes. +To provide backwards compatibility and extra security, it is added to +the +.Xr atexit 3 +chain by +.Fn PacketAliasInit . +Calling it multiple times is harmless. +.Ed +.Pp +.Ft void +.Fn PacketAliasSetAddress "struct in_addr addr" +.Bd -ragged -offset indent +This function sets the source address to which outgoing packets from the +local area network are aliased. +All outgoing packets are re-mapped to this address unless overridden by a +static address mapping established by +.Fn PacketAliasRedirectAddr . +.Pp +If the +.Dv PKT_ALIAS_RESET_ON_ADDR_CHANGE +mode bit is set (the default mode of operation), then the internal aliasing +link tables will be reset any time the aliasing address changes. +This is useful for interfaces such as +.Xr ppp 8 , +where the IP +address may or may not change on successive dial-up attempts. +.Pp +If the +.Dv PKT_ALIAS_RESET_ON_ADDR_CHANGE +mode bit is set to zero, this function can also be used to dynamically change +the aliasing address on a packet to packet basis (it is a low overhead call). +.Pp +It is mandatory that this function be called prior to any packet handling. +.Ed +.Pp +.Ft unsigned int +.Fn PacketAliasSetMode "unsigned int flags" "unsigned int mask" +.Bd -ragged -offset indent +This function sets or clears mode bits +according to the value of +.Fa flags . +Only bits marked in +.Fa mask +are affected. +The following mode bits are defined in +.Aq Pa alias.h : +.Bl -tag -width indent +.It Dv PKT_ALIAS_LOG +Enables logging into +.Pa /var/log/alias.log . +Each time an aliasing link is created or deleted, the log file is appended +with the current number of ICMP, TCP and UDP links. +Mainly useful for debugging when the log file is viewed continuously with +.Xr tail 1 . +.It Dv PKT_ALIAS_DENY_INCOMING +If this mode bit is set, all incoming packets associated with new TCP +connections or new UDP transactions will be marked for being ignored +.Po +.Fn PacketAliasIn +returns +.Dv PKT_ALIAS_IGNORED +code +.Pc +by the calling program. +Response packets to connections or transactions initiated from the packet +aliasing host or local network will be unaffected. +This mode bit is useful for implementing a one-way firewall. +.It Dv PKT_ALIAS_SAME_PORTS +If this mode bit is set, the packet aliasing engine will attempt to leave +the alias port numbers unchanged from the actual local port numbers. +This can be done as long as the quintuple (proto, alias addr, alias port, +remote addr, remote port) is unique. +If a conflict exists, a new aliasing port number is chosen even if this +mode bit is set. +.It Dv PKT_ALIAS_USE_SOCKETS +This bit should be set when the packet aliasing host originates network +traffic as well as forwards it. +When the packet aliasing host is waiting for a connection from an unknown +host address or unknown port number (e.g. an FTP data connection), this +mode bit specifies that a socket be allocated as a place holder to prevent +port conflicts. +Once a connection is established, usually within a minute or so, the socket +is closed. +.It Dv PKT_ALIAS_UNREGISTERED_ONLY +If this mode bit is set, traffic on the local network which does not +originate from unregistered address spaces will be ignored. +Standard Class A, B and C unregistered addresses are: +.Bd -literal -offset indent +10.0.0.0 -> 10.255.255.255 (Class A subnet) +172.16.0.0 -> 172.31.255.255 (Class B subnets) +192.168.0.0 -> 192.168.255.255 (Class C subnets) +.Ed +.Pp +This option is useful in the case that packet aliasing host has both +registered and unregistered subnets on different interfaces. +The registered subnet is fully accessible to the outside world, so traffic +from it does not need to be passed through the packet aliasing engine. +.It Dv PKT_ALIAS_RESET_ON_ADDR_CHANGE +When this mode bit is set and +.Fn PacketAliasSetAddress +is called to change the aliasing address, the internal link table of the +packet aliasing engine will be cleared. +This operating mode is useful for +.Xr ppp 8 +links where the interface address can sometimes change or remain the same +between dial-up attempts. +If this mode bit is not set, the link table will never be reset in the event +of an address change. +.It Dv PKT_ALIAS_PUNCH_FW +This option makes +.Nm +`punch holes' in an +.Xr ipfirewall 4 +based firewall for FTP/IRC DCC connections. +The holes punched are bound by from/to IP address and port; it will not be +possible to use a hole for another connection. +A hole is removed when the connection that uses it dies. +To cater to unexpected death of a program using +.Nm +(e.g. kill -9), +changing the state of the flag will clear the entire firewall range +allocated for holes. +This will also happen on the initial call to +.Fn PacketAliasSetFWBase . +This call must happen prior to setting this flag. +.It Dv PKT_ALIAS_REVERSE +This option makes +.Nm +reverse the way it handles incoming and outgoing packets, allowing it +to be fed with data that passes through the internal interface rather +than the external one. +.It Dv PKT_ALIAS_PROXY_ONLY +This option tells +.Nm +to obey transparent proxy rules only. +Normal packet aliasing is not performed. +See +.Fn PacketAliasProxyRule +below for details. +.El +.Ed +.Pp +.Ft void +.Fn PacketAliasSetFWBase "unsigned int base" "unsigned int num" +.Bd -ragged -offset indent +Set firewall range allocated for punching firewall holes (with the +.Dv PKT_ALIAS_PUNCH_FW +flag). +The range will be cleared for all rules on initialization. +.Ed +.Sh PACKET HANDLING +The packet handling functions are used to modify incoming (remote to local) +and outgoing (local to remote) packets. +The calling program is responsible for receiving and sending packets via +network interfaces. +.Pp +Along with +.Fn PacketAliasInit +and +.Fn PacketAliasSetAddress , +the two packet handling functions, +.Fn PacketAliasIn +and +.Fn PacketAliasOut , +comprise minimal set of functions needed for a basic IP masquerading +implementation. +.Pp +.Ft int +.Fn PacketAliasIn "char *buffer" "int maxpacketsize" +.Bd -ragged -offset indent +An incoming packet coming from a remote machine to the local network is +de-aliased by this function. +The IP packet is pointed to by +.Fa buffer , +and +.Fa maxpacketsize +indicates the size of the data structure containing the packet and should +be at least as large as the actual packet size. +.Pp +Return codes: +.Bl -tag -width indent +.It Dv PKT_ALIAS_OK +The packet aliasing process was successful. +.It Dv PKT_ALIAS_IGNORED +The packet was ignored and not de-aliased. +This can happen if the protocol is unrecognized, possibly an ICMP message +type is not handled or if incoming packets for new connections are being +ignored (if +.Dv PKT_ALIAS_DENY_INCOMING +mode bit was set by +.Fn PacketAliasSetMode ) . +.It Dv PKT_ALIAS_UNRESOLVED_FRAGMENT +This is returned when a fragment cannot be resolved because the header +fragment has not been sent yet. +In this situation, fragments must be saved with +.Fn PacketAliasSaveFragment +until a header fragment is found. +.It Dv PKT_ALIAS_FOUND_HEADER_FRAGMENT +The packet aliasing process was successful, and a header fragment was found. +This is a signal to retrieve any unresolved fragments with +.Fn PacketAliasGetFragment +and de-alias them with +.Fn PacketAliasFragmentIn . +.It Dv PKT_ALIAS_ERROR +An internal error within the packet aliasing engine occurred. +.El +.Ed +.Pp +.Ft int +.Fn PacketAliasOut "char *buffer" "int maxpacketsize" +.Bd -ragged -offset indent +An outgoing packet coming from the local network to a remote machine is +aliased by this function. +The IP packet is pointed to by +.Fa buffer , +and +.Fa maxpacketsize +indicates the maximum packet size permissible should the packet length be +changed. +IP encoding protocols place address and port information in the encapsulated +data stream which has to be modified and can account for changes in packet +length. +Well known examples of such protocols are FTP and IRC DCC. +.Pp +Return codes: +.Bl -tag -width indent +.It Dv PKT_ALIAS_OK +The packet aliasing process was successful. +.It Dv PKT_ALIAS_IGNORED +The packet was ignored and not aliased. +This can happen if the protocol is unrecognized, or possibly an ICMP message +type is not handled. +.It Dv PKT_ALIAS_ERROR +An internal error within the packet aliasing engine occurred. +.El +.Ed +.Sh PORT AND ADDRESS REDIRECTION +The functions described in this section allow machines on the local network +to be accessible in some degree to new incoming connections from the external +network. +Individual ports can be re-mapped or static network address translations can +be designated. +.Pp +.Ft struct alias_link * +.Fo PacketAliasRedirectPort +.Fa "struct in_addr local_addr" +.Fa "u_short local_port" +.Fa "struct in_addr remote_addr" +.Fa "u_short remote_port" +.Fa "struct in_addr alias_addr" +.Fa "u_short alias_port" +.Fa "u_char proto" +.Fc +.Bd -ragged -offset indent +This function specifies that traffic from a given remote address/port to +an alias address/port be redirected to a specified local address/port. +The parameter +.Fa proto +can be either +.Dv IPPROTO_TCP +or +.Dv IPPROTO_UDP , +as defined in +.Aq Pa netinet/in.h . +.Pp +If +.Fa local_addr +or +.Fa alias_addr +is zero, this indicates that the packet aliasing address as established +by +.Fn PacketAliasSetAddress +is to be used. +Even if +.Fn PacketAliasSetAddress +is called to change the address after +.Fn PacketAliasRedirectPort +is called, a zero reference will track this change. +.Pp +If the link is further set up to operate for a load sharing, then +.Fa local_addr +and +.Fa local_port +are ignored, and are selected dynamically from the server pool, as described in +.Fn PacketAliasAddServer +below. +.Pp +If +.Fa remote_addr +is zero, this indicates to redirect packets from any remote address. +Likewise, if +.Fa remote_port +is zero, this indicates to redirect packets originating from any remote +port number. +Almost always, the remote port specification will be zero, but non-zero +remote addresses can sometimes be useful for firewalling. +If two calls to +.Fn PacketAliasRedirectPort +overlap in their address/port specifications, then the most recent call +will have precedence. +.Pp +This function returns a pointer which can subsequently be used by +.Fn PacketAliasRedirectDelete . +If +.Dv NULL +is returned, then the function call did not complete successfully. +.Pp +All port numbers should be in network address byte order, so it is necessary +to use +.Xr htons 3 +to convert these parameters from internally readable numbers to network byte +order. +Addresses are also in network byte order, which is implicit in the use of the +.Fa struct in_addr +data type. +.Ed +.Pp +.Ft struct alias_link * +.Fo PacketAliasRedirectAddr +.Fa "struct in_addr local_addr" +.Fa "struct in_addr alias_addr" +.Fc +.Bd -ragged -offset indent +This function designates that all incoming traffic to +.Fa alias_addr +be redirected to +.Fa local_addr . +Similarly, all outgoing traffic from +.Fa local_addr +is aliased to +.Fa alias_addr . +.Pp +If +.Fa local_addr +or +.Fa alias_addr +is zero, this indicates that the packet aliasing address as established by +.Fn PacketAliasSetAddress +is to be used. +Even if +.Fn PacketAliasSetAddress +is called to change the address after +.Fn PacketAliasRedirectAddr +is called, a zero reference will track this change. +.Pp +If the link is further set up to operate for a load sharing, then +.Fa local_addr +is ignored, and is selected dynamically from the server pool, as described in +.Fn PacketAliasAddServer +below. +.Pp +If subsequent calls to +.Fn PacketAliasRedirectAddr +use the same aliasing address, all new incoming traffic to this aliasing +address will be redirected to the local address made in the last function +call. +New traffic generated by any of the local machines, designated in the +several function calls, will be aliased to the same address. +Consider the following example: +.Bd -literal -offset indent +PacketAliasRedirectAddr(inet_aton("192.168.0.2"), + inet_aton("141.221.254.101")); +PacketAliasRedirectAddr(inet_aton("192.168.0.3"), + inet_aton("141.221.254.101")); +PacketAliasRedirectAddr(inet_aton("192.168.0.4"), + inet_aton("141.221.254.101")); +.Ed +.Pp +Any outgoing connections such as +.Xr telnet 1 +or +.Xr ftp 1 +from 192.168.0.2, 192.168.0.3 and 192.168.0.4 will appear to come from +141.221.254.101. +Any incoming connections to 141.221.254.101 will be directed to 192.168.0.4. +.Pp +Any calls to +.Fn PacketAliasRedirectPort +will have precedence over address mappings designated by +.Fn PacketAliasRedirectAddr . +.Pp +This function returns a pointer which can subsequently be used by +.Fn PacketAliasRedirectDelete . +If +.Dv NULL +is returned, then the function call did not complete successfully. +.Ed +.Pp +.Ft int +.Fo PacketAliasAddServer +.Fa "struct alias_link *link" +.Fa "struct in_addr addr" +.Fa "u_short port" +.Fc +.Bd -ragged -offset indent +This function sets the +.Fa link +up for Load Sharing using IP Network Address Translation (RFC 2391, LSNAT). +LSNAT operates as follows. +A client attempts to access a server by using the server virtual address. +The LSNAT router transparently redirects the request to one of the hosts +in server pool, selected using a real-time load sharing algorithm. +Multiple sessions may be initiated from the same client, and each session +could be directed to a different host based on load balance across server +pool hosts at the time. +If load share is desired for just a few specific services, the configuration +on LSNAT could be defined to restrict load share for just the services +desired. +.Pp +Currently, only the simplest selection algorithm is implemented, where a +host is selected on a round-robin basis only, without regard to load on +the host. +.Pp +First, the +.Fa link +is created by either +.Fn PacketAliasRedirectPort +or +.Fn PacketAliasRedirectAddr . +Then, +.Fn PacketAliasAddServer +is called multiple times to add entries to the +.Fa link Ns 's +server pool. +.Pp +For links created with +.Fn PacketAliasRedirectAddr , +the +.Fa port +argument is ignored and could have any value, e.g. htons(~0). +.Pp +This function returns 0 on success, -1 otherwise. +.Ed +.Pp +.Ft void +.Fn PacketAliasRedirectDelete "struct alias_link *link" +.Bd -ragged -offset indent +This function will delete a specific static redirect rule entered by +.Fn PacketAliasRedirectPort +or +.Fn PacketAliasRedirectAddr . +The parameter +.Fa link +is the pointer returned by either of the redirection functions. +If an invalid pointer is passed to +.Fn PacketAliasRedirectDelete , +then a program crash or unpredictable operation could result, so it is +necessary to be careful using this function. +.Ed +.Pp +.Ft int +.Fn PacketAliasProxyRule "const char *cmd" +.Bd -ragged -offset indent +The passed +.Fa cmd +string consists of one or more pairs of words. +The first word in each pair is a token and the second is the value that +should be applied for that token. +Tokens and their argument types are as follows: +.Bl -tag -width indent +.It Cm type encode_ip_hdr | encode_tcp_stream | no_encode +In order to support transparent proxying, it is necessary to somehow +pass the original address and port information into the new destination +server. +If +.Cm encode_ip_hdr +is specified, the original address and port is passed as an extra IP +option. +If +.Cm encode_tcp_stream +is specified, the original address and port is passed as the first +piece of data in the TCP stream in the format +.Dq DEST Ar IP port . +.It Cm port Ar portnum +Only packets with the destination port +.Ar portnum +are proxied. +.It Cm server Ar host Ns Xo +.Op : Ns Ar portnum +.Xc +This specifies the +.Ar host +and +.Ar portnum +that the data is to be redirected to. +.Ar host +must be an IP address rather than a DNS host name. +If +.Ar portnum +is not specified, the destination port number is not changed. +.Pp +The +.Ar server +specification is mandatory unless the +.Cm delete +command is being used. +.It Cm rule Ar index +Normally, each call to +.Fn PacketAliasProxyRule +inserts the next rule at the start of a linear list of rules. +If an +.Ar index +is specified, the new rule will be checked after all rules with lower +indices. +Calls to +.Fn PacketAliasProxyRule +that do not specify a rule are assigned rule 0. +.It Cm delete Ar index +This token and its argument MUST NOT be used with any other tokens. +When used, all existing rules with the given +.Ar index +are deleted. +.It Cm proto tcp | udp +If specified, only packets of the given protocol type are matched. +.It Cm src Ar IP Ns Xo +.Op / Ns Ar bits +.Xc +If specified, only packets with a source address matching the given +.Ar IP +are matched. +If +.Ar bits +is also specified, then the first +.Ar bits +bits of +.Ar IP +are taken as a network specification, and all IP addresses from that +network will be matched. +.It Cm dst Ar IP Ns Xo +.Op / Ns Ar bits +.Xc +If specified, only packets with a destination address matching the given +.Ar IP +are matched. +If +.Ar bits +is also specified, then the first +.Ar bits +bits of +.Ar IP +are taken as a network specification, and all IP addresses from that +network will be matched. +.El +.Pp +This function is usually used to redirect outgoing connections for +internal machines that are not permitted certain types of internet +access, or to restrict access to certain external machines. +.Ed +.Pp +.Ft struct alias_link * +.Fo PacketAliasRedirectProto +.Fa "struct in_addr local_addr" +.Fa "struct in_addr remote_addr" +.Fa "struct in_addr alias_addr" +.Fa "u_char proto" +.Fc +.Bd -ragged -offset indent +This function specifies that any IP packet with protocol number of +.Fa proto +from a given remote address to an alias address be +redirected to a specified local address. +.Pp +If +.Fa local_addr +or +.Fa alias_addr +is zero, this indicates that the packet aliasing address as established +by +.Fn PacketAliasSetAddress +is to be used. +Even if +.Fn PacketAliasSetAddress +is called to change the address after +.Fn PacketAliasRedirectProto +is called, a zero reference will track this change. +.Pp +If +.Fa remote_addr +is zero, this indicates to redirect packets from any remote address. +Non-zero remote addresses can sometimes be useful for firewalling. +.Pp +If two calls to +.Fn PacketAliasRedirectProto +overlap in their address specifications, then the most recent call +will have precedence. +.Pp +This function returns a pointer which can subsequently be used by +.Fn PacketAliasRedirectDelete . +If +.Dv NULL +is returned, then the function call did not complete successfully. +.Ed +.Sh FRAGMENT HANDLING +The functions in this section are used to deal with incoming fragments. +.Pp +Outgoing fragments are handled within +.Fn PacketAliasOut +by changing the address according to any applicable mapping set by +.Fn PacketAliasRedirectAddr , +or the default aliasing address set by +.Fn PacketAliasSetAddress . +.Pp +Incoming fragments are handled in one of two ways. +If the header of a fragmented IP packet has already been seen, then all +subsequent fragments will be re-mapped in the same manner the header +fragment was. +Fragments which arrive before the header are saved and then retrieved +once the header fragment has been resolved. +.Pp +.Ft int +.Fn PacketAliasSaveFragment "char *ptr" +.Bd -ragged -offset indent +When +.Fn PacketAliasIn +returns +.Dv PKT_ALIAS_UNRESOLVED_FRAGMENT , +this function can be used to save the pointer to the unresolved fragment. +.Pp +It is implicitly assumed that +.Fa ptr +points to a block of memory allocated by +.Xr malloc 3 . +If the fragment is never resolved, the packet aliasing engine will +automatically free the memory after a timeout period. +[Eventually this function should be modified so that a callback function +for freeing memory is passed as an argument.] +.Pp +This function returns +.Dv PKT_ALIAS_OK +if it was successful and +.Dv PKT_ALIAS_ERROR +if there was an error. +.Ed +.Pp +.Ft char * +.Fn PacketAliasGetFragment "char *buffer" +.Bd -ragged -offset indent +This function can be used to retrieve fragment pointers saved by +.Fn PacketAliasSaveFragment . +The IP header fragment pointed to by +.Fa buffer +is the header fragment indicated when +.Fn PacketAliasIn +returns +.Dv PKT_ALIAS_FOUND_HEADER_FRAGMENT . +Once a fragment pointer is retrieved, it becomes the calling program's +responsibility to free the dynamically allocated memory for the fragment. +.Pp +.Fn PacketAliasGetFragment +can be called sequentially until there are no more fragments available, +at which time it returns +.Dv NULL . +.Ed +.Pp +.Ft void +.Fn PacketAliasFragmentIn "char *header" "char *fragment" +.Bd -ragged -offset indent +When a fragment is retrieved with +.Fn PacketAliasGetFragment , +it can then be de-aliased with a call to +.Fn PacketAliasFragmentIn . +The +.Fa header +argument is the pointer to a header fragment used as a template, and +.Fa fragment +is the pointer to the packet to be de-aliased. +.Ed +.Sh MISCELLANEOUS FUNCTIONS +.Ft void +.Fn PacketAliasSetTarget "struct in_addr addr" +.Bd -ragged -offset indent +When an incoming packet not associated with any pre-existing aliasing link +arrives at the host machine, it will be sent to the address indicated by a +call to +.Fn PacketAliasSetTarget . +.Pp +If this function is called with an +.Dv INADDR_NONE +address argument, then all new incoming packets go to the address set by +.Fn PacketAliasSetAddress . +.Pp +If this function is not called, or is called with an +.Dv INADDR_ANY +address argument, then all new incoming packets go to the address specified +in the packet. +This allows external machines to talk directly to internal machines if they +can route packets to the machine in question. +.Ed +.Pp +.Ft int +.Fn PacketAliasCheckNewLink void +.Bd -ragged -offset indent +This function returns a non-zero value when a new aliasing link is created. +In circumstances where incoming traffic is being sequentially sent to +different local servers, this function can be used to trigger when +.Fn PacketAliasSetTarget +is called to change the default target address. +.Ed +.Pp +.Ft u_short +.Fn PacketAliasInternetChecksum "u_short *buffer" "int nbytes" +.Bd -ragged -offset indent +This is a utility function that does not seem to be available elsewhere and +is included as a convenience. +It computes the internet checksum, which is used in both IP and +protocol-specific headers (TCP, UDP, ICMP). +.Pp +The +.Fa buffer +argument points to the data block to be checksummed, and +.Fa nbytes +is the number of bytes. +The 16-bit checksum field should be zeroed before computing the checksum. +.Pp +Checksums can also be verified by operating on a block of data including +its checksum. +If the checksum is valid, +.Fn PacketAliasInternetChecksum +will return zero. +.Ed +.Pp +.Ft int +.Fn PacketUnaliasOut "char *buffer" "int maxpacketsize" +.Bd -ragged -offset indent +An outgoing packet, which has already been aliased, +has its private address/port information restored by this function. +The IP packet is pointed to by +.Fa buffer , +and +.Fa maxpacketsize +is provided for error checking purposes. +This function can be used if an already-aliased packet needs to have its +original IP header restored for further processing (eg. logging). +.Ed +.Sh BUGS +PPTP aliasing does not work when more than one internal client +connects to the same external server at the same time, because +PPTP requires a single TCP control connection to be established +between any two IP addresses. +.Sh AUTHORS +.An Charles Mott Aq cmott@scientech.com , +versions 1.0 - 1.8, 2.0 - 2.4. +.An Eivind Eklund Aq eivind@FreeBSD.org , +versions 1.8b, 1.9 and 2.5. +Added IRC DCC support as well as contributing a number of architectural +improvements; added the firewall bypass for FTP/IRC DCC. +.An Erik Salander Aq erik@whistle.com +added support for PPTP and RTSP. +.An Junichi Satoh Aq junichi@junichi.org +added support for RTSP/PNA. +.Sh ACKNOWLEDGMENTS +Listed below, in approximate chronological order, are individuals who +have provided valuable comments and/or debugging assistance. +.Pp +.Bd -ragged -offset indent +.An -split +.An Gary Roberts +.An Tom Torrance +.An Reto Burkhalter +.An Martin Renters +.An Brian Somers +.An Paul Traina +.An Ari Suutari +.An Dave Remien +.An J. Fortes +.An Andrzej Bialecki +.An Gordon Burditt +.Ed +.Sh CONCEPTUAL BACKGROUND +This section is intended for those who are planning to modify the source +code or want to create somewhat esoteric applications using the packet +aliasing functions. +.Pp +The conceptual framework under which the packet aliasing engine operates +is described here. +Central to the discussion is the idea of an +.Em aliasing link +which describes the relationship for a given packet transaction between +the local machine, aliased identity and remote machine. +It is discussed how such links come into existence and are destroyed. +.Ss ALIASING LINKS +There is a notion of an +.Em aliasing link , +which is a 7-tuple describing a specific translation: +.Bd -literal -offset indent +(local addr, local port, alias addr, alias port, + remote addr, remote port, protocol) +.Ed +.Pp +Outgoing packets have the local address and port number replaced with the +alias address and port number. +Incoming packets undergo the reverse process. +The packet aliasing engine attempts to match packets against an internal +table of aliasing links to determine how to modify a given IP packet. +Both the IP header and protocol dependent headers are modified as necessary. +Aliasing links are created and deleted as necessary according to network +traffic. +.Pp +Protocols can be TCP, UDP or even ICMP in certain circumstances. +(Some types of ICMP packets can be aliased according to sequence or ID +number which acts as an equivalent port number for identifying how +individual packets should be handled.) +.Pp +Each aliasing link must have a unique combination of the following five +quantities: alias address/port, remote address/port and protocol. +This ensures that several machines on a local network can share the +same aliasing IP address. +In cases where conflicts might arise, the aliasing port is chosen so that +uniqueness is maintained. +.Ss STATIC AND DYNAMIC LINKS +Aliasing links can either be static or dynamic. +Static links persist indefinitely and represent fixed rules for translating +IP packets. +Dynamic links come into existence for a specific TCP connection or UDP +transaction or ICMP ECHO sequence. +For the case of TCP, the connection can be monitored to see when the +associated aliasing link should be deleted. +Aliasing links for UDP transactions (and ICMP ECHO and TIMESTAMP requests) +work on a simple timeout rule. +When no activity is observed on a dynamic link for a certain amount of time +it is automatically deleted. +Timeout rules also apply to TCP connections which do not open or close +properly. +.Ss PARTIALLY SPECIFIED ALIASING LINKS +Aliasing links can be partially specified, meaning that the remote address +and/or remote port are unknown. +In this case, when a packet matching the incomplete specification is found, +a fully specified dynamic link is created. +If the original partially specified link is dynamic, it will be deleted +after the fully specified link is created, otherwise it will persist. +.Pp +For instance, a partially specified link might be +.Bd -literal -offset indent +(192.168.0.4, 23, 204.228.203.215, 8066, 0, 0, tcp) +.Ed +.Pp +The zeros denote unspecified components for the remote address and port. +If this link were static it would have the effect of redirecting all +incoming traffic from port 8066 of 204.228.203.215 to port 23 (telnet) +of machine 192.168.0.4 on the local network. +Each individual telnet connection would initiate the creation of a distinct +dynamic link. +.Ss DYNAMIC LINK CREATION +In addition to aliasing links, there are also address mappings that can be +stored within the internal data table of the packet aliasing mechanism. +.Bd -literal -offset indent +(local addr, alias addr) +.Ed +.Pp +Address mappings are searched when creating new dynamic links. +.Pp +All outgoing packets from the local network automatically create a dynamic +link if they do not match an already existing fully specified link. +If an address mapping exists for the outgoing packet, this determines +the alias address to be used. +If no mapping exists, then a default address, usually the address of the +packet aliasing host, is used. +If necessary, this default address can be changed as often as each individual +packet arrives. +.Pp +The aliasing port number is determined such that the new dynamic link does +not conflict with any existing links. +In the default operating mode, the packet aliasing engine attempts to set +the aliasing port equal to the local port number. +If this results in a conflict, then port numbers are randomly chosen until +a unique aliasing link can be established. +In an alternate operating mode, the first choice of an aliasing port is also +random and unrelated to the local port number. diff --git a/sys/netinet/mlfk_ipl.c b/sys/netinet/mlfk_ipl.c new file mode 100644 index 0000000..2a51d7d --- /dev/null +++ b/sys/netinet/mlfk_ipl.c @@ -0,0 +1,196 @@ +/* + * Copyright 1999 Guido van Rooij. All rights reserved. + * + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS + * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/conf.h> +#include <sys/socket.h> +#include <sys/sysctl.h> +#include <net/if.h> +#include <netinet/in_systm.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#if (__FreeBSD_version >= 199511) +# include <net/route.h> +# include <netinet/ip_var.h> +# include <netinet/tcp.h> +# include <netinet/tcpip.h> +#endif + + +#include <netinet/ipl.h> +#include <netinet/ip_compat.h> +#include <netinet/ip_fil.h> +#include <netinet/ip_state.h> +#include <netinet/ip_nat.h> +#include <netinet/ip_auth.h> +#include <netinet/ip_frag.h> +#include <netinet/ip_proxy.h> + +static dev_t ipf_devs[IPL_LOGMAX + 1]; + +SYSCTL_DECL(_net_inet); +SYSCTL_NODE(_net_inet, OID_AUTO, ipf, CTLFLAG_RW, 0, "IPF"); +SYSCTL_INT(_net_inet_ipf, OID_AUTO, fr_flags, CTLFLAG_RW, &fr_flags, 0, ""); +SYSCTL_INT(_net_inet_ipf, OID_AUTO, fr_pass, CTLFLAG_RW, &fr_pass, 0, ""); +SYSCTL_INT(_net_inet_ipf, OID_AUTO, fr_active, CTLFLAG_RD, &fr_active, 0, ""); +SYSCTL_INT(_net_inet_ipf, OID_AUTO, fr_tcpidletimeout, CTLFLAG_RW, + &fr_tcpidletimeout, 0, ""); +SYSCTL_INT(_net_inet_ipf, OID_AUTO, fr_tcpclosewait, CTLFLAG_RW, + &fr_tcpclosewait, 0, ""); +SYSCTL_INT(_net_inet_ipf, OID_AUTO, fr_tcplastack, CTLFLAG_RW, + &fr_tcplastack, 0, ""); +SYSCTL_INT(_net_inet_ipf, OID_AUTO, fr_tcptimeout, CTLFLAG_RW, + &fr_tcptimeout, 0, ""); +SYSCTL_INT(_net_inet_ipf, OID_AUTO, fr_tcpclosed, CTLFLAG_RW, + &fr_tcpclosed, 0, ""); +SYSCTL_INT(_net_inet_ipf, OID_AUTO, fr_tcphalfclosed, CTLFLAG_RW, + &fr_tcphalfclosed, 0, ""); +SYSCTL_INT(_net_inet_ipf, OID_AUTO, fr_udptimeout, CTLFLAG_RW, + &fr_udptimeout, 0, ""); +SYSCTL_INT(_net_inet_ipf, OID_AUTO, fr_icmptimeout, CTLFLAG_RW, + &fr_icmptimeout, 0, ""); +SYSCTL_INT(_net_inet_ipf, OID_AUTO, fr_defnatage, CTLFLAG_RW, + &fr_defnatage, 0, ""); +SYSCTL_INT(_net_inet_ipf, OID_AUTO, fr_ipfrttl, CTLFLAG_RW, + &fr_ipfrttl, 0, ""); +SYSCTL_INT(_net_inet_ipf, OID_AUTO, ipl_unreach, CTLFLAG_RW, + &ipl_unreach, 0, ""); +SYSCTL_INT(_net_inet_ipf, OID_AUTO, fr_running, CTLFLAG_RD, + &fr_running, 0, ""); +SYSCTL_INT(_net_inet_ipf, OID_AUTO, fr_authsize, CTLFLAG_RD, + &fr_authsize, 0, ""); +SYSCTL_INT(_net_inet_ipf, OID_AUTO, fr_authused, CTLFLAG_RD, + &fr_authused, 0, ""); +SYSCTL_INT(_net_inet_ipf, OID_AUTO, fr_defaultauthage, CTLFLAG_RW, + &fr_defaultauthage, 0, ""); +SYSCTL_INT(_net_inet_ipf, OID_AUTO, fr_chksrc, CTLFLAG_RW, &fr_chksrc, 0, ""); +SYSCTL_INT(_net_inet_ipf, OID_AUTO, ippr_ftp_pasvonly, CTLFLAG_RW, + &ippr_ftp_pasvonly, 0, ""); +SYSCTL_INT(_net_inet_ipf, OID_AUTO, fr_minttl, CTLFLAG_RW, &fr_minttl, 0, ""); +SYSCTL_INT(_net_inet_ipf, OID_AUTO, fr_minttllog, CTLFLAG_RW, + &fr_minttllog, 0, ""); + +#define CDEV_MAJOR 79 +static struct cdevsw ipl_cdevsw = { + /* open */ iplopen, + /* close */ iplclose, + /* read */ iplread, + /* write */ nowrite, + /* ioctl */ iplioctl, + /* poll */ nopoll, + /* mmap */ nommap, + /* strategy */ nostrategy, + /* name */ "ipl", + /* maj */ CDEV_MAJOR, + /* dump */ nodump, + /* psize */ nopsize, + /* flags */ 0, +}; + +static int +ipfilter_modevent(module_t mod, int type, void *unused) +{ + char *c; + int i, error = 0; + + switch (type) { + case MOD_LOAD : + + error = iplattach(); + if (error) + break; + + c = NULL; + for(i=strlen(IPL_NAME); i>0; i--) + if (IPL_NAME[i] == '/') { + c = &IPL_NAME[i+1]; + break; + } + if (!c) + c = IPL_NAME; + ipf_devs[IPL_LOGIPF] = + make_dev(&ipl_cdevsw, IPL_LOGIPF, 0, 0, 0600, c); + + c = NULL; + for(i=strlen(IPL_NAT); i>0; i--) + if (IPL_NAT[i] == '/') { + c = &IPL_NAT[i+1]; + break; + } + if (!c) + c = IPL_NAT; + ipf_devs[IPL_LOGNAT] = + make_dev(&ipl_cdevsw, IPL_LOGNAT, 0, 0, 0600, c); + + c = NULL; + for(i=strlen(IPL_STATE); i>0; i--) + if (IPL_STATE[i] == '/') { + c = &IPL_STATE[i+1]; + break; + } + if (!c) + c = IPL_STATE; + ipf_devs[IPL_LOGSTATE] = + make_dev(&ipl_cdevsw, IPL_LOGSTATE, 0, 0, 0600, c); + + c = NULL; + for(i=strlen(IPL_AUTH); i>0; i--) + if (IPL_AUTH[i] == '/') { + c = &IPL_AUTH[i+1]; + break; + } + if (!c) + c = IPL_AUTH; + ipf_devs[IPL_LOGAUTH] = + make_dev(&ipl_cdevsw, IPL_LOGAUTH, 0, 0, 0600, c); + + break; + case MOD_UNLOAD : + destroy_dev(ipf_devs[IPL_LOGIPF]); + destroy_dev(ipf_devs[IPL_LOGNAT]); + destroy_dev(ipf_devs[IPL_LOGSTATE]); + destroy_dev(ipf_devs[IPL_LOGAUTH]); + error = ipldetach(); + break; + default: + error = EINVAL; + break; + } + return error; +} + +static moduledata_t ipfiltermod = { + IPL_VERSION, + ipfilter_modevent, + 0 +}; +DECLARE_MODULE(ipfilter, ipfiltermod, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); diff --git a/sys/netinet/raw_ip.c b/sys/netinet/raw_ip.c new file mode 100644 index 0000000..50bbf32 --- /dev/null +++ b/sys/netinet/raw_ip.c @@ -0,0 +1,660 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)raw_ip.c 8.7 (Berkeley) 5/15/95 + * $FreeBSD$ + */ + +#include "opt_inet6.h" +#include "opt_ipsec.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sysctl.h> + +#include <vm/vm_zone.h> + +#include <net/if.h> +#include <net/route.h> + +#define _IP_VHL +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/in_pcb.h> +#include <netinet/in_var.h> +#include <netinet/ip_var.h> +#include <netinet/ip_mroute.h> + +#include <netinet/ip_fw.h> + +#ifdef IPSEC +#include <netinet6/ipsec.h> +#endif /*IPSEC*/ + +#include "opt_ipdn.h" +#ifdef DUMMYNET +#include <netinet/ip_dummynet.h> +#endif + +struct inpcbhead ripcb; +struct inpcbinfo ripcbinfo; + +/* + * Nominal space allocated to a raw ip socket. + */ +#define RIPSNDQ 8192 +#define RIPRCVQ 8192 + +/* + * Raw interface to IP protocol. + */ + +/* + * Initialize raw connection block q. + */ +void +rip_init() +{ + LIST_INIT(&ripcb); + ripcbinfo.listhead = &ripcb; + /* + * XXX We don't use the hash list for raw IP, but it's easier + * to allocate a one entry hash list than it is to check all + * over the place for hashbase == NULL. + */ + ripcbinfo.hashbase = hashinit(1, M_PCB, &ripcbinfo.hashmask); + ripcbinfo.porthashbase = hashinit(1, M_PCB, &ripcbinfo.porthashmask); + ripcbinfo.ipi_zone = zinit("ripcb", sizeof(struct inpcb), + maxsockets, ZONE_INTERRUPT, 0); +} + +static struct sockaddr_in ripsrc = { sizeof(ripsrc), AF_INET }; +/* + * Setup generic address and protocol structures + * for raw_input routine, then pass them along with + * mbuf chain. + */ +void +rip_input(m, off, proto) + struct mbuf *m; + int off, proto; +{ + register struct ip *ip = mtod(m, struct ip *); + register struct inpcb *inp; + struct inpcb *last = 0; + struct mbuf *opts = 0; + + ripsrc.sin_addr = ip->ip_src; + LIST_FOREACH(inp, &ripcb, inp_list) { +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV4) == 0) + continue; +#endif + if (inp->inp_ip_p && inp->inp_ip_p != proto) + continue; + if (inp->inp_laddr.s_addr && + inp->inp_laddr.s_addr != ip->ip_dst.s_addr) + continue; + if (inp->inp_faddr.s_addr && + inp->inp_faddr.s_addr != ip->ip_src.s_addr) + continue; + if (last) { + struct mbuf *n = m_copy(m, 0, (int)M_COPYALL); + if (n) { + if (last->inp_flags & INP_CONTROLOPTS || + last->inp_socket->so_options & SO_TIMESTAMP) + ip_savecontrol(last, &opts, ip, n); + if (sbappendaddr(&last->inp_socket->so_rcv, + (struct sockaddr *)&ripsrc, n, + opts) == 0) { + /* should notify about lost packet */ + m_freem(n); + if (opts) + m_freem(opts); + } else + sorwakeup(last->inp_socket); + opts = 0; + } + } + last = inp; + } + if (last) { + if (last->inp_flags & INP_CONTROLOPTS || + last->inp_socket->so_options & SO_TIMESTAMP) + ip_savecontrol(last, &opts, ip, m); + if (sbappendaddr(&last->inp_socket->so_rcv, + (struct sockaddr *)&ripsrc, m, opts) == 0) { + m_freem(m); + if (opts) + m_freem(opts); + } else + sorwakeup(last->inp_socket); + } else { + m_freem(m); + ipstat.ips_noproto++; + ipstat.ips_delivered--; + } +} + +/* + * Generate IP header and pass packet to ip_output. + * Tack on options user may have setup with control call. + */ +int +rip_output(m, so, dst) + struct mbuf *m; + struct socket *so; + u_long dst; +{ + register struct ip *ip; + register struct inpcb *inp = sotoinpcb(so); + int flags = (so->so_options & SO_DONTROUTE) | IP_ALLOWBROADCAST; + + /* + * If the user handed us a complete IP packet, use it. + * Otherwise, allocate an mbuf for a header and fill it in. + */ + if ((inp->inp_flags & INP_HDRINCL) == 0) { + if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) { + m_freem(m); + return(EMSGSIZE); + } + M_PREPEND(m, sizeof(struct ip), M_TRYWAIT); + ip = mtod(m, struct ip *); + ip->ip_tos = inp->inp_ip_tos; + ip->ip_off = 0; + ip->ip_p = inp->inp_ip_p; + ip->ip_len = m->m_pkthdr.len; + ip->ip_src = inp->inp_laddr; + ip->ip_dst.s_addr = dst; + ip->ip_ttl = inp->inp_ip_ttl; + } else { + if (m->m_pkthdr.len > IP_MAXPACKET) { + m_freem(m); + return(EMSGSIZE); + } + ip = mtod(m, struct ip *); + /* don't allow both user specified and setsockopt options, + and don't allow packet length sizes that will crash */ + if (((IP_VHL_HL(ip->ip_vhl) != (sizeof (*ip) >> 2)) + && inp->inp_options) + || (ip->ip_len > m->m_pkthdr.len) + || (ip->ip_len < (IP_VHL_HL(ip->ip_vhl) << 2))) { + m_freem(m); + return EINVAL; + } + if (ip->ip_id == 0) + ip->ip_id = htons(ip_id++); + /* XXX prevent ip_output from overwriting header fields */ + flags |= IP_RAWOUTPUT; + ipstat.ips_rawout++; + } + +#ifdef IPSEC + ipsec_setsocket(m, so); +#endif /*IPSEC*/ + + return (ip_output(m, inp->inp_options, &inp->inp_route, flags, + inp->inp_moptions)); +} + +/* + * Raw IP socket option processing. + */ +int +rip_ctloutput(so, sopt) + struct socket *so; + struct sockopt *sopt; +{ + struct inpcb *inp = sotoinpcb(so); + int error, optval; + + if (sopt->sopt_level != IPPROTO_IP) + return (EINVAL); + + error = 0; + + switch (sopt->sopt_dir) { + case SOPT_GET: + switch (sopt->sopt_name) { + case IP_HDRINCL: + optval = inp->inp_flags & INP_HDRINCL; + error = sooptcopyout(sopt, &optval, sizeof optval); + break; + + case IP_FW_ADD: + case IP_FW_GET: + if (ip_fw_ctl_ptr == 0) + error = ENOPROTOOPT; + else + error = ip_fw_ctl_ptr(sopt); + break; + +#ifdef DUMMYNET + case IP_DUMMYNET_GET: + if (ip_dn_ctl_ptr == NULL) + error = ENOPROTOOPT ; + else + error = ip_dn_ctl_ptr(sopt); + break ; +#endif /* DUMMYNET */ + + case MRT_INIT: + case MRT_DONE: + case MRT_ADD_VIF: + case MRT_DEL_VIF: + case MRT_ADD_MFC: + case MRT_DEL_MFC: + case MRT_VERSION: + case MRT_ASSERT: + error = ip_mrouter_get(so, sopt); + break; + + default: + error = ip_ctloutput(so, sopt); + break; + } + break; + + case SOPT_SET: + switch (sopt->sopt_name) { + case IP_HDRINCL: + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + break; + if (optval) + inp->inp_flags |= INP_HDRINCL; + else + inp->inp_flags &= ~INP_HDRINCL; + break; + + case IP_FW_ADD: + case IP_FW_DEL: + case IP_FW_FLUSH: + case IP_FW_ZERO: + case IP_FW_RESETLOG: + if (ip_fw_ctl_ptr == 0) + error = ENOPROTOOPT; + else + error = ip_fw_ctl_ptr(sopt); + break; + +#ifdef DUMMYNET + case IP_DUMMYNET_CONFIGURE: + case IP_DUMMYNET_DEL: + case IP_DUMMYNET_FLUSH: + if (ip_dn_ctl_ptr == NULL) + error = ENOPROTOOPT ; + else + error = ip_dn_ctl_ptr(sopt); + break ; +#endif + + case IP_RSVP_ON: + error = ip_rsvp_init(so); + break; + + case IP_RSVP_OFF: + error = ip_rsvp_done(); + break; + + /* XXX - should be combined */ + case IP_RSVP_VIF_ON: + error = ip_rsvp_vif_init(so, sopt); + break; + + case IP_RSVP_VIF_OFF: + error = ip_rsvp_vif_done(so, sopt); + break; + + case MRT_INIT: + case MRT_DONE: + case MRT_ADD_VIF: + case MRT_DEL_VIF: + case MRT_ADD_MFC: + case MRT_DEL_MFC: + case MRT_VERSION: + case MRT_ASSERT: + error = ip_mrouter_set(so, sopt); + break; + + default: + error = ip_ctloutput(so, sopt); + break; + } + break; + } + + return (error); +} + +/* + * This function exists solely to receive the PRC_IFDOWN messages which + * are sent by if_down(). It looks for an ifaddr whose ifa_addr is sa, + * and calls in_ifadown() to remove all routes corresponding to that address. + * It also receives the PRC_IFUP messages from if_up() and reinstalls the + * interface routes. + */ +void +rip_ctlinput(cmd, sa, vip) + int cmd; + struct sockaddr *sa; + void *vip; +{ + struct in_ifaddr *ia; + struct ifnet *ifp; + int err; + int flags; + + switch (cmd) { + case PRC_IFDOWN: + TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { + if (ia->ia_ifa.ifa_addr == sa + && (ia->ia_flags & IFA_ROUTE)) { + /* + * in_ifscrub kills the interface route. + */ + in_ifscrub(ia->ia_ifp, ia); + /* + * in_ifadown gets rid of all the rest of + * the routes. This is not quite the right + * thing to do, but at least if we are running + * a routing process they will come back. + */ + in_ifadown(&ia->ia_ifa, 0); + break; + } + } + break; + + case PRC_IFUP: + TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { + if (ia->ia_ifa.ifa_addr == sa) + break; + } + if (ia == 0 || (ia->ia_flags & IFA_ROUTE)) + return; + flags = RTF_UP; + ifp = ia->ia_ifa.ifa_ifp; + + if ((ifp->if_flags & IFF_LOOPBACK) + || (ifp->if_flags & IFF_POINTOPOINT)) + flags |= RTF_HOST; + + err = rtinit(&ia->ia_ifa, RTM_ADD, flags); + if (err == 0) + ia->ia_flags |= IFA_ROUTE; + break; + } +} + +u_long rip_sendspace = RIPSNDQ; +u_long rip_recvspace = RIPRCVQ; + +SYSCTL_INT(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW, + &rip_sendspace, 0, "Maximum outgoing raw IP datagram size"); +SYSCTL_INT(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW, + &rip_recvspace, 0, "Maximum incoming raw IP datagram size"); + +static int +rip_attach(struct socket *so, int proto, struct proc *p) +{ + struct inpcb *inp; + int error, s; + + inp = sotoinpcb(so); + if (inp) + panic("rip_attach"); + if (p && (error = suser(p)) != 0) + return error; + + error = soreserve(so, rip_sendspace, rip_recvspace); + if (error) + return error; + s = splnet(); + error = in_pcballoc(so, &ripcbinfo, p); + splx(s); + if (error) + return error; + inp = (struct inpcb *)so->so_pcb; + inp->inp_vflag |= INP_IPV4; + inp->inp_ip_p = proto; + inp->inp_ip_ttl = ip_defttl; +#ifdef IPSEC + error = ipsec_init_policy(so, &inp->inp_sp); + if (error != 0) { + in_pcbdetach(inp); + return error; + } +#endif /*IPSEC*/ + return 0; +} + +static int +rip_detach(struct socket *so) +{ + struct inpcb *inp; + + inp = sotoinpcb(so); + if (inp == 0) + panic("rip_detach"); + if (so == ip_mrouter) + ip_mrouter_done(); + ip_rsvp_force_done(so); + if (so == ip_rsvpd) + ip_rsvp_done(); + in_pcbdetach(inp); + return 0; +} + +static int +rip_abort(struct socket *so) +{ + soisdisconnected(so); + return rip_detach(so); +} + +static int +rip_disconnect(struct socket *so) +{ + if ((so->so_state & SS_ISCONNECTED) == 0) + return ENOTCONN; + return rip_abort(so); +} + +static int +rip_bind(struct socket *so, struct sockaddr *nam, struct proc *p) +{ + struct inpcb *inp = sotoinpcb(so); + struct sockaddr_in *addr = (struct sockaddr_in *)nam; + + if (nam->sa_len != sizeof(*addr)) + return EINVAL; + + if (TAILQ_EMPTY(&ifnet) || ((addr->sin_family != AF_INET) && + (addr->sin_family != AF_IMPLINK)) || + (addr->sin_addr.s_addr && + ifa_ifwithaddr((struct sockaddr *)addr) == 0)) + return EADDRNOTAVAIL; + inp->inp_laddr = addr->sin_addr; + return 0; +} + +static int +rip_connect(struct socket *so, struct sockaddr *nam, struct proc *p) +{ + struct inpcb *inp = sotoinpcb(so); + struct sockaddr_in *addr = (struct sockaddr_in *)nam; + + if (nam->sa_len != sizeof(*addr)) + return EINVAL; + if (TAILQ_EMPTY(&ifnet)) + return EADDRNOTAVAIL; + if ((addr->sin_family != AF_INET) && + (addr->sin_family != AF_IMPLINK)) + return EAFNOSUPPORT; + inp->inp_faddr = addr->sin_addr; + soisconnected(so); + return 0; +} + +static int +rip_shutdown(struct socket *so) +{ + socantsendmore(so); + return 0; +} + +static int +rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, + struct mbuf *control, struct proc *p) +{ + struct inpcb *inp = sotoinpcb(so); + register u_long dst; + + if (so->so_state & SS_ISCONNECTED) { + if (nam) { + m_freem(m); + return EISCONN; + } + dst = inp->inp_faddr.s_addr; + } else { + if (nam == NULL) { + m_freem(m); + return ENOTCONN; + } + dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr; + } + return rip_output(m, so, dst); +} + +static int +rip_pcblist(SYSCTL_HANDLER_ARGS) +{ + int error, i, n, s; + struct inpcb *inp, **inp_list; + inp_gen_t gencnt; + struct xinpgen xig; + + /* + * The process of preparing the TCB list is too time-consuming and + * resource-intensive to repeat twice on every request. + */ + if (req->oldptr == 0) { + n = ripcbinfo.ipi_count; + req->oldidx = 2 * (sizeof xig) + + (n + n/8) * sizeof(struct xinpcb); + return 0; + } + + if (req->newptr != 0) + return EPERM; + + /* + * OK, now we're committed to doing something. + */ + s = splnet(); + gencnt = ripcbinfo.ipi_gencnt; + n = ripcbinfo.ipi_count; + splx(s); + + xig.xig_len = sizeof xig; + xig.xig_count = n; + xig.xig_gen = gencnt; + xig.xig_sogen = so_gencnt; + error = SYSCTL_OUT(req, &xig, sizeof xig); + if (error) + return error; + + inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); + if (inp_list == 0) + return ENOMEM; + + s = splnet(); + for (inp = LIST_FIRST(ripcbinfo.listhead), i = 0; inp && i < n; + inp = LIST_NEXT(inp, inp_list)) { + if (inp->inp_gencnt <= gencnt) + inp_list[i++] = inp; + } + splx(s); + n = i; + + error = 0; + for (i = 0; i < n; i++) { + inp = inp_list[i]; + if (inp->inp_gencnt <= gencnt) { + struct xinpcb xi; + xi.xi_len = sizeof xi; + /* XXX should avoid extra copy */ + bcopy(inp, &xi.xi_inp, sizeof *inp); + if (inp->inp_socket) + sotoxsocket(inp->inp_socket, &xi.xi_socket); + error = SYSCTL_OUT(req, &xi, sizeof xi); + } + } + if (!error) { + /* + * Give the user an updated idea of our state. + * If the generation differs from what we told + * her before, she knows that something happened + * while we were processing this request, and it + * might be necessary to retry. + */ + s = splnet(); + xig.xig_gen = ripcbinfo.ipi_gencnt; + xig.xig_sogen = so_gencnt; + xig.xig_count = ripcbinfo.ipi_count; + splx(s); + error = SYSCTL_OUT(req, &xig, sizeof xig); + } + free(inp_list, M_TEMP); + return error; +} + +SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, CTLFLAG_RD, 0, 0, + rip_pcblist, "S,xinpcb", "List of active raw IP sockets"); + +struct pr_usrreqs rip_usrreqs = { + rip_abort, pru_accept_notsupp, rip_attach, rip_bind, rip_connect, + pru_connect2_notsupp, in_control, rip_detach, rip_disconnect, + pru_listen_notsupp, in_setpeeraddr, pru_rcvd_notsupp, + pru_rcvoob_notsupp, rip_send, pru_sense_null, rip_shutdown, + in_setsockaddr, sosend, soreceive, sopoll +}; diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h new file mode 100644 index 0000000..fee449f --- /dev/null +++ b/sys/netinet/tcp.h @@ -0,0 +1,137 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_TCP_H_ +#define _NETINET_TCP_H_ + +typedef u_int32_t tcp_seq; +typedef u_int32_t tcp_cc; /* connection count per rfc1644 */ + +#define tcp6_seq tcp_seq /* for KAME src sync over BSD*'s */ +#define tcp6hdr tcphdr /* for KAME src sync over BSD*'s */ + +/* + * TCP header. + * Per RFC 793, September, 1981. + */ +struct tcphdr { + u_short th_sport; /* source port */ + u_short th_dport; /* destination port */ + tcp_seq th_seq; /* sequence number */ + tcp_seq th_ack; /* acknowledgement number */ +#if BYTE_ORDER == LITTLE_ENDIAN + u_int th_x2:4, /* (unused) */ + th_off:4; /* data offset */ +#endif +#if BYTE_ORDER == BIG_ENDIAN + u_int th_off:4, /* data offset */ + th_x2:4; /* (unused) */ +#endif + u_char th_flags; +#define TH_FIN 0x01 +#define TH_SYN 0x02 +#define TH_RST 0x04 +#define TH_PUSH 0x08 +#define TH_ACK 0x10 +#define TH_URG 0x20 +#define TH_ECE 0x40 +#define TH_CWR 0x80 +#define TH_FLAGS (TH_FIN|TH_SYN|TH_RST|TH_ACK|TH_URG|TH_ECE|TH_CWR) + + u_short th_win; /* window */ + u_short th_sum; /* checksum */ + u_short th_urp; /* urgent pointer */ +}; + +#define TCPOPT_EOL 0 +#define TCPOPT_NOP 1 +#define TCPOPT_MAXSEG 2 +#define TCPOLEN_MAXSEG 4 +#define TCPOPT_WINDOW 3 +#define TCPOLEN_WINDOW 3 +#define TCPOPT_SACK_PERMITTED 4 /* Experimental */ +#define TCPOLEN_SACK_PERMITTED 2 +#define TCPOPT_SACK 5 /* Experimental */ +#define TCPOPT_TIMESTAMP 8 +#define TCPOLEN_TIMESTAMP 10 +#define TCPOLEN_TSTAMP_APPA (TCPOLEN_TIMESTAMP+2) /* appendix A */ +#define TCPOPT_TSTAMP_HDR \ + (TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP) + +#define TCPOPT_CC 11 /* CC options: RFC-1644 */ +#define TCPOPT_CCNEW 12 +#define TCPOPT_CCECHO 13 +#define TCPOLEN_CC 6 +#define TCPOLEN_CC_APPA (TCPOLEN_CC+2) +#define TCPOPT_CC_HDR(ccopt) \ + (TCPOPT_NOP<<24|TCPOPT_NOP<<16|(ccopt)<<8|TCPOLEN_CC) + +/* + * Default maximum segment size for TCP. + * With an IP MSS of 576, this is 536, + * but 512 is probably more convenient. + * This should be defined as MIN(512, IP_MSS - sizeof (struct tcpiphdr)). + */ +#define TCP_MSS 512 + +/* + * Default maximum segment size for TCP6. + * With an IP6 MSS of 1280, this is 1220, + * but 1024 is probably more convenient. (xxx kazu in doubt) + * This should be defined as MIN(1024, IP6_MSS - sizeof (struct tcpip6hdr)) + */ +#define TCP6_MSS 1024 + +#define TCP_MAXWIN 65535 /* largest value for (unscaled) window */ +#define TTCP_CLIENT_SND_WND 4096 /* dflt send window for T/TCP client */ + +#define TCP_MAX_WINSHIFT 14 /* maximum window shift */ + +#define TCP_MAXBURST 4 /* maximum segments in a burst */ + +#define TCP_MAXHLEN (0xf<<2) /* max length of header in bytes */ +#define TCP_MAXOLEN (TCP_MAXHLEN - sizeof(struct tcphdr)) + /* max space left for options */ + +/* + * User-settable options (used with setsockopt). + */ +#define TCP_NODELAY 0x01 /* don't delay send to coalesce packets */ +#define TCP_MAXSEG 0x02 /* set maximum segment size */ +#define TCP_NOPUSH 0x04 /* don't push last block of write */ +#define TCP_NOOPT 0x08 /* don't use TCP options */ + +#endif diff --git a/sys/netinet/tcp_debug.c b/sys/netinet/tcp_debug.c new file mode 100644 index 0000000..89e9d7c --- /dev/null +++ b/sys/netinet/tcp_debug.c @@ -0,0 +1,231 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_debug.c 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#include "opt_inet.h" +#include "opt_inet6.h" +#include "opt_tcpdebug.h" + +#ifndef INET +#error The option TCPDEBUG requires option INET. +#endif + +#ifdef TCPDEBUG +/* load symbolic names */ +#define PRUREQUESTS +#define TCPSTATES +#define TCPTIMERS +#define TANAMES +#endif + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/protosw.h> +#include <sys/socket.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#ifdef INET6 +#include <netinet/ip6.h> +#endif +#include <netinet/ip_var.h> +#include <netinet/tcp.h> +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#include <netinet/tcpip.h> +#include <netinet/tcp_debug.h> + +#ifdef TCPDEBUG +static int tcpconsdebug = 0; +#endif + +static struct tcp_debug tcp_debug[TCP_NDEBUG]; +static int tcp_debx; + +/* + * Tcp debug routines + */ +void +tcp_trace(act, ostate, tp, ipgen, th, req) + short act, ostate; + struct tcpcb *tp; + void *ipgen; + struct tcphdr *th; + int req; +{ +#ifdef INET6 + int isipv6; +#endif /* INET6 */ + tcp_seq seq, ack; + int len, flags; + struct tcp_debug *td = &tcp_debug[tcp_debx++]; + +#ifdef INET6 + isipv6 = (ipgen != NULL && ((struct ip *)ipgen)->ip_v == 6) ? 1 : 0; +#endif /* INET6 */ + td->td_family = +#ifdef INET6 + (isipv6 != 0) ? AF_INET6 : +#endif + AF_INET; + if (tcp_debx == TCP_NDEBUG) + tcp_debx = 0; + td->td_time = iptime(); + td->td_act = act; + td->td_ostate = ostate; + td->td_tcb = (caddr_t)tp; + if (tp) + td->td_cb = *tp; + else + bzero((caddr_t)&td->td_cb, sizeof (*tp)); + if (ipgen) { + switch (td->td_family) { + case AF_INET: + bcopy((caddr_t)ipgen, (caddr_t)&td->td_ti.ti_i, + sizeof(td->td_ti.ti_i)); + bzero((caddr_t)td->td_ip6buf, sizeof(td->td_ip6buf)); + break; +#ifdef INET6 + case AF_INET6: + bcopy((caddr_t)ipgen, (caddr_t)td->td_ip6buf, + sizeof(td->td_ip6buf)); + bzero((caddr_t)&td->td_ti.ti_i, + sizeof(td->td_ti.ti_i)); + break; +#endif + default: + bzero((caddr_t)td->td_ip6buf, sizeof(td->td_ip6buf)); + bzero((caddr_t)&td->td_ti.ti_i, + sizeof(td->td_ti.ti_i)); + break; + } + } else { + bzero((caddr_t)&td->td_ti.ti_i, sizeof(td->td_ti.ti_i)); + bzero((caddr_t)td->td_ip6buf, sizeof(td->td_ip6buf)); + } + if (th) { + switch (td->td_family) { + case AF_INET: + td->td_ti.ti_t = *th; + bzero((caddr_t)&td->td_ti6.th, sizeof(td->td_ti6.th)); + break; +#ifdef INET6 + case AF_INET6: + td->td_ti6.th = *th; + bzero((caddr_t)&td->td_ti.ti_t, + sizeof(td->td_ti.ti_t)); + break; +#endif + default: + bzero((caddr_t)&td->td_ti.ti_t, + sizeof(td->td_ti.ti_t)); + bzero((caddr_t)&td->td_ti6.th, sizeof(td->td_ti6.th)); + break; + } + } else { + bzero((caddr_t)&td->td_ti.ti_t, sizeof(td->td_ti.ti_t)); + bzero((caddr_t)&td->td_ti6.th, sizeof(td->td_ti6.th)); + } + td->td_req = req; +#ifdef TCPDEBUG + if (tcpconsdebug == 0) + return; + if (tp) + printf("%p %s:", tp, tcpstates[ostate]); + else + printf("???????? "); + printf("%s ", tanames[act]); + switch (act) { + + case TA_INPUT: + case TA_OUTPUT: + case TA_DROP: + if (ipgen == NULL || th == NULL) + break; + seq = th->th_seq; + ack = th->th_ack; + len = +#ifdef INET6 + isipv6 ? ((struct ip6_hdr *)ipgen)->ip6_plen : +#endif + ((struct ip *)ipgen)->ip_len; + if (act == TA_OUTPUT) { + seq = ntohl(seq); + ack = ntohl(ack); + len = ntohs((u_short)len); + } + if (act == TA_OUTPUT) + len -= sizeof (struct tcphdr); + if (len) + printf("[%x..%x)", seq, seq+len); + else + printf("%x", seq); + printf("@%x, urp=%x", ack, th->th_urp); + flags = th->th_flags; + if (flags) { + char *cp = "<"; +#define pf(f) { \ + if (th->th_flags & TH_##f) { \ + printf("%s%s", cp, #f); \ + cp = ","; \ + } \ +} + pf(SYN); pf(ACK); pf(FIN); pf(RST); pf(PUSH); pf(URG); + printf(">"); + } + break; + + case TA_USER: + printf("%s", prurequests[req&0xff]); + if ((req & 0xff) == PRU_SLOWTIMO) + printf("<%s>", tcptimers[req>>8]); + break; + } + if (tp) + printf(" -> %s", tcpstates[tp->t_state]); + /* print out internal state of tp !?! */ + printf("\n"); + if (tp == 0) + return; + printf( + "\trcv_(nxt,wnd,up) (%lx,%lx,%lx) snd_(una,nxt,max) (%lx,%lx,%lx)\n", + (u_long)tp->rcv_nxt, tp->rcv_wnd, (u_long)tp->rcv_up, + (u_long)tp->snd_una, (u_long)tp->snd_nxt, (u_long)tp->snd_max); + printf("\tsnd_(wl1,wl2,wnd) (%lx,%lx,%lx)\n", + (u_long)tp->snd_wl1, (u_long)tp->snd_wl2, tp->snd_wnd); +#endif /* TCPDEBUG */ +} diff --git a/sys/netinet/tcp_debug.h b/sys/netinet/tcp_debug.h new file mode 100644 index 0000000..773d3e4 --- /dev/null +++ b/sys/netinet/tcp_debug.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_debug.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_TCP_DEBUG_H_ +#define _NETINET_TCP_DEBUG_H_ + +struct tcp_debug { + n_time td_time; + short td_act; + short td_ostate; + caddr_t td_tcb; + int td_family; + /* + * Co-existense of td_ti and td_ti6 below is ugly, but it is necessary + * to achieve backword compatibility to some extent. + */ + struct tcpiphdr td_ti; + struct { +#if !defined(_KERNEL) && defined(INET6) + struct ip6_hdr ip6; +#else + u_char ip6buf[40]; /* sizeof(struct ip6_hdr) */ +#endif + struct tcphdr th; + } td_ti6; +#define td_ip6buf td_ti6.ip6buf + short td_req; + struct tcpcb td_cb; +}; + +#define TA_INPUT 0 +#define TA_OUTPUT 1 +#define TA_USER 2 +#define TA_RESPOND 3 +#define TA_DROP 4 + +#ifdef TANAMES +static char *tanames[] = + { "input", "output", "user", "respond", "drop" }; +#endif + +#define TCP_NDEBUG 100 + +#ifndef _KERNEL +/* XXX common variables for broken applications. */ +struct tcp_debug tcp_debug[TCP_NDEBUG]; +int tcp_debx; +#endif + +#endif /* !_NETINET_TCP_DEBUG_H_ */ diff --git a/sys/netinet/tcp_fsm.h b/sys/netinet/tcp_fsm.h new file mode 100644 index 0000000..37752c4 --- /dev/null +++ b/sys/netinet/tcp_fsm.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_fsm.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_TCP_FSM_H_ +#define _NETINET_TCP_FSM_H_ + +/* + * TCP FSM state definitions. + * Per RFC793, September, 1981. + */ + +#define TCP_NSTATES 11 + +#define TCPS_CLOSED 0 /* closed */ +#define TCPS_LISTEN 1 /* listening for connection */ +#define TCPS_SYN_SENT 2 /* active, have sent syn */ +#define TCPS_SYN_RECEIVED 3 /* have send and received syn */ +/* states < TCPS_ESTABLISHED are those where connections not established */ +#define TCPS_ESTABLISHED 4 /* established */ +#define TCPS_CLOSE_WAIT 5 /* rcvd fin, waiting for close */ +/* states > TCPS_CLOSE_WAIT are those where user has closed */ +#define TCPS_FIN_WAIT_1 6 /* have closed, sent fin */ +#define TCPS_CLOSING 7 /* closed xchd FIN; await FIN ACK */ +#define TCPS_LAST_ACK 8 /* had fin and close; await FIN ACK */ +/* states > TCPS_CLOSE_WAIT && < TCPS_FIN_WAIT_2 await ACK of FIN */ +#define TCPS_FIN_WAIT_2 9 /* have closed, fin is acked */ +#define TCPS_TIME_WAIT 10 /* in 2*msl quiet wait after close */ + +/* for KAME src sync over BSD*'s */ +#define TCP6_NSTATES TCP_NSTATES +#define TCP6S_CLOSED TCPS_CLOSED +#define TCP6S_LISTEN TCPS_LISTEN +#define TCP6S_SYN_SENT TCPS_SYN_SENT +#define TCP6S_SYN_RECEIVED TCPS_SYN_RECEIVED +#define TCP6S_ESTABLISHED TCPS_ESTABLISHED +#define TCP6S_CLOSE_WAIT TCPS_CLOSE_WAIT +#define TCP6S_FIN_WAIT_1 TCPS_FIN_WAIT_1 +#define TCP6S_CLOSING TCPS_CLOSING +#define TCP6S_LAST_ACK TCPS_LAST_ACK +#define TCP6S_FIN_WAIT_2 TCPS_FIN_WAIT_2 +#define TCP6S_TIME_WAIT TCPS_TIME_WAIT + +#define TCPS_HAVERCVDSYN(s) ((s) >= TCPS_SYN_RECEIVED) +#define TCPS_HAVEESTABLISHED(s) ((s) >= TCPS_ESTABLISHED) +#define TCPS_HAVERCVDFIN(s) ((s) >= TCPS_TIME_WAIT) + +#ifdef TCPOUTFLAGS +/* + * Flags used when sending segments in tcp_output. + * Basic flags (TH_RST,TH_ACK,TH_SYN,TH_FIN) are totally + * determined by state, with the proviso that TH_FIN is sent only + * if all data queued for output is included in the segment. + */ +static u_char tcp_outflags[TCP_NSTATES] = { + TH_RST|TH_ACK, /* 0, CLOSED */ + 0, /* 1, LISTEN */ + TH_SYN, /* 2, SYN_SENT */ + TH_SYN|TH_ACK, /* 3, SYN_RECEIVED */ + TH_ACK, /* 4, ESTABLISHED */ + TH_ACK, /* 5, CLOSE_WAIT */ + TH_FIN|TH_ACK, /* 6, FIN_WAIT_1 */ + TH_FIN|TH_ACK, /* 7, CLOSING */ + TH_FIN|TH_ACK, /* 8, LAST_ACK */ + TH_ACK, /* 9, FIN_WAIT_2 */ + TH_ACK, /* 10, TIME_WAIT */ +}; +#endif + +#ifdef KPROF +int tcp_acounts[TCP_NSTATES][PRU_NREQ]; +#endif + +#ifdef TCPSTATES +char *tcpstates[] = { + "CLOSED", "LISTEN", "SYN_SENT", "SYN_RCVD", + "ESTABLISHED", "CLOSE_WAIT", "FIN_WAIT_1", "CLOSING", + "LAST_ACK", "FIN_WAIT_2", "TIME_WAIT", +}; +#endif + +#endif diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c new file mode 100644 index 0000000..bf578b7 --- /dev/null +++ b/sys/netinet/tcp_input.c @@ -0,0 +1,2885 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 + * $FreeBSD$ + */ + +#include "opt_ipfw.h" /* for ipfw_fwd */ +#include "opt_inet6.h" +#include "opt_ipsec.h" +#include "opt_tcpdebug.h" +#include "opt_tcp_input.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/proc.h> /* for proc0 declaration */ +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/syslog.h> + +#include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */ + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/ip_icmp.h> /* for ICMP_BANDLIM */ +#include <netinet/in_var.h> +#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ +#include <netinet/in_pcb.h> +#include <netinet/ip_var.h> +#ifdef INET6 +#include <netinet/ip6.h> +#include <netinet/icmp6.h> +#include <netinet6/nd6.h> +#include <netinet6/ip6_var.h> +#include <netinet6/in6_pcb.h> +#endif +#include <netinet/tcp.h> +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_seq.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#ifdef INET6 +#include <netinet6/tcp6_var.h> +#endif +#include <netinet/tcpip.h> +#ifdef TCPDEBUG +#include <netinet/tcp_debug.h> + +u_char tcp_saveipgen[40]; /* the size must be of max ip header, now IPv6 */ +struct tcphdr tcp_savetcp; +#endif /* TCPDEBUG */ + +#ifdef IPSEC +#include <netinet6/ipsec.h> +#ifdef INET6 +#include <netinet6/ipsec6.h> +#endif +#include <netkey/key.h> +#endif /*IPSEC*/ + +#include <machine/in_cksum.h> + +MALLOC_DEFINE(M_TSEGQ, "tseg_qent", "TCP segment queue entry"); + +static int tcprexmtthresh = 3; +tcp_seq tcp_iss; +tcp_cc tcp_ccgen; + +struct tcpstat tcpstat; +SYSCTL_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RD, + &tcpstat , tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); + +static int log_in_vain = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, + &log_in_vain, 0, "Log all incoming TCP connections"); + +static int blackhole = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW, + &blackhole, 0, "Do not send RST when dropping refused connections"); + +int tcp_delack_enabled = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW, + &tcp_delack_enabled, 0, + "Delay ACK to try and piggyback it onto a data packet"); + +int tcp_lq_overflow = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_lq_overflow, CTLFLAG_RW, + &tcp_lq_overflow, 0, + "Listen Queue Overflow"); + +#ifdef TCP_DROP_SYNFIN +static int drop_synfin = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW, + &drop_synfin, 0, "Drop TCP packets with SYN+FIN set"); +#endif + +struct inpcbhead tcb; +#define tcb6 tcb /* for KAME src sync over BSD*'s */ +struct inpcbinfo tcbinfo; + +static void tcp_dooptions __P((struct tcpcb *, + u_char *, int, struct tcphdr *, struct tcpopt *)); +static void tcp_pulloutofband __P((struct socket *, + struct tcphdr *, struct mbuf *, int)); +static int tcp_reass __P((struct tcpcb *, struct tcphdr *, int *, + struct mbuf *)); +static void tcp_xmit_timer __P((struct tcpcb *, int)); +static int tcp_newreno __P((struct tcpcb *, struct tcphdr *)); + +/* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */ +#ifdef INET6 +#define ND6_HINT(tp) \ +do { \ + if ((tp) && (tp)->t_inpcb && \ + ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0 && \ + (tp)->t_inpcb->in6p_route.ro_rt) \ + nd6_nud_hint((tp)->t_inpcb->in6p_route.ro_rt, NULL, 0); \ +} while (0) +#else +#define ND6_HINT(tp) +#endif + +/* + * Indicate whether this ack should be delayed. + */ +#define DELAY_ACK(tp) \ + (tcp_delack_enabled && !callout_pending(tp->tt_delack)) + +/* + * Insert segment which inludes th into reassembly queue of tcp with + * control block tp. Return TH_FIN if reassembly now includes + * a segment with FIN. The macro form does the common case inline + * (segment is the next to be received on an established connection, + * and the queue is empty), avoiding linkage into and removal + * from the queue and repetition of various conversions. + * Set DELACK for segments received in order, but ack immediately + * when segments are out of order (so fast retransmit can work). + */ +#define TCP_REASS(tp, th, tlenp, m, so, flags) { \ + if ((th)->th_seq == (tp)->rcv_nxt && \ + LIST_EMPTY(&(tp)->t_segq) && \ + TCPS_HAVEESTABLISHED((tp)->t_state)) { \ + if (DELAY_ACK(tp)) \ + callout_reset(tp->tt_delack, tcp_delacktime, \ + tcp_timer_delack, tp); \ + else \ + tp->t_flags |= TF_ACKNOW; \ + (tp)->rcv_nxt += *(tlenp); \ + flags = (th)->th_flags & TH_FIN; \ + tcpstat.tcps_rcvpack++;\ + tcpstat.tcps_rcvbyte += *(tlenp);\ + ND6_HINT(tp); \ + sbappend(&(so)->so_rcv, (m)); \ + sorwakeup(so); \ + } else { \ + (flags) = tcp_reass((tp), (th), (tlenp), (m)); \ + tp->t_flags |= TF_ACKNOW; \ + } \ +} + +static int +tcp_reass(tp, th, tlenp, m) + register struct tcpcb *tp; + register struct tcphdr *th; + int *tlenp; + struct mbuf *m; +{ + struct tseg_qent *q; + struct tseg_qent *p = NULL; + struct tseg_qent *nq; + struct tseg_qent *te; + struct socket *so = tp->t_inpcb->inp_socket; + int flags; + + /* + * Call with th==0 after become established to + * force pre-ESTABLISHED data up to user socket. + */ + if (th == 0) + goto present; + + /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */ + MALLOC(te, struct tseg_qent *, sizeof (struct tseg_qent), M_TSEGQ, + M_NOWAIT); + if (te == NULL) { + tcpstat.tcps_rcvmemdrop++; + m_freem(m); + return (0); + } + + /* + * Find a segment which begins after this one does. + */ + LIST_FOREACH(q, &tp->t_segq, tqe_q) { + if (SEQ_GT(q->tqe_th->th_seq, th->th_seq)) + break; + p = q; + } + + /* + * If there is a preceding segment, it may provide some of + * our data already. If so, drop the data from the incoming + * segment. If it provides all of our data, drop us. + */ + if (p != NULL) { + register int i; + /* conversion to int (in i) handles seq wraparound */ + i = p->tqe_th->th_seq + p->tqe_len - th->th_seq; + if (i > 0) { + if (i >= *tlenp) { + tcpstat.tcps_rcvduppack++; + tcpstat.tcps_rcvdupbyte += *tlenp; + m_freem(m); + FREE(te, M_TSEGQ); + /* + * Try to present any queued data + * at the left window edge to the user. + * This is needed after the 3-WHS + * completes. + */ + goto present; /* ??? */ + } + m_adj(m, i); + *tlenp -= i; + th->th_seq += i; + } + } + tcpstat.tcps_rcvoopack++; + tcpstat.tcps_rcvoobyte += *tlenp; + + /* + * While we overlap succeeding segments trim them or, + * if they are completely covered, dequeue them. + */ + while (q) { + register int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq; + if (i <= 0) + break; + if (i < q->tqe_len) { + q->tqe_th->th_seq += i; + q->tqe_len -= i; + m_adj(q->tqe_m, i); + break; + } + + nq = LIST_NEXT(q, tqe_q); + LIST_REMOVE(q, tqe_q); + m_freem(q->tqe_m); + FREE(q, M_TSEGQ); + q = nq; + } + + /* Insert the new segment queue entry into place. */ + te->tqe_m = m; + te->tqe_th = th; + te->tqe_len = *tlenp; + + if (p == NULL) { + LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q); + } else { + LIST_INSERT_AFTER(p, te, tqe_q); + } + +present: + /* + * Present data to user, advancing rcv_nxt through + * completed sequence space. + */ + if (!TCPS_HAVEESTABLISHED(tp->t_state)) + return (0); + q = LIST_FIRST(&tp->t_segq); + if (!q || q->tqe_th->th_seq != tp->rcv_nxt) + return (0); + do { + tp->rcv_nxt += q->tqe_len; + flags = q->tqe_th->th_flags & TH_FIN; + nq = LIST_NEXT(q, tqe_q); + LIST_REMOVE(q, tqe_q); + if (so->so_state & SS_CANTRCVMORE) + m_freem(q->tqe_m); + else + sbappend(&so->so_rcv, q->tqe_m); + FREE(q, M_TSEGQ); + q = nq; + } while (q && q->tqe_th->th_seq == tp->rcv_nxt); + ND6_HINT(tp); + sorwakeup(so); + return (flags); +} + +/* + * TCP input routine, follows pages 65-76 of the + * protocol specification dated September, 1981 very closely. + */ +#ifdef INET6 +int +tcp6_input(mp, offp, proto) + struct mbuf **mp; + int *offp, proto; +{ + register struct mbuf *m = *mp; + + IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE); + + /* + * draft-itojun-ipv6-tcp-to-anycast + * better place to put this in? + */ + if (m->m_flags & M_ANYCAST6) { + struct ip6_hdr *ip6; + + ip6 = mtod(m, struct ip6_hdr *); + icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, + (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); + return IPPROTO_DONE; + } + + tcp_input(m, *offp, proto); + return IPPROTO_DONE; +} +#endif + +void +tcp_input(m, off0, proto) + register struct mbuf *m; + int off0, proto; +{ + register struct tcphdr *th; + register struct ip *ip = NULL; + register struct ipovly *ipov; + register struct inpcb *inp; + u_char *optp = NULL; + int optlen = 0; + int len, tlen, off; + int drop_hdrlen; + register struct tcpcb *tp = 0; + register int thflags; + struct socket *so = 0; + int todrop, acked, ourfinisacked, needoutput = 0; + struct in_addr laddr; +#ifdef INET6 + struct in6_addr laddr6; +#endif + int dropsocket = 0; + int iss = 0; + u_long tiwin; + struct tcpopt to; /* options in this segment */ + struct rmxp_tao *taop; /* pointer to our TAO cache entry */ + struct rmxp_tao tao_noncached; /* in case there's no cached entry */ +#ifdef TCPDEBUG + short ostate = 0; +#endif +#ifdef INET6 + struct ip6_hdr *ip6 = NULL; + int isipv6; +#endif /* INET6 */ + int rstreason; /* For badport_bandlim accounting purposes */ + +#ifdef INET6 + isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; +#endif + bzero((char *)&to, sizeof(to)); + + tcpstat.tcps_rcvtotal++; + +#ifdef INET6 + if (isipv6) { + /* IP6_EXTHDR_CHECK() is already done at tcp6_input() */ + ip6 = mtod(m, struct ip6_hdr *); + tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; + if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) { + tcpstat.tcps_rcvbadsum++; + goto drop; + } + th = (struct tcphdr *)((caddr_t)ip6 + off0); + } else +#endif /* INET6 */ + { + /* + * Get IP and TCP header together in first mbuf. + * Note: IP leaves IP header in first mbuf. + */ + if (off0 > sizeof (struct ip)) { + ip_stripoptions(m, (struct mbuf *)0); + off0 = sizeof(struct ip); + } + if (m->m_len < sizeof (struct tcpiphdr)) { + if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) { + tcpstat.tcps_rcvshort++; + return; + } + } + ip = mtod(m, struct ip *); + ipov = (struct ipovly *)ip; + th = (struct tcphdr *)((caddr_t)ip + off0); + tlen = ip->ip_len; + + if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { + if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) + th->th_sum = m->m_pkthdr.csum_data; + else + th->th_sum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data + + ip->ip_len + IPPROTO_TCP)); + th->th_sum ^= 0xffff; + } else { + /* + * Checksum extended TCP header and data. + */ + len = sizeof (struct ip) + tlen; + bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); + ipov->ih_len = (u_short)tlen; + HTONS(ipov->ih_len); + th->th_sum = in_cksum(m, len); + } + if (th->th_sum) { + tcpstat.tcps_rcvbadsum++; + goto drop; + } +#ifdef INET6 + /* Re-initialization for later version check */ + ip->ip_v = IPVERSION; +#endif + } + + /* + * Check that TCP offset makes sense, + * pull out TCP options and adjust length. XXX + */ + off = th->th_off << 2; + if (off < sizeof (struct tcphdr) || off > tlen) { + tcpstat.tcps_rcvbadoff++; + goto drop; + } + tlen -= off; /* tlen is used instead of ti->ti_len */ + if (off > sizeof (struct tcphdr)) { +#ifdef INET6 + if (isipv6) { + IP6_EXTHDR_CHECK(m, off0, off, ); + ip6 = mtod(m, struct ip6_hdr *); + th = (struct tcphdr *)((caddr_t)ip6 + off0); + } else +#endif /* INET6 */ + { + if (m->m_len < sizeof(struct ip) + off) { + if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) { + tcpstat.tcps_rcvshort++; + return; + } + ip = mtod(m, struct ip *); + ipov = (struct ipovly *)ip; + th = (struct tcphdr *)((caddr_t)ip + off0); + } + } + optlen = off - sizeof (struct tcphdr); + optp = (u_char *)(th + 1); + } + thflags = th->th_flags; + +#ifdef TCP_DROP_SYNFIN + /* + * If the drop_synfin option is enabled, drop all packets with + * both the SYN and FIN bits set. This prevents e.g. nmap from + * identifying the TCP/IP stack. + * + * This is a violation of the TCP specification. + */ + if (drop_synfin && (thflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN)) + goto drop; +#endif + + /* + * Convert TCP protocol specific fields to host format. + */ + NTOHL(th->th_seq); + NTOHL(th->th_ack); + NTOHS(th->th_win); + NTOHS(th->th_urp); + + /* + * Delay droping TCP, IP headers, IPv6 ext headers, and TCP options, + * until after ip6_savecontrol() is called and before other functions + * which don't want those proto headers. + * Because ip6_savecontrol() is going to parse the mbuf to + * search for data to be passed up to user-land, it wants mbuf + * parameters to be unchanged. + */ + drop_hdrlen = off0 + off; + + /* + * Locate pcb for segment. + */ +findpcb: +#ifdef IPFIREWALL_FORWARD + if (ip_fw_fwd_addr != NULL +#ifdef INET6 + && isipv6 == NULL /* IPv6 support is not yet */ +#endif /* INET6 */ + ) { + /* + * Diverted. Pretend to be the destination. + * already got one like this? + */ + inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport, + ip->ip_dst, th->th_dport, 0, m->m_pkthdr.rcvif); + if (!inp) { + /* + * No, then it's new. Try find the ambushing socket + */ + if (!ip_fw_fwd_addr->sin_port) { + inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, + th->th_sport, ip_fw_fwd_addr->sin_addr, + th->th_dport, 1, m->m_pkthdr.rcvif); + } else { + inp = in_pcblookup_hash(&tcbinfo, + ip->ip_src, th->th_sport, + ip_fw_fwd_addr->sin_addr, + ntohs(ip_fw_fwd_addr->sin_port), 1, + m->m_pkthdr.rcvif); + } + } + ip_fw_fwd_addr = NULL; + } else +#endif /* IPFIREWALL_FORWARD */ + { +#ifdef INET6 + if (isipv6) + inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_src, th->th_sport, + &ip6->ip6_dst, th->th_dport, 1, + m->m_pkthdr.rcvif); + else +#endif /* INET6 */ + inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport, + ip->ip_dst, th->th_dport, 1, m->m_pkthdr.rcvif); + } + +#ifdef IPSEC +#ifdef INET6 + if (isipv6) { + if (inp != NULL && ipsec6_in_reject_so(m, inp->inp_socket)) { + ipsec6stat.in_polvio++; + goto drop; + } + } else +#endif /* INET6 */ + if (inp != NULL && ipsec4_in_reject_so(m, inp->inp_socket)) { + ipsecstat.in_polvio++; + goto drop; + } +#endif /*IPSEC*/ + + /* + * If the state is CLOSED (i.e., TCB does not exist) then + * all data in the incoming segment is discarded. + * If the TCB exists but is in CLOSED state, it is embryonic, + * but should either do a listen or a connect soon. + */ + if (inp == NULL) { + if (log_in_vain) { +#ifdef INET6 + char dbuf[INET6_ADDRSTRLEN], sbuf[INET6_ADDRSTRLEN]; +#else /* INET6 */ + char dbuf[4*sizeof "123"], sbuf[4*sizeof "123"]; +#endif /* INET6 */ + +#ifdef INET6 + if (isipv6) { + strcpy(dbuf, ip6_sprintf(&ip6->ip6_dst)); + strcpy(sbuf, ip6_sprintf(&ip6->ip6_src)); + } else +#endif + { + strcpy(dbuf, inet_ntoa(ip->ip_dst)); + strcpy(sbuf, inet_ntoa(ip->ip_src)); + } + switch (log_in_vain) { + case 1: + if(thflags & TH_SYN) + log(LOG_INFO, + "Connection attempt to TCP %s:%d from %s:%d\n", + dbuf, ntohs(th->th_dport), + sbuf, + ntohs(th->th_sport)); + break; + case 2: + log(LOG_INFO, + "Connection attempt to TCP %s:%d from %s:%d flags:0x%x\n", + dbuf, ntohs(th->th_dport), sbuf, + ntohs(th->th_sport), thflags); + break; + default: + break; + } + } + if (blackhole) { + switch (blackhole) { + case 1: + if (thflags & TH_SYN) + goto drop; + break; + case 2: + goto drop; + default: + goto drop; + } + } + rstreason = BANDLIM_RST_CLOSEDPORT; + goto dropwithreset; + } + tp = intotcpcb(inp); + if (tp == 0) { + rstreason = BANDLIM_RST_CLOSEDPORT; + goto dropwithreset; + } + if (tp->t_state == TCPS_CLOSED) + goto drop; + + /* Unscale the window into a 32-bit value. */ + if ((thflags & TH_SYN) == 0) + tiwin = th->th_win << tp->snd_scale; + else + tiwin = th->th_win; + +#ifdef INET6 + /* save packet options if user wanted */ + if (isipv6 && inp->in6p_flags & INP_CONTROLOPTS) { + if (inp->in6p_options) { + m_freem(inp->in6p_options); + inp->in6p_options = 0; + } + ip6_savecontrol(inp, &inp->in6p_options, ip6, m); + } + /* else, should also do ip_srcroute() here? */ +#endif /* INET6 */ + + so = inp->inp_socket; + if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { +#ifdef TCPDEBUG + if (so->so_options & SO_DEBUG) { + ostate = tp->t_state; +#ifdef INET6 + if (isipv6) + bcopy((char *)ip6, (char *)tcp_saveipgen, + sizeof(*ip6)); + else +#endif /* INET6 */ + bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip)); + tcp_savetcp = *th; + } +#endif + if (so->so_options & SO_ACCEPTCONN) { + register struct tcpcb *tp0 = tp; + struct socket *so2; +#ifdef IPSEC + struct socket *oso; +#endif +#ifdef INET6 + struct inpcb *oinp = sotoinpcb(so); +#endif /* INET6 */ + +#ifndef IPSEC + /* + * Current IPsec implementation makes incorrect IPsec + * cache if this check is done here. + * So delay this until duplicated socket is created. + */ + if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { + /* + * Note: dropwithreset makes sure we don't + * send a RST in response to a RST. + */ + if (thflags & TH_ACK) { + tcpstat.tcps_badsyn++; + rstreason = BANDLIM_RST_OPENPORT; + goto dropwithreset; + } + goto drop; + } +#endif + so2 = sonewconn(so, 0); + if (so2 == 0) { + tcpstat.tcps_listendrop++; + so2 = sodropablereq(so); + if (so2) { + if (tcp_lq_overflow) + sototcpcb(so2)->t_flags |= + TF_LQ_OVERFLOW; + tcp_drop(sototcpcb(so2), ETIMEDOUT); + so2 = sonewconn(so, 0); + } + if (!so2) + goto drop; + } +#ifdef IPSEC + oso = so; +#endif + so = so2; + /* + * This is ugly, but .... + * + * Mark socket as temporary until we're + * committed to keeping it. The code at + * ``drop'' and ``dropwithreset'' check the + * flag dropsocket to see if the temporary + * socket created here should be discarded. + * We mark the socket as discardable until + * we're committed to it below in TCPS_LISTEN. + */ + dropsocket++; + inp = (struct inpcb *)so->so_pcb; +#ifdef INET6 + if (isipv6) + inp->in6p_laddr = ip6->ip6_dst; + else { + if ((inp->inp_flags & IN6P_BINDV6ONLY) == 0) { + inp->inp_vflag &= ~INP_IPV6; + inp->inp_vflag |= INP_IPV4; + } +#endif /* INET6 */ + inp->inp_laddr = ip->ip_dst; +#ifdef INET6 + } +#endif /* INET6 */ + inp->inp_lport = th->th_dport; + if (in_pcbinshash(inp) != 0) { + /* + * Undo the assignments above if we failed to + * put the PCB on the hash lists. + */ +#ifdef INET6 + if (isipv6) + inp->in6p_laddr = in6addr_any; + else +#endif /* INET6 */ + inp->inp_laddr.s_addr = INADDR_ANY; + inp->inp_lport = 0; + goto drop; + } +#ifdef IPSEC + /* + * To avoid creating incorrectly cached IPsec + * association, this is need to be done here. + * + * Subject: (KAME-snap 748) + * From: Wayne Knowles <w.knowles@niwa.cri.nz> + * ftp://ftp.kame.net/pub/mail-list/snap-users/748 + */ + if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { + /* + * Note: dropwithreset makes sure we don't + * send a RST in response to a RST. + */ + if (thflags & TH_ACK) { + tcpstat.tcps_badsyn++; + rstreason = BANDLIM_RST_OPENPORT; + goto dropwithreset; + } + goto drop; + } +#endif +#ifdef INET6 + if (isipv6) { + /* + * inherit socket options from the listening + * socket. + */ + inp->inp_flags |= + oinp->inp_flags & INP_CONTROLOPTS; + if (inp->inp_flags & INP_CONTROLOPTS) { + if (inp->in6p_options) { + m_freem(inp->in6p_options); + inp->in6p_options = 0; + } + ip6_savecontrol(inp, + &inp->in6p_options, + ip6, m); + } + } else +#endif /* INET6 */ + inp->inp_options = ip_srcroute(); +#ifdef IPSEC + /* copy old policy into new socket's */ + if (ipsec_copy_policy(sotoinpcb(oso)->inp_sp, + inp->inp_sp)) + printf("tcp_input: could not copy policy\n"); +#endif + tp = intotcpcb(inp); + tp->t_state = TCPS_LISTEN; + tp->t_flags |= tp0->t_flags & (TF_NOPUSH|TF_NOOPT); + + /* Compute proper scaling value from buffer space */ + while (tp->request_r_scale < TCP_MAX_WINSHIFT && + TCP_MAXWIN << tp->request_r_scale < + so->so_rcv.sb_hiwat) + tp->request_r_scale++; + } + } + + /* + * Segment received on connection. + * Reset idle time and keep-alive timer. + */ + tp->t_rcvtime = ticks; + if (TCPS_HAVEESTABLISHED(tp->t_state)) + callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); + + /* + * Process options if not in LISTEN state, + * else do it below (after getting remote address). + */ + if (tp->t_state != TCPS_LISTEN) + tcp_dooptions(tp, optp, optlen, th, &to); + + /* + * Header prediction: check for the two common cases + * of a uni-directional data xfer. If the packet has + * no control flags, is in-sequence, the window didn't + * change and we're not retransmitting, it's a + * candidate. If the length is zero and the ack moved + * forward, we're the sender side of the xfer. Just + * free the data acked & wake any higher level process + * that was blocked waiting for space. If the length + * is non-zero and the ack didn't move, we're the + * receiver side. If we're getting packets in-order + * (the reassembly queue is empty), add the data to + * the socket buffer and note that we need a delayed ack. + * Make sure that the hidden state-flags are also off. + * Since we check for TCPS_ESTABLISHED above, it can only + * be TH_NEEDSYN. + */ + if (tp->t_state == TCPS_ESTABLISHED && + (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && + ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && + ((to.to_flag & TOF_TS) == 0 || + TSTMP_GEQ(to.to_tsval, tp->ts_recent)) && + /* + * Using the CC option is compulsory if once started: + * the segment is OK if no T/TCP was negotiated or + * if the segment has a CC option equal to CCrecv + */ + ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) != (TF_REQ_CC|TF_RCVD_CC) || + ((to.to_flag & TOF_CC) != 0 && to.to_cc == tp->cc_recv)) && + th->th_seq == tp->rcv_nxt && + tiwin && tiwin == tp->snd_wnd && + tp->snd_nxt == tp->snd_max) { + + /* + * If last ACK falls within this segment's sequence numbers, + * record the timestamp. + * NOTE that the test is modified according to the latest + * proposal of the tcplw@cray.com list (Braden 1993/04/26). + */ + if ((to.to_flag & TOF_TS) != 0 && + SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { + tp->ts_recent_age = ticks; + tp->ts_recent = to.to_tsval; + } + + if (tlen == 0) { + if (SEQ_GT(th->th_ack, tp->snd_una) && + SEQ_LEQ(th->th_ack, tp->snd_max) && + tp->snd_cwnd >= tp->snd_wnd && + tp->t_dupacks < tcprexmtthresh) { + /* + * this is a pure ack for outstanding data. + */ + ++tcpstat.tcps_predack; + /* + * "bad retransmit" recovery + */ + if (tp->t_rxtshift == 1 && + ticks < tp->t_badrxtwin) { + tp->snd_cwnd = tp->snd_cwnd_prev; + tp->snd_ssthresh = + tp->snd_ssthresh_prev; + tp->snd_nxt = tp->snd_max; + tp->t_badrxtwin = 0; + } + if ((to.to_flag & TOF_TS) != 0) + tcp_xmit_timer(tp, + ticks - to.to_tsecr + 1); + else if (tp->t_rtttime && + SEQ_GT(th->th_ack, tp->t_rtseq)) + tcp_xmit_timer(tp, ticks - tp->t_rtttime); + acked = th->th_ack - tp->snd_una; + tcpstat.tcps_rcvackpack++; + tcpstat.tcps_rcvackbyte += acked; + sbdrop(&so->so_snd, acked); + tp->snd_una = th->th_ack; + m_freem(m); + ND6_HINT(tp); /* some progress has been done */ + + /* + * If all outstanding data are acked, stop + * retransmit timer, otherwise restart timer + * using current (possibly backed-off) value. + * If process is waiting for space, + * wakeup/selwakeup/signal. If data + * are ready to send, let tcp_output + * decide between more output or persist. + */ + if (tp->snd_una == tp->snd_max) + callout_stop(tp->tt_rexmt); + else if (!callout_active(tp->tt_persist)) + callout_reset(tp->tt_rexmt, + tp->t_rxtcur, + tcp_timer_rexmt, tp); + + sowwakeup(so); + if (so->so_snd.sb_cc) + (void) tcp_output(tp); + return; + } + } else if (th->th_ack == tp->snd_una && + LIST_EMPTY(&tp->t_segq) && + tlen <= sbspace(&so->so_rcv)) { + /* + * this is a pure, in-sequence data packet + * with nothing on the reassembly queue and + * we have enough buffer space to take it. + */ + ++tcpstat.tcps_preddat; + tp->rcv_nxt += tlen; + tcpstat.tcps_rcvpack++; + tcpstat.tcps_rcvbyte += tlen; + ND6_HINT(tp); /* some progress has been done */ + /* + * Add data to socket buffer. + */ + m_adj(m, drop_hdrlen); /* delayed header drop */ + sbappend(&so->so_rcv, m); + sorwakeup(so); + if (DELAY_ACK(tp)) { + callout_reset(tp->tt_delack, tcp_delacktime, + tcp_timer_delack, tp); + } else { + tp->t_flags |= TF_ACKNOW; + tcp_output(tp); + } + return; + } + } + + /* + * Calculate amount of space in receive window, + * and then do TCP input processing. + * Receive window is amount of space in rcv queue, + * but not less than advertised window. + */ + { int win; + + win = sbspace(&so->so_rcv); + if (win < 0) + win = 0; + tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); + } + + switch (tp->t_state) { + + /* + * If the state is LISTEN then ignore segment if it contains an RST. + * If the segment contains an ACK then it is bad and send a RST. + * If it does not contain a SYN then it is not interesting; drop it. + * If it is from this socket, drop it, it must be forged. + * Don't bother responding if the destination was a broadcast. + * Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial + * tp->iss, and send a segment: + * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> + * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss. + * Fill in remote peer address fields if not previously specified. + * Enter SYN_RECEIVED state, and process any other fields of this + * segment in this state. + */ + case TCPS_LISTEN: { + register struct sockaddr_in *sin; +#ifdef INET6 + register struct sockaddr_in6 *sin6; +#endif + + if (thflags & TH_RST) + goto drop; + if (thflags & TH_ACK) { + rstreason = BANDLIM_RST_OPENPORT; + goto dropwithreset; + } + if ((thflags & TH_SYN) == 0) + goto drop; + if (th->th_dport == th->th_sport) { +#ifdef INET6 + if (isipv6) { + if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, + &ip6->ip6_src)) + goto drop; + } else +#endif /* INET6 */ + if (ip->ip_dst.s_addr == ip->ip_src.s_addr) + goto drop; + } + /* + * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN + * in_broadcast() should never return true on a received + * packet with M_BCAST not set. + * + * Packets with a multicast source address should also + * be discarded. + */ + if (m->m_flags & (M_BCAST|M_MCAST)) + goto drop; +#ifdef INET6 + if (isipv6) { + if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || + IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) + goto drop; + } else +#endif + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || + IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || + ip->ip_src.s_addr == htonl(INADDR_BROADCAST)) + goto drop; +#ifdef INET6 + if (isipv6) { + MALLOC(sin6, struct sockaddr_in6 *, sizeof *sin6, + M_SONAME, M_NOWAIT | M_ZERO); + if (sin6 == NULL) + goto drop; + sin6->sin6_family = AF_INET6; + sin6->sin6_len = sizeof(*sin6); + sin6->sin6_addr = ip6->ip6_src; + sin6->sin6_port = th->th_sport; + laddr6 = inp->in6p_laddr; + if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) + inp->in6p_laddr = ip6->ip6_dst; + if (in6_pcbconnect(inp, (struct sockaddr *)sin6, + &proc0)) { + inp->in6p_laddr = laddr6; + FREE(sin6, M_SONAME); + goto drop; + } + FREE(sin6, M_SONAME); + } else +#endif + { + MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME, + M_NOWAIT); + if (sin == NULL) + goto drop; + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + sin->sin_addr = ip->ip_src; + sin->sin_port = th->th_sport; + bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero)); + laddr = inp->inp_laddr; + if (inp->inp_laddr.s_addr == INADDR_ANY) + inp->inp_laddr = ip->ip_dst; + if (in_pcbconnect(inp, (struct sockaddr *)sin, &proc0)) { + inp->inp_laddr = laddr; + FREE(sin, M_SONAME); + goto drop; + } + FREE(sin, M_SONAME); + } + tp->t_template = tcp_template(tp); + if (tp->t_template == 0) { + tp = tcp_drop(tp, ENOBUFS); + dropsocket = 0; /* socket is already gone */ + goto drop; + } + if ((taop = tcp_gettaocache(inp)) == NULL) { + taop = &tao_noncached; + bzero(taop, sizeof(*taop)); + } + tcp_dooptions(tp, optp, optlen, th, &to); + if (iss) + tp->iss = iss; + else { + tp->iss = tcp_rndiss_next(); + } + tp->irs = th->th_seq; + tcp_sendseqinit(tp); + tcp_rcvseqinit(tp); + tp->snd_recover = tp->snd_una; + /* + * Initialization of the tcpcb for transaction; + * set SND.WND = SEG.WND, + * initialize CCsend and CCrecv. + */ + tp->snd_wnd = tiwin; /* initial send-window */ + tp->cc_send = CC_INC(tcp_ccgen); + tp->cc_recv = to.to_cc; + /* + * Perform TAO test on incoming CC (SEG.CC) option, if any. + * - compare SEG.CC against cached CC from the same host, + * if any. + * - if SEG.CC > chached value, SYN must be new and is accepted + * immediately: save new CC in the cache, mark the socket + * connected, enter ESTABLISHED state, turn on flag to + * send a SYN in the next segment. + * A virtual advertised window is set in rcv_adv to + * initialize SWS prevention. Then enter normal segment + * processing: drop SYN, process data and FIN. + * - otherwise do a normal 3-way handshake. + */ + if ((to.to_flag & TOF_CC) != 0) { + if (((tp->t_flags & TF_NOPUSH) != 0) && + taop->tao_cc != 0 && CC_GT(to.to_cc, taop->tao_cc)) { + + taop->tao_cc = to.to_cc; + tp->t_starttime = ticks; + tp->t_state = TCPS_ESTABLISHED; + + /* + * If there is a FIN, or if there is data and the + * connection is local, then delay SYN,ACK(SYN) in + * the hope of piggy-backing it on a response + * segment. Otherwise must send ACK now in case + * the other side is slow starting. + */ + if (DELAY_ACK(tp) && ((thflags & TH_FIN) || + (tlen != 0 && +#ifdef INET6 + ((isipv6 && in6_localaddr(&inp->in6p_faddr)) + || + (!isipv6 && +#endif + in_localaddr(inp->inp_faddr) +#ifdef INET6 + )) +#endif + ))) { + callout_reset(tp->tt_delack, tcp_delacktime, + tcp_timer_delack, tp); + tp->t_flags |= TF_NEEDSYN; + } else + tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); + + /* + * Limit the `virtual advertised window' to TCP_MAXWIN + * here. Even if we requested window scaling, it will + * become effective only later when our SYN is acked. + */ + tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN); + tcpstat.tcps_connects++; + soisconnected(so); + callout_reset(tp->tt_keep, tcp_keepinit, + tcp_timer_keep, tp); + dropsocket = 0; /* committed to socket */ + tcpstat.tcps_accepts++; + goto trimthenstep6; + } + /* else do standard 3-way handshake */ + } else { + /* + * No CC option, but maybe CC.NEW: + * invalidate cached value. + */ + taop->tao_cc = 0; + } + /* + * TAO test failed or there was no CC option, + * do a standard 3-way handshake. + */ + tp->t_flags |= TF_ACKNOW; + tp->t_state = TCPS_SYN_RECEIVED; + callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp); + dropsocket = 0; /* committed to socket */ + tcpstat.tcps_accepts++; + goto trimthenstep6; + } + + /* + * If the state is SYN_RECEIVED: + * if seg contains an ACK, but not for our SYN/ACK, send a RST. + */ + case TCPS_SYN_RECEIVED: + if ((thflags & TH_ACK) && + (SEQ_LEQ(th->th_ack, tp->snd_una) || + SEQ_GT(th->th_ack, tp->snd_max))) { + rstreason = BANDLIM_RST_OPENPORT; + goto dropwithreset; + } + break; + + /* + * If the state is SYN_SENT: + * if seg contains an ACK, but not for our SYN, drop the input. + * if seg contains a RST, then drop the connection. + * if seg does not contain SYN, then drop it. + * Otherwise this is an acceptable SYN segment + * initialize tp->rcv_nxt and tp->irs + * if seg contains ack then advance tp->snd_una + * if SYN has been acked change to ESTABLISHED else SYN_RCVD state + * arrange for segment to be acked (eventually) + * continue processing rest of data/controls, beginning with URG + */ + case TCPS_SYN_SENT: + if ((taop = tcp_gettaocache(inp)) == NULL) { + taop = &tao_noncached; + bzero(taop, sizeof(*taop)); + } + + if ((thflags & TH_ACK) && + (SEQ_LEQ(th->th_ack, tp->iss) || + SEQ_GT(th->th_ack, tp->snd_max))) { + /* + * If we have a cached CCsent for the remote host, + * hence we haven't just crashed and restarted, + * do not send a RST. This may be a retransmission + * from the other side after our earlier ACK was lost. + * Our new SYN, when it arrives, will serve as the + * needed ACK. + */ + if (taop->tao_ccsent != 0) + goto drop; + else { + rstreason = BANDLIM_UNLIMITED; + goto dropwithreset; + } + } + if (thflags & TH_RST) { + if (thflags & TH_ACK) + tp = tcp_drop(tp, ECONNREFUSED); + goto drop; + } + if ((thflags & TH_SYN) == 0) + goto drop; + tp->snd_wnd = th->th_win; /* initial send window */ + tp->cc_recv = to.to_cc; /* foreign CC */ + + tp->irs = th->th_seq; + tcp_rcvseqinit(tp); + if (thflags & TH_ACK) { + /* + * Our SYN was acked. If segment contains CC.ECHO + * option, check it to make sure this segment really + * matches our SYN. If not, just drop it as old + * duplicate, but send an RST if we're still playing + * by the old rules. If no CC.ECHO option, make sure + * we don't get fooled into using T/TCP. + */ + if (to.to_flag & TOF_CCECHO) { + if (tp->cc_send != to.to_ccecho) { + if (taop->tao_ccsent != 0) + goto drop; + else { + rstreason = BANDLIM_UNLIMITED; + goto dropwithreset; + } + } + } else + tp->t_flags &= ~TF_RCVD_CC; + tcpstat.tcps_connects++; + soisconnected(so); + /* Do window scaling on this connection? */ + if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == + (TF_RCVD_SCALE|TF_REQ_SCALE)) { + tp->snd_scale = tp->requested_s_scale; + tp->rcv_scale = tp->request_r_scale; + } + /* Segment is acceptable, update cache if undefined. */ + if (taop->tao_ccsent == 0) + taop->tao_ccsent = to.to_ccecho; + + tp->rcv_adv += tp->rcv_wnd; + tp->snd_una++; /* SYN is acked */ + /* + * If there's data, delay ACK; if there's also a FIN + * ACKNOW will be turned on later. + */ + if (DELAY_ACK(tp) && tlen != 0) + callout_reset(tp->tt_delack, tcp_delacktime, + tcp_timer_delack, tp); + else + tp->t_flags |= TF_ACKNOW; + /* + * Received <SYN,ACK> in SYN_SENT[*] state. + * Transitions: + * SYN_SENT --> ESTABLISHED + * SYN_SENT* --> FIN_WAIT_1 + */ + tp->t_starttime = ticks; + if (tp->t_flags & TF_NEEDFIN) { + tp->t_state = TCPS_FIN_WAIT_1; + tp->t_flags &= ~TF_NEEDFIN; + thflags &= ~TH_SYN; + } else { + tp->t_state = TCPS_ESTABLISHED; + callout_reset(tp->tt_keep, tcp_keepidle, + tcp_timer_keep, tp); + } + } else { + /* + * Received initial SYN in SYN-SENT[*] state => simul- + * taneous open. If segment contains CC option and there is + * a cached CC, apply TAO test; if it succeeds, connection is + * half-synchronized. Otherwise, do 3-way handshake: + * SYN-SENT -> SYN-RECEIVED + * SYN-SENT* -> SYN-RECEIVED* + * If there was no CC option, clear cached CC value. + */ + tp->t_flags |= TF_ACKNOW; + callout_stop(tp->tt_rexmt); + if (to.to_flag & TOF_CC) { + if (taop->tao_cc != 0 && + CC_GT(to.to_cc, taop->tao_cc)) { + /* + * update cache and make transition: + * SYN-SENT -> ESTABLISHED* + * SYN-SENT* -> FIN-WAIT-1* + */ + taop->tao_cc = to.to_cc; + tp->t_starttime = ticks; + if (tp->t_flags & TF_NEEDFIN) { + tp->t_state = TCPS_FIN_WAIT_1; + tp->t_flags &= ~TF_NEEDFIN; + } else { + tp->t_state = TCPS_ESTABLISHED; + callout_reset(tp->tt_keep, + tcp_keepidle, + tcp_timer_keep, + tp); + } + tp->t_flags |= TF_NEEDSYN; + } else + tp->t_state = TCPS_SYN_RECEIVED; + } else { + /* CC.NEW or no option => invalidate cache */ + taop->tao_cc = 0; + tp->t_state = TCPS_SYN_RECEIVED; + } + } + +trimthenstep6: + /* + * Advance th->th_seq to correspond to first data byte. + * If data, trim to stay within window, + * dropping FIN if necessary. + */ + th->th_seq++; + if (tlen > tp->rcv_wnd) { + todrop = tlen - tp->rcv_wnd; + m_adj(m, -todrop); + tlen = tp->rcv_wnd; + thflags &= ~TH_FIN; + tcpstat.tcps_rcvpackafterwin++; + tcpstat.tcps_rcvbyteafterwin += todrop; + } + tp->snd_wl1 = th->th_seq - 1; + tp->rcv_up = th->th_seq; + /* + * Client side of transaction: already sent SYN and data. + * If the remote host used T/TCP to validate the SYN, + * our data will be ACK'd; if so, enter normal data segment + * processing in the middle of step 5, ack processing. + * Otherwise, goto step 6. + */ + if (thflags & TH_ACK) + goto process_ACK; + goto step6; + /* + * If the state is LAST_ACK or CLOSING or TIME_WAIT: + * if segment contains a SYN and CC [not CC.NEW] option: + * if state == TIME_WAIT and connection duration > MSL, + * drop packet and send RST; + * + * if SEG.CC > CCrecv then is new SYN, and can implicitly + * ack the FIN (and data) in retransmission queue. + * Complete close and delete TCPCB. Then reprocess + * segment, hoping to find new TCPCB in LISTEN state; + * + * else must be old SYN; drop it. + * else do normal processing. + */ + case TCPS_LAST_ACK: + case TCPS_CLOSING: + case TCPS_TIME_WAIT: + if ((thflags & TH_SYN) && + (to.to_flag & TOF_CC) && tp->cc_recv != 0) { + if (tp->t_state == TCPS_TIME_WAIT && + (ticks - tp->t_starttime) > tcp_msl) { + rstreason = BANDLIM_UNLIMITED; + goto dropwithreset; + } + if (CC_GT(to.to_cc, tp->cc_recv)) { + tp = tcp_close(tp); + goto findpcb; + } + else + goto drop; + } + break; /* continue normal processing */ + } + + /* + * States other than LISTEN or SYN_SENT. + * First check the RST flag and sequence number since reset segments + * are exempt from the timestamp and connection count tests. This + * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix + * below which allowed reset segments in half the sequence space + * to fall though and be processed (which gives forged reset + * segments with a random sequence number a 50 percent chance of + * killing a connection). + * Then check timestamp, if present. + * Then check the connection count, if present. + * Then check that at least some bytes of segment are within + * receive window. If segment begins before rcv_nxt, + * drop leading data (and SYN); if nothing left, just ack. + * + * + * If the RST bit is set, check the sequence number to see + * if this is a valid reset segment. + * RFC 793 page 37: + * In all states except SYN-SENT, all reset (RST) segments + * are validated by checking their SEQ-fields. A reset is + * valid if its sequence number is in the window. + * Note: this does not take into account delayed ACKs, so + * we should test against last_ack_sent instead of rcv_nxt. + * The sequence number in the reset segment is normally an + * echo of our outgoing acknowlegement numbers, but some hosts + * send a reset with the sequence number at the rightmost edge + * of our receive window, and we have to handle this case. + * If we have multiple segments in flight, the intial reset + * segment sequence numbers will be to the left of last_ack_sent, + * but they will eventually catch up. + * In any case, it never made sense to trim reset segments to + * fit the receive window since RFC 1122 says: + * 4.2.2.12 RST Segment: RFC-793 Section 3.4 + * + * A TCP SHOULD allow a received RST segment to include data. + * + * DISCUSSION + * It has been suggested that a RST segment could contain + * ASCII text that encoded and explained the cause of the + * RST. No standard has yet been established for such + * data. + * + * If the reset segment passes the sequence number test examine + * the state: + * SYN_RECEIVED STATE: + * If passive open, return to LISTEN state. + * If active open, inform user that connection was refused. + * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: + * Inform user that connection was reset, and close tcb. + * CLOSING, LAST_ACK STATES: + * Close the tcb. + * TIME_WAIT STATE: + * Drop the segment - see Stevens, vol. 2, p. 964 and + * RFC 1337. + */ + if (thflags & TH_RST) { + if (SEQ_GEQ(th->th_seq, tp->last_ack_sent) && + SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { + switch (tp->t_state) { + + case TCPS_SYN_RECEIVED: + so->so_error = ECONNREFUSED; + goto close; + + case TCPS_ESTABLISHED: + case TCPS_FIN_WAIT_1: + case TCPS_FIN_WAIT_2: + case TCPS_CLOSE_WAIT: + so->so_error = ECONNRESET; + close: + tp->t_state = TCPS_CLOSED; + tcpstat.tcps_drops++; + tp = tcp_close(tp); + break; + + case TCPS_CLOSING: + case TCPS_LAST_ACK: + tp = tcp_close(tp); + break; + + case TCPS_TIME_WAIT: + break; + } + } + goto drop; + } + + /* + * RFC 1323 PAWS: If we have a timestamp reply on this segment + * and it's less than ts_recent, drop it. + */ + if ((to.to_flag & TOF_TS) != 0 && tp->ts_recent && + TSTMP_LT(to.to_tsval, tp->ts_recent)) { + + /* Check to see if ts_recent is over 24 days old. */ + if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) { + /* + * Invalidate ts_recent. If this segment updates + * ts_recent, the age will be reset later and ts_recent + * will get a valid value. If it does not, setting + * ts_recent to zero will at least satisfy the + * requirement that zero be placed in the timestamp + * echo reply when ts_recent isn't valid. The + * age isn't reset until we get a valid ts_recent + * because we don't want out-of-order segments to be + * dropped when ts_recent is old. + */ + tp->ts_recent = 0; + } else { + tcpstat.tcps_rcvduppack++; + tcpstat.tcps_rcvdupbyte += tlen; + tcpstat.tcps_pawsdrop++; + goto dropafterack; + } + } + + /* + * T/TCP mechanism + * If T/TCP was negotiated and the segment doesn't have CC, + * or if its CC is wrong then drop the segment. + * RST segments do not have to comply with this. + */ + if ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) == (TF_REQ_CC|TF_RCVD_CC) && + ((to.to_flag & TOF_CC) == 0 || tp->cc_recv != to.to_cc)) + goto dropafterack; + + /* + * In the SYN-RECEIVED state, validate that the packet belongs to + * this connection before trimming the data to fit the receive + * window. Check the sequence number versus IRS since we know + * the sequence numbers haven't wrapped. This is a partial fix + * for the "LAND" DoS attack. + */ + if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { + rstreason = BANDLIM_RST_OPENPORT; + goto dropwithreset; + } + + todrop = tp->rcv_nxt - th->th_seq; + if (todrop > 0) { + if (thflags & TH_SYN) { + thflags &= ~TH_SYN; + th->th_seq++; + if (th->th_urp > 1) + th->th_urp--; + else + thflags &= ~TH_URG; + todrop--; + } + /* + * Following if statement from Stevens, vol. 2, p. 960. + */ + if (todrop > tlen + || (todrop == tlen && (thflags & TH_FIN) == 0)) { + /* + * Any valid FIN must be to the left of the window. + * At this point the FIN must be a duplicate or out + * of sequence; drop it. + */ + thflags &= ~TH_FIN; + + /* + * Send an ACK to resynchronize and drop any data. + * But keep on processing for RST or ACK. + */ + tp->t_flags |= TF_ACKNOW; + todrop = tlen; + tcpstat.tcps_rcvduppack++; + tcpstat.tcps_rcvdupbyte += todrop; + } else { + tcpstat.tcps_rcvpartduppack++; + tcpstat.tcps_rcvpartdupbyte += todrop; + } + drop_hdrlen += todrop; /* drop from the top afterwards */ + th->th_seq += todrop; + tlen -= todrop; + if (th->th_urp > todrop) + th->th_urp -= todrop; + else { + thflags &= ~TH_URG; + th->th_urp = 0; + } + } + + /* + * If new data are received on a connection after the + * user processes are gone, then RST the other end. + */ + if ((so->so_state & SS_NOFDREF) && + tp->t_state > TCPS_CLOSE_WAIT && tlen) { + tp = tcp_close(tp); + tcpstat.tcps_rcvafterclose++; + rstreason = BANDLIM_UNLIMITED; + goto dropwithreset; + } + + /* + * If segment ends after window, drop trailing data + * (and PUSH and FIN); if nothing left, just ACK. + */ + todrop = (th->th_seq+tlen) - (tp->rcv_nxt+tp->rcv_wnd); + if (todrop > 0) { + tcpstat.tcps_rcvpackafterwin++; + if (todrop >= tlen) { + tcpstat.tcps_rcvbyteafterwin += tlen; + /* + * If a new connection request is received + * while in TIME_WAIT, drop the old connection + * and start over if the sequence numbers + * are above the previous ones. + */ + if (thflags & TH_SYN && + tp->t_state == TCPS_TIME_WAIT && + SEQ_GT(th->th_seq, tp->rcv_nxt)) { + iss = tcp_rndiss_next(); + tp = tcp_close(tp); + goto findpcb; + } + /* + * If window is closed can only take segments at + * window edge, and have to drop data and PUSH from + * incoming segments. Continue processing, but + * remember to ack. Otherwise, drop segment + * and ack. + */ + if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { + tp->t_flags |= TF_ACKNOW; + tcpstat.tcps_rcvwinprobe++; + } else + goto dropafterack; + } else + tcpstat.tcps_rcvbyteafterwin += todrop; + m_adj(m, -todrop); + tlen -= todrop; + thflags &= ~(TH_PUSH|TH_FIN); + } + + /* + * If last ACK falls within this segment's sequence numbers, + * record its timestamp. + * NOTE that the test is modified according to the latest + * proposal of the tcplw@cray.com list (Braden 1993/04/26). + */ + if ((to.to_flag & TOF_TS) != 0 && + SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { + tp->ts_recent_age = ticks; + tp->ts_recent = to.to_tsval; + } + + /* + * If a SYN is in the window, then this is an + * error and we send an RST and drop the connection. + */ + if (thflags & TH_SYN) { + tp = tcp_drop(tp, ECONNRESET); + rstreason = BANDLIM_UNLIMITED; + goto dropwithreset; + } + + /* + * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN + * flag is on (half-synchronized state), then queue data for + * later processing; else drop segment and return. + */ + if ((thflags & TH_ACK) == 0) { + if (tp->t_state == TCPS_SYN_RECEIVED || + (tp->t_flags & TF_NEEDSYN)) + goto step6; + else + goto drop; + } + + /* + * Ack processing. + */ + switch (tp->t_state) { + + /* + * In SYN_RECEIVED state, the ack ACKs our SYN, so enter + * ESTABLISHED state and continue processing. + * The ACK was checked above. + */ + case TCPS_SYN_RECEIVED: + + tcpstat.tcps_connects++; + soisconnected(so); + /* Do window scaling? */ + if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == + (TF_RCVD_SCALE|TF_REQ_SCALE)) { + tp->snd_scale = tp->requested_s_scale; + tp->rcv_scale = tp->request_r_scale; + } + /* + * Upon successful completion of 3-way handshake, + * update cache.CC if it was undefined, pass any queued + * data to the user, and advance state appropriately. + */ + if ((taop = tcp_gettaocache(inp)) != NULL && + taop->tao_cc == 0) + taop->tao_cc = tp->cc_recv; + + /* + * Make transitions: + * SYN-RECEIVED -> ESTABLISHED + * SYN-RECEIVED* -> FIN-WAIT-1 + */ + tp->t_starttime = ticks; + if (tp->t_flags & TF_NEEDFIN) { + tp->t_state = TCPS_FIN_WAIT_1; + tp->t_flags &= ~TF_NEEDFIN; + } else { + tp->t_state = TCPS_ESTABLISHED; + callout_reset(tp->tt_keep, tcp_keepidle, + tcp_timer_keep, tp); + } + /* + * If segment contains data or ACK, will call tcp_reass() + * later; if not, do so now to pass queued data to user. + */ + if (tlen == 0 && (thflags & TH_FIN) == 0) + (void) tcp_reass(tp, (struct tcphdr *)0, 0, + (struct mbuf *)0); + tp->snd_wl1 = th->th_seq - 1; + /* fall into ... */ + + /* + * In ESTABLISHED state: drop duplicate ACKs; ACK out of range + * ACKs. If the ack is in the range + * tp->snd_una < th->th_ack <= tp->snd_max + * then advance tp->snd_una to th->th_ack and drop + * data from the retransmission queue. If this ACK reflects + * more up to date window information we update our window information. + */ + case TCPS_ESTABLISHED: + case TCPS_FIN_WAIT_1: + case TCPS_FIN_WAIT_2: + case TCPS_CLOSE_WAIT: + case TCPS_CLOSING: + case TCPS_LAST_ACK: + case TCPS_TIME_WAIT: + + if (SEQ_LEQ(th->th_ack, tp->snd_una)) { + if (tlen == 0 && tiwin == tp->snd_wnd) { + tcpstat.tcps_rcvdupack++; + /* + * If we have outstanding data (other than + * a window probe), this is a completely + * duplicate ack (ie, window info didn't + * change), the ack is the biggest we've + * seen and we've seen exactly our rexmt + * threshhold of them, assume a packet + * has been dropped and retransmit it. + * Kludge snd_nxt & the congestion + * window so we send only this one + * packet. + * + * We know we're losing at the current + * window size so do congestion avoidance + * (set ssthresh to half the current window + * and pull our congestion window back to + * the new ssthresh). + * + * Dup acks mean that packets have left the + * network (they're now cached at the receiver) + * so bump cwnd by the amount in the receiver + * to keep a constant cwnd packets in the + * network. + */ + if (!callout_active(tp->tt_rexmt) || + th->th_ack != tp->snd_una) + tp->t_dupacks = 0; + else if (++tp->t_dupacks == tcprexmtthresh) { + tcp_seq onxt = tp->snd_nxt; + u_int win = + min(tp->snd_wnd, tp->snd_cwnd) / 2 / + tp->t_maxseg; + if (tcp_do_newreno && SEQ_LT(th->th_ack, + tp->snd_recover)) { + /* False retransmit, should not + * cut window + */ + tp->snd_cwnd += tp->t_maxseg; + tp->t_dupacks = 0; + (void) tcp_output(tp); + goto drop; + } + if (win < 2) + win = 2; + tp->snd_ssthresh = win * tp->t_maxseg; + tp->snd_recover = tp->snd_max; + callout_stop(tp->tt_rexmt); + tp->t_rtttime = 0; + tp->snd_nxt = th->th_ack; + tp->snd_cwnd = tp->t_maxseg; + (void) tcp_output(tp); + tp->snd_cwnd = tp->snd_ssthresh + + tp->t_maxseg * tp->t_dupacks; + if (SEQ_GT(onxt, tp->snd_nxt)) + tp->snd_nxt = onxt; + goto drop; + } else if (tp->t_dupacks > tcprexmtthresh) { + tp->snd_cwnd += tp->t_maxseg; + (void) tcp_output(tp); + goto drop; + } + } else + tp->t_dupacks = 0; + break; + } + /* + * If the congestion window was inflated to account + * for the other side's cached packets, retract it. + */ + if (tcp_do_newreno == 0) { + if (tp->t_dupacks >= tcprexmtthresh && + tp->snd_cwnd > tp->snd_ssthresh) + tp->snd_cwnd = tp->snd_ssthresh; + tp->t_dupacks = 0; + } else if (tp->t_dupacks >= tcprexmtthresh && + !tcp_newreno(tp, th)) { + /* + * Window inflation should have left us with approx. + * snd_ssthresh outstanding data. But in case we + * would be inclined to send a burst, better to do + * it via the slow start mechanism. + */ + if (SEQ_GT(th->th_ack + tp->snd_ssthresh, tp->snd_max)) + tp->snd_cwnd = + tp->snd_max - th->th_ack + tp->t_maxseg; + else + tp->snd_cwnd = tp->snd_ssthresh; + tp->t_dupacks = 0; + } + if (SEQ_GT(th->th_ack, tp->snd_max)) { + tcpstat.tcps_rcvacktoomuch++; + goto dropafterack; + } + /* + * If we reach this point, ACK is not a duplicate, + * i.e., it ACKs something we sent. + */ + if (tp->t_flags & TF_NEEDSYN) { + /* + * T/TCP: Connection was half-synchronized, and our + * SYN has been ACK'd (so connection is now fully + * synchronized). Go to non-starred state, + * increment snd_una for ACK of SYN, and check if + * we can do window scaling. + */ + tp->t_flags &= ~TF_NEEDSYN; + tp->snd_una++; + /* Do window scaling? */ + if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == + (TF_RCVD_SCALE|TF_REQ_SCALE)) { + tp->snd_scale = tp->requested_s_scale; + tp->rcv_scale = tp->request_r_scale; + } + } + +process_ACK: + acked = th->th_ack - tp->snd_una; + tcpstat.tcps_rcvackpack++; + tcpstat.tcps_rcvackbyte += acked; + + /* + * If we just performed our first retransmit, and the ACK + * arrives within our recovery window, then it was a mistake + * to do the retransmit in the first place. Recover our + * original cwnd and ssthresh, and proceed to transmit where + * we left off. + */ + if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) { + tp->snd_cwnd = tp->snd_cwnd_prev; + tp->snd_ssthresh = tp->snd_ssthresh_prev; + tp->snd_nxt = tp->snd_max; + tp->t_badrxtwin = 0; /* XXX probably not required */ + } + + /* + * If we have a timestamp reply, update smoothed + * round trip time. If no timestamp is present but + * transmit timer is running and timed sequence + * number was acked, update smoothed round trip time. + * Since we now have an rtt measurement, cancel the + * timer backoff (cf., Phil Karn's retransmit alg.). + * Recompute the initial retransmit timer. + */ + if (to.to_flag & TOF_TS) + tcp_xmit_timer(tp, ticks - to.to_tsecr + 1); + else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) + tcp_xmit_timer(tp, ticks - tp->t_rtttime); + + /* + * If all outstanding data is acked, stop retransmit + * timer and remember to restart (more output or persist). + * If there is more data to be acked, restart retransmit + * timer, using current (possibly backed-off) value. + */ + if (th->th_ack == tp->snd_max) { + callout_stop(tp->tt_rexmt); + needoutput = 1; + } else if (!callout_active(tp->tt_persist)) + callout_reset(tp->tt_rexmt, tp->t_rxtcur, + tcp_timer_rexmt, tp); + + /* + * If no data (only SYN) was ACK'd, + * skip rest of ACK processing. + */ + if (acked == 0) + goto step6; + + /* + * When new data is acked, open the congestion window. + * If the window gives us less than ssthresh packets + * in flight, open exponentially (maxseg per packet). + * Otherwise open linearly: maxseg per window + * (maxseg^2 / cwnd per packet). + */ + { + register u_int cw = tp->snd_cwnd; + register u_int incr = tp->t_maxseg; + + if (cw > tp->snd_ssthresh) + incr = incr * incr / cw; + /* + * If t_dupacks != 0 here, it indicates that we are still + * in NewReno fast recovery mode, so we leave the congestion + * window alone. + */ + if (tcp_do_newreno == 0 || tp->t_dupacks == 0) + tp->snd_cwnd = min(cw + incr,TCP_MAXWIN<<tp->snd_scale); + } + if (acked > so->so_snd.sb_cc) { + tp->snd_wnd -= so->so_snd.sb_cc; + sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); + ourfinisacked = 1; + } else { + sbdrop(&so->so_snd, acked); + tp->snd_wnd -= acked; + ourfinisacked = 0; + } + sowwakeup(so); + tp->snd_una = th->th_ack; + if (SEQ_LT(tp->snd_nxt, tp->snd_una)) + tp->snd_nxt = tp->snd_una; + + switch (tp->t_state) { + + /* + * In FIN_WAIT_1 STATE in addition to the processing + * for the ESTABLISHED state if our FIN is now acknowledged + * then enter FIN_WAIT_2. + */ + case TCPS_FIN_WAIT_1: + if (ourfinisacked) { + /* + * If we can't receive any more + * data, then closing user can proceed. + * Starting the timer is contrary to the + * specification, but if we don't get a FIN + * we'll hang forever. + */ + if (so->so_state & SS_CANTRCVMORE) { + soisdisconnected(so); + callout_reset(tp->tt_2msl, tcp_maxidle, + tcp_timer_2msl, tp); + } + tp->t_state = TCPS_FIN_WAIT_2; + } + break; + + /* + * In CLOSING STATE in addition to the processing for + * the ESTABLISHED state if the ACK acknowledges our FIN + * then enter the TIME-WAIT state, otherwise ignore + * the segment. + */ + case TCPS_CLOSING: + if (ourfinisacked) { + tp->t_state = TCPS_TIME_WAIT; + tcp_canceltimers(tp); + /* Shorten TIME_WAIT [RFC-1644, p.28] */ + if (tp->cc_recv != 0 && + (ticks - tp->t_starttime) < tcp_msl) + callout_reset(tp->tt_2msl, + tp->t_rxtcur * + TCPTV_TWTRUNC, + tcp_timer_2msl, tp); + else + callout_reset(tp->tt_2msl, 2 * tcp_msl, + tcp_timer_2msl, tp); + soisdisconnected(so); + } + break; + + /* + * In LAST_ACK, we may still be waiting for data to drain + * and/or to be acked, as well as for the ack of our FIN. + * If our FIN is now acknowledged, delete the TCB, + * enter the closed state and return. + */ + case TCPS_LAST_ACK: + if (ourfinisacked) { + tp = tcp_close(tp); + goto drop; + } + break; + + /* + * In TIME_WAIT state the only thing that should arrive + * is a retransmission of the remote FIN. Acknowledge + * it and restart the finack timer. + */ + case TCPS_TIME_WAIT: + callout_reset(tp->tt_2msl, 2 * tcp_msl, + tcp_timer_2msl, tp); + goto dropafterack; + } + } + +step6: + /* + * Update window information. + * Don't look at window if no ACK: TAC's send garbage on first SYN. + */ + if ((thflags & TH_ACK) && + (SEQ_LT(tp->snd_wl1, th->th_seq) || + (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || + (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { + /* keep track of pure window updates */ + if (tlen == 0 && + tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) + tcpstat.tcps_rcvwinupd++; + tp->snd_wnd = tiwin; + tp->snd_wl1 = th->th_seq; + tp->snd_wl2 = th->th_ack; + if (tp->snd_wnd > tp->max_sndwnd) + tp->max_sndwnd = tp->snd_wnd; + needoutput = 1; + } + + /* + * Process segments with URG. + */ + if ((thflags & TH_URG) && th->th_urp && + TCPS_HAVERCVDFIN(tp->t_state) == 0) { + /* + * This is a kludge, but if we receive and accept + * random urgent pointers, we'll crash in + * soreceive. It's hard to imagine someone + * actually wanting to send this much urgent data. + */ + if (th->th_urp + so->so_rcv.sb_cc > sb_max) { + th->th_urp = 0; /* XXX */ + thflags &= ~TH_URG; /* XXX */ + goto dodata; /* XXX */ + } + /* + * If this segment advances the known urgent pointer, + * then mark the data stream. This should not happen + * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since + * a FIN has been received from the remote side. + * In these states we ignore the URG. + * + * According to RFC961 (Assigned Protocols), + * the urgent pointer points to the last octet + * of urgent data. We continue, however, + * to consider it to indicate the first octet + * of data past the urgent section as the original + * spec states (in one of two places). + */ + if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { + tp->rcv_up = th->th_seq + th->th_urp; + so->so_oobmark = so->so_rcv.sb_cc + + (tp->rcv_up - tp->rcv_nxt) - 1; + if (so->so_oobmark == 0) + so->so_state |= SS_RCVATMARK; + sohasoutofband(so); + tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); + } + /* + * Remove out of band data so doesn't get presented to user. + * This can happen independent of advancing the URG pointer, + * but if two URG's are pending at once, some out-of-band + * data may creep in... ick. + */ + if (th->th_urp <= (u_long)tlen +#ifdef SO_OOBINLINE + && (so->so_options & SO_OOBINLINE) == 0 +#endif + ) + tcp_pulloutofband(so, th, m, + drop_hdrlen); /* hdr drop is delayed */ + } else + /* + * If no out of band data is expected, + * pull receive urgent pointer along + * with the receive window. + */ + if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) + tp->rcv_up = tp->rcv_nxt; +dodata: /* XXX */ + + /* + * Process the segment text, merging it into the TCP sequencing queue, + * and arranging for acknowledgment of receipt if necessary. + * This process logically involves adjusting tp->rcv_wnd as data + * is presented to the user (this happens in tcp_usrreq.c, + * case PRU_RCVD). If a FIN has already been received on this + * connection then we just ignore the text. + */ + if ((tlen || (thflags&TH_FIN)) && + TCPS_HAVERCVDFIN(tp->t_state) == 0) { + m_adj(m, drop_hdrlen); /* delayed header drop */ + TCP_REASS(tp, th, &tlen, m, so, thflags); + /* + * Note the amount of data that peer has sent into + * our window, in order to estimate the sender's + * buffer size. + */ + len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); + } else { + m_freem(m); + thflags &= ~TH_FIN; + } + + /* + * If FIN is received ACK the FIN and let the user know + * that the connection is closing. + */ + if (thflags & TH_FIN) { + if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { + socantrcvmore(so); + /* + * If connection is half-synchronized + * (ie NEEDSYN flag on) then delay ACK, + * so it may be piggybacked when SYN is sent. + * Otherwise, since we received a FIN then no + * more input can be expected, send ACK now. + */ + if (DELAY_ACK(tp) && (tp->t_flags & TF_NEEDSYN)) + callout_reset(tp->tt_delack, tcp_delacktime, + tcp_timer_delack, tp); + else + tp->t_flags |= TF_ACKNOW; + tp->rcv_nxt++; + } + switch (tp->t_state) { + + /* + * In SYN_RECEIVED and ESTABLISHED STATES + * enter the CLOSE_WAIT state. + */ + case TCPS_SYN_RECEIVED: + tp->t_starttime = ticks; + /*FALLTHROUGH*/ + case TCPS_ESTABLISHED: + tp->t_state = TCPS_CLOSE_WAIT; + break; + + /* + * If still in FIN_WAIT_1 STATE FIN has not been acked so + * enter the CLOSING state. + */ + case TCPS_FIN_WAIT_1: + tp->t_state = TCPS_CLOSING; + break; + + /* + * In FIN_WAIT_2 state enter the TIME_WAIT state, + * starting the time-wait timer, turning off the other + * standard timers. + */ + case TCPS_FIN_WAIT_2: + tp->t_state = TCPS_TIME_WAIT; + tcp_canceltimers(tp); + /* Shorten TIME_WAIT [RFC-1644, p.28] */ + if (tp->cc_recv != 0 && + (ticks - tp->t_starttime) < tcp_msl) { + callout_reset(tp->tt_2msl, + tp->t_rxtcur * TCPTV_TWTRUNC, + tcp_timer_2msl, tp); + /* For transaction client, force ACK now. */ + tp->t_flags |= TF_ACKNOW; + } + else + callout_reset(tp->tt_2msl, 2 * tcp_msl, + tcp_timer_2msl, tp); + soisdisconnected(so); + break; + + /* + * In TIME_WAIT state restart the 2 MSL time_wait timer. + */ + case TCPS_TIME_WAIT: + callout_reset(tp->tt_2msl, 2 * tcp_msl, + tcp_timer_2msl, tp); + break; + } + } +#ifdef TCPDEBUG + if (so->so_options & SO_DEBUG) + tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, + &tcp_savetcp, 0); +#endif + + /* + * Return any desired output. + */ + if (needoutput || (tp->t_flags & TF_ACKNOW)) + (void) tcp_output(tp); + return; + +dropafterack: + /* + * Generate an ACK dropping incoming segment if it occupies + * sequence space, where the ACK reflects our state. + * + * We can now skip the test for the RST flag since all + * paths to this code happen after packets containing + * RST have been dropped. + * + * In the SYN-RECEIVED state, don't send an ACK unless the + * segment we received passes the SYN-RECEIVED ACK test. + * If it fails send a RST. This breaks the loop in the + * "LAND" DoS attack, and also prevents an ACK storm + * between two listening ports that have been sent forged + * SYN segments, each with the source address of the other. + */ + if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && + (SEQ_GT(tp->snd_una, th->th_ack) || + SEQ_GT(th->th_ack, tp->snd_max)) ) { + rstreason = BANDLIM_RST_OPENPORT; + goto dropwithreset; + } +#ifdef TCPDEBUG + if (so->so_options & SO_DEBUG) + tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, + &tcp_savetcp, 0); +#endif + m_freem(m); + tp->t_flags |= TF_ACKNOW; + (void) tcp_output(tp); + return; + +dropwithreset: + /* + * Generate a RST, dropping incoming segment. + * Make ACK acceptable to originator of segment. + * Don't bother to respond if destination was broadcast/multicast. + */ + if ((thflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST)) + goto drop; +#ifdef INET6 + if (isipv6) { + if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || + IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) + goto drop; + } else +#endif /* INET6 */ + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || + IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || + ip->ip_src.s_addr == htonl(INADDR_BROADCAST)) + goto drop; + /* IPv6 anycast check is done at tcp6_input() */ + + /* + * Perform bandwidth limiting. + */ + if (badport_bandlim(rstreason) < 0) + goto drop; + +#ifdef TCPDEBUG + if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) + tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, + &tcp_savetcp, 0); +#endif + if (thflags & TH_ACK) + /* mtod() below is safe as long as hdr dropping is delayed */ + tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack, + TH_RST); + else { + if (thflags & TH_SYN) + tlen++; + /* mtod() below is safe as long as hdr dropping is delayed */ + tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, + (tcp_seq)0, TH_RST|TH_ACK); + } + /* destroy temporarily created socket */ + if (dropsocket) + (void) soabort(so); + return; + +drop: + /* + * Drop space held by incoming segment and return. + */ +#ifdef TCPDEBUG + if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) + tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, + &tcp_savetcp, 0); +#endif + m_freem(m); + /* destroy temporarily created socket */ + if (dropsocket) + (void) soabort(so); + return; +} + +static void +tcp_dooptions(tp, cp, cnt, th, to) + struct tcpcb *tp; + u_char *cp; + int cnt; + struct tcphdr *th; + struct tcpopt *to; +{ + u_short mss = 0; + int opt, optlen; + + for (; cnt > 0; cnt -= optlen, cp += optlen) { + opt = cp[0]; + if (opt == TCPOPT_EOL) + break; + if (opt == TCPOPT_NOP) + optlen = 1; + else { + if (cnt < 2) + break; + optlen = cp[1]; + if (optlen < 2 || optlen > cnt) + break; + } + switch (opt) { + + default: + continue; + + case TCPOPT_MAXSEG: + if (optlen != TCPOLEN_MAXSEG) + continue; + if (!(th->th_flags & TH_SYN)) + continue; + bcopy((char *) cp + 2, (char *) &mss, sizeof(mss)); + NTOHS(mss); + break; + + case TCPOPT_WINDOW: + if (optlen != TCPOLEN_WINDOW) + continue; + if (!(th->th_flags & TH_SYN)) + continue; + tp->t_flags |= TF_RCVD_SCALE; + tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); + break; + + case TCPOPT_TIMESTAMP: + if (optlen != TCPOLEN_TIMESTAMP) + continue; + to->to_flag |= TOF_TS; + bcopy((char *)cp + 2, + (char *)&to->to_tsval, sizeof(to->to_tsval)); + NTOHL(to->to_tsval); + bcopy((char *)cp + 6, + (char *)&to->to_tsecr, sizeof(to->to_tsecr)); + NTOHL(to->to_tsecr); + + /* + * A timestamp received in a SYN makes + * it ok to send timestamp requests and replies. + */ + if (th->th_flags & TH_SYN) { + tp->t_flags |= TF_RCVD_TSTMP; + tp->ts_recent = to->to_tsval; + tp->ts_recent_age = ticks; + } + break; + case TCPOPT_CC: + if (optlen != TCPOLEN_CC) + continue; + to->to_flag |= TOF_CC; + bcopy((char *)cp + 2, + (char *)&to->to_cc, sizeof(to->to_cc)); + NTOHL(to->to_cc); + /* + * A CC or CC.new option received in a SYN makes + * it ok to send CC in subsequent segments. + */ + if (th->th_flags & TH_SYN) + tp->t_flags |= TF_RCVD_CC; + break; + case TCPOPT_CCNEW: + if (optlen != TCPOLEN_CC) + continue; + if (!(th->th_flags & TH_SYN)) + continue; + to->to_flag |= TOF_CCNEW; + bcopy((char *)cp + 2, + (char *)&to->to_cc, sizeof(to->to_cc)); + NTOHL(to->to_cc); + /* + * A CC or CC.new option received in a SYN makes + * it ok to send CC in subsequent segments. + */ + tp->t_flags |= TF_RCVD_CC; + break; + case TCPOPT_CCECHO: + if (optlen != TCPOLEN_CC) + continue; + if (!(th->th_flags & TH_SYN)) + continue; + to->to_flag |= TOF_CCECHO; + bcopy((char *)cp + 2, + (char *)&to->to_ccecho, sizeof(to->to_ccecho)); + NTOHL(to->to_ccecho); + break; + } + } + if (th->th_flags & TH_SYN) + tcp_mss(tp, mss); /* sets t_maxseg */ +} + +/* + * Pull out of band byte out of a segment so + * it doesn't appear in the user's data queue. + * It is still reflected in the segment length for + * sequencing purposes. + */ +static void +tcp_pulloutofband(so, th, m, off) + struct socket *so; + struct tcphdr *th; + register struct mbuf *m; + int off; /* delayed to be droped hdrlen */ +{ + int cnt = off + th->th_urp - 1; + + while (cnt >= 0) { + if (m->m_len > cnt) { + char *cp = mtod(m, caddr_t) + cnt; + struct tcpcb *tp = sototcpcb(so); + + tp->t_iobc = *cp; + tp->t_oobflags |= TCPOOB_HAVEDATA; + bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); + m->m_len--; + if (m->m_flags & M_PKTHDR) + m->m_pkthdr.len--; + return; + } + cnt -= m->m_len; + m = m->m_next; + if (m == 0) + break; + } + panic("tcp_pulloutofband"); +} + +/* + * Collect new round-trip time estimate + * and update averages and current timeout. + */ +static void +tcp_xmit_timer(tp, rtt) + register struct tcpcb *tp; + int rtt; +{ + register int delta; + + tcpstat.tcps_rttupdated++; + tp->t_rttupdated++; + if (tp->t_srtt != 0) { + /* + * srtt is stored as fixed point with 5 bits after the + * binary point (i.e., scaled by 8). The following magic + * is equivalent to the smoothing algorithm in rfc793 with + * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed + * point). Adjust rtt to origin 0. + */ + delta = ((rtt - 1) << TCP_DELTA_SHIFT) + - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); + + if ((tp->t_srtt += delta) <= 0) + tp->t_srtt = 1; + + /* + * We accumulate a smoothed rtt variance (actually, a + * smoothed mean difference), then set the retransmit + * timer to smoothed rtt + 4 times the smoothed variance. + * rttvar is stored as fixed point with 4 bits after the + * binary point (scaled by 16). The following is + * equivalent to rfc793 smoothing with an alpha of .75 + * (rttvar = rttvar*3/4 + |delta| / 4). This replaces + * rfc793's wired-in beta. + */ + if (delta < 0) + delta = -delta; + delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); + if ((tp->t_rttvar += delta) <= 0) + tp->t_rttvar = 1; + } else { + /* + * No rtt measurement yet - use the unsmoothed rtt. + * Set the variance to half the rtt (so our first + * retransmit happens at 3*rtt). + */ + tp->t_srtt = rtt << TCP_RTT_SHIFT; + tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); + } + tp->t_rtttime = 0; + tp->t_rxtshift = 0; + + /* + * the retransmit should happen at rtt + 4 * rttvar. + * Because of the way we do the smoothing, srtt and rttvar + * will each average +1/2 tick of bias. When we compute + * the retransmit timer, we want 1/2 tick of rounding and + * 1 extra tick because of +-1/2 tick uncertainty in the + * firing of the timer. The bias will give us exactly the + * 1.5 tick we need. But, because the bias is + * statistical, we have to test that we don't drop below + * the minimum feasible timer (which is 2 ticks). + */ + TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), + max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); + + /* + * We received an ack for a packet that wasn't retransmitted; + * it is probably safe to discard any error indications we've + * received recently. This isn't quite right, but close enough + * for now (a route might have failed after we sent a segment, + * and the return path might not be symmetrical). + */ + tp->t_softerror = 0; +} + +/* + * Determine a reasonable value for maxseg size. + * If the route is known, check route for mtu. + * If none, use an mss that can be handled on the outgoing + * interface without forcing IP to fragment; if bigger than + * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES + * to utilize large mbufs. If no route is found, route has no mtu, + * or the destination isn't local, use a default, hopefully conservative + * size (usually 512 or the default IP max size, but no more than the mtu + * of the interface), as we can't discover anything about intervening + * gateways or networks. We also initialize the congestion/slow start + * window to be a single segment if the destination isn't local. + * While looking at the routing entry, we also initialize other path-dependent + * parameters from pre-set or cached values in the routing entry. + * + * Also take into account the space needed for options that we + * send regularly. Make maxseg shorter by that amount to assure + * that we can send maxseg amount of data even when the options + * are present. Store the upper limit of the length of options plus + * data in maxopd. + * + * NOTE that this routine is only called when we process an incoming + * segment, for outgoing segments only tcp_mssopt is called. + * + * In case of T/TCP, we call this routine during implicit connection + * setup as well (offer = -1), to initialize maxseg from the cached + * MSS of our peer. + */ +void +tcp_mss(tp, offer) + struct tcpcb *tp; + int offer; +{ + register struct rtentry *rt; + struct ifnet *ifp; + register int rtt, mss; + u_long bufsize; + struct inpcb *inp; + struct socket *so; + struct rmxp_tao *taop; + int origoffer = offer; +#ifdef INET6 + int isipv6; + int min_protoh; +#endif + + inp = tp->t_inpcb; +#ifdef INET6 + isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; + min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) + : sizeof (struct tcpiphdr); +#else +#define min_protoh (sizeof (struct tcpiphdr)) +#endif +#ifdef INET6 + if (isipv6) + rt = tcp_rtlookup6(inp); + else +#endif + rt = tcp_rtlookup(inp); + if (rt == NULL) { + tp->t_maxopd = tp->t_maxseg = +#ifdef INET6 + isipv6 ? tcp_v6mssdflt : +#endif /* INET6 */ + tcp_mssdflt; + return; + } + ifp = rt->rt_ifp; + so = inp->inp_socket; + + taop = rmx_taop(rt->rt_rmx); + /* + * Offer == -1 means that we didn't receive SYN yet, + * use cached value in that case; + */ + if (offer == -1) + offer = taop->tao_mssopt; + /* + * Offer == 0 means that there was no MSS on the SYN segment, + * in this case we use tcp_mssdflt. + */ + if (offer == 0) + offer = +#ifdef INET6 + isipv6 ? tcp_v6mssdflt : +#endif /* INET6 */ + tcp_mssdflt; + else + /* + * Sanity check: make sure that maxopd will be large + * enough to allow some data on segments even is the + * all the option space is used (40bytes). Otherwise + * funny things may happen in tcp_output. + */ + offer = max(offer, 64); + taop->tao_mssopt = offer; + + /* + * While we're here, check if there's an initial rtt + * or rttvar. Convert from the route-table units + * to scaled multiples of the slow timeout timer. + */ + if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) { + /* + * XXX the lock bit for RTT indicates that the value + * is also a minimum value; this is subject to time. + */ + if (rt->rt_rmx.rmx_locks & RTV_RTT) + tp->t_rttmin = rtt / (RTM_RTTUNIT / hz); + tp->t_srtt = rtt / (RTM_RTTUNIT / (hz * TCP_RTT_SCALE)); + tcpstat.tcps_usedrtt++; + if (rt->rt_rmx.rmx_rttvar) { + tp->t_rttvar = rt->rt_rmx.rmx_rttvar / + (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE)); + tcpstat.tcps_usedrttvar++; + } else { + /* default variation is +- 1 rtt */ + tp->t_rttvar = + tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; + } + TCPT_RANGESET(tp->t_rxtcur, + ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, + tp->t_rttmin, TCPTV_REXMTMAX); + } + /* + * if there's an mtu associated with the route, use it + * else, use the link mtu. + */ + if (rt->rt_rmx.rmx_mtu) + mss = rt->rt_rmx.rmx_mtu - min_protoh; + else + { + mss = +#ifdef INET6 + (isipv6 ? nd_ifinfo[rt->rt_ifp->if_index].linkmtu : +#endif + ifp->if_mtu +#ifdef INET6 + ) +#endif + - min_protoh; +#ifdef INET6 + if (isipv6) { + if (!in6_localaddr(&inp->in6p_faddr)) + mss = min(mss, tcp_v6mssdflt); + } else +#endif + if (!in_localaddr(inp->inp_faddr)) + mss = min(mss, tcp_mssdflt); + } + mss = min(mss, offer); + /* + * maxopd stores the maximum length of data AND options + * in a segment; maxseg is the amount of data in a normal + * segment. We need to store this value (maxopd) apart + * from maxseg, because now every segment carries options + * and thus we normally have somewhat less data in segments. + */ + tp->t_maxopd = mss; + + /* + * In case of T/TCP, origoffer==-1 indicates, that no segments + * were received yet. In this case we just guess, otherwise + * we do the same as before T/TCP. + */ + if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && + (origoffer == -1 || + (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)) + mss -= TCPOLEN_TSTAMP_APPA; + if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && + (origoffer == -1 || + (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC)) + mss -= TCPOLEN_CC_APPA; + +#if (MCLBYTES & (MCLBYTES - 1)) == 0 + if (mss > MCLBYTES) + mss &= ~(MCLBYTES-1); +#else + if (mss > MCLBYTES) + mss = mss / MCLBYTES * MCLBYTES; +#endif + /* + * If there's a pipesize, change the socket buffer + * to that size. Make the socket buffers an integral + * number of mss units; if the mss is larger than + * the socket buffer, decrease the mss. + */ +#ifdef RTV_SPIPE + if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0) +#endif + bufsize = so->so_snd.sb_hiwat; + if (bufsize < mss) + mss = bufsize; + else { + bufsize = roundup(bufsize, mss); + if (bufsize > sb_max) + bufsize = sb_max; + (void)sbreserve(&so->so_snd, bufsize, so, NULL); + } + tp->t_maxseg = mss; + +#ifdef RTV_RPIPE + if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0) +#endif + bufsize = so->so_rcv.sb_hiwat; + if (bufsize > mss) { + bufsize = roundup(bufsize, mss); + if (bufsize > sb_max) + bufsize = sb_max; + (void)sbreserve(&so->so_rcv, bufsize, so, NULL); + } + + /* + * Set the slow-start flight size depending on whether this + * is a local network or not. + */ + if ( +#ifdef INET6 + (isipv6 && in6_localaddr(&inp->in6p_faddr)) || + (!isipv6 && +#endif + in_localaddr(inp->inp_faddr) +#ifdef INET6 + ) +#endif + ) + tp->snd_cwnd = mss * ss_fltsz_local; + else + tp->snd_cwnd = mss * ss_fltsz; + + if (rt->rt_rmx.rmx_ssthresh) { + /* + * There's some sort of gateway or interface + * buffer limit on the path. Use this to set + * the slow start threshhold, but set the + * threshold to no less than 2*mss. + */ + tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh); + tcpstat.tcps_usedssthresh++; + } +} + +/* + * Determine the MSS option to send on an outgoing SYN. + */ +int +tcp_mssopt(tp) + struct tcpcb *tp; +{ + struct rtentry *rt; +#ifdef INET6 + int isipv6; + int min_protoh; +#endif + +#ifdef INET6 + isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0; + min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) + : sizeof (struct tcpiphdr); +#else +#define min_protoh (sizeof (struct tcpiphdr)) +#endif +#ifdef INET6 + if (isipv6) + rt = tcp_rtlookup6(tp->t_inpcb); + else +#endif /* INET6 */ + rt = tcp_rtlookup(tp->t_inpcb); + if (rt == NULL) + return +#ifdef INET6 + isipv6 ? tcp_v6mssdflt : +#endif /* INET6 */ + tcp_mssdflt; + + return rt->rt_ifp->if_mtu - min_protoh; +} + + +/* + * Checks for partial ack. If partial ack arrives, force the retransmission + * of the next unacknowledged segment, do not clear tp->t_dupacks, and return + * 1. By setting snd_nxt to ti_ack, this forces retransmission timer to + * be started again. If the ack advances at least to tp->snd_recover, return 0. + */ +static int +tcp_newreno(tp, th) + struct tcpcb *tp; + struct tcphdr *th; +{ + if (SEQ_LT(th->th_ack, tp->snd_recover)) { + tcp_seq onxt = tp->snd_nxt; + u_long ocwnd = tp->snd_cwnd; + + callout_stop(tp->tt_rexmt); + tp->t_rtttime = 0; + tp->snd_nxt = th->th_ack; + /* + * Set snd_cwnd to one segment beyond acknowledged offset + * (tp->snd_una has not yet been updated when this function + * is called) + */ + tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); + (void) tcp_output(tp); + tp->snd_cwnd = ocwnd; + if (SEQ_GT(onxt, tp->snd_nxt)) + tp->snd_nxt = onxt; + /* + * Partial window deflation. Relies on fact that tp->snd_una + * not updated yet. + */ + tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_maxseg); + return (1); + } + return (0); +} diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c new file mode 100644 index 0000000..286b420 --- /dev/null +++ b/sys/netinet/tcp_output.c @@ -0,0 +1,943 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95 + * $FreeBSD$ + */ + +#include "opt_inet6.h" +#include "opt_ipsec.h" +#include "opt_tcpdebug.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/domain.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/mbuf.h> +#include <sys/mutex.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sysctl.h> + +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/in_pcb.h> +#include <netinet/ip_var.h> +#ifdef INET6 +#include <netinet6/in6_pcb.h> +#include <netinet/ip6.h> +#include <netinet6/ip6_var.h> +#endif +#include <netinet/tcp.h> +#define TCPOUTFLAGS +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_seq.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#include <netinet/tcpip.h> +#ifdef TCPDEBUG +#include <netinet/tcp_debug.h> +#endif + +#ifdef IPSEC +#include <netinet6/ipsec.h> +#endif /*IPSEC*/ + +#include <machine/in_cksum.h> + +#ifdef notyet +extern struct mbuf *m_copypack(); +#endif + +static int path_mtu_discovery = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW, + &path_mtu_discovery, 1, "Enable Path MTU Discovery"); + +int ss_fltsz = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowstart_flightsize, CTLFLAG_RW, + &ss_fltsz, 1, "Slow start flight size"); + +int ss_fltsz_local = TCP_MAXWIN; /* something large */ +SYSCTL_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize, CTLFLAG_RW, + &ss_fltsz_local, 1, "Slow start flight size for local networks"); + +int tcp_do_newreno = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, &tcp_do_newreno, + 0, "Enable NewReno Algorithms"); +/* + * Tcp output routine: figure out what should be sent and send it. + */ +int +tcp_output(tp) + register struct tcpcb *tp; +{ + register struct socket *so = tp->t_inpcb->inp_socket; + register long len, win; + int off, flags, error; + register struct mbuf *m; + struct ip *ip = NULL; + register struct ipovly *ipov = NULL; +#ifdef INET6 + struct ip6_hdr *ip6 = NULL; +#endif /* INET6 */ + register struct tcphdr *th; + u_char opt[TCP_MAXOLEN]; + unsigned ipoptlen, optlen, hdrlen; + int idle, sendalot; + int maxburst = TCP_MAXBURST; + struct rmxp_tao *taop; + struct rmxp_tao tao_noncached; +#ifdef INET6 + int isipv6; +#endif + +#ifdef INET6 + isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; +#endif + + /* + * Determine length of data that should be transmitted, + * and flags that will be used. + * If there is some data or critical controls (SYN, RST) + * to send, then transmit; otherwise, investigate further. + */ + idle = (tp->snd_max == tp->snd_una); + if (idle && (ticks - tp->t_rcvtime) >= tp->t_rxtcur) { + /* + * We have been idle for "a while" and no acks are + * expected to clock out any data we send -- + * slow start to get ack "clock" running again. + * + * Set the slow-start flight size depending on whether + * this is a local network or not. + */ + if ( +#ifdef INET6 + (isipv6 && in6_localaddr(&tp->t_inpcb->in6p_faddr)) || + (!isipv6 && +#endif + in_localaddr(tp->t_inpcb->inp_faddr) +#ifdef INET6 + ) +#endif + ) + tp->snd_cwnd = tp->t_maxseg * ss_fltsz_local; + else + tp->snd_cwnd = tp->t_maxseg * ss_fltsz; + } +again: + sendalot = 0; + off = tp->snd_nxt - tp->snd_una; + win = min(tp->snd_wnd, tp->snd_cwnd); + + flags = tcp_outflags[tp->t_state]; + /* + * Get standard flags, and add SYN or FIN if requested by 'hidden' + * state flags. + */ + if (tp->t_flags & TF_NEEDFIN) + flags |= TH_FIN; + if (tp->t_flags & TF_NEEDSYN) + flags |= TH_SYN; + + /* + * If in persist timeout with window of 0, send 1 byte. + * Otherwise, if window is small but nonzero + * and timer expired, we will send what we can + * and go to transmit state. + */ + if (tp->t_force) { + if (win == 0) { + /* + * If we still have some data to send, then + * clear the FIN bit. Usually this would + * happen below when it realizes that we + * aren't sending all the data. However, + * if we have exactly 1 byte of unsent data, + * then it won't clear the FIN bit below, + * and if we are in persist state, we wind + * up sending the packet without recording + * that we sent the FIN bit. + * + * We can't just blindly clear the FIN bit, + * because if we don't have any more data + * to send then the probe will be the FIN + * itself. + */ + if (off < so->so_snd.sb_cc) + flags &= ~TH_FIN; + win = 1; + } else { + callout_stop(tp->tt_persist); + tp->t_rxtshift = 0; + } + } + + len = (long)ulmin(so->so_snd.sb_cc, win) - off; + + if ((taop = tcp_gettaocache(tp->t_inpcb)) == NULL) { + taop = &tao_noncached; + bzero(taop, sizeof(*taop)); + } + + /* + * Lop off SYN bit if it has already been sent. However, if this + * is SYN-SENT state and if segment contains data and if we don't + * know that foreign host supports TAO, suppress sending segment. + */ + if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { + flags &= ~TH_SYN; + off--, len++; + if (len > 0 && tp->t_state == TCPS_SYN_SENT && + taop->tao_ccsent == 0) + return 0; + } + + /* + * Be careful not to send data and/or FIN on SYN segments + * in cases when no CC option will be sent. + * This measure is needed to prevent interoperability problems + * with not fully conformant TCP implementations. + */ + if ((flags & TH_SYN) && + ((tp->t_flags & TF_NOOPT) || !(tp->t_flags & TF_REQ_CC) || + ((flags & TH_ACK) && !(tp->t_flags & TF_RCVD_CC)))) { + len = 0; + flags &= ~TH_FIN; + } + + if (len < 0) { + /* + * If FIN has been sent but not acked, + * but we haven't been called to retransmit, + * len will be -1. Otherwise, window shrank + * after we sent into it. If window shrank to 0, + * cancel pending retransmit, pull snd_nxt back + * to (closed) window, and set the persist timer + * if it isn't already going. If the window didn't + * close completely, just wait for an ACK. + */ + len = 0; + if (win == 0) { + callout_stop(tp->tt_rexmt); + tp->t_rxtshift = 0; + tp->snd_nxt = tp->snd_una; + if (!callout_active(tp->tt_persist)) + tcp_setpersist(tp); + } + } + if (len > tp->t_maxseg) { + len = tp->t_maxseg; + sendalot = 1; + } + if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) + flags &= ~TH_FIN; + + win = sbspace(&so->so_rcv); + + /* + * Sender silly window avoidance. If connection is idle + * and can send all data, a maximum segment, + * at least a maximum default-size segment do it, + * or are forced, do it; otherwise don't bother. + * If peer's buffer is tiny, then send + * when window is at least half open. + * If retransmitting (possibly after persist timer forced us + * to send into a small window), then must resend. + */ + if (len) { + if (len == tp->t_maxseg) + goto send; + if (!(tp->t_flags & TF_MORETOCOME) && + (idle || tp->t_flags & TF_NODELAY) && + (tp->t_flags & TF_NOPUSH) == 0 && + len + off >= so->so_snd.sb_cc) + goto send; + if (tp->t_force) + goto send; + if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) + goto send; + if (SEQ_LT(tp->snd_nxt, tp->snd_max)) + goto send; + } + + /* + * Compare available window to amount of window + * known to peer (as advertised window less + * next expected input). If the difference is at least two + * max size segments, or at least 50% of the maximum possible + * window, then want to send a window update to peer. + */ + if (win > 0) { + /* + * "adv" is the amount we can increase the window, + * taking into account that we are limited by + * TCP_MAXWIN << tp->rcv_scale. + */ + long adv = min(win, (long)TCP_MAXWIN << tp->rcv_scale) - + (tp->rcv_adv - tp->rcv_nxt); + + if (adv >= (long) (2 * tp->t_maxseg)) + goto send; + if (2 * adv >= (long) so->so_rcv.sb_hiwat) + goto send; + } + + /* + * Send if we owe peer an ACK. + */ + if (tp->t_flags & TF_ACKNOW) + goto send; + if ((flags & TH_RST) || + ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) + goto send; + if (SEQ_GT(tp->snd_up, tp->snd_una)) + goto send; + /* + * If our state indicates that FIN should be sent + * and we have not yet done so, or we're retransmitting the FIN, + * then we need to send. + */ + if (flags & TH_FIN && + ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) + goto send; + + /* + * TCP window updates are not reliable, rather a polling protocol + * using ``persist'' packets is used to insure receipt of window + * updates. The three ``states'' for the output side are: + * idle not doing retransmits or persists + * persisting to move a small or zero window + * (re)transmitting and thereby not persisting + * + * callout_active(tp->tt_persist) + * is true when we are in persist state. + * tp->t_force + * is set when we are called to send a persist packet. + * callout_active(tp->tt_rexmt) + * is set when we are retransmitting + * The output side is idle when both timers are zero. + * + * If send window is too small, there is data to transmit, and no + * retransmit or persist is pending, then go to persist state. + * If nothing happens soon, send when timer expires: + * if window is nonzero, transmit what we can, + * otherwise force out a byte. + */ + if (so->so_snd.sb_cc && !callout_active(tp->tt_rexmt) && + !callout_active(tp->tt_persist)) { + tp->t_rxtshift = 0; + tcp_setpersist(tp); + } + + /* + * No reason to send a segment, just return. + */ + return (0); + +send: + /* + * Before ESTABLISHED, force sending of initial options + * unless TCP set not to do any options. + * NOTE: we assume that the IP/TCP header plus TCP options + * always fit in a single mbuf, leaving room for a maximum + * link header, i.e. + * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MHLEN + */ + optlen = 0; +#ifdef INET6 + if (isipv6) + hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); + else +#endif + hdrlen = sizeof (struct tcpiphdr); + if (flags & TH_SYN) { + tp->snd_nxt = tp->iss; + if ((tp->t_flags & TF_NOOPT) == 0) { + u_short mss; + + opt[0] = TCPOPT_MAXSEG; + opt[1] = TCPOLEN_MAXSEG; + mss = htons((u_short) tcp_mssopt(tp)); + (void)memcpy(opt + 2, &mss, sizeof(mss)); + optlen = TCPOLEN_MAXSEG; + + if ((tp->t_flags & TF_REQ_SCALE) && + ((flags & TH_ACK) == 0 || + (tp->t_flags & TF_RCVD_SCALE))) { + *((u_int32_t *)(opt + optlen)) = htonl( + TCPOPT_NOP << 24 | + TCPOPT_WINDOW << 16 | + TCPOLEN_WINDOW << 8 | + tp->request_r_scale); + optlen += 4; + } + } + } + + /* + * Send a timestamp and echo-reply if this is a SYN and our side + * wants to use timestamps (TF_REQ_TSTMP is set) or both our side + * and our peer have sent timestamps in our SYN's. + */ + if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && + (flags & TH_RST) == 0 && + ((flags & TH_ACK) == 0 || + (tp->t_flags & TF_RCVD_TSTMP))) { + u_int32_t *lp = (u_int32_t *)(opt + optlen); + + /* Form timestamp option as shown in appendix A of RFC 1323. */ + *lp++ = htonl(TCPOPT_TSTAMP_HDR); + *lp++ = htonl(ticks); + *lp = htonl(tp->ts_recent); + optlen += TCPOLEN_TSTAMP_APPA; + } + + /* + * Send `CC-family' options if our side wants to use them (TF_REQ_CC), + * options are allowed (!TF_NOOPT) and it's not a RST. + */ + if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && + (flags & TH_RST) == 0) { + switch (flags & (TH_SYN|TH_ACK)) { + /* + * This is a normal ACK, send CC if we received CC before + * from our peer. + */ + case TH_ACK: + if (!(tp->t_flags & TF_RCVD_CC)) + break; + /*FALLTHROUGH*/ + + /* + * We can only get here in T/TCP's SYN_SENT* state, when + * we're a sending a non-SYN segment without waiting for + * the ACK of our SYN. A check above assures that we only + * do this if our peer understands T/TCP. + */ + case 0: + opt[optlen++] = TCPOPT_NOP; + opt[optlen++] = TCPOPT_NOP; + opt[optlen++] = TCPOPT_CC; + opt[optlen++] = TCPOLEN_CC; + *(u_int32_t *)&opt[optlen] = htonl(tp->cc_send); + + optlen += 4; + break; + + /* + * This is our initial SYN, check whether we have to use + * CC or CC.new. + */ + case TH_SYN: + opt[optlen++] = TCPOPT_NOP; + opt[optlen++] = TCPOPT_NOP; + opt[optlen++] = tp->t_flags & TF_SENDCCNEW ? + TCPOPT_CCNEW : TCPOPT_CC; + opt[optlen++] = TCPOLEN_CC; + *(u_int32_t *)&opt[optlen] = htonl(tp->cc_send); + optlen += 4; + break; + + /* + * This is a SYN,ACK; send CC and CC.echo if we received + * CC from our peer. + */ + case (TH_SYN|TH_ACK): + if (tp->t_flags & TF_RCVD_CC) { + opt[optlen++] = TCPOPT_NOP; + opt[optlen++] = TCPOPT_NOP; + opt[optlen++] = TCPOPT_CC; + opt[optlen++] = TCPOLEN_CC; + *(u_int32_t *)&opt[optlen] = + htonl(tp->cc_send); + optlen += 4; + opt[optlen++] = TCPOPT_NOP; + opt[optlen++] = TCPOPT_NOP; + opt[optlen++] = TCPOPT_CCECHO; + opt[optlen++] = TCPOLEN_CC; + *(u_int32_t *)&opt[optlen] = + htonl(tp->cc_recv); + optlen += 4; + } + break; + } + } + + hdrlen += optlen; + +#ifdef INET6 + if (isipv6) + ipoptlen = ip6_optlen(tp->t_inpcb); + else +#endif + { + if (tp->t_inpcb->inp_options) { + ipoptlen = tp->t_inpcb->inp_options->m_len - + offsetof(struct ipoption, ipopt_list); + } else { + ipoptlen = 0; + } + } +#ifdef IPSEC + ipoptlen += ipsec_hdrsiz_tcp(tp); +#endif + + /* + * Adjust data length if insertion of options will + * bump the packet length beyond the t_maxopd length. + * Clear the FIN bit because we cut off the tail of + * the segment. + */ + if (len + optlen + ipoptlen > tp->t_maxopd) { + /* + * If there is still more to send, don't close the connection. + */ + flags &= ~TH_FIN; + len = tp->t_maxopd - optlen - ipoptlen; + sendalot = 1; + } + +/*#ifdef DIAGNOSTIC*/ +#ifdef INET6 + if (max_linkhdr + hdrlen > MCLBYTES) + panic("tcphdr too big"); +#else + if (max_linkhdr + hdrlen > MHLEN) + panic("tcphdr too big"); +#endif +/*#endif*/ + + /* + * Grab a header mbuf, attaching a copy of data to + * be transmitted, and initialize the header from + * the template for sends on this connection. + */ + if (len) { + if (tp->t_force && len == 1) + tcpstat.tcps_sndprobe++; + else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { + tcpstat.tcps_sndrexmitpack++; + tcpstat.tcps_sndrexmitbyte += len; + } else { + tcpstat.tcps_sndpack++; + tcpstat.tcps_sndbyte += len; + } +#ifdef notyet + if ((m = m_copypack(so->so_snd.sb_mb, off, + (int)len, max_linkhdr + hdrlen)) == 0) { + error = ENOBUFS; + goto out; + } + /* + * m_copypack left space for our hdr; use it. + */ + m->m_len += hdrlen; + m->m_data -= hdrlen; +#else + MGETHDR(m, M_DONTWAIT, MT_HEADER); + if (m == NULL) { + error = ENOBUFS; + goto out; + } +#ifdef INET6 + if (MHLEN < hdrlen + max_linkhdr) { + MCLGET(m, M_DONTWAIT); + if ((m->m_flags & M_EXT) == 0) { + m_freem(m); + error = ENOBUFS; + goto out; + } + } +#endif + m->m_data += max_linkhdr; + m->m_len = hdrlen; + if (len <= MHLEN - hdrlen - max_linkhdr) { + m_copydata(so->so_snd.sb_mb, off, (int) len, + mtod(m, caddr_t) + hdrlen); + m->m_len += len; + } else { + m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len); + if (m->m_next == 0) { + (void) m_free(m); + error = ENOBUFS; + goto out; + } + } +#endif + /* + * If we're sending everything we've got, set PUSH. + * (This will keep happy those implementations which only + * give data to the user when a buffer fills or + * a PUSH comes in.) + */ + if (off + len == so->so_snd.sb_cc) + flags |= TH_PUSH; + } else { + if (tp->t_flags & TF_ACKNOW) + tcpstat.tcps_sndacks++; + else if (flags & (TH_SYN|TH_FIN|TH_RST)) + tcpstat.tcps_sndctrl++; + else if (SEQ_GT(tp->snd_up, tp->snd_una)) + tcpstat.tcps_sndurg++; + else + tcpstat.tcps_sndwinup++; + + MGETHDR(m, M_DONTWAIT, MT_HEADER); + if (m == NULL) { + error = ENOBUFS; + goto out; + } +#ifdef INET6 + if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && + MHLEN >= hdrlen) { + MH_ALIGN(m, hdrlen); + } else +#endif + m->m_data += max_linkhdr; + m->m_len = hdrlen; + } + m->m_pkthdr.rcvif = (struct ifnet *)0; + if (tp->t_template == 0) + panic("tcp_output"); +#ifdef INET6 + if (isipv6) { + ip6 = mtod(m, struct ip6_hdr *); + th = (struct tcphdr *)(ip6 + 1); + bcopy((caddr_t)tp->t_template->tt_ipgen, (caddr_t)ip6, + sizeof(struct ip6_hdr)); + bcopy((caddr_t)&tp->t_template->tt_t, (caddr_t)th, + sizeof(struct tcphdr)); + } else +#endif /* INET6 */ + { + ip = mtod(m, struct ip *); + ipov = (struct ipovly *)ip; + th = (struct tcphdr *)(ip + 1); + /* this picks up the pseudo header (w/o the length) */ + bcopy((caddr_t)tp->t_template->tt_ipgen, (caddr_t)ip, + sizeof(struct ip)); + bcopy((caddr_t)&tp->t_template->tt_t, (caddr_t)th, + sizeof(struct tcphdr)); + } + + /* + * Fill in fields, remembering maximum advertised + * window for use in delaying messages about window sizes. + * If resending a FIN, be sure not to use a new sequence number. + */ + if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && + tp->snd_nxt == tp->snd_max) + tp->snd_nxt--; + /* + * If we are doing retransmissions, then snd_nxt will + * not reflect the first unsent octet. For ACK only + * packets, we do not want the sequence number of the + * retransmitted packet, we want the sequence number + * of the next unsent octet. So, if there is no data + * (and no SYN or FIN), use snd_max instead of snd_nxt + * when filling in ti_seq. But if we are in persist + * state, snd_max might reflect one byte beyond the + * right edge of the window, so use snd_nxt in that + * case, since we know we aren't doing a retransmission. + * (retransmit and persist are mutually exclusive...) + */ + if (len || (flags & (TH_SYN|TH_FIN)) + || callout_active(tp->tt_persist)) + th->th_seq = htonl(tp->snd_nxt); + else + th->th_seq = htonl(tp->snd_max); + th->th_ack = htonl(tp->rcv_nxt); + if (optlen) { + bcopy(opt, th + 1, optlen); + th->th_off = (sizeof (struct tcphdr) + optlen) >> 2; + } + th->th_flags = flags; + /* + * Calculate receive window. Don't shrink window, + * but avoid silly window syndrome. + */ + if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg) + win = 0; + if (win < (long)(tp->rcv_adv - tp->rcv_nxt)) + win = (long)(tp->rcv_adv - tp->rcv_nxt); + if (win > (long)TCP_MAXWIN << tp->rcv_scale) + win = (long)TCP_MAXWIN << tp->rcv_scale; + th->th_win = htons((u_short) (win>>tp->rcv_scale)); + if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { + th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); + th->th_flags |= TH_URG; + } else + /* + * If no urgent pointer to send, then we pull + * the urgent pointer to the left edge of the send window + * so that it doesn't drift into the send window on sequence + * number wraparound. + */ + tp->snd_up = tp->snd_una; /* drag it along */ + + /* + * Put TCP length in extended header, and then + * checksum extended header and data. + */ + m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ +#ifdef INET6 + if (isipv6) + /* + * ip6_plen is not need to be filled now, and will be filled + * in ip6_output. + */ + th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), + sizeof(struct tcphdr) + optlen + len); + else +#endif /* INET6 */ + { + m->m_pkthdr.csum_flags = CSUM_TCP; + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); + if (len + optlen) + th->th_sum = in_addword(th->th_sum, + htons((u_short)(optlen + len))); + + /* IP version must be set here for ipv4/ipv6 checking later */ + KASSERT(ip->ip_v == IPVERSION, + ("%s: IP version incorrect: %d", __FUNCTION__, ip->ip_v)); + } + + /* + * In transmit state, time the transmission and arrange for + * the retransmit. In persist state, just set snd_max. + */ + if (tp->t_force == 0 || !callout_active(tp->tt_persist)) { + tcp_seq startseq = tp->snd_nxt; + + /* + * Advance snd_nxt over sequence space of this segment. + */ + if (flags & (TH_SYN|TH_FIN)) { + if (flags & TH_SYN) + tp->snd_nxt++; + if (flags & TH_FIN) { + tp->snd_nxt++; + tp->t_flags |= TF_SENTFIN; + } + } + tp->snd_nxt += len; + if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { + tp->snd_max = tp->snd_nxt; + /* + * Time this transmission if not a retransmission and + * not currently timing anything. + */ + if (tp->t_rtttime == 0) { + tp->t_rtttime = ticks; + tp->t_rtseq = startseq; + tcpstat.tcps_segstimed++; + } + } + + /* + * Set retransmit timer if not currently set, + * and not doing an ack or a keep-alive probe. + * Initial value for retransmit timer is smoothed + * round-trip time + 2 * round-trip time variance. + * Initialize shift counter which is used for backoff + * of retransmit time. + */ + if (!callout_active(tp->tt_rexmt) && + tp->snd_nxt != tp->snd_una) { + if (callout_active(tp->tt_persist)) { + callout_stop(tp->tt_persist); + tp->t_rxtshift = 0; + } + callout_reset(tp->tt_rexmt, tp->t_rxtcur, + tcp_timer_rexmt, tp); + } + } else + if (SEQ_GT(tp->snd_nxt + len, tp->snd_max)) + tp->snd_max = tp->snd_nxt + len; + +#ifdef TCPDEBUG + /* + * Trace. + */ + if (so->so_options & SO_DEBUG) + tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); +#endif + + /* + * Fill in IP length and desired time to live and + * send to IP level. There should be a better way + * to handle ttl and tos; we could keep them in + * the template, but need a way to checksum without them. + */ + /* + * m->m_pkthdr.len should have been set before cksum calcuration, + * because in6_cksum() need it. + */ +#ifdef INET6 + if (isipv6) { + /* + * we separately set hoplimit for every segment, since the + * user might want to change the value via setsockopt. + * Also, desired default hop limit might be changed via + * Neighbor Discovery. + */ + ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, + tp->t_inpcb->in6p_route.ro_rt ? + tp->t_inpcb->in6p_route.ro_rt->rt_ifp + : NULL); + + /* TODO: IPv6 IP6TOS_ECT bit on */ +#ifdef IPSEC + ipsec_setsocket(m, so); +#endif /*IPSEC*/ + error = ip6_output(m, + tp->t_inpcb->in6p_outputopts, + &tp->t_inpcb->in6p_route, + (so->so_options & SO_DONTROUTE), NULL, NULL); + } else +#endif /* INET6 */ + { + struct rtentry *rt; + ip->ip_len = m->m_pkthdr.len; +#ifdef INET6 + if (INP_CHECK_SOCKAF(so, AF_INET6)) + ip->ip_ttl = in6_selecthlim(tp->t_inpcb, + tp->t_inpcb->in6p_route.ro_rt ? + tp->t_inpcb->in6p_route.ro_rt->rt_ifp + : NULL); + else +#endif /* INET6 */ + ip->ip_ttl = tp->t_inpcb->inp_ip_ttl; /* XXX */ + ip->ip_tos = tp->t_inpcb->inp_ip_tos; /* XXX */ + /* + * See if we should do MTU discovery. We do it only if the following + * are true: + * 1) we have a valid route to the destination + * 2) the MTU is not locked (if it is, then discovery has been + * disabled) + */ + if (path_mtu_discovery + && (rt = tp->t_inpcb->inp_route.ro_rt) + && rt->rt_flags & RTF_UP + && !(rt->rt_rmx.rmx_locks & RTV_MTU)) { + ip->ip_off |= IP_DF; + } +#ifdef IPSEC + ipsec_setsocket(m, so); +#endif /*IPSEC*/ + error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, + (so->so_options & SO_DONTROUTE), 0); + } + if (error) { + + /* + * We know that the packet was lost, so back out the + * sequence number advance, if any. + */ + if (tp->t_force == 0 || !callout_active(tp->tt_persist)) { + /* + * No need to check for TH_FIN here because + * the TF_SENTFIN flag handles that case. + */ + if ((flags & TH_SYN) == 0) + tp->snd_nxt -= len; + } + +out: + if (error == ENOBUFS) { + if (!callout_active(tp->tt_rexmt) && + !callout_active(tp->tt_persist)) + callout_reset(tp->tt_rexmt, tp->t_rxtcur, + tcp_timer_rexmt, tp); + tcp_quench(tp->t_inpcb, 0); + return (0); + } + if (error == EMSGSIZE) { + /* + * ip_output() will have already fixed the route + * for us. tcp_mtudisc() will, as its last action, + * initiate retransmission, so it is important to + * not do so here. + */ + tcp_mtudisc(tp->t_inpcb, 0); + return 0; + } + if ((error == EHOSTUNREACH || error == ENETDOWN) + && TCPS_HAVERCVDSYN(tp->t_state)) { + tp->t_softerror = error; + return (0); + } + return (error); + } + tcpstat.tcps_sndtotal++; + + /* + * Data sent (as far as we can tell). + * If this advertises a larger window than any other segment, + * then remember the size of the advertised window. + * Any pending ACK has now been sent. + */ + if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv)) + tp->rcv_adv = tp->rcv_nxt + win; + tp->last_ack_sent = tp->rcv_nxt; + tp->t_flags &= ~TF_ACKNOW; + if (tcp_delack_enabled) + callout_stop(tp->tt_delack); + if (sendalot && (!tcp_do_newreno || --maxburst)) + goto again; + return (0); +} + +void +tcp_setpersist(tp) + register struct tcpcb *tp; +{ + int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; + int tt; + + if (callout_active(tp->tt_rexmt)) + panic("tcp_setpersist: retransmit pending"); + /* + * Start/restart persistance timer. + */ + TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], + TCPTV_PERSMIN, TCPTV_PERSMAX); + callout_reset(tp->tt_persist, tt, tcp_timer_persist, tp); + if (tp->t_rxtshift < TCP_MAXRXTSHIFT) + tp->t_rxtshift++; +} diff --git a/sys/netinet/tcp_reass.c b/sys/netinet/tcp_reass.c new file mode 100644 index 0000000..bf578b7 --- /dev/null +++ b/sys/netinet/tcp_reass.c @@ -0,0 +1,2885 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 + * $FreeBSD$ + */ + +#include "opt_ipfw.h" /* for ipfw_fwd */ +#include "opt_inet6.h" +#include "opt_ipsec.h" +#include "opt_tcpdebug.h" +#include "opt_tcp_input.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/proc.h> /* for proc0 declaration */ +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/syslog.h> + +#include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */ + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/ip_icmp.h> /* for ICMP_BANDLIM */ +#include <netinet/in_var.h> +#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ +#include <netinet/in_pcb.h> +#include <netinet/ip_var.h> +#ifdef INET6 +#include <netinet/ip6.h> +#include <netinet/icmp6.h> +#include <netinet6/nd6.h> +#include <netinet6/ip6_var.h> +#include <netinet6/in6_pcb.h> +#endif +#include <netinet/tcp.h> +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_seq.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#ifdef INET6 +#include <netinet6/tcp6_var.h> +#endif +#include <netinet/tcpip.h> +#ifdef TCPDEBUG +#include <netinet/tcp_debug.h> + +u_char tcp_saveipgen[40]; /* the size must be of max ip header, now IPv6 */ +struct tcphdr tcp_savetcp; +#endif /* TCPDEBUG */ + +#ifdef IPSEC +#include <netinet6/ipsec.h> +#ifdef INET6 +#include <netinet6/ipsec6.h> +#endif +#include <netkey/key.h> +#endif /*IPSEC*/ + +#include <machine/in_cksum.h> + +MALLOC_DEFINE(M_TSEGQ, "tseg_qent", "TCP segment queue entry"); + +static int tcprexmtthresh = 3; +tcp_seq tcp_iss; +tcp_cc tcp_ccgen; + +struct tcpstat tcpstat; +SYSCTL_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RD, + &tcpstat , tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); + +static int log_in_vain = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, + &log_in_vain, 0, "Log all incoming TCP connections"); + +static int blackhole = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW, + &blackhole, 0, "Do not send RST when dropping refused connections"); + +int tcp_delack_enabled = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW, + &tcp_delack_enabled, 0, + "Delay ACK to try and piggyback it onto a data packet"); + +int tcp_lq_overflow = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_lq_overflow, CTLFLAG_RW, + &tcp_lq_overflow, 0, + "Listen Queue Overflow"); + +#ifdef TCP_DROP_SYNFIN +static int drop_synfin = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW, + &drop_synfin, 0, "Drop TCP packets with SYN+FIN set"); +#endif + +struct inpcbhead tcb; +#define tcb6 tcb /* for KAME src sync over BSD*'s */ +struct inpcbinfo tcbinfo; + +static void tcp_dooptions __P((struct tcpcb *, + u_char *, int, struct tcphdr *, struct tcpopt *)); +static void tcp_pulloutofband __P((struct socket *, + struct tcphdr *, struct mbuf *, int)); +static int tcp_reass __P((struct tcpcb *, struct tcphdr *, int *, + struct mbuf *)); +static void tcp_xmit_timer __P((struct tcpcb *, int)); +static int tcp_newreno __P((struct tcpcb *, struct tcphdr *)); + +/* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */ +#ifdef INET6 +#define ND6_HINT(tp) \ +do { \ + if ((tp) && (tp)->t_inpcb && \ + ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0 && \ + (tp)->t_inpcb->in6p_route.ro_rt) \ + nd6_nud_hint((tp)->t_inpcb->in6p_route.ro_rt, NULL, 0); \ +} while (0) +#else +#define ND6_HINT(tp) +#endif + +/* + * Indicate whether this ack should be delayed. + */ +#define DELAY_ACK(tp) \ + (tcp_delack_enabled && !callout_pending(tp->tt_delack)) + +/* + * Insert segment which inludes th into reassembly queue of tcp with + * control block tp. Return TH_FIN if reassembly now includes + * a segment with FIN. The macro form does the common case inline + * (segment is the next to be received on an established connection, + * and the queue is empty), avoiding linkage into and removal + * from the queue and repetition of various conversions. + * Set DELACK for segments received in order, but ack immediately + * when segments are out of order (so fast retransmit can work). + */ +#define TCP_REASS(tp, th, tlenp, m, so, flags) { \ + if ((th)->th_seq == (tp)->rcv_nxt && \ + LIST_EMPTY(&(tp)->t_segq) && \ + TCPS_HAVEESTABLISHED((tp)->t_state)) { \ + if (DELAY_ACK(tp)) \ + callout_reset(tp->tt_delack, tcp_delacktime, \ + tcp_timer_delack, tp); \ + else \ + tp->t_flags |= TF_ACKNOW; \ + (tp)->rcv_nxt += *(tlenp); \ + flags = (th)->th_flags & TH_FIN; \ + tcpstat.tcps_rcvpack++;\ + tcpstat.tcps_rcvbyte += *(tlenp);\ + ND6_HINT(tp); \ + sbappend(&(so)->so_rcv, (m)); \ + sorwakeup(so); \ + } else { \ + (flags) = tcp_reass((tp), (th), (tlenp), (m)); \ + tp->t_flags |= TF_ACKNOW; \ + } \ +} + +static int +tcp_reass(tp, th, tlenp, m) + register struct tcpcb *tp; + register struct tcphdr *th; + int *tlenp; + struct mbuf *m; +{ + struct tseg_qent *q; + struct tseg_qent *p = NULL; + struct tseg_qent *nq; + struct tseg_qent *te; + struct socket *so = tp->t_inpcb->inp_socket; + int flags; + + /* + * Call with th==0 after become established to + * force pre-ESTABLISHED data up to user socket. + */ + if (th == 0) + goto present; + + /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */ + MALLOC(te, struct tseg_qent *, sizeof (struct tseg_qent), M_TSEGQ, + M_NOWAIT); + if (te == NULL) { + tcpstat.tcps_rcvmemdrop++; + m_freem(m); + return (0); + } + + /* + * Find a segment which begins after this one does. + */ + LIST_FOREACH(q, &tp->t_segq, tqe_q) { + if (SEQ_GT(q->tqe_th->th_seq, th->th_seq)) + break; + p = q; + } + + /* + * If there is a preceding segment, it may provide some of + * our data already. If so, drop the data from the incoming + * segment. If it provides all of our data, drop us. + */ + if (p != NULL) { + register int i; + /* conversion to int (in i) handles seq wraparound */ + i = p->tqe_th->th_seq + p->tqe_len - th->th_seq; + if (i > 0) { + if (i >= *tlenp) { + tcpstat.tcps_rcvduppack++; + tcpstat.tcps_rcvdupbyte += *tlenp; + m_freem(m); + FREE(te, M_TSEGQ); + /* + * Try to present any queued data + * at the left window edge to the user. + * This is needed after the 3-WHS + * completes. + */ + goto present; /* ??? */ + } + m_adj(m, i); + *tlenp -= i; + th->th_seq += i; + } + } + tcpstat.tcps_rcvoopack++; + tcpstat.tcps_rcvoobyte += *tlenp; + + /* + * While we overlap succeeding segments trim them or, + * if they are completely covered, dequeue them. + */ + while (q) { + register int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq; + if (i <= 0) + break; + if (i < q->tqe_len) { + q->tqe_th->th_seq += i; + q->tqe_len -= i; + m_adj(q->tqe_m, i); + break; + } + + nq = LIST_NEXT(q, tqe_q); + LIST_REMOVE(q, tqe_q); + m_freem(q->tqe_m); + FREE(q, M_TSEGQ); + q = nq; + } + + /* Insert the new segment queue entry into place. */ + te->tqe_m = m; + te->tqe_th = th; + te->tqe_len = *tlenp; + + if (p == NULL) { + LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q); + } else { + LIST_INSERT_AFTER(p, te, tqe_q); + } + +present: + /* + * Present data to user, advancing rcv_nxt through + * completed sequence space. + */ + if (!TCPS_HAVEESTABLISHED(tp->t_state)) + return (0); + q = LIST_FIRST(&tp->t_segq); + if (!q || q->tqe_th->th_seq != tp->rcv_nxt) + return (0); + do { + tp->rcv_nxt += q->tqe_len; + flags = q->tqe_th->th_flags & TH_FIN; + nq = LIST_NEXT(q, tqe_q); + LIST_REMOVE(q, tqe_q); + if (so->so_state & SS_CANTRCVMORE) + m_freem(q->tqe_m); + else + sbappend(&so->so_rcv, q->tqe_m); + FREE(q, M_TSEGQ); + q = nq; + } while (q && q->tqe_th->th_seq == tp->rcv_nxt); + ND6_HINT(tp); + sorwakeup(so); + return (flags); +} + +/* + * TCP input routine, follows pages 65-76 of the + * protocol specification dated September, 1981 very closely. + */ +#ifdef INET6 +int +tcp6_input(mp, offp, proto) + struct mbuf **mp; + int *offp, proto; +{ + register struct mbuf *m = *mp; + + IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE); + + /* + * draft-itojun-ipv6-tcp-to-anycast + * better place to put this in? + */ + if (m->m_flags & M_ANYCAST6) { + struct ip6_hdr *ip6; + + ip6 = mtod(m, struct ip6_hdr *); + icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, + (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); + return IPPROTO_DONE; + } + + tcp_input(m, *offp, proto); + return IPPROTO_DONE; +} +#endif + +void +tcp_input(m, off0, proto) + register struct mbuf *m; + int off0, proto; +{ + register struct tcphdr *th; + register struct ip *ip = NULL; + register struct ipovly *ipov; + register struct inpcb *inp; + u_char *optp = NULL; + int optlen = 0; + int len, tlen, off; + int drop_hdrlen; + register struct tcpcb *tp = 0; + register int thflags; + struct socket *so = 0; + int todrop, acked, ourfinisacked, needoutput = 0; + struct in_addr laddr; +#ifdef INET6 + struct in6_addr laddr6; +#endif + int dropsocket = 0; + int iss = 0; + u_long tiwin; + struct tcpopt to; /* options in this segment */ + struct rmxp_tao *taop; /* pointer to our TAO cache entry */ + struct rmxp_tao tao_noncached; /* in case there's no cached entry */ +#ifdef TCPDEBUG + short ostate = 0; +#endif +#ifdef INET6 + struct ip6_hdr *ip6 = NULL; + int isipv6; +#endif /* INET6 */ + int rstreason; /* For badport_bandlim accounting purposes */ + +#ifdef INET6 + isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; +#endif + bzero((char *)&to, sizeof(to)); + + tcpstat.tcps_rcvtotal++; + +#ifdef INET6 + if (isipv6) { + /* IP6_EXTHDR_CHECK() is already done at tcp6_input() */ + ip6 = mtod(m, struct ip6_hdr *); + tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; + if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) { + tcpstat.tcps_rcvbadsum++; + goto drop; + } + th = (struct tcphdr *)((caddr_t)ip6 + off0); + } else +#endif /* INET6 */ + { + /* + * Get IP and TCP header together in first mbuf. + * Note: IP leaves IP header in first mbuf. + */ + if (off0 > sizeof (struct ip)) { + ip_stripoptions(m, (struct mbuf *)0); + off0 = sizeof(struct ip); + } + if (m->m_len < sizeof (struct tcpiphdr)) { + if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) { + tcpstat.tcps_rcvshort++; + return; + } + } + ip = mtod(m, struct ip *); + ipov = (struct ipovly *)ip; + th = (struct tcphdr *)((caddr_t)ip + off0); + tlen = ip->ip_len; + + if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { + if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) + th->th_sum = m->m_pkthdr.csum_data; + else + th->th_sum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data + + ip->ip_len + IPPROTO_TCP)); + th->th_sum ^= 0xffff; + } else { + /* + * Checksum extended TCP header and data. + */ + len = sizeof (struct ip) + tlen; + bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); + ipov->ih_len = (u_short)tlen; + HTONS(ipov->ih_len); + th->th_sum = in_cksum(m, len); + } + if (th->th_sum) { + tcpstat.tcps_rcvbadsum++; + goto drop; + } +#ifdef INET6 + /* Re-initialization for later version check */ + ip->ip_v = IPVERSION; +#endif + } + + /* + * Check that TCP offset makes sense, + * pull out TCP options and adjust length. XXX + */ + off = th->th_off << 2; + if (off < sizeof (struct tcphdr) || off > tlen) { + tcpstat.tcps_rcvbadoff++; + goto drop; + } + tlen -= off; /* tlen is used instead of ti->ti_len */ + if (off > sizeof (struct tcphdr)) { +#ifdef INET6 + if (isipv6) { + IP6_EXTHDR_CHECK(m, off0, off, ); + ip6 = mtod(m, struct ip6_hdr *); + th = (struct tcphdr *)((caddr_t)ip6 + off0); + } else +#endif /* INET6 */ + { + if (m->m_len < sizeof(struct ip) + off) { + if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) { + tcpstat.tcps_rcvshort++; + return; + } + ip = mtod(m, struct ip *); + ipov = (struct ipovly *)ip; + th = (struct tcphdr *)((caddr_t)ip + off0); + } + } + optlen = off - sizeof (struct tcphdr); + optp = (u_char *)(th + 1); + } + thflags = th->th_flags; + +#ifdef TCP_DROP_SYNFIN + /* + * If the drop_synfin option is enabled, drop all packets with + * both the SYN and FIN bits set. This prevents e.g. nmap from + * identifying the TCP/IP stack. + * + * This is a violation of the TCP specification. + */ + if (drop_synfin && (thflags & (TH_SYN|TH_FIN)) == (TH_SYN|TH_FIN)) + goto drop; +#endif + + /* + * Convert TCP protocol specific fields to host format. + */ + NTOHL(th->th_seq); + NTOHL(th->th_ack); + NTOHS(th->th_win); + NTOHS(th->th_urp); + + /* + * Delay droping TCP, IP headers, IPv6 ext headers, and TCP options, + * until after ip6_savecontrol() is called and before other functions + * which don't want those proto headers. + * Because ip6_savecontrol() is going to parse the mbuf to + * search for data to be passed up to user-land, it wants mbuf + * parameters to be unchanged. + */ + drop_hdrlen = off0 + off; + + /* + * Locate pcb for segment. + */ +findpcb: +#ifdef IPFIREWALL_FORWARD + if (ip_fw_fwd_addr != NULL +#ifdef INET6 + && isipv6 == NULL /* IPv6 support is not yet */ +#endif /* INET6 */ + ) { + /* + * Diverted. Pretend to be the destination. + * already got one like this? + */ + inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport, + ip->ip_dst, th->th_dport, 0, m->m_pkthdr.rcvif); + if (!inp) { + /* + * No, then it's new. Try find the ambushing socket + */ + if (!ip_fw_fwd_addr->sin_port) { + inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, + th->th_sport, ip_fw_fwd_addr->sin_addr, + th->th_dport, 1, m->m_pkthdr.rcvif); + } else { + inp = in_pcblookup_hash(&tcbinfo, + ip->ip_src, th->th_sport, + ip_fw_fwd_addr->sin_addr, + ntohs(ip_fw_fwd_addr->sin_port), 1, + m->m_pkthdr.rcvif); + } + } + ip_fw_fwd_addr = NULL; + } else +#endif /* IPFIREWALL_FORWARD */ + { +#ifdef INET6 + if (isipv6) + inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_src, th->th_sport, + &ip6->ip6_dst, th->th_dport, 1, + m->m_pkthdr.rcvif); + else +#endif /* INET6 */ + inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport, + ip->ip_dst, th->th_dport, 1, m->m_pkthdr.rcvif); + } + +#ifdef IPSEC +#ifdef INET6 + if (isipv6) { + if (inp != NULL && ipsec6_in_reject_so(m, inp->inp_socket)) { + ipsec6stat.in_polvio++; + goto drop; + } + } else +#endif /* INET6 */ + if (inp != NULL && ipsec4_in_reject_so(m, inp->inp_socket)) { + ipsecstat.in_polvio++; + goto drop; + } +#endif /*IPSEC*/ + + /* + * If the state is CLOSED (i.e., TCB does not exist) then + * all data in the incoming segment is discarded. + * If the TCB exists but is in CLOSED state, it is embryonic, + * but should either do a listen or a connect soon. + */ + if (inp == NULL) { + if (log_in_vain) { +#ifdef INET6 + char dbuf[INET6_ADDRSTRLEN], sbuf[INET6_ADDRSTRLEN]; +#else /* INET6 */ + char dbuf[4*sizeof "123"], sbuf[4*sizeof "123"]; +#endif /* INET6 */ + +#ifdef INET6 + if (isipv6) { + strcpy(dbuf, ip6_sprintf(&ip6->ip6_dst)); + strcpy(sbuf, ip6_sprintf(&ip6->ip6_src)); + } else +#endif + { + strcpy(dbuf, inet_ntoa(ip->ip_dst)); + strcpy(sbuf, inet_ntoa(ip->ip_src)); + } + switch (log_in_vain) { + case 1: + if(thflags & TH_SYN) + log(LOG_INFO, + "Connection attempt to TCP %s:%d from %s:%d\n", + dbuf, ntohs(th->th_dport), + sbuf, + ntohs(th->th_sport)); + break; + case 2: + log(LOG_INFO, + "Connection attempt to TCP %s:%d from %s:%d flags:0x%x\n", + dbuf, ntohs(th->th_dport), sbuf, + ntohs(th->th_sport), thflags); + break; + default: + break; + } + } + if (blackhole) { + switch (blackhole) { + case 1: + if (thflags & TH_SYN) + goto drop; + break; + case 2: + goto drop; + default: + goto drop; + } + } + rstreason = BANDLIM_RST_CLOSEDPORT; + goto dropwithreset; + } + tp = intotcpcb(inp); + if (tp == 0) { + rstreason = BANDLIM_RST_CLOSEDPORT; + goto dropwithreset; + } + if (tp->t_state == TCPS_CLOSED) + goto drop; + + /* Unscale the window into a 32-bit value. */ + if ((thflags & TH_SYN) == 0) + tiwin = th->th_win << tp->snd_scale; + else + tiwin = th->th_win; + +#ifdef INET6 + /* save packet options if user wanted */ + if (isipv6 && inp->in6p_flags & INP_CONTROLOPTS) { + if (inp->in6p_options) { + m_freem(inp->in6p_options); + inp->in6p_options = 0; + } + ip6_savecontrol(inp, &inp->in6p_options, ip6, m); + } + /* else, should also do ip_srcroute() here? */ +#endif /* INET6 */ + + so = inp->inp_socket; + if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) { +#ifdef TCPDEBUG + if (so->so_options & SO_DEBUG) { + ostate = tp->t_state; +#ifdef INET6 + if (isipv6) + bcopy((char *)ip6, (char *)tcp_saveipgen, + sizeof(*ip6)); + else +#endif /* INET6 */ + bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip)); + tcp_savetcp = *th; + } +#endif + if (so->so_options & SO_ACCEPTCONN) { + register struct tcpcb *tp0 = tp; + struct socket *so2; +#ifdef IPSEC + struct socket *oso; +#endif +#ifdef INET6 + struct inpcb *oinp = sotoinpcb(so); +#endif /* INET6 */ + +#ifndef IPSEC + /* + * Current IPsec implementation makes incorrect IPsec + * cache if this check is done here. + * So delay this until duplicated socket is created. + */ + if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { + /* + * Note: dropwithreset makes sure we don't + * send a RST in response to a RST. + */ + if (thflags & TH_ACK) { + tcpstat.tcps_badsyn++; + rstreason = BANDLIM_RST_OPENPORT; + goto dropwithreset; + } + goto drop; + } +#endif + so2 = sonewconn(so, 0); + if (so2 == 0) { + tcpstat.tcps_listendrop++; + so2 = sodropablereq(so); + if (so2) { + if (tcp_lq_overflow) + sototcpcb(so2)->t_flags |= + TF_LQ_OVERFLOW; + tcp_drop(sototcpcb(so2), ETIMEDOUT); + so2 = sonewconn(so, 0); + } + if (!so2) + goto drop; + } +#ifdef IPSEC + oso = so; +#endif + so = so2; + /* + * This is ugly, but .... + * + * Mark socket as temporary until we're + * committed to keeping it. The code at + * ``drop'' and ``dropwithreset'' check the + * flag dropsocket to see if the temporary + * socket created here should be discarded. + * We mark the socket as discardable until + * we're committed to it below in TCPS_LISTEN. + */ + dropsocket++; + inp = (struct inpcb *)so->so_pcb; +#ifdef INET6 + if (isipv6) + inp->in6p_laddr = ip6->ip6_dst; + else { + if ((inp->inp_flags & IN6P_BINDV6ONLY) == 0) { + inp->inp_vflag &= ~INP_IPV6; + inp->inp_vflag |= INP_IPV4; + } +#endif /* INET6 */ + inp->inp_laddr = ip->ip_dst; +#ifdef INET6 + } +#endif /* INET6 */ + inp->inp_lport = th->th_dport; + if (in_pcbinshash(inp) != 0) { + /* + * Undo the assignments above if we failed to + * put the PCB on the hash lists. + */ +#ifdef INET6 + if (isipv6) + inp->in6p_laddr = in6addr_any; + else +#endif /* INET6 */ + inp->inp_laddr.s_addr = INADDR_ANY; + inp->inp_lport = 0; + goto drop; + } +#ifdef IPSEC + /* + * To avoid creating incorrectly cached IPsec + * association, this is need to be done here. + * + * Subject: (KAME-snap 748) + * From: Wayne Knowles <w.knowles@niwa.cri.nz> + * ftp://ftp.kame.net/pub/mail-list/snap-users/748 + */ + if ((thflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { + /* + * Note: dropwithreset makes sure we don't + * send a RST in response to a RST. + */ + if (thflags & TH_ACK) { + tcpstat.tcps_badsyn++; + rstreason = BANDLIM_RST_OPENPORT; + goto dropwithreset; + } + goto drop; + } +#endif +#ifdef INET6 + if (isipv6) { + /* + * inherit socket options from the listening + * socket. + */ + inp->inp_flags |= + oinp->inp_flags & INP_CONTROLOPTS; + if (inp->inp_flags & INP_CONTROLOPTS) { + if (inp->in6p_options) { + m_freem(inp->in6p_options); + inp->in6p_options = 0; + } + ip6_savecontrol(inp, + &inp->in6p_options, + ip6, m); + } + } else +#endif /* INET6 */ + inp->inp_options = ip_srcroute(); +#ifdef IPSEC + /* copy old policy into new socket's */ + if (ipsec_copy_policy(sotoinpcb(oso)->inp_sp, + inp->inp_sp)) + printf("tcp_input: could not copy policy\n"); +#endif + tp = intotcpcb(inp); + tp->t_state = TCPS_LISTEN; + tp->t_flags |= tp0->t_flags & (TF_NOPUSH|TF_NOOPT); + + /* Compute proper scaling value from buffer space */ + while (tp->request_r_scale < TCP_MAX_WINSHIFT && + TCP_MAXWIN << tp->request_r_scale < + so->so_rcv.sb_hiwat) + tp->request_r_scale++; + } + } + + /* + * Segment received on connection. + * Reset idle time and keep-alive timer. + */ + tp->t_rcvtime = ticks; + if (TCPS_HAVEESTABLISHED(tp->t_state)) + callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); + + /* + * Process options if not in LISTEN state, + * else do it below (after getting remote address). + */ + if (tp->t_state != TCPS_LISTEN) + tcp_dooptions(tp, optp, optlen, th, &to); + + /* + * Header prediction: check for the two common cases + * of a uni-directional data xfer. If the packet has + * no control flags, is in-sequence, the window didn't + * change and we're not retransmitting, it's a + * candidate. If the length is zero and the ack moved + * forward, we're the sender side of the xfer. Just + * free the data acked & wake any higher level process + * that was blocked waiting for space. If the length + * is non-zero and the ack didn't move, we're the + * receiver side. If we're getting packets in-order + * (the reassembly queue is empty), add the data to + * the socket buffer and note that we need a delayed ack. + * Make sure that the hidden state-flags are also off. + * Since we check for TCPS_ESTABLISHED above, it can only + * be TH_NEEDSYN. + */ + if (tp->t_state == TCPS_ESTABLISHED && + (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && + ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && + ((to.to_flag & TOF_TS) == 0 || + TSTMP_GEQ(to.to_tsval, tp->ts_recent)) && + /* + * Using the CC option is compulsory if once started: + * the segment is OK if no T/TCP was negotiated or + * if the segment has a CC option equal to CCrecv + */ + ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) != (TF_REQ_CC|TF_RCVD_CC) || + ((to.to_flag & TOF_CC) != 0 && to.to_cc == tp->cc_recv)) && + th->th_seq == tp->rcv_nxt && + tiwin && tiwin == tp->snd_wnd && + tp->snd_nxt == tp->snd_max) { + + /* + * If last ACK falls within this segment's sequence numbers, + * record the timestamp. + * NOTE that the test is modified according to the latest + * proposal of the tcplw@cray.com list (Braden 1993/04/26). + */ + if ((to.to_flag & TOF_TS) != 0 && + SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { + tp->ts_recent_age = ticks; + tp->ts_recent = to.to_tsval; + } + + if (tlen == 0) { + if (SEQ_GT(th->th_ack, tp->snd_una) && + SEQ_LEQ(th->th_ack, tp->snd_max) && + tp->snd_cwnd >= tp->snd_wnd && + tp->t_dupacks < tcprexmtthresh) { + /* + * this is a pure ack for outstanding data. + */ + ++tcpstat.tcps_predack; + /* + * "bad retransmit" recovery + */ + if (tp->t_rxtshift == 1 && + ticks < tp->t_badrxtwin) { + tp->snd_cwnd = tp->snd_cwnd_prev; + tp->snd_ssthresh = + tp->snd_ssthresh_prev; + tp->snd_nxt = tp->snd_max; + tp->t_badrxtwin = 0; + } + if ((to.to_flag & TOF_TS) != 0) + tcp_xmit_timer(tp, + ticks - to.to_tsecr + 1); + else if (tp->t_rtttime && + SEQ_GT(th->th_ack, tp->t_rtseq)) + tcp_xmit_timer(tp, ticks - tp->t_rtttime); + acked = th->th_ack - tp->snd_una; + tcpstat.tcps_rcvackpack++; + tcpstat.tcps_rcvackbyte += acked; + sbdrop(&so->so_snd, acked); + tp->snd_una = th->th_ack; + m_freem(m); + ND6_HINT(tp); /* some progress has been done */ + + /* + * If all outstanding data are acked, stop + * retransmit timer, otherwise restart timer + * using current (possibly backed-off) value. + * If process is waiting for space, + * wakeup/selwakeup/signal. If data + * are ready to send, let tcp_output + * decide between more output or persist. + */ + if (tp->snd_una == tp->snd_max) + callout_stop(tp->tt_rexmt); + else if (!callout_active(tp->tt_persist)) + callout_reset(tp->tt_rexmt, + tp->t_rxtcur, + tcp_timer_rexmt, tp); + + sowwakeup(so); + if (so->so_snd.sb_cc) + (void) tcp_output(tp); + return; + } + } else if (th->th_ack == tp->snd_una && + LIST_EMPTY(&tp->t_segq) && + tlen <= sbspace(&so->so_rcv)) { + /* + * this is a pure, in-sequence data packet + * with nothing on the reassembly queue and + * we have enough buffer space to take it. + */ + ++tcpstat.tcps_preddat; + tp->rcv_nxt += tlen; + tcpstat.tcps_rcvpack++; + tcpstat.tcps_rcvbyte += tlen; + ND6_HINT(tp); /* some progress has been done */ + /* + * Add data to socket buffer. + */ + m_adj(m, drop_hdrlen); /* delayed header drop */ + sbappend(&so->so_rcv, m); + sorwakeup(so); + if (DELAY_ACK(tp)) { + callout_reset(tp->tt_delack, tcp_delacktime, + tcp_timer_delack, tp); + } else { + tp->t_flags |= TF_ACKNOW; + tcp_output(tp); + } + return; + } + } + + /* + * Calculate amount of space in receive window, + * and then do TCP input processing. + * Receive window is amount of space in rcv queue, + * but not less than advertised window. + */ + { int win; + + win = sbspace(&so->so_rcv); + if (win < 0) + win = 0; + tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); + } + + switch (tp->t_state) { + + /* + * If the state is LISTEN then ignore segment if it contains an RST. + * If the segment contains an ACK then it is bad and send a RST. + * If it does not contain a SYN then it is not interesting; drop it. + * If it is from this socket, drop it, it must be forged. + * Don't bother responding if the destination was a broadcast. + * Otherwise initialize tp->rcv_nxt, and tp->irs, select an initial + * tp->iss, and send a segment: + * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> + * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss. + * Fill in remote peer address fields if not previously specified. + * Enter SYN_RECEIVED state, and process any other fields of this + * segment in this state. + */ + case TCPS_LISTEN: { + register struct sockaddr_in *sin; +#ifdef INET6 + register struct sockaddr_in6 *sin6; +#endif + + if (thflags & TH_RST) + goto drop; + if (thflags & TH_ACK) { + rstreason = BANDLIM_RST_OPENPORT; + goto dropwithreset; + } + if ((thflags & TH_SYN) == 0) + goto drop; + if (th->th_dport == th->th_sport) { +#ifdef INET6 + if (isipv6) { + if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, + &ip6->ip6_src)) + goto drop; + } else +#endif /* INET6 */ + if (ip->ip_dst.s_addr == ip->ip_src.s_addr) + goto drop; + } + /* + * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN + * in_broadcast() should never return true on a received + * packet with M_BCAST not set. + * + * Packets with a multicast source address should also + * be discarded. + */ + if (m->m_flags & (M_BCAST|M_MCAST)) + goto drop; +#ifdef INET6 + if (isipv6) { + if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || + IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) + goto drop; + } else +#endif + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || + IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || + ip->ip_src.s_addr == htonl(INADDR_BROADCAST)) + goto drop; +#ifdef INET6 + if (isipv6) { + MALLOC(sin6, struct sockaddr_in6 *, sizeof *sin6, + M_SONAME, M_NOWAIT | M_ZERO); + if (sin6 == NULL) + goto drop; + sin6->sin6_family = AF_INET6; + sin6->sin6_len = sizeof(*sin6); + sin6->sin6_addr = ip6->ip6_src; + sin6->sin6_port = th->th_sport; + laddr6 = inp->in6p_laddr; + if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) + inp->in6p_laddr = ip6->ip6_dst; + if (in6_pcbconnect(inp, (struct sockaddr *)sin6, + &proc0)) { + inp->in6p_laddr = laddr6; + FREE(sin6, M_SONAME); + goto drop; + } + FREE(sin6, M_SONAME); + } else +#endif + { + MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME, + M_NOWAIT); + if (sin == NULL) + goto drop; + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + sin->sin_addr = ip->ip_src; + sin->sin_port = th->th_sport; + bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero)); + laddr = inp->inp_laddr; + if (inp->inp_laddr.s_addr == INADDR_ANY) + inp->inp_laddr = ip->ip_dst; + if (in_pcbconnect(inp, (struct sockaddr *)sin, &proc0)) { + inp->inp_laddr = laddr; + FREE(sin, M_SONAME); + goto drop; + } + FREE(sin, M_SONAME); + } + tp->t_template = tcp_template(tp); + if (tp->t_template == 0) { + tp = tcp_drop(tp, ENOBUFS); + dropsocket = 0; /* socket is already gone */ + goto drop; + } + if ((taop = tcp_gettaocache(inp)) == NULL) { + taop = &tao_noncached; + bzero(taop, sizeof(*taop)); + } + tcp_dooptions(tp, optp, optlen, th, &to); + if (iss) + tp->iss = iss; + else { + tp->iss = tcp_rndiss_next(); + } + tp->irs = th->th_seq; + tcp_sendseqinit(tp); + tcp_rcvseqinit(tp); + tp->snd_recover = tp->snd_una; + /* + * Initialization of the tcpcb for transaction; + * set SND.WND = SEG.WND, + * initialize CCsend and CCrecv. + */ + tp->snd_wnd = tiwin; /* initial send-window */ + tp->cc_send = CC_INC(tcp_ccgen); + tp->cc_recv = to.to_cc; + /* + * Perform TAO test on incoming CC (SEG.CC) option, if any. + * - compare SEG.CC against cached CC from the same host, + * if any. + * - if SEG.CC > chached value, SYN must be new and is accepted + * immediately: save new CC in the cache, mark the socket + * connected, enter ESTABLISHED state, turn on flag to + * send a SYN in the next segment. + * A virtual advertised window is set in rcv_adv to + * initialize SWS prevention. Then enter normal segment + * processing: drop SYN, process data and FIN. + * - otherwise do a normal 3-way handshake. + */ + if ((to.to_flag & TOF_CC) != 0) { + if (((tp->t_flags & TF_NOPUSH) != 0) && + taop->tao_cc != 0 && CC_GT(to.to_cc, taop->tao_cc)) { + + taop->tao_cc = to.to_cc; + tp->t_starttime = ticks; + tp->t_state = TCPS_ESTABLISHED; + + /* + * If there is a FIN, or if there is data and the + * connection is local, then delay SYN,ACK(SYN) in + * the hope of piggy-backing it on a response + * segment. Otherwise must send ACK now in case + * the other side is slow starting. + */ + if (DELAY_ACK(tp) && ((thflags & TH_FIN) || + (tlen != 0 && +#ifdef INET6 + ((isipv6 && in6_localaddr(&inp->in6p_faddr)) + || + (!isipv6 && +#endif + in_localaddr(inp->inp_faddr) +#ifdef INET6 + )) +#endif + ))) { + callout_reset(tp->tt_delack, tcp_delacktime, + tcp_timer_delack, tp); + tp->t_flags |= TF_NEEDSYN; + } else + tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); + + /* + * Limit the `virtual advertised window' to TCP_MAXWIN + * here. Even if we requested window scaling, it will + * become effective only later when our SYN is acked. + */ + tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN); + tcpstat.tcps_connects++; + soisconnected(so); + callout_reset(tp->tt_keep, tcp_keepinit, + tcp_timer_keep, tp); + dropsocket = 0; /* committed to socket */ + tcpstat.tcps_accepts++; + goto trimthenstep6; + } + /* else do standard 3-way handshake */ + } else { + /* + * No CC option, but maybe CC.NEW: + * invalidate cached value. + */ + taop->tao_cc = 0; + } + /* + * TAO test failed or there was no CC option, + * do a standard 3-way handshake. + */ + tp->t_flags |= TF_ACKNOW; + tp->t_state = TCPS_SYN_RECEIVED; + callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp); + dropsocket = 0; /* committed to socket */ + tcpstat.tcps_accepts++; + goto trimthenstep6; + } + + /* + * If the state is SYN_RECEIVED: + * if seg contains an ACK, but not for our SYN/ACK, send a RST. + */ + case TCPS_SYN_RECEIVED: + if ((thflags & TH_ACK) && + (SEQ_LEQ(th->th_ack, tp->snd_una) || + SEQ_GT(th->th_ack, tp->snd_max))) { + rstreason = BANDLIM_RST_OPENPORT; + goto dropwithreset; + } + break; + + /* + * If the state is SYN_SENT: + * if seg contains an ACK, but not for our SYN, drop the input. + * if seg contains a RST, then drop the connection. + * if seg does not contain SYN, then drop it. + * Otherwise this is an acceptable SYN segment + * initialize tp->rcv_nxt and tp->irs + * if seg contains ack then advance tp->snd_una + * if SYN has been acked change to ESTABLISHED else SYN_RCVD state + * arrange for segment to be acked (eventually) + * continue processing rest of data/controls, beginning with URG + */ + case TCPS_SYN_SENT: + if ((taop = tcp_gettaocache(inp)) == NULL) { + taop = &tao_noncached; + bzero(taop, sizeof(*taop)); + } + + if ((thflags & TH_ACK) && + (SEQ_LEQ(th->th_ack, tp->iss) || + SEQ_GT(th->th_ack, tp->snd_max))) { + /* + * If we have a cached CCsent for the remote host, + * hence we haven't just crashed and restarted, + * do not send a RST. This may be a retransmission + * from the other side after our earlier ACK was lost. + * Our new SYN, when it arrives, will serve as the + * needed ACK. + */ + if (taop->tao_ccsent != 0) + goto drop; + else { + rstreason = BANDLIM_UNLIMITED; + goto dropwithreset; + } + } + if (thflags & TH_RST) { + if (thflags & TH_ACK) + tp = tcp_drop(tp, ECONNREFUSED); + goto drop; + } + if ((thflags & TH_SYN) == 0) + goto drop; + tp->snd_wnd = th->th_win; /* initial send window */ + tp->cc_recv = to.to_cc; /* foreign CC */ + + tp->irs = th->th_seq; + tcp_rcvseqinit(tp); + if (thflags & TH_ACK) { + /* + * Our SYN was acked. If segment contains CC.ECHO + * option, check it to make sure this segment really + * matches our SYN. If not, just drop it as old + * duplicate, but send an RST if we're still playing + * by the old rules. If no CC.ECHO option, make sure + * we don't get fooled into using T/TCP. + */ + if (to.to_flag & TOF_CCECHO) { + if (tp->cc_send != to.to_ccecho) { + if (taop->tao_ccsent != 0) + goto drop; + else { + rstreason = BANDLIM_UNLIMITED; + goto dropwithreset; + } + } + } else + tp->t_flags &= ~TF_RCVD_CC; + tcpstat.tcps_connects++; + soisconnected(so); + /* Do window scaling on this connection? */ + if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == + (TF_RCVD_SCALE|TF_REQ_SCALE)) { + tp->snd_scale = tp->requested_s_scale; + tp->rcv_scale = tp->request_r_scale; + } + /* Segment is acceptable, update cache if undefined. */ + if (taop->tao_ccsent == 0) + taop->tao_ccsent = to.to_ccecho; + + tp->rcv_adv += tp->rcv_wnd; + tp->snd_una++; /* SYN is acked */ + /* + * If there's data, delay ACK; if there's also a FIN + * ACKNOW will be turned on later. + */ + if (DELAY_ACK(tp) && tlen != 0) + callout_reset(tp->tt_delack, tcp_delacktime, + tcp_timer_delack, tp); + else + tp->t_flags |= TF_ACKNOW; + /* + * Received <SYN,ACK> in SYN_SENT[*] state. + * Transitions: + * SYN_SENT --> ESTABLISHED + * SYN_SENT* --> FIN_WAIT_1 + */ + tp->t_starttime = ticks; + if (tp->t_flags & TF_NEEDFIN) { + tp->t_state = TCPS_FIN_WAIT_1; + tp->t_flags &= ~TF_NEEDFIN; + thflags &= ~TH_SYN; + } else { + tp->t_state = TCPS_ESTABLISHED; + callout_reset(tp->tt_keep, tcp_keepidle, + tcp_timer_keep, tp); + } + } else { + /* + * Received initial SYN in SYN-SENT[*] state => simul- + * taneous open. If segment contains CC option and there is + * a cached CC, apply TAO test; if it succeeds, connection is + * half-synchronized. Otherwise, do 3-way handshake: + * SYN-SENT -> SYN-RECEIVED + * SYN-SENT* -> SYN-RECEIVED* + * If there was no CC option, clear cached CC value. + */ + tp->t_flags |= TF_ACKNOW; + callout_stop(tp->tt_rexmt); + if (to.to_flag & TOF_CC) { + if (taop->tao_cc != 0 && + CC_GT(to.to_cc, taop->tao_cc)) { + /* + * update cache and make transition: + * SYN-SENT -> ESTABLISHED* + * SYN-SENT* -> FIN-WAIT-1* + */ + taop->tao_cc = to.to_cc; + tp->t_starttime = ticks; + if (tp->t_flags & TF_NEEDFIN) { + tp->t_state = TCPS_FIN_WAIT_1; + tp->t_flags &= ~TF_NEEDFIN; + } else { + tp->t_state = TCPS_ESTABLISHED; + callout_reset(tp->tt_keep, + tcp_keepidle, + tcp_timer_keep, + tp); + } + tp->t_flags |= TF_NEEDSYN; + } else + tp->t_state = TCPS_SYN_RECEIVED; + } else { + /* CC.NEW or no option => invalidate cache */ + taop->tao_cc = 0; + tp->t_state = TCPS_SYN_RECEIVED; + } + } + +trimthenstep6: + /* + * Advance th->th_seq to correspond to first data byte. + * If data, trim to stay within window, + * dropping FIN if necessary. + */ + th->th_seq++; + if (tlen > tp->rcv_wnd) { + todrop = tlen - tp->rcv_wnd; + m_adj(m, -todrop); + tlen = tp->rcv_wnd; + thflags &= ~TH_FIN; + tcpstat.tcps_rcvpackafterwin++; + tcpstat.tcps_rcvbyteafterwin += todrop; + } + tp->snd_wl1 = th->th_seq - 1; + tp->rcv_up = th->th_seq; + /* + * Client side of transaction: already sent SYN and data. + * If the remote host used T/TCP to validate the SYN, + * our data will be ACK'd; if so, enter normal data segment + * processing in the middle of step 5, ack processing. + * Otherwise, goto step 6. + */ + if (thflags & TH_ACK) + goto process_ACK; + goto step6; + /* + * If the state is LAST_ACK or CLOSING or TIME_WAIT: + * if segment contains a SYN and CC [not CC.NEW] option: + * if state == TIME_WAIT and connection duration > MSL, + * drop packet and send RST; + * + * if SEG.CC > CCrecv then is new SYN, and can implicitly + * ack the FIN (and data) in retransmission queue. + * Complete close and delete TCPCB. Then reprocess + * segment, hoping to find new TCPCB in LISTEN state; + * + * else must be old SYN; drop it. + * else do normal processing. + */ + case TCPS_LAST_ACK: + case TCPS_CLOSING: + case TCPS_TIME_WAIT: + if ((thflags & TH_SYN) && + (to.to_flag & TOF_CC) && tp->cc_recv != 0) { + if (tp->t_state == TCPS_TIME_WAIT && + (ticks - tp->t_starttime) > tcp_msl) { + rstreason = BANDLIM_UNLIMITED; + goto dropwithreset; + } + if (CC_GT(to.to_cc, tp->cc_recv)) { + tp = tcp_close(tp); + goto findpcb; + } + else + goto drop; + } + break; /* continue normal processing */ + } + + /* + * States other than LISTEN or SYN_SENT. + * First check the RST flag and sequence number since reset segments + * are exempt from the timestamp and connection count tests. This + * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix + * below which allowed reset segments in half the sequence space + * to fall though and be processed (which gives forged reset + * segments with a random sequence number a 50 percent chance of + * killing a connection). + * Then check timestamp, if present. + * Then check the connection count, if present. + * Then check that at least some bytes of segment are within + * receive window. If segment begins before rcv_nxt, + * drop leading data (and SYN); if nothing left, just ack. + * + * + * If the RST bit is set, check the sequence number to see + * if this is a valid reset segment. + * RFC 793 page 37: + * In all states except SYN-SENT, all reset (RST) segments + * are validated by checking their SEQ-fields. A reset is + * valid if its sequence number is in the window. + * Note: this does not take into account delayed ACKs, so + * we should test against last_ack_sent instead of rcv_nxt. + * The sequence number in the reset segment is normally an + * echo of our outgoing acknowlegement numbers, but some hosts + * send a reset with the sequence number at the rightmost edge + * of our receive window, and we have to handle this case. + * If we have multiple segments in flight, the intial reset + * segment sequence numbers will be to the left of last_ack_sent, + * but they will eventually catch up. + * In any case, it never made sense to trim reset segments to + * fit the receive window since RFC 1122 says: + * 4.2.2.12 RST Segment: RFC-793 Section 3.4 + * + * A TCP SHOULD allow a received RST segment to include data. + * + * DISCUSSION + * It has been suggested that a RST segment could contain + * ASCII text that encoded and explained the cause of the + * RST. No standard has yet been established for such + * data. + * + * If the reset segment passes the sequence number test examine + * the state: + * SYN_RECEIVED STATE: + * If passive open, return to LISTEN state. + * If active open, inform user that connection was refused. + * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES: + * Inform user that connection was reset, and close tcb. + * CLOSING, LAST_ACK STATES: + * Close the tcb. + * TIME_WAIT STATE: + * Drop the segment - see Stevens, vol. 2, p. 964 and + * RFC 1337. + */ + if (thflags & TH_RST) { + if (SEQ_GEQ(th->th_seq, tp->last_ack_sent) && + SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { + switch (tp->t_state) { + + case TCPS_SYN_RECEIVED: + so->so_error = ECONNREFUSED; + goto close; + + case TCPS_ESTABLISHED: + case TCPS_FIN_WAIT_1: + case TCPS_FIN_WAIT_2: + case TCPS_CLOSE_WAIT: + so->so_error = ECONNRESET; + close: + tp->t_state = TCPS_CLOSED; + tcpstat.tcps_drops++; + tp = tcp_close(tp); + break; + + case TCPS_CLOSING: + case TCPS_LAST_ACK: + tp = tcp_close(tp); + break; + + case TCPS_TIME_WAIT: + break; + } + } + goto drop; + } + + /* + * RFC 1323 PAWS: If we have a timestamp reply on this segment + * and it's less than ts_recent, drop it. + */ + if ((to.to_flag & TOF_TS) != 0 && tp->ts_recent && + TSTMP_LT(to.to_tsval, tp->ts_recent)) { + + /* Check to see if ts_recent is over 24 days old. */ + if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) { + /* + * Invalidate ts_recent. If this segment updates + * ts_recent, the age will be reset later and ts_recent + * will get a valid value. If it does not, setting + * ts_recent to zero will at least satisfy the + * requirement that zero be placed in the timestamp + * echo reply when ts_recent isn't valid. The + * age isn't reset until we get a valid ts_recent + * because we don't want out-of-order segments to be + * dropped when ts_recent is old. + */ + tp->ts_recent = 0; + } else { + tcpstat.tcps_rcvduppack++; + tcpstat.tcps_rcvdupbyte += tlen; + tcpstat.tcps_pawsdrop++; + goto dropafterack; + } + } + + /* + * T/TCP mechanism + * If T/TCP was negotiated and the segment doesn't have CC, + * or if its CC is wrong then drop the segment. + * RST segments do not have to comply with this. + */ + if ((tp->t_flags & (TF_REQ_CC|TF_RCVD_CC)) == (TF_REQ_CC|TF_RCVD_CC) && + ((to.to_flag & TOF_CC) == 0 || tp->cc_recv != to.to_cc)) + goto dropafterack; + + /* + * In the SYN-RECEIVED state, validate that the packet belongs to + * this connection before trimming the data to fit the receive + * window. Check the sequence number versus IRS since we know + * the sequence numbers haven't wrapped. This is a partial fix + * for the "LAND" DoS attack. + */ + if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { + rstreason = BANDLIM_RST_OPENPORT; + goto dropwithreset; + } + + todrop = tp->rcv_nxt - th->th_seq; + if (todrop > 0) { + if (thflags & TH_SYN) { + thflags &= ~TH_SYN; + th->th_seq++; + if (th->th_urp > 1) + th->th_urp--; + else + thflags &= ~TH_URG; + todrop--; + } + /* + * Following if statement from Stevens, vol. 2, p. 960. + */ + if (todrop > tlen + || (todrop == tlen && (thflags & TH_FIN) == 0)) { + /* + * Any valid FIN must be to the left of the window. + * At this point the FIN must be a duplicate or out + * of sequence; drop it. + */ + thflags &= ~TH_FIN; + + /* + * Send an ACK to resynchronize and drop any data. + * But keep on processing for RST or ACK. + */ + tp->t_flags |= TF_ACKNOW; + todrop = tlen; + tcpstat.tcps_rcvduppack++; + tcpstat.tcps_rcvdupbyte += todrop; + } else { + tcpstat.tcps_rcvpartduppack++; + tcpstat.tcps_rcvpartdupbyte += todrop; + } + drop_hdrlen += todrop; /* drop from the top afterwards */ + th->th_seq += todrop; + tlen -= todrop; + if (th->th_urp > todrop) + th->th_urp -= todrop; + else { + thflags &= ~TH_URG; + th->th_urp = 0; + } + } + + /* + * If new data are received on a connection after the + * user processes are gone, then RST the other end. + */ + if ((so->so_state & SS_NOFDREF) && + tp->t_state > TCPS_CLOSE_WAIT && tlen) { + tp = tcp_close(tp); + tcpstat.tcps_rcvafterclose++; + rstreason = BANDLIM_UNLIMITED; + goto dropwithreset; + } + + /* + * If segment ends after window, drop trailing data + * (and PUSH and FIN); if nothing left, just ACK. + */ + todrop = (th->th_seq+tlen) - (tp->rcv_nxt+tp->rcv_wnd); + if (todrop > 0) { + tcpstat.tcps_rcvpackafterwin++; + if (todrop >= tlen) { + tcpstat.tcps_rcvbyteafterwin += tlen; + /* + * If a new connection request is received + * while in TIME_WAIT, drop the old connection + * and start over if the sequence numbers + * are above the previous ones. + */ + if (thflags & TH_SYN && + tp->t_state == TCPS_TIME_WAIT && + SEQ_GT(th->th_seq, tp->rcv_nxt)) { + iss = tcp_rndiss_next(); + tp = tcp_close(tp); + goto findpcb; + } + /* + * If window is closed can only take segments at + * window edge, and have to drop data and PUSH from + * incoming segments. Continue processing, but + * remember to ack. Otherwise, drop segment + * and ack. + */ + if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { + tp->t_flags |= TF_ACKNOW; + tcpstat.tcps_rcvwinprobe++; + } else + goto dropafterack; + } else + tcpstat.tcps_rcvbyteafterwin += todrop; + m_adj(m, -todrop); + tlen -= todrop; + thflags &= ~(TH_PUSH|TH_FIN); + } + + /* + * If last ACK falls within this segment's sequence numbers, + * record its timestamp. + * NOTE that the test is modified according to the latest + * proposal of the tcplw@cray.com list (Braden 1993/04/26). + */ + if ((to.to_flag & TOF_TS) != 0 && + SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { + tp->ts_recent_age = ticks; + tp->ts_recent = to.to_tsval; + } + + /* + * If a SYN is in the window, then this is an + * error and we send an RST and drop the connection. + */ + if (thflags & TH_SYN) { + tp = tcp_drop(tp, ECONNRESET); + rstreason = BANDLIM_UNLIMITED; + goto dropwithreset; + } + + /* + * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN + * flag is on (half-synchronized state), then queue data for + * later processing; else drop segment and return. + */ + if ((thflags & TH_ACK) == 0) { + if (tp->t_state == TCPS_SYN_RECEIVED || + (tp->t_flags & TF_NEEDSYN)) + goto step6; + else + goto drop; + } + + /* + * Ack processing. + */ + switch (tp->t_state) { + + /* + * In SYN_RECEIVED state, the ack ACKs our SYN, so enter + * ESTABLISHED state and continue processing. + * The ACK was checked above. + */ + case TCPS_SYN_RECEIVED: + + tcpstat.tcps_connects++; + soisconnected(so); + /* Do window scaling? */ + if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == + (TF_RCVD_SCALE|TF_REQ_SCALE)) { + tp->snd_scale = tp->requested_s_scale; + tp->rcv_scale = tp->request_r_scale; + } + /* + * Upon successful completion of 3-way handshake, + * update cache.CC if it was undefined, pass any queued + * data to the user, and advance state appropriately. + */ + if ((taop = tcp_gettaocache(inp)) != NULL && + taop->tao_cc == 0) + taop->tao_cc = tp->cc_recv; + + /* + * Make transitions: + * SYN-RECEIVED -> ESTABLISHED + * SYN-RECEIVED* -> FIN-WAIT-1 + */ + tp->t_starttime = ticks; + if (tp->t_flags & TF_NEEDFIN) { + tp->t_state = TCPS_FIN_WAIT_1; + tp->t_flags &= ~TF_NEEDFIN; + } else { + tp->t_state = TCPS_ESTABLISHED; + callout_reset(tp->tt_keep, tcp_keepidle, + tcp_timer_keep, tp); + } + /* + * If segment contains data or ACK, will call tcp_reass() + * later; if not, do so now to pass queued data to user. + */ + if (tlen == 0 && (thflags & TH_FIN) == 0) + (void) tcp_reass(tp, (struct tcphdr *)0, 0, + (struct mbuf *)0); + tp->snd_wl1 = th->th_seq - 1; + /* fall into ... */ + + /* + * In ESTABLISHED state: drop duplicate ACKs; ACK out of range + * ACKs. If the ack is in the range + * tp->snd_una < th->th_ack <= tp->snd_max + * then advance tp->snd_una to th->th_ack and drop + * data from the retransmission queue. If this ACK reflects + * more up to date window information we update our window information. + */ + case TCPS_ESTABLISHED: + case TCPS_FIN_WAIT_1: + case TCPS_FIN_WAIT_2: + case TCPS_CLOSE_WAIT: + case TCPS_CLOSING: + case TCPS_LAST_ACK: + case TCPS_TIME_WAIT: + + if (SEQ_LEQ(th->th_ack, tp->snd_una)) { + if (tlen == 0 && tiwin == tp->snd_wnd) { + tcpstat.tcps_rcvdupack++; + /* + * If we have outstanding data (other than + * a window probe), this is a completely + * duplicate ack (ie, window info didn't + * change), the ack is the biggest we've + * seen and we've seen exactly our rexmt + * threshhold of them, assume a packet + * has been dropped and retransmit it. + * Kludge snd_nxt & the congestion + * window so we send only this one + * packet. + * + * We know we're losing at the current + * window size so do congestion avoidance + * (set ssthresh to half the current window + * and pull our congestion window back to + * the new ssthresh). + * + * Dup acks mean that packets have left the + * network (they're now cached at the receiver) + * so bump cwnd by the amount in the receiver + * to keep a constant cwnd packets in the + * network. + */ + if (!callout_active(tp->tt_rexmt) || + th->th_ack != tp->snd_una) + tp->t_dupacks = 0; + else if (++tp->t_dupacks == tcprexmtthresh) { + tcp_seq onxt = tp->snd_nxt; + u_int win = + min(tp->snd_wnd, tp->snd_cwnd) / 2 / + tp->t_maxseg; + if (tcp_do_newreno && SEQ_LT(th->th_ack, + tp->snd_recover)) { + /* False retransmit, should not + * cut window + */ + tp->snd_cwnd += tp->t_maxseg; + tp->t_dupacks = 0; + (void) tcp_output(tp); + goto drop; + } + if (win < 2) + win = 2; + tp->snd_ssthresh = win * tp->t_maxseg; + tp->snd_recover = tp->snd_max; + callout_stop(tp->tt_rexmt); + tp->t_rtttime = 0; + tp->snd_nxt = th->th_ack; + tp->snd_cwnd = tp->t_maxseg; + (void) tcp_output(tp); + tp->snd_cwnd = tp->snd_ssthresh + + tp->t_maxseg * tp->t_dupacks; + if (SEQ_GT(onxt, tp->snd_nxt)) + tp->snd_nxt = onxt; + goto drop; + } else if (tp->t_dupacks > tcprexmtthresh) { + tp->snd_cwnd += tp->t_maxseg; + (void) tcp_output(tp); + goto drop; + } + } else + tp->t_dupacks = 0; + break; + } + /* + * If the congestion window was inflated to account + * for the other side's cached packets, retract it. + */ + if (tcp_do_newreno == 0) { + if (tp->t_dupacks >= tcprexmtthresh && + tp->snd_cwnd > tp->snd_ssthresh) + tp->snd_cwnd = tp->snd_ssthresh; + tp->t_dupacks = 0; + } else if (tp->t_dupacks >= tcprexmtthresh && + !tcp_newreno(tp, th)) { + /* + * Window inflation should have left us with approx. + * snd_ssthresh outstanding data. But in case we + * would be inclined to send a burst, better to do + * it via the slow start mechanism. + */ + if (SEQ_GT(th->th_ack + tp->snd_ssthresh, tp->snd_max)) + tp->snd_cwnd = + tp->snd_max - th->th_ack + tp->t_maxseg; + else + tp->snd_cwnd = tp->snd_ssthresh; + tp->t_dupacks = 0; + } + if (SEQ_GT(th->th_ack, tp->snd_max)) { + tcpstat.tcps_rcvacktoomuch++; + goto dropafterack; + } + /* + * If we reach this point, ACK is not a duplicate, + * i.e., it ACKs something we sent. + */ + if (tp->t_flags & TF_NEEDSYN) { + /* + * T/TCP: Connection was half-synchronized, and our + * SYN has been ACK'd (so connection is now fully + * synchronized). Go to non-starred state, + * increment snd_una for ACK of SYN, and check if + * we can do window scaling. + */ + tp->t_flags &= ~TF_NEEDSYN; + tp->snd_una++; + /* Do window scaling? */ + if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == + (TF_RCVD_SCALE|TF_REQ_SCALE)) { + tp->snd_scale = tp->requested_s_scale; + tp->rcv_scale = tp->request_r_scale; + } + } + +process_ACK: + acked = th->th_ack - tp->snd_una; + tcpstat.tcps_rcvackpack++; + tcpstat.tcps_rcvackbyte += acked; + + /* + * If we just performed our first retransmit, and the ACK + * arrives within our recovery window, then it was a mistake + * to do the retransmit in the first place. Recover our + * original cwnd and ssthresh, and proceed to transmit where + * we left off. + */ + if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) { + tp->snd_cwnd = tp->snd_cwnd_prev; + tp->snd_ssthresh = tp->snd_ssthresh_prev; + tp->snd_nxt = tp->snd_max; + tp->t_badrxtwin = 0; /* XXX probably not required */ + } + + /* + * If we have a timestamp reply, update smoothed + * round trip time. If no timestamp is present but + * transmit timer is running and timed sequence + * number was acked, update smoothed round trip time. + * Since we now have an rtt measurement, cancel the + * timer backoff (cf., Phil Karn's retransmit alg.). + * Recompute the initial retransmit timer. + */ + if (to.to_flag & TOF_TS) + tcp_xmit_timer(tp, ticks - to.to_tsecr + 1); + else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) + tcp_xmit_timer(tp, ticks - tp->t_rtttime); + + /* + * If all outstanding data is acked, stop retransmit + * timer and remember to restart (more output or persist). + * If there is more data to be acked, restart retransmit + * timer, using current (possibly backed-off) value. + */ + if (th->th_ack == tp->snd_max) { + callout_stop(tp->tt_rexmt); + needoutput = 1; + } else if (!callout_active(tp->tt_persist)) + callout_reset(tp->tt_rexmt, tp->t_rxtcur, + tcp_timer_rexmt, tp); + + /* + * If no data (only SYN) was ACK'd, + * skip rest of ACK processing. + */ + if (acked == 0) + goto step6; + + /* + * When new data is acked, open the congestion window. + * If the window gives us less than ssthresh packets + * in flight, open exponentially (maxseg per packet). + * Otherwise open linearly: maxseg per window + * (maxseg^2 / cwnd per packet). + */ + { + register u_int cw = tp->snd_cwnd; + register u_int incr = tp->t_maxseg; + + if (cw > tp->snd_ssthresh) + incr = incr * incr / cw; + /* + * If t_dupacks != 0 here, it indicates that we are still + * in NewReno fast recovery mode, so we leave the congestion + * window alone. + */ + if (tcp_do_newreno == 0 || tp->t_dupacks == 0) + tp->snd_cwnd = min(cw + incr,TCP_MAXWIN<<tp->snd_scale); + } + if (acked > so->so_snd.sb_cc) { + tp->snd_wnd -= so->so_snd.sb_cc; + sbdrop(&so->so_snd, (int)so->so_snd.sb_cc); + ourfinisacked = 1; + } else { + sbdrop(&so->so_snd, acked); + tp->snd_wnd -= acked; + ourfinisacked = 0; + } + sowwakeup(so); + tp->snd_una = th->th_ack; + if (SEQ_LT(tp->snd_nxt, tp->snd_una)) + tp->snd_nxt = tp->snd_una; + + switch (tp->t_state) { + + /* + * In FIN_WAIT_1 STATE in addition to the processing + * for the ESTABLISHED state if our FIN is now acknowledged + * then enter FIN_WAIT_2. + */ + case TCPS_FIN_WAIT_1: + if (ourfinisacked) { + /* + * If we can't receive any more + * data, then closing user can proceed. + * Starting the timer is contrary to the + * specification, but if we don't get a FIN + * we'll hang forever. + */ + if (so->so_state & SS_CANTRCVMORE) { + soisdisconnected(so); + callout_reset(tp->tt_2msl, tcp_maxidle, + tcp_timer_2msl, tp); + } + tp->t_state = TCPS_FIN_WAIT_2; + } + break; + + /* + * In CLOSING STATE in addition to the processing for + * the ESTABLISHED state if the ACK acknowledges our FIN + * then enter the TIME-WAIT state, otherwise ignore + * the segment. + */ + case TCPS_CLOSING: + if (ourfinisacked) { + tp->t_state = TCPS_TIME_WAIT; + tcp_canceltimers(tp); + /* Shorten TIME_WAIT [RFC-1644, p.28] */ + if (tp->cc_recv != 0 && + (ticks - tp->t_starttime) < tcp_msl) + callout_reset(tp->tt_2msl, + tp->t_rxtcur * + TCPTV_TWTRUNC, + tcp_timer_2msl, tp); + else + callout_reset(tp->tt_2msl, 2 * tcp_msl, + tcp_timer_2msl, tp); + soisdisconnected(so); + } + break; + + /* + * In LAST_ACK, we may still be waiting for data to drain + * and/or to be acked, as well as for the ack of our FIN. + * If our FIN is now acknowledged, delete the TCB, + * enter the closed state and return. + */ + case TCPS_LAST_ACK: + if (ourfinisacked) { + tp = tcp_close(tp); + goto drop; + } + break; + + /* + * In TIME_WAIT state the only thing that should arrive + * is a retransmission of the remote FIN. Acknowledge + * it and restart the finack timer. + */ + case TCPS_TIME_WAIT: + callout_reset(tp->tt_2msl, 2 * tcp_msl, + tcp_timer_2msl, tp); + goto dropafterack; + } + } + +step6: + /* + * Update window information. + * Don't look at window if no ACK: TAC's send garbage on first SYN. + */ + if ((thflags & TH_ACK) && + (SEQ_LT(tp->snd_wl1, th->th_seq) || + (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || + (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { + /* keep track of pure window updates */ + if (tlen == 0 && + tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) + tcpstat.tcps_rcvwinupd++; + tp->snd_wnd = tiwin; + tp->snd_wl1 = th->th_seq; + tp->snd_wl2 = th->th_ack; + if (tp->snd_wnd > tp->max_sndwnd) + tp->max_sndwnd = tp->snd_wnd; + needoutput = 1; + } + + /* + * Process segments with URG. + */ + if ((thflags & TH_URG) && th->th_urp && + TCPS_HAVERCVDFIN(tp->t_state) == 0) { + /* + * This is a kludge, but if we receive and accept + * random urgent pointers, we'll crash in + * soreceive. It's hard to imagine someone + * actually wanting to send this much urgent data. + */ + if (th->th_urp + so->so_rcv.sb_cc > sb_max) { + th->th_urp = 0; /* XXX */ + thflags &= ~TH_URG; /* XXX */ + goto dodata; /* XXX */ + } + /* + * If this segment advances the known urgent pointer, + * then mark the data stream. This should not happen + * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since + * a FIN has been received from the remote side. + * In these states we ignore the URG. + * + * According to RFC961 (Assigned Protocols), + * the urgent pointer points to the last octet + * of urgent data. We continue, however, + * to consider it to indicate the first octet + * of data past the urgent section as the original + * spec states (in one of two places). + */ + if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { + tp->rcv_up = th->th_seq + th->th_urp; + so->so_oobmark = so->so_rcv.sb_cc + + (tp->rcv_up - tp->rcv_nxt) - 1; + if (so->so_oobmark == 0) + so->so_state |= SS_RCVATMARK; + sohasoutofband(so); + tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); + } + /* + * Remove out of band data so doesn't get presented to user. + * This can happen independent of advancing the URG pointer, + * but if two URG's are pending at once, some out-of-band + * data may creep in... ick. + */ + if (th->th_urp <= (u_long)tlen +#ifdef SO_OOBINLINE + && (so->so_options & SO_OOBINLINE) == 0 +#endif + ) + tcp_pulloutofband(so, th, m, + drop_hdrlen); /* hdr drop is delayed */ + } else + /* + * If no out of band data is expected, + * pull receive urgent pointer along + * with the receive window. + */ + if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) + tp->rcv_up = tp->rcv_nxt; +dodata: /* XXX */ + + /* + * Process the segment text, merging it into the TCP sequencing queue, + * and arranging for acknowledgment of receipt if necessary. + * This process logically involves adjusting tp->rcv_wnd as data + * is presented to the user (this happens in tcp_usrreq.c, + * case PRU_RCVD). If a FIN has already been received on this + * connection then we just ignore the text. + */ + if ((tlen || (thflags&TH_FIN)) && + TCPS_HAVERCVDFIN(tp->t_state) == 0) { + m_adj(m, drop_hdrlen); /* delayed header drop */ + TCP_REASS(tp, th, &tlen, m, so, thflags); + /* + * Note the amount of data that peer has sent into + * our window, in order to estimate the sender's + * buffer size. + */ + len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); + } else { + m_freem(m); + thflags &= ~TH_FIN; + } + + /* + * If FIN is received ACK the FIN and let the user know + * that the connection is closing. + */ + if (thflags & TH_FIN) { + if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { + socantrcvmore(so); + /* + * If connection is half-synchronized + * (ie NEEDSYN flag on) then delay ACK, + * so it may be piggybacked when SYN is sent. + * Otherwise, since we received a FIN then no + * more input can be expected, send ACK now. + */ + if (DELAY_ACK(tp) && (tp->t_flags & TF_NEEDSYN)) + callout_reset(tp->tt_delack, tcp_delacktime, + tcp_timer_delack, tp); + else + tp->t_flags |= TF_ACKNOW; + tp->rcv_nxt++; + } + switch (tp->t_state) { + + /* + * In SYN_RECEIVED and ESTABLISHED STATES + * enter the CLOSE_WAIT state. + */ + case TCPS_SYN_RECEIVED: + tp->t_starttime = ticks; + /*FALLTHROUGH*/ + case TCPS_ESTABLISHED: + tp->t_state = TCPS_CLOSE_WAIT; + break; + + /* + * If still in FIN_WAIT_1 STATE FIN has not been acked so + * enter the CLOSING state. + */ + case TCPS_FIN_WAIT_1: + tp->t_state = TCPS_CLOSING; + break; + + /* + * In FIN_WAIT_2 state enter the TIME_WAIT state, + * starting the time-wait timer, turning off the other + * standard timers. + */ + case TCPS_FIN_WAIT_2: + tp->t_state = TCPS_TIME_WAIT; + tcp_canceltimers(tp); + /* Shorten TIME_WAIT [RFC-1644, p.28] */ + if (tp->cc_recv != 0 && + (ticks - tp->t_starttime) < tcp_msl) { + callout_reset(tp->tt_2msl, + tp->t_rxtcur * TCPTV_TWTRUNC, + tcp_timer_2msl, tp); + /* For transaction client, force ACK now. */ + tp->t_flags |= TF_ACKNOW; + } + else + callout_reset(tp->tt_2msl, 2 * tcp_msl, + tcp_timer_2msl, tp); + soisdisconnected(so); + break; + + /* + * In TIME_WAIT state restart the 2 MSL time_wait timer. + */ + case TCPS_TIME_WAIT: + callout_reset(tp->tt_2msl, 2 * tcp_msl, + tcp_timer_2msl, tp); + break; + } + } +#ifdef TCPDEBUG + if (so->so_options & SO_DEBUG) + tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, + &tcp_savetcp, 0); +#endif + + /* + * Return any desired output. + */ + if (needoutput || (tp->t_flags & TF_ACKNOW)) + (void) tcp_output(tp); + return; + +dropafterack: + /* + * Generate an ACK dropping incoming segment if it occupies + * sequence space, where the ACK reflects our state. + * + * We can now skip the test for the RST flag since all + * paths to this code happen after packets containing + * RST have been dropped. + * + * In the SYN-RECEIVED state, don't send an ACK unless the + * segment we received passes the SYN-RECEIVED ACK test. + * If it fails send a RST. This breaks the loop in the + * "LAND" DoS attack, and also prevents an ACK storm + * between two listening ports that have been sent forged + * SYN segments, each with the source address of the other. + */ + if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && + (SEQ_GT(tp->snd_una, th->th_ack) || + SEQ_GT(th->th_ack, tp->snd_max)) ) { + rstreason = BANDLIM_RST_OPENPORT; + goto dropwithreset; + } +#ifdef TCPDEBUG + if (so->so_options & SO_DEBUG) + tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, + &tcp_savetcp, 0); +#endif + m_freem(m); + tp->t_flags |= TF_ACKNOW; + (void) tcp_output(tp); + return; + +dropwithreset: + /* + * Generate a RST, dropping incoming segment. + * Make ACK acceptable to originator of segment. + * Don't bother to respond if destination was broadcast/multicast. + */ + if ((thflags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST)) + goto drop; +#ifdef INET6 + if (isipv6) { + if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || + IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) + goto drop; + } else +#endif /* INET6 */ + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || + IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || + ip->ip_src.s_addr == htonl(INADDR_BROADCAST)) + goto drop; + /* IPv6 anycast check is done at tcp6_input() */ + + /* + * Perform bandwidth limiting. + */ + if (badport_bandlim(rstreason) < 0) + goto drop; + +#ifdef TCPDEBUG + if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) + tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, + &tcp_savetcp, 0); +#endif + if (thflags & TH_ACK) + /* mtod() below is safe as long as hdr dropping is delayed */ + tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack, + TH_RST); + else { + if (thflags & TH_SYN) + tlen++; + /* mtod() below is safe as long as hdr dropping is delayed */ + tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, + (tcp_seq)0, TH_RST|TH_ACK); + } + /* destroy temporarily created socket */ + if (dropsocket) + (void) soabort(so); + return; + +drop: + /* + * Drop space held by incoming segment and return. + */ +#ifdef TCPDEBUG + if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) + tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, + &tcp_savetcp, 0); +#endif + m_freem(m); + /* destroy temporarily created socket */ + if (dropsocket) + (void) soabort(so); + return; +} + +static void +tcp_dooptions(tp, cp, cnt, th, to) + struct tcpcb *tp; + u_char *cp; + int cnt; + struct tcphdr *th; + struct tcpopt *to; +{ + u_short mss = 0; + int opt, optlen; + + for (; cnt > 0; cnt -= optlen, cp += optlen) { + opt = cp[0]; + if (opt == TCPOPT_EOL) + break; + if (opt == TCPOPT_NOP) + optlen = 1; + else { + if (cnt < 2) + break; + optlen = cp[1]; + if (optlen < 2 || optlen > cnt) + break; + } + switch (opt) { + + default: + continue; + + case TCPOPT_MAXSEG: + if (optlen != TCPOLEN_MAXSEG) + continue; + if (!(th->th_flags & TH_SYN)) + continue; + bcopy((char *) cp + 2, (char *) &mss, sizeof(mss)); + NTOHS(mss); + break; + + case TCPOPT_WINDOW: + if (optlen != TCPOLEN_WINDOW) + continue; + if (!(th->th_flags & TH_SYN)) + continue; + tp->t_flags |= TF_RCVD_SCALE; + tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT); + break; + + case TCPOPT_TIMESTAMP: + if (optlen != TCPOLEN_TIMESTAMP) + continue; + to->to_flag |= TOF_TS; + bcopy((char *)cp + 2, + (char *)&to->to_tsval, sizeof(to->to_tsval)); + NTOHL(to->to_tsval); + bcopy((char *)cp + 6, + (char *)&to->to_tsecr, sizeof(to->to_tsecr)); + NTOHL(to->to_tsecr); + + /* + * A timestamp received in a SYN makes + * it ok to send timestamp requests and replies. + */ + if (th->th_flags & TH_SYN) { + tp->t_flags |= TF_RCVD_TSTMP; + tp->ts_recent = to->to_tsval; + tp->ts_recent_age = ticks; + } + break; + case TCPOPT_CC: + if (optlen != TCPOLEN_CC) + continue; + to->to_flag |= TOF_CC; + bcopy((char *)cp + 2, + (char *)&to->to_cc, sizeof(to->to_cc)); + NTOHL(to->to_cc); + /* + * A CC or CC.new option received in a SYN makes + * it ok to send CC in subsequent segments. + */ + if (th->th_flags & TH_SYN) + tp->t_flags |= TF_RCVD_CC; + break; + case TCPOPT_CCNEW: + if (optlen != TCPOLEN_CC) + continue; + if (!(th->th_flags & TH_SYN)) + continue; + to->to_flag |= TOF_CCNEW; + bcopy((char *)cp + 2, + (char *)&to->to_cc, sizeof(to->to_cc)); + NTOHL(to->to_cc); + /* + * A CC or CC.new option received in a SYN makes + * it ok to send CC in subsequent segments. + */ + tp->t_flags |= TF_RCVD_CC; + break; + case TCPOPT_CCECHO: + if (optlen != TCPOLEN_CC) + continue; + if (!(th->th_flags & TH_SYN)) + continue; + to->to_flag |= TOF_CCECHO; + bcopy((char *)cp + 2, + (char *)&to->to_ccecho, sizeof(to->to_ccecho)); + NTOHL(to->to_ccecho); + break; + } + } + if (th->th_flags & TH_SYN) + tcp_mss(tp, mss); /* sets t_maxseg */ +} + +/* + * Pull out of band byte out of a segment so + * it doesn't appear in the user's data queue. + * It is still reflected in the segment length for + * sequencing purposes. + */ +static void +tcp_pulloutofband(so, th, m, off) + struct socket *so; + struct tcphdr *th; + register struct mbuf *m; + int off; /* delayed to be droped hdrlen */ +{ + int cnt = off + th->th_urp - 1; + + while (cnt >= 0) { + if (m->m_len > cnt) { + char *cp = mtod(m, caddr_t) + cnt; + struct tcpcb *tp = sototcpcb(so); + + tp->t_iobc = *cp; + tp->t_oobflags |= TCPOOB_HAVEDATA; + bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); + m->m_len--; + if (m->m_flags & M_PKTHDR) + m->m_pkthdr.len--; + return; + } + cnt -= m->m_len; + m = m->m_next; + if (m == 0) + break; + } + panic("tcp_pulloutofband"); +} + +/* + * Collect new round-trip time estimate + * and update averages and current timeout. + */ +static void +tcp_xmit_timer(tp, rtt) + register struct tcpcb *tp; + int rtt; +{ + register int delta; + + tcpstat.tcps_rttupdated++; + tp->t_rttupdated++; + if (tp->t_srtt != 0) { + /* + * srtt is stored as fixed point with 5 bits after the + * binary point (i.e., scaled by 8). The following magic + * is equivalent to the smoothing algorithm in rfc793 with + * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed + * point). Adjust rtt to origin 0. + */ + delta = ((rtt - 1) << TCP_DELTA_SHIFT) + - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); + + if ((tp->t_srtt += delta) <= 0) + tp->t_srtt = 1; + + /* + * We accumulate a smoothed rtt variance (actually, a + * smoothed mean difference), then set the retransmit + * timer to smoothed rtt + 4 times the smoothed variance. + * rttvar is stored as fixed point with 4 bits after the + * binary point (scaled by 16). The following is + * equivalent to rfc793 smoothing with an alpha of .75 + * (rttvar = rttvar*3/4 + |delta| / 4). This replaces + * rfc793's wired-in beta. + */ + if (delta < 0) + delta = -delta; + delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); + if ((tp->t_rttvar += delta) <= 0) + tp->t_rttvar = 1; + } else { + /* + * No rtt measurement yet - use the unsmoothed rtt. + * Set the variance to half the rtt (so our first + * retransmit happens at 3*rtt). + */ + tp->t_srtt = rtt << TCP_RTT_SHIFT; + tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); + } + tp->t_rtttime = 0; + tp->t_rxtshift = 0; + + /* + * the retransmit should happen at rtt + 4 * rttvar. + * Because of the way we do the smoothing, srtt and rttvar + * will each average +1/2 tick of bias. When we compute + * the retransmit timer, we want 1/2 tick of rounding and + * 1 extra tick because of +-1/2 tick uncertainty in the + * firing of the timer. The bias will give us exactly the + * 1.5 tick we need. But, because the bias is + * statistical, we have to test that we don't drop below + * the minimum feasible timer (which is 2 ticks). + */ + TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), + max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); + + /* + * We received an ack for a packet that wasn't retransmitted; + * it is probably safe to discard any error indications we've + * received recently. This isn't quite right, but close enough + * for now (a route might have failed after we sent a segment, + * and the return path might not be symmetrical). + */ + tp->t_softerror = 0; +} + +/* + * Determine a reasonable value for maxseg size. + * If the route is known, check route for mtu. + * If none, use an mss that can be handled on the outgoing + * interface without forcing IP to fragment; if bigger than + * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES + * to utilize large mbufs. If no route is found, route has no mtu, + * or the destination isn't local, use a default, hopefully conservative + * size (usually 512 or the default IP max size, but no more than the mtu + * of the interface), as we can't discover anything about intervening + * gateways or networks. We also initialize the congestion/slow start + * window to be a single segment if the destination isn't local. + * While looking at the routing entry, we also initialize other path-dependent + * parameters from pre-set or cached values in the routing entry. + * + * Also take into account the space needed for options that we + * send regularly. Make maxseg shorter by that amount to assure + * that we can send maxseg amount of data even when the options + * are present. Store the upper limit of the length of options plus + * data in maxopd. + * + * NOTE that this routine is only called when we process an incoming + * segment, for outgoing segments only tcp_mssopt is called. + * + * In case of T/TCP, we call this routine during implicit connection + * setup as well (offer = -1), to initialize maxseg from the cached + * MSS of our peer. + */ +void +tcp_mss(tp, offer) + struct tcpcb *tp; + int offer; +{ + register struct rtentry *rt; + struct ifnet *ifp; + register int rtt, mss; + u_long bufsize; + struct inpcb *inp; + struct socket *so; + struct rmxp_tao *taop; + int origoffer = offer; +#ifdef INET6 + int isipv6; + int min_protoh; +#endif + + inp = tp->t_inpcb; +#ifdef INET6 + isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; + min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) + : sizeof (struct tcpiphdr); +#else +#define min_protoh (sizeof (struct tcpiphdr)) +#endif +#ifdef INET6 + if (isipv6) + rt = tcp_rtlookup6(inp); + else +#endif + rt = tcp_rtlookup(inp); + if (rt == NULL) { + tp->t_maxopd = tp->t_maxseg = +#ifdef INET6 + isipv6 ? tcp_v6mssdflt : +#endif /* INET6 */ + tcp_mssdflt; + return; + } + ifp = rt->rt_ifp; + so = inp->inp_socket; + + taop = rmx_taop(rt->rt_rmx); + /* + * Offer == -1 means that we didn't receive SYN yet, + * use cached value in that case; + */ + if (offer == -1) + offer = taop->tao_mssopt; + /* + * Offer == 0 means that there was no MSS on the SYN segment, + * in this case we use tcp_mssdflt. + */ + if (offer == 0) + offer = +#ifdef INET6 + isipv6 ? tcp_v6mssdflt : +#endif /* INET6 */ + tcp_mssdflt; + else + /* + * Sanity check: make sure that maxopd will be large + * enough to allow some data on segments even is the + * all the option space is used (40bytes). Otherwise + * funny things may happen in tcp_output. + */ + offer = max(offer, 64); + taop->tao_mssopt = offer; + + /* + * While we're here, check if there's an initial rtt + * or rttvar. Convert from the route-table units + * to scaled multiples of the slow timeout timer. + */ + if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) { + /* + * XXX the lock bit for RTT indicates that the value + * is also a minimum value; this is subject to time. + */ + if (rt->rt_rmx.rmx_locks & RTV_RTT) + tp->t_rttmin = rtt / (RTM_RTTUNIT / hz); + tp->t_srtt = rtt / (RTM_RTTUNIT / (hz * TCP_RTT_SCALE)); + tcpstat.tcps_usedrtt++; + if (rt->rt_rmx.rmx_rttvar) { + tp->t_rttvar = rt->rt_rmx.rmx_rttvar / + (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE)); + tcpstat.tcps_usedrttvar++; + } else { + /* default variation is +- 1 rtt */ + tp->t_rttvar = + tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; + } + TCPT_RANGESET(tp->t_rxtcur, + ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, + tp->t_rttmin, TCPTV_REXMTMAX); + } + /* + * if there's an mtu associated with the route, use it + * else, use the link mtu. + */ + if (rt->rt_rmx.rmx_mtu) + mss = rt->rt_rmx.rmx_mtu - min_protoh; + else + { + mss = +#ifdef INET6 + (isipv6 ? nd_ifinfo[rt->rt_ifp->if_index].linkmtu : +#endif + ifp->if_mtu +#ifdef INET6 + ) +#endif + - min_protoh; +#ifdef INET6 + if (isipv6) { + if (!in6_localaddr(&inp->in6p_faddr)) + mss = min(mss, tcp_v6mssdflt); + } else +#endif + if (!in_localaddr(inp->inp_faddr)) + mss = min(mss, tcp_mssdflt); + } + mss = min(mss, offer); + /* + * maxopd stores the maximum length of data AND options + * in a segment; maxseg is the amount of data in a normal + * segment. We need to store this value (maxopd) apart + * from maxseg, because now every segment carries options + * and thus we normally have somewhat less data in segments. + */ + tp->t_maxopd = mss; + + /* + * In case of T/TCP, origoffer==-1 indicates, that no segments + * were received yet. In this case we just guess, otherwise + * we do the same as before T/TCP. + */ + if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && + (origoffer == -1 || + (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)) + mss -= TCPOLEN_TSTAMP_APPA; + if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && + (origoffer == -1 || + (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC)) + mss -= TCPOLEN_CC_APPA; + +#if (MCLBYTES & (MCLBYTES - 1)) == 0 + if (mss > MCLBYTES) + mss &= ~(MCLBYTES-1); +#else + if (mss > MCLBYTES) + mss = mss / MCLBYTES * MCLBYTES; +#endif + /* + * If there's a pipesize, change the socket buffer + * to that size. Make the socket buffers an integral + * number of mss units; if the mss is larger than + * the socket buffer, decrease the mss. + */ +#ifdef RTV_SPIPE + if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0) +#endif + bufsize = so->so_snd.sb_hiwat; + if (bufsize < mss) + mss = bufsize; + else { + bufsize = roundup(bufsize, mss); + if (bufsize > sb_max) + bufsize = sb_max; + (void)sbreserve(&so->so_snd, bufsize, so, NULL); + } + tp->t_maxseg = mss; + +#ifdef RTV_RPIPE + if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0) +#endif + bufsize = so->so_rcv.sb_hiwat; + if (bufsize > mss) { + bufsize = roundup(bufsize, mss); + if (bufsize > sb_max) + bufsize = sb_max; + (void)sbreserve(&so->so_rcv, bufsize, so, NULL); + } + + /* + * Set the slow-start flight size depending on whether this + * is a local network or not. + */ + if ( +#ifdef INET6 + (isipv6 && in6_localaddr(&inp->in6p_faddr)) || + (!isipv6 && +#endif + in_localaddr(inp->inp_faddr) +#ifdef INET6 + ) +#endif + ) + tp->snd_cwnd = mss * ss_fltsz_local; + else + tp->snd_cwnd = mss * ss_fltsz; + + if (rt->rt_rmx.rmx_ssthresh) { + /* + * There's some sort of gateway or interface + * buffer limit on the path. Use this to set + * the slow start threshhold, but set the + * threshold to no less than 2*mss. + */ + tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh); + tcpstat.tcps_usedssthresh++; + } +} + +/* + * Determine the MSS option to send on an outgoing SYN. + */ +int +tcp_mssopt(tp) + struct tcpcb *tp; +{ + struct rtentry *rt; +#ifdef INET6 + int isipv6; + int min_protoh; +#endif + +#ifdef INET6 + isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0; + min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) + : sizeof (struct tcpiphdr); +#else +#define min_protoh (sizeof (struct tcpiphdr)) +#endif +#ifdef INET6 + if (isipv6) + rt = tcp_rtlookup6(tp->t_inpcb); + else +#endif /* INET6 */ + rt = tcp_rtlookup(tp->t_inpcb); + if (rt == NULL) + return +#ifdef INET6 + isipv6 ? tcp_v6mssdflt : +#endif /* INET6 */ + tcp_mssdflt; + + return rt->rt_ifp->if_mtu - min_protoh; +} + + +/* + * Checks for partial ack. If partial ack arrives, force the retransmission + * of the next unacknowledged segment, do not clear tp->t_dupacks, and return + * 1. By setting snd_nxt to ti_ack, this forces retransmission timer to + * be started again. If the ack advances at least to tp->snd_recover, return 0. + */ +static int +tcp_newreno(tp, th) + struct tcpcb *tp; + struct tcphdr *th; +{ + if (SEQ_LT(th->th_ack, tp->snd_recover)) { + tcp_seq onxt = tp->snd_nxt; + u_long ocwnd = tp->snd_cwnd; + + callout_stop(tp->tt_rexmt); + tp->t_rtttime = 0; + tp->snd_nxt = th->th_ack; + /* + * Set snd_cwnd to one segment beyond acknowledged offset + * (tp->snd_una has not yet been updated when this function + * is called) + */ + tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); + (void) tcp_output(tp); + tp->snd_cwnd = ocwnd; + if (SEQ_GT(onxt, tp->snd_nxt)) + tp->snd_nxt = onxt; + /* + * Partial window deflation. Relies on fact that tp->snd_una + * not updated yet. + */ + tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_maxseg); + return (1); + } + return (0); +} diff --git a/sys/netinet/tcp_seq.h b/sys/netinet/tcp_seq.h new file mode 100644 index 0000000..9d4adc8 --- /dev/null +++ b/sys/netinet/tcp_seq.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 1982, 1986, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_seq.h 8.3 (Berkeley) 6/21/95 + * $FreeBSD$ + */ + +#ifndef _NETINET_TCP_SEQ_H_ +#define _NETINET_TCP_SEQ_H_ +/* + * TCP sequence numbers are 32 bit integers operated + * on with modular arithmetic. These macros can be + * used to compare such integers. + */ +#define SEQ_LT(a,b) ((int)((a)-(b)) < 0) +#define SEQ_LEQ(a,b) ((int)((a)-(b)) <= 0) +#define SEQ_GT(a,b) ((int)((a)-(b)) > 0) +#define SEQ_GEQ(a,b) ((int)((a)-(b)) >= 0) + +/* for modulo comparisons of timestamps */ +#define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) +#define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) + +/* + * TCP connection counts are 32 bit integers operated + * on with modular arithmetic. These macros can be + * used to compare such integers. + */ +#define CC_LT(a,b) ((int)((a)-(b)) < 0) +#define CC_LEQ(a,b) ((int)((a)-(b)) <= 0) +#define CC_GT(a,b) ((int)((a)-(b)) > 0) +#define CC_GEQ(a,b) ((int)((a)-(b)) >= 0) + +/* Macro to increment a CC: skip 0 which has a special meaning */ +#define CC_INC(c) (++(c) == 0 ? ++(c) : (c)) + +/* + * Macros to initialize tcp sequence numbers for + * send and receive from initial send and receive + * sequence numbers. + */ +#define tcp_rcvseqinit(tp) \ + (tp)->rcv_adv = (tp)->rcv_nxt = (tp)->irs + 1 + +#define tcp_sendseqinit(tp) \ + (tp)->snd_una = (tp)->snd_nxt = (tp)->snd_max = (tp)->snd_up = \ + (tp)->iss + +#define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * hz) + /* timestamp wrap-around time */ + +#ifdef _KERNEL +extern tcp_cc tcp_ccgen; /* global connection count */ + +#else +#define TCP_ISSINCR (250*1024) /* increment for tcp_iss each second */ +#endif /* _KERNEL */ +#endif /* _NETINET_TCP_SEQ_H_ */ diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c new file mode 100644 index 0000000..4089551 --- /dev/null +++ b/sys/netinet/tcp_subr.c @@ -0,0 +1,1424 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 + * $FreeBSD$ + */ + +#include "opt_compat.h" +#include "opt_inet6.h" +#include "opt_ipsec.h" +#include "opt_tcpdebug.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/callout.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#ifdef INET6 +#include <sys/domain.h> +#endif +#include <sys/proc.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/protosw.h> +#include <sys/random.h> + +#include <vm/vm_zone.h> + +#include <net/route.h> +#include <net/if.h> + +#define _IP_VHL +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#ifdef INET6 +#include <netinet/ip6.h> +#endif +#include <netinet/in_pcb.h> +#ifdef INET6 +#include <netinet6/in6_pcb.h> +#endif +#include <netinet/in_var.h> +#include <netinet/ip_var.h> +#ifdef INET6 +#include <netinet6/ip6_var.h> +#endif +#include <netinet/tcp.h> +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_seq.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#ifdef INET6 +#include <netinet6/tcp6_var.h> +#endif +#include <netinet/tcpip.h> +#ifdef TCPDEBUG +#include <netinet/tcp_debug.h> +#endif +#include <netinet6/ip6protosw.h> + +#ifdef IPSEC +#include <netinet6/ipsec.h> +#ifdef INET6 +#include <netinet6/ipsec6.h> +#endif +#endif /*IPSEC*/ + +#include <machine/in_cksum.h> + +int tcp_mssdflt = TCP_MSS; +SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW, + &tcp_mssdflt , 0, "Default TCP Maximum Segment Size"); + +#ifdef INET6 +int tcp_v6mssdflt = TCP6_MSS; +SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, + CTLFLAG_RW, &tcp_v6mssdflt , 0, + "Default TCP Maximum Segment Size for IPv6"); +#endif + +#if 0 +static int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ; +SYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, CTLFLAG_RW, + &tcp_rttdflt , 0, "Default maximum TCP Round Trip Time"); +#endif + +static int tcp_do_rfc1323 = 1; +SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW, + &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions"); + +static int tcp_do_rfc1644 = 0; +SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW, + &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions"); + +static int tcp_tcbhashsize = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD, + &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); + +static int do_tcpdrain = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, + "Enable tcp_drain routine for extra help when low on mbufs"); + +SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD, + &tcbinfo.ipi_count, 0, "Number of active PCBs"); + +static int icmp_may_rst = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, &icmp_may_rst, 0, + "Certain ICMP unreachable messages may abort connections in SYN_SENT"); + +static void tcp_cleartaocache __P((void)); +static void tcp_notify __P((struct inpcb *, int)); + +/* + * Target size of TCP PCB hash tables. Must be a power of two. + * + * Note that this can be overridden by the kernel environment + * variable net.inet.tcp.tcbhashsize + */ +#ifndef TCBHASHSIZE +#define TCBHASHSIZE 512 +#endif + +/* + * This is the actual shape of what we allocate using the zone + * allocator. Doing it this way allows us to protect both structures + * using the same generation count, and also eliminates the overhead + * of allocating tcpcbs separately. By hiding the structure here, + * we avoid changing most of the rest of the code (although it needs + * to be changed, eventually, for greater efficiency). + */ +#define ALIGNMENT 32 +#define ALIGNM1 (ALIGNMENT - 1) +struct inp_tp { + union { + struct inpcb inp; + char align[(sizeof(struct inpcb) + ALIGNM1) & ~ALIGNM1]; + } inp_tp_u; + struct tcpcb tcb; + struct callout inp_tp_rexmt, inp_tp_persist, inp_tp_keep, inp_tp_2msl; + struct callout inp_tp_delack; +}; +#undef ALIGNMENT +#undef ALIGNM1 + +/* + * Tcp initialization + */ +void +tcp_init() +{ + int hashsize; + + tcp_ccgen = 1; + tcp_cleartaocache(); + + tcp_delacktime = TCPTV_DELACK; + tcp_keepinit = TCPTV_KEEP_INIT; + tcp_keepidle = TCPTV_KEEP_IDLE; + tcp_keepintvl = TCPTV_KEEPINTVL; + tcp_maxpersistidle = TCPTV_KEEP_IDLE; + tcp_msl = TCPTV_MSL; + + LIST_INIT(&tcb); + tcbinfo.listhead = &tcb; + TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", TCBHASHSIZE, hashsize); + if (!powerof2(hashsize)) { + printf("WARNING: TCB hash size not a power of 2\n"); + hashsize = 512; /* safe default */ + } + tcp_tcbhashsize = hashsize; + tcbinfo.hashbase = hashinit(hashsize, M_PCB, &tcbinfo.hashmask); + tcbinfo.porthashbase = hashinit(hashsize, M_PCB, + &tcbinfo.porthashmask); + tcbinfo.ipi_zone = zinit("tcpcb", sizeof(struct inp_tp), maxsockets, + ZONE_INTERRUPT, 0); +#ifdef INET6 +#define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) +#else /* INET6 */ +#define TCP_MINPROTOHDR (sizeof(struct tcpiphdr)) +#endif /* INET6 */ + if (max_protohdr < TCP_MINPROTOHDR) + max_protohdr = TCP_MINPROTOHDR; + if (max_linkhdr + TCP_MINPROTOHDR > MHLEN) + panic("tcp_init"); +#undef TCP_MINPROTOHDR +} + +/* + * Create template to be used to send tcp packets on a connection. + * Call after host entry created, allocates an mbuf and fills + * in a skeletal tcp/ip header, minimizing the amount of work + * necessary when the connection is used. + */ +struct tcptemp * +tcp_template(tp) + struct tcpcb *tp; +{ + register struct inpcb *inp = tp->t_inpcb; + register struct mbuf *m; + register struct tcptemp *n; + + if ((n = tp->t_template) == 0) { + m = m_get(M_DONTWAIT, MT_HEADER); + if (m == NULL) + return (0); + m->m_len = sizeof (struct tcptemp); + n = mtod(m, struct tcptemp *); + } +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV6) != 0) { + register struct ip6_hdr *ip6; + + ip6 = (struct ip6_hdr *)n->tt_ipgen; + ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | + (inp->in6p_flowinfo & IPV6_FLOWINFO_MASK); + ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | + (IPV6_VERSION & IPV6_VERSION_MASK); + ip6->ip6_nxt = IPPROTO_TCP; + ip6->ip6_plen = sizeof(struct tcphdr); + ip6->ip6_src = inp->in6p_laddr; + ip6->ip6_dst = inp->in6p_faddr; + n->tt_t.th_sum = 0; + } else +#endif + { + struct ip *ip = (struct ip *)n->tt_ipgen; + + bzero(ip, sizeof(struct ip)); /* XXX overkill? */ + ip->ip_vhl = IP_VHL_BORING; + ip->ip_p = IPPROTO_TCP; + ip->ip_src = inp->inp_laddr; + ip->ip_dst = inp->inp_faddr; + n->tt_t.th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, + htons(sizeof(struct tcphdr) + IPPROTO_TCP)); + } + n->tt_t.th_sport = inp->inp_lport; + n->tt_t.th_dport = inp->inp_fport; + n->tt_t.th_seq = 0; + n->tt_t.th_ack = 0; + n->tt_t.th_x2 = 0; + n->tt_t.th_off = 5; + n->tt_t.th_flags = 0; + n->tt_t.th_win = 0; + n->tt_t.th_urp = 0; + return (n); +} + +/* + * Send a single message to the TCP at address specified by + * the given TCP/IP header. If m == 0, then we make a copy + * of the tcpiphdr at ti and send directly to the addressed host. + * This is used to force keep alive messages out using the TCP + * template for a connection tp->t_template. If flags are given + * then we send a message back to the TCP which originated the + * segment ti, and discard the mbuf containing it and any other + * attached mbufs. + * + * In any case the ack and sequence number of the transmitted + * segment are as specified by the parameters. + * + * NOTE: If m != NULL, then ti must point to *inside* the mbuf. + */ +void +tcp_respond(tp, ipgen, th, m, ack, seq, flags) + struct tcpcb *tp; + void *ipgen; + register struct tcphdr *th; + register struct mbuf *m; + tcp_seq ack, seq; + int flags; +{ + register int tlen; + int win = 0; + struct route *ro = 0; + struct route sro; + struct ip *ip; + struct tcphdr *nth; +#ifdef INET6 + struct route_in6 *ro6 = 0; + struct route_in6 sro6; + struct ip6_hdr *ip6; + int isipv6; +#endif /* INET6 */ + int ipflags = 0; + +#ifdef INET6 + isipv6 = IP_VHL_V(((struct ip *)ipgen)->ip_vhl) == 6; + ip6 = ipgen; +#endif /* INET6 */ + ip = ipgen; + + if (tp) { + if (!(flags & TH_RST)) { + win = sbspace(&tp->t_inpcb->inp_socket->so_rcv); + if (win > (long)TCP_MAXWIN << tp->rcv_scale) + win = (long)TCP_MAXWIN << tp->rcv_scale; + } +#ifdef INET6 + if (isipv6) + ro6 = &tp->t_inpcb->in6p_route; + else +#endif /* INET6 */ + ro = &tp->t_inpcb->inp_route; + } else { +#ifdef INET6 + if (isipv6) { + ro6 = &sro6; + bzero(ro6, sizeof *ro6); + } else +#endif /* INET6 */ + { + ro = &sro; + bzero(ro, sizeof *ro); + } + } + if (m == 0) { + m = m_gethdr(M_DONTWAIT, MT_HEADER); + if (m == NULL) + return; + tlen = 0; + m->m_data += max_linkhdr; +#ifdef INET6 + if (isipv6) { + bcopy((caddr_t)ip6, mtod(m, caddr_t), + sizeof(struct ip6_hdr)); + ip6 = mtod(m, struct ip6_hdr *); + nth = (struct tcphdr *)(ip6 + 1); + } else +#endif /* INET6 */ + { + bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); + ip = mtod(m, struct ip *); + nth = (struct tcphdr *)(ip + 1); + } + bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); + flags = TH_ACK; + } else { + m_freem(m->m_next); + m->m_next = 0; + m->m_data = (caddr_t)ipgen; + /* m_len is set later */ + tlen = 0; +#define xchg(a,b,type) { type t; t=a; a=b; b=t; } +#ifdef INET6 + if (isipv6) { + xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); + nth = (struct tcphdr *)(ip6 + 1); + } else +#endif /* INET6 */ + { + xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long); + nth = (struct tcphdr *)(ip + 1); + } + if (th != nth) { + /* + * this is usually a case when an extension header + * exists between the IPv6 header and the + * TCP header. + */ + nth->th_sport = th->th_sport; + nth->th_dport = th->th_dport; + } + xchg(nth->th_dport, nth->th_sport, n_short); +#undef xchg + } +#ifdef INET6 + if (isipv6) { + ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) + + tlen)); + tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr); + } else +#endif + { + tlen += sizeof (struct tcpiphdr); + ip->ip_len = tlen; + ip->ip_ttl = ip_defttl; + } + m->m_len = tlen; + m->m_pkthdr.len = tlen; + m->m_pkthdr.rcvif = (struct ifnet *) 0; + nth->th_seq = htonl(seq); + nth->th_ack = htonl(ack); + nth->th_x2 = 0; + nth->th_off = sizeof (struct tcphdr) >> 2; + nth->th_flags = flags; + if (tp) + nth->th_win = htons((u_short) (win >> tp->rcv_scale)); + else + nth->th_win = htons((u_short)win); + nth->th_urp = 0; +#ifdef INET6 + if (isipv6) { + nth->th_sum = 0; + nth->th_sum = in6_cksum(m, IPPROTO_TCP, + sizeof(struct ip6_hdr), + tlen - sizeof(struct ip6_hdr)); + ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL, + ro6 && ro6->ro_rt ? + ro6->ro_rt->rt_ifp : + NULL); + } else +#endif /* INET6 */ + { + nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, + htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p))); + m->m_pkthdr.csum_flags = CSUM_TCP; + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); + } +#ifdef TCPDEBUG + if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) + tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0); +#endif +#ifdef IPSEC + ipsec_setsocket(m, tp ? tp->t_inpcb->inp_socket : NULL); +#endif +#ifdef INET6 + if (isipv6) { + (void)ip6_output(m, NULL, ro6, ipflags, NULL, NULL); + if (ro6 == &sro6 && ro6->ro_rt) { + RTFREE(ro6->ro_rt); + ro6->ro_rt = NULL; + } + } else +#endif /* INET6 */ + { + (void) ip_output(m, NULL, ro, ipflags, NULL); + if (ro == &sro && ro->ro_rt) { + RTFREE(ro->ro_rt); + ro->ro_rt = NULL; + } + } +} + +/* + * Create a new TCP control block, making an + * empty reassembly queue and hooking it to the argument + * protocol control block. The `inp' parameter must have + * come from the zone allocator set up in tcp_init(). + */ +struct tcpcb * +tcp_newtcpcb(inp) + struct inpcb *inp; +{ + struct inp_tp *it; + register struct tcpcb *tp; +#ifdef INET6 + int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; +#endif /* INET6 */ + + it = (struct inp_tp *)inp; + tp = &it->tcb; + bzero((char *) tp, sizeof(struct tcpcb)); + LIST_INIT(&tp->t_segq); + tp->t_maxseg = tp->t_maxopd = +#ifdef INET6 + isipv6 ? tcp_v6mssdflt : +#endif /* INET6 */ + tcp_mssdflt; + + /* Set up our timeouts. */ + callout_init(tp->tt_rexmt = &it->inp_tp_rexmt, 0); + callout_init(tp->tt_persist = &it->inp_tp_persist, 0); + callout_init(tp->tt_keep = &it->inp_tp_keep, 0); + callout_init(tp->tt_2msl = &it->inp_tp_2msl, 0); + callout_init(tp->tt_delack = &it->inp_tp_delack, 0); + + if (tcp_do_rfc1323) + tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); + if (tcp_do_rfc1644) + tp->t_flags |= TF_REQ_CC; + tp->t_inpcb = inp; /* XXX */ + /* + * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no + * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives + * reasonable initial retransmit time. + */ + tp->t_srtt = TCPTV_SRTTBASE; + tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; + tp->t_rttmin = TCPTV_MIN; + tp->t_rxtcur = TCPTV_RTOBASE; + tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; + tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; + tp->t_rcvtime = ticks; + /* + * IPv4 TTL initialization is necessary for an IPv6 socket as well, + * because the socket may be bound to an IPv6 wildcard address, + * which may match an IPv4-mapped IPv6 address. + */ + inp->inp_ip_ttl = ip_defttl; + inp->inp_ppcb = (caddr_t)tp; + return (tp); /* XXX */ +} + +/* + * Drop a TCP connection, reporting + * the specified error. If connection is synchronized, + * then send a RST to peer. + */ +struct tcpcb * +tcp_drop(tp, errno) + register struct tcpcb *tp; + int errno; +{ + struct socket *so = tp->t_inpcb->inp_socket; + + if (TCPS_HAVERCVDSYN(tp->t_state)) { + tp->t_state = TCPS_CLOSED; + (void) tcp_output(tp); + tcpstat.tcps_drops++; + } else + tcpstat.tcps_conndrops++; + if (errno == ETIMEDOUT && tp->t_softerror) + errno = tp->t_softerror; + so->so_error = errno; + return (tcp_close(tp)); +} + +/* + * Close a TCP control block: + * discard all space held by the tcp + * discard internet protocol block + * wake up any sleepers + */ +struct tcpcb * +tcp_close(tp) + register struct tcpcb *tp; +{ + register struct tseg_qent *q; + struct inpcb *inp = tp->t_inpcb; + struct socket *so = inp->inp_socket; +#ifdef INET6 + int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; +#endif /* INET6 */ + register struct rtentry *rt; + int dosavessthresh; + + /* + * Make sure that all of our timers are stopped before we + * delete the PCB. + */ + callout_stop(tp->tt_rexmt); + callout_stop(tp->tt_persist); + callout_stop(tp->tt_keep); + callout_stop(tp->tt_2msl); + callout_stop(tp->tt_delack); + + /* + * If we got enough samples through the srtt filter, + * save the rtt and rttvar in the routing entry. + * 'Enough' is arbitrarily defined as the 16 samples. + * 16 samples is enough for the srtt filter to converge + * to within 5% of the correct value; fewer samples and + * we could save a very bogus rtt. + * + * Don't update the default route's characteristics and don't + * update anything that the user "locked". + */ + if (tp->t_rttupdated >= 16) { + register u_long i = 0; +#ifdef INET6 + if (isipv6) { + struct sockaddr_in6 *sin6; + + if ((rt = inp->in6p_route.ro_rt) == NULL) + goto no_valid_rt; + sin6 = (struct sockaddr_in6 *)rt_key(rt); + if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) + goto no_valid_rt; + } + else +#endif /* INET6 */ + if ((rt = inp->inp_route.ro_rt) == NULL || + ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr + == INADDR_ANY) + goto no_valid_rt; + + if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) { + i = tp->t_srtt * + (RTM_RTTUNIT / (hz * TCP_RTT_SCALE)); + if (rt->rt_rmx.rmx_rtt && i) + /* + * filter this update to half the old & half + * the new values, converting scale. + * See route.h and tcp_var.h for a + * description of the scaling constants. + */ + rt->rt_rmx.rmx_rtt = + (rt->rt_rmx.rmx_rtt + i) / 2; + else + rt->rt_rmx.rmx_rtt = i; + tcpstat.tcps_cachedrtt++; + } + if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) { + i = tp->t_rttvar * + (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE)); + if (rt->rt_rmx.rmx_rttvar && i) + rt->rt_rmx.rmx_rttvar = + (rt->rt_rmx.rmx_rttvar + i) / 2; + else + rt->rt_rmx.rmx_rttvar = i; + tcpstat.tcps_cachedrttvar++; + } + /* + * The old comment here said: + * update the pipelimit (ssthresh) if it has been updated + * already or if a pipesize was specified & the threshhold + * got below half the pipesize. I.e., wait for bad news + * before we start updating, then update on both good + * and bad news. + * + * But we want to save the ssthresh even if no pipesize is + * specified explicitly in the route, because such + * connections still have an implicit pipesize specified + * by the global tcp_sendspace. In the absence of a reliable + * way to calculate the pipesize, it will have to do. + */ + i = tp->snd_ssthresh; + if (rt->rt_rmx.rmx_sendpipe != 0) + dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2); + else + dosavessthresh = (i < so->so_snd.sb_hiwat / 2); + if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 && + i != 0 && rt->rt_rmx.rmx_ssthresh != 0) + || dosavessthresh) { + /* + * convert the limit from user data bytes to + * packets then to packet data bytes. + */ + i = (i + tp->t_maxseg / 2) / tp->t_maxseg; + if (i < 2) + i = 2; + i *= (u_long)(tp->t_maxseg + +#ifdef INET6 + (isipv6 ? sizeof (struct ip6_hdr) + + sizeof (struct tcphdr) : +#endif + sizeof (struct tcpiphdr) +#ifdef INET6 + ) +#endif + ); + if (rt->rt_rmx.rmx_ssthresh) + rt->rt_rmx.rmx_ssthresh = + (rt->rt_rmx.rmx_ssthresh + i) / 2; + else + rt->rt_rmx.rmx_ssthresh = i; + tcpstat.tcps_cachedssthresh++; + } + } + rt = inp->inp_route.ro_rt; + if (rt) { + /* + * mark route for deletion if no information is + * cached. + */ + if ((tp->t_flags & TF_LQ_OVERFLOW) && + ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0)){ + if (rt->rt_rmx.rmx_rtt == 0) + rt->rt_flags |= RTF_DELCLONE; + } + } + no_valid_rt: + /* free the reassembly queue, if any */ + while((q = LIST_FIRST(&tp->t_segq)) != NULL) { + LIST_REMOVE(q, tqe_q); + m_freem(q->tqe_m); + FREE(q, M_TSEGQ); + } + if (tp->t_template) + (void) m_free(dtom(tp->t_template)); + inp->inp_ppcb = NULL; + soisdisconnected(so); +#ifdef INET6 + if (INP_CHECK_SOCKAF(so, AF_INET6)) + in6_pcbdetach(inp); + else +#endif /* INET6 */ + in_pcbdetach(inp); + tcpstat.tcps_closed++; + return ((struct tcpcb *)0); +} + +void +tcp_drain() +{ + if (do_tcpdrain) + { + struct inpcb *inpb; + struct tcpcb *tcpb; + struct tseg_qent *te; + + /* + * Walk the tcpbs, if existing, and flush the reassembly queue, + * if there is one... + * XXX: The "Net/3" implementation doesn't imply that the TCP + * reassembly queue should be flushed, but in a situation + * where we're really low on mbufs, this is potentially + * usefull. + */ + LIST_FOREACH(inpb, tcbinfo.listhead, inp_list) { + if ((tcpb = intotcpcb(inpb))) { + while ((te = LIST_FIRST(&tcpb->t_segq)) + != NULL) { + LIST_REMOVE(te, tqe_q); + m_freem(te->tqe_m); + FREE(te, M_TSEGQ); + } + } + } + } +} + +/* + * Notify a tcp user of an asynchronous error; + * store error as soft error, but wake up user + * (for now, won't do anything until can select for soft error). + * + * Do not wake up user since there currently is no mechanism for + * reporting soft errors (yet - a kqueue filter may be added). + */ +static void +tcp_notify(inp, error) + struct inpcb *inp; + int error; +{ + struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb; + + /* + * Ignore some errors if we are hooked up. + * If connection hasn't completed, has retransmitted several times, + * and receives a second error, give up now. This is better + * than waiting a long time to establish a connection that + * can never complete. + */ + if (tp->t_state == TCPS_ESTABLISHED && + (error == EHOSTUNREACH || error == ENETUNREACH || + error == EHOSTDOWN)) { + return; + } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && + tp->t_softerror) + tcp_drop(tp, error); + else + tp->t_softerror = error; +#if 0 + wakeup((caddr_t) &so->so_timeo); + sorwakeup(so); + sowwakeup(so); +#endif +} + +static int +tcp_pcblist(SYSCTL_HANDLER_ARGS) +{ + int error, i, n, s; + struct inpcb *inp, **inp_list; + inp_gen_t gencnt; + struct xinpgen xig; + + /* + * The process of preparing the TCB list is too time-consuming and + * resource-intensive to repeat twice on every request. + */ + if (req->oldptr == 0) { + n = tcbinfo.ipi_count; + req->oldidx = 2 * (sizeof xig) + + (n + n/8) * sizeof(struct xtcpcb); + return 0; + } + + if (req->newptr != 0) + return EPERM; + + /* + * OK, now we're committed to doing something. + */ + s = splnet(); + gencnt = tcbinfo.ipi_gencnt; + n = tcbinfo.ipi_count; + splx(s); + + xig.xig_len = sizeof xig; + xig.xig_count = n; + xig.xig_gen = gencnt; + xig.xig_sogen = so_gencnt; + error = SYSCTL_OUT(req, &xig, sizeof xig); + if (error) + return error; + + inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); + if (inp_list == 0) + return ENOMEM; + + s = splnet(); + for (inp = LIST_FIRST(tcbinfo.listhead), i = 0; inp && i < n; + inp = LIST_NEXT(inp, inp_list)) { + if (inp->inp_gencnt <= gencnt && !prison_xinpcb(req->p, inp)) + inp_list[i++] = inp; + } + splx(s); + n = i; + + error = 0; + for (i = 0; i < n; i++) { + inp = inp_list[i]; + if (inp->inp_gencnt <= gencnt) { + struct xtcpcb xt; + caddr_t inp_ppcb; + xt.xt_len = sizeof xt; + /* XXX should avoid extra copy */ + bcopy(inp, &xt.xt_inp, sizeof *inp); + inp_ppcb = inp->inp_ppcb; + if (inp_ppcb != NULL) + bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp); + else + bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); + if (inp->inp_socket) + sotoxsocket(inp->inp_socket, &xt.xt_socket); + error = SYSCTL_OUT(req, &xt, sizeof xt); + } + } + if (!error) { + /* + * Give the user an updated idea of our state. + * If the generation differs from what we told + * her before, she knows that something happened + * while we were processing this request, and it + * might be necessary to retry. + */ + s = splnet(); + xig.xig_gen = tcbinfo.ipi_gencnt; + xig.xig_sogen = so_gencnt; + xig.xig_count = tcbinfo.ipi_count; + splx(s); + error = SYSCTL_OUT(req, &xig, sizeof xig); + } + free(inp_list, M_TEMP); + return error; +} + +SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0, + tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); + +static int +tcp_getcred(SYSCTL_HANDLER_ARGS) +{ + struct xucred xuc; + struct sockaddr_in addrs[2]; + struct inpcb *inp; + int error, s; + + error = suser(req->p); + if (error) + return (error); + error = SYSCTL_IN(req, addrs, sizeof(addrs)); + if (error) + return (error); + s = splnet(); + inp = in_pcblookup_hash(&tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, + addrs[0].sin_addr, addrs[0].sin_port, 0, NULL); + if (inp == NULL || inp->inp_socket == NULL) { + error = ENOENT; + goto out; + } + bzero(&xuc, sizeof(xuc)); + xuc.cr_uid = inp->inp_socket->so_cred->cr_uid; + xuc.cr_ngroups = inp->inp_socket->so_cred->cr_ngroups; + bcopy(inp->inp_socket->so_cred->cr_groups, xuc.cr_groups, + sizeof(xuc.cr_groups)); + error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); +out: + splx(s); + return (error); +} + +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW, + 0, 0, tcp_getcred, "S,xucred", "Get the xucred of a TCP connection"); + +#ifdef INET6 +static int +tcp6_getcred(SYSCTL_HANDLER_ARGS) +{ + struct xucred xuc; + struct sockaddr_in6 addrs[2]; + struct inpcb *inp; + int error, s, mapped = 0; + + error = suser(req->p); + if (error) + return (error); + error = SYSCTL_IN(req, addrs, sizeof(addrs)); + if (error) + return (error); + if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) { + if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr)) + mapped = 1; + else + return (EINVAL); + } + s = splnet(); + if (mapped == 1) + inp = in_pcblookup_hash(&tcbinfo, + *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], + addrs[1].sin6_port, + *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], + addrs[0].sin6_port, + 0, NULL); + else + inp = in6_pcblookup_hash(&tcbinfo, &addrs[1].sin6_addr, + addrs[1].sin6_port, + &addrs[0].sin6_addr, addrs[0].sin6_port, + 0, NULL); + if (inp == NULL || inp->inp_socket == NULL) { + error = ENOENT; + goto out; + } + bzero(&xuc, sizeof(xuc)); + xuc.cr_uid = inp->inp_socket->so_cred->cr_uid; + xuc.cr_ngroups = inp->inp_socket->so_cred->cr_ngroups; + bcopy(inp->inp_socket->so_cred->cr_groups, xuc.cr_groups, + sizeof(xuc.cr_groups)); + error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); +out: + splx(s); + return (error); +} + +SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW, + 0, 0, + tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection"); +#endif + + +void +tcp_ctlinput(cmd, sa, vip) + int cmd; + struct sockaddr *sa; + void *vip; +{ + struct ip *ip = vip; + struct tcphdr *th; + struct in_addr faddr; + struct inpcb *inp; + struct tcpcb *tp; + void (*notify) __P((struct inpcb *, int)) = tcp_notify; + tcp_seq icmp_seq; + int s; + + faddr = ((struct sockaddr_in *)sa)->sin_addr; + if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) + return; + + if (cmd == PRC_QUENCH) + notify = tcp_quench; + else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || + cmd == PRC_UNREACH_PORT) && ip) + notify = tcp_drop_syn_sent; + else if (cmd == PRC_MSGSIZE) + notify = tcp_mtudisc; + else if (PRC_IS_REDIRECT(cmd)) { + ip = 0; + notify = in_rtchange; + } else if (cmd == PRC_HOSTDEAD) + ip = 0; + else if ((unsigned)cmd > PRC_NCMDS || inetctlerrmap[cmd] == 0) + return; + if (ip) { + s = splnet(); + th = (struct tcphdr *)((caddr_t)ip + + (IP_VHL_HL(ip->ip_vhl) << 2)); + inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport, + ip->ip_src, th->th_sport, 0, NULL); + if (inp != NULL && inp->inp_socket != NULL) { + icmp_seq = htonl(th->th_seq); + tp = intotcpcb(inp); + if (SEQ_GEQ(icmp_seq, tp->snd_una) && + SEQ_LT(icmp_seq, tp->snd_max)) + (*notify)(inp, inetctlerrmap[cmd]); + } + splx(s); + } else + in_pcbnotifyall(&tcb, faddr, inetctlerrmap[cmd], notify); +} + +#ifdef INET6 +void +tcp6_ctlinput(cmd, sa, d) + int cmd; + struct sockaddr *sa; + void *d; +{ + register struct tcphdr *thp; + struct tcphdr th; + void (*notify) __P((struct inpcb *, int)) = tcp_notify; + struct sockaddr_in6 sa6; + struct ip6_hdr *ip6; + struct mbuf *m; + int off; + + if (sa->sa_family != AF_INET6 || + sa->sa_len != sizeof(struct sockaddr_in6)) + return; + + if (cmd == PRC_QUENCH) + notify = tcp_quench; + else if (cmd == PRC_MSGSIZE) + notify = tcp_mtudisc; + else if (!PRC_IS_REDIRECT(cmd) && + ((unsigned)cmd > PRC_NCMDS || inet6ctlerrmap[cmd] == 0)) + return; + + /* if the parameter is from icmp6, decode it. */ + if (d != NULL) { + struct ip6ctlparam *ip6cp = (struct ip6ctlparam *)d; + m = ip6cp->ip6c_m; + ip6 = ip6cp->ip6c_ip6; + off = ip6cp->ip6c_off; + } else { + m = NULL; + ip6 = NULL; + off = 0; /* fool gcc */ + } + + /* + * Translate addresses into internal form. + * Sa check if it is AF_INET6 is done at the top of this funciton. + */ + sa6 = *(struct sockaddr_in6 *)sa; + if (IN6_IS_ADDR_LINKLOCAL(&sa6.sin6_addr) != 0 && m != NULL && + m->m_pkthdr.rcvif != NULL) + sa6.sin6_addr.s6_addr16[1] = htons(m->m_pkthdr.rcvif->if_index); + + if (ip6) { + /* + * XXX: We assume that when IPV6 is non NULL, + * M and OFF are valid. + */ + struct in6_addr s; + + /* translate addresses into internal form */ + memcpy(&s, &ip6->ip6_src, sizeof(s)); + if (IN6_IS_ADDR_LINKLOCAL(&s) != 0 && m != NULL && + m->m_pkthdr.rcvif != NULL) + s.s6_addr16[1] = htons(m->m_pkthdr.rcvif->if_index); + + /* check if we can safely examine src and dst ports */ + if (m->m_pkthdr.len < off + sizeof(th)) + return; + + if (m->m_len < off + sizeof(th)) { + /* + * this should be rare case + * because now MINCLSIZE is "(MHLEN + 1)", + * so we compromise on this copy... + */ + m_copydata(m, off, sizeof(th), (caddr_t)&th); + thp = &th; + } else + thp = (struct tcphdr *)(mtod(m, caddr_t) + off); + in6_pcbnotify(&tcb, (struct sockaddr *)&sa6, thp->th_dport, + &s, thp->th_sport, cmd, notify); + } else + in6_pcbnotify(&tcb, (struct sockaddr *)&sa6, 0, &zeroin6_addr, + 0, cmd, notify); +} +#endif /* INET6 */ + +#define TCP_RNDISS_ROUNDS 16 +#define TCP_RNDISS_OUT 7200 +#define TCP_RNDISS_MAX 30000 + +u_int8_t tcp_rndiss_sbox[128]; +u_int16_t tcp_rndiss_msb; +u_int16_t tcp_rndiss_cnt; +long tcp_rndiss_reseed; + +u_int16_t +tcp_rndiss_encrypt(val) + u_int16_t val; +{ + u_int16_t sum = 0, i; + + for (i = 0; i < TCP_RNDISS_ROUNDS; i++) { + sum += 0x79b9; + val ^= ((u_int16_t)tcp_rndiss_sbox[(val^sum) & 0x7f]) << 7; + val = ((val & 0xff) << 7) | (val >> 8); + } + + return val; +} + +void +tcp_rndiss_init() +{ + struct timeval time; + + getmicrotime(&time); + read_random(tcp_rndiss_sbox, sizeof(tcp_rndiss_sbox)); + + tcp_rndiss_reseed = time.tv_sec + TCP_RNDISS_OUT; + tcp_rndiss_msb = tcp_rndiss_msb == 0x8000 ? 0 : 0x8000; + tcp_rndiss_cnt = 0; +} + +tcp_seq +tcp_rndiss_next() +{ + u_int16_t tmp; + struct timeval time; + + getmicrotime(&time); + + if (tcp_rndiss_cnt >= TCP_RNDISS_MAX || + time.tv_sec > tcp_rndiss_reseed) + tcp_rndiss_init(); + + read_random(&tmp, sizeof(tmp)); + + /* (tmp & 0x7fff) ensures a 32768 byte gap between ISS */ + return ((tcp_rndiss_encrypt(tcp_rndiss_cnt++) | tcp_rndiss_msb) <<16) | + (tmp & 0x7fff); +} + + +/* + * When a source quench is received, close congestion window + * to one segment. We will gradually open it again as we proceed. + */ +void +tcp_quench(inp, errno) + struct inpcb *inp; + int errno; +{ + struct tcpcb *tp = intotcpcb(inp); + + if (tp) + tp->snd_cwnd = tp->t_maxseg; +} + +/* + * When a specific ICMP unreachable message is received and the + * connection state is SYN-SENT, drop the connection. This behavior + * is controlled by the icmp_may_rst sysctl. + */ +void +tcp_drop_syn_sent(inp, errno) + struct inpcb *inp; + int errno; +{ + struct tcpcb *tp = intotcpcb(inp); + + if (tp && tp->t_state == TCPS_SYN_SENT) + tcp_drop(tp, errno); +} + +/* + * When `need fragmentation' ICMP is received, update our idea of the MSS + * based on the new value in the route. Also nudge TCP to send something, + * since we know the packet we just sent was dropped. + * This duplicates some code in the tcp_mss() function in tcp_input.c. + */ +void +tcp_mtudisc(inp, errno) + struct inpcb *inp; + int errno; +{ + struct tcpcb *tp = intotcpcb(inp); + struct rtentry *rt; + struct rmxp_tao *taop; + struct socket *so = inp->inp_socket; + int offered; + int mss; +#ifdef INET6 + int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; +#endif /* INET6 */ + + if (tp) { +#ifdef INET6 + if (isipv6) + rt = tcp_rtlookup6(inp); + else +#endif /* INET6 */ + rt = tcp_rtlookup(inp); + if (!rt || !rt->rt_rmx.rmx_mtu) { + tp->t_maxopd = tp->t_maxseg = +#ifdef INET6 + isipv6 ? tcp_v6mssdflt : +#endif /* INET6 */ + tcp_mssdflt; + return; + } + taop = rmx_taop(rt->rt_rmx); + offered = taop->tao_mssopt; + mss = rt->rt_rmx.rmx_mtu - +#ifdef INET6 + (isipv6 ? + sizeof(struct ip6_hdr) + sizeof(struct tcphdr) : +#endif /* INET6 */ + sizeof(struct tcpiphdr) +#ifdef INET6 + ) +#endif /* INET6 */ + ; + + if (offered) + mss = min(mss, offered); + /* + * XXX - The above conditional probably violates the TCP + * spec. The problem is that, since we don't know the + * other end's MSS, we are supposed to use a conservative + * default. But, if we do that, then MTU discovery will + * never actually take place, because the conservative + * default is much less than the MTUs typically seen + * on the Internet today. For the moment, we'll sweep + * this under the carpet. + * + * The conservative default might not actually be a problem + * if the only case this occurs is when sending an initial + * SYN with options and data to a host we've never talked + * to before. Then, they will reply with an MSS value which + * will get recorded and the new parameters should get + * recomputed. For Further Study. + */ + if (tp->t_maxopd <= mss) + return; + tp->t_maxopd = mss; + + if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && + (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) + mss -= TCPOLEN_TSTAMP_APPA; + if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && + (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC) + mss -= TCPOLEN_CC_APPA; +#if (MCLBYTES & (MCLBYTES - 1)) == 0 + if (mss > MCLBYTES) + mss &= ~(MCLBYTES-1); +#else + if (mss > MCLBYTES) + mss = mss / MCLBYTES * MCLBYTES; +#endif + if (so->so_snd.sb_hiwat < mss) + mss = so->so_snd.sb_hiwat; + + tp->t_maxseg = mss; + + tcpstat.tcps_mturesent++; + tp->t_rtttime = 0; + tp->snd_nxt = tp->snd_una; + tcp_output(tp); + } +} + +/* + * Look-up the routing entry to the peer of this inpcb. If no route + * is found and it cannot be allocated the return NULL. This routine + * is called by TCP routines that access the rmx structure and by tcp_mss + * to get the interface MTU. + */ +struct rtentry * +tcp_rtlookup(inp) + struct inpcb *inp; +{ + struct route *ro; + struct rtentry *rt; + + ro = &inp->inp_route; + rt = ro->ro_rt; + if (rt == NULL || !(rt->rt_flags & RTF_UP)) { + /* No route yet, so try to acquire one */ + if (inp->inp_faddr.s_addr != INADDR_ANY) { + ro->ro_dst.sa_family = AF_INET; + ro->ro_dst.sa_len = sizeof(ro->ro_dst); + ((struct sockaddr_in *) &ro->ro_dst)->sin_addr = + inp->inp_faddr; + rtalloc(ro); + rt = ro->ro_rt; + } + } + return rt; +} + +#ifdef INET6 +struct rtentry * +tcp_rtlookup6(inp) + struct inpcb *inp; +{ + struct route_in6 *ro6; + struct rtentry *rt; + + ro6 = &inp->in6p_route; + rt = ro6->ro_rt; + if (rt == NULL || !(rt->rt_flags & RTF_UP)) { + /* No route yet, so try to acquire one */ + if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { + ro6->ro_dst.sin6_family = AF_INET6; + ro6->ro_dst.sin6_len = sizeof(ro6->ro_dst); + ro6->ro_dst.sin6_addr = inp->in6p_faddr; + rtalloc((struct route *)ro6); + rt = ro6->ro_rt; + } + } + return rt; +} +#endif /* INET6 */ + +#ifdef IPSEC +/* compute ESP/AH header size for TCP, including outer IP header. */ +size_t +ipsec_hdrsiz_tcp(tp) + struct tcpcb *tp; +{ + struct inpcb *inp; + struct mbuf *m; + size_t hdrsiz; + struct ip *ip; +#ifdef INET6 + struct ip6_hdr *ip6; +#endif /* INET6 */ + struct tcphdr *th; + + if (!tp || !tp->t_template || !(inp = tp->t_inpcb)) + return 0; + MGETHDR(m, M_DONTWAIT, MT_DATA); + if (!m) + return 0; + +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV6) != 0) { + ip6 = mtod(m, struct ip6_hdr *); + th = (struct tcphdr *)(ip6 + 1); + m->m_pkthdr.len = m->m_len = + sizeof(struct ip6_hdr) + sizeof(struct tcphdr); + bcopy((caddr_t)tp->t_template->tt_ipgen, (caddr_t)ip6, + sizeof(struct ip6_hdr)); + bcopy((caddr_t)&tp->t_template->tt_t, (caddr_t)th, + sizeof(struct tcphdr)); + hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); + } else +#endif /* INET6 */ + { + ip = mtod(m, struct ip *); + th = (struct tcphdr *)(ip + 1); + m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr); + bcopy((caddr_t)tp->t_template->tt_ipgen, (caddr_t)ip, + sizeof(struct ip)); + bcopy((caddr_t)&tp->t_template->tt_t, (caddr_t)th, + sizeof(struct tcphdr)); + hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); + } + + m_free(m); + return hdrsiz; +} +#endif /*IPSEC*/ + +/* + * Return a pointer to the cached information about the remote host. + * The cached information is stored in the protocol specific part of + * the route metrics. + */ +struct rmxp_tao * +tcp_gettaocache(inp) + struct inpcb *inp; +{ + struct rtentry *rt; + +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV6) != 0) + rt = tcp_rtlookup6(inp); + else +#endif /* INET6 */ + rt = tcp_rtlookup(inp); + + /* Make sure this is a host route and is up. */ + if (rt == NULL || + (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST)) + return NULL; + + return rmx_taop(rt->rt_rmx); +} + +/* + * Clear all the TAO cache entries, called from tcp_init. + * + * XXX + * This routine is just an empty one, because we assume that the routing + * routing tables are initialized at the same time when TCP, so there is + * nothing in the cache left over. + */ +static void +tcp_cleartaocache() +{ +} diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c new file mode 100644 index 0000000..0a85cf8 --- /dev/null +++ b/sys/netinet/tcp_timer.c @@ -0,0 +1,463 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 + * $FreeBSD$ + */ + +#include "opt_compat.h" +#include "opt_inet6.h" +#include "opt_tcpdebug.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/protosw.h> + +#include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */ + +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/in_pcb.h> +#ifdef INET6 +#include <netinet6/in6_pcb.h> +#endif +#include <netinet/ip_var.h> +#include <netinet/tcp.h> +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_seq.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#include <netinet/tcpip.h> +#ifdef TCPDEBUG +#include <netinet/tcp_debug.h> +#endif + +static int +sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS) +{ + int error, s, tt; + + tt = *(int *)oidp->oid_arg1; + s = tt * 1000 / hz; + + error = sysctl_handle_int(oidp, &s, 0, req); + if (error || !req->newptr) + return (error); + + tt = s * hz / 1000; + if (tt < 1) + return (EINVAL); + + *(int *)oidp->oid_arg1 = tt; + return (0); +} + +int tcp_keepinit; +SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW, + &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", ""); + +int tcp_keepidle; +SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW, + &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", ""); + +int tcp_keepintvl; +SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW, + &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", ""); + +int tcp_delacktime; +SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, + CTLTYPE_INT|CTLFLAG_RW, &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", + "Time before a delayed ACK is sent"); + +int tcp_msl; +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW, + &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); + +static int always_keepalive = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW, + &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections"); + +static int tcp_keepcnt = TCPTV_KEEPCNT; + /* max idle probes */ +int tcp_maxpersistidle; + /* max idle time in persist */ +int tcp_maxidle; + +/* + * Tcp protocol timeout routine called every 500 ms. + * Updates timestamps used for TCP + * causes finite state machine actions if timers expire. + */ +void +tcp_slowtimo() +{ + int s; + + s = splnet(); + + tcp_maxidle = tcp_keepcnt * tcp_keepintvl; + + splx(s); +} + +/* + * Cancel all timers for TCP tp. + */ +void +tcp_canceltimers(tp) + struct tcpcb *tp; +{ + callout_stop(tp->tt_2msl); + callout_stop(tp->tt_persist); + callout_stop(tp->tt_keep); + callout_stop(tp->tt_rexmt); +} + +int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = + { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 }; + +int tcp_backoff[TCP_MAXRXTSHIFT + 1] = + { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 }; + +static int tcp_totbackoff = 511; /* sum of tcp_backoff[] */ + +/* + * TCP timer processing. + */ +void +tcp_timer_delack(xtp) + void *xtp; +{ + struct tcpcb *tp = xtp; + int s; + + s = splnet(); + if (callout_pending(tp->tt_delack) || !callout_active(tp->tt_delack)) { + splx(s); + return; + } + callout_deactivate(tp->tt_delack); + + tp->t_flags |= TF_ACKNOW; + tcpstat.tcps_delack++; + (void) tcp_output(tp); + splx(s); +} + +void +tcp_timer_2msl(xtp) + void *xtp; +{ + struct tcpcb *tp = xtp; + int s; +#ifdef TCPDEBUG + int ostate; + + ostate = tp->t_state; +#endif + s = splnet(); + if (callout_pending(tp->tt_2msl) || !callout_active(tp->tt_2msl)) { + splx(s); + return; + } + callout_deactivate(tp->tt_2msl); + /* + * 2 MSL timeout in shutdown went off. If we're closed but + * still waiting for peer to close and connection has been idle + * too long, or if 2MSL time is up from TIME_WAIT, delete connection + * control block. Otherwise, check again in a bit. + */ + if (tp->t_state != TCPS_TIME_WAIT && + (ticks - tp->t_rcvtime) <= tcp_maxidle) + callout_reset(tp->tt_2msl, tcp_keepintvl, + tcp_timer_2msl, tp); + else + tp = tcp_close(tp); + +#ifdef TCPDEBUG + if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) + tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, + PRU_SLOWTIMO); +#endif + splx(s); +} + +void +tcp_timer_keep(xtp) + void *xtp; +{ + struct tcpcb *tp = xtp; + int s; +#ifdef TCPDEBUG + int ostate; + + ostate = tp->t_state; +#endif + s = splnet(); + if (callout_pending(tp->tt_keep) || !callout_active(tp->tt_keep)) { + splx(s); + return; + } + callout_deactivate(tp->tt_keep); + /* + * Keep-alive timer went off; send something + * or drop connection if idle for too long. + */ + tcpstat.tcps_keeptimeo++; + if (tp->t_state < TCPS_ESTABLISHED) + goto dropit; + if ((always_keepalive || + tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) && + tp->t_state <= TCPS_CLOSING) { + if ((ticks - tp->t_rcvtime) >= tcp_keepidle + tcp_maxidle) + goto dropit; + /* + * Send a packet designed to force a response + * if the peer is up and reachable: + * either an ACK if the connection is still alive, + * or an RST if the peer has closed the connection + * due to timeout or reboot. + * Using sequence number tp->snd_una-1 + * causes the transmitted zero-length segment + * to lie outside the receive window; + * by the protocol spec, this requires the + * correspondent TCP to respond. + */ + tcpstat.tcps_keepprobe++; + tcp_respond(tp, tp->t_template->tt_ipgen, + &tp->t_template->tt_t, (struct mbuf *)NULL, + tp->rcv_nxt, tp->snd_una - 1, 0); + callout_reset(tp->tt_keep, tcp_keepintvl, tcp_timer_keep, tp); + } else + callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); + +#ifdef TCPDEBUG + if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG) + tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, + PRU_SLOWTIMO); +#endif + splx(s); + return; + +dropit: + tcpstat.tcps_keepdrops++; + tp = tcp_drop(tp, ETIMEDOUT); + +#ifdef TCPDEBUG + if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) + tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, + PRU_SLOWTIMO); +#endif + splx(s); +} + +void +tcp_timer_persist(xtp) + void *xtp; +{ + struct tcpcb *tp = xtp; + int s; +#ifdef TCPDEBUG + int ostate; + + ostate = tp->t_state; +#endif + s = splnet(); + if (callout_pending(tp->tt_persist) || !callout_active(tp->tt_persist)){ + splx(s); + return; + } + callout_deactivate(tp->tt_persist); + /* + * Persistance timer into zero window. + * Force a byte to be output, if possible. + */ + tcpstat.tcps_persisttimeo++; + /* + * Hack: if the peer is dead/unreachable, we do not + * time out if the window is closed. After a full + * backoff, drop the connection if the idle time + * (no responses to probes) reaches the maximum + * backoff that we would use if retransmitting. + */ + if (tp->t_rxtshift == TCP_MAXRXTSHIFT && + ((ticks - tp->t_rcvtime) >= tcp_maxpersistidle || + (ticks - tp->t_rcvtime) >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { + tcpstat.tcps_persistdrop++; + tp = tcp_drop(tp, ETIMEDOUT); + goto out; + } + tcp_setpersist(tp); + tp->t_force = 1; + (void) tcp_output(tp); + tp->t_force = 0; + +out: +#ifdef TCPDEBUG + if (tp && tp->t_inpcb->inp_socket->so_options & SO_DEBUG) + tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, + PRU_SLOWTIMO); +#endif + splx(s); +} + +void +tcp_timer_rexmt(xtp) + void *xtp; +{ + struct tcpcb *tp = xtp; + int s; + int rexmt; +#ifdef TCPDEBUG + int ostate; + + ostate = tp->t_state; +#endif + s = splnet(); + if (callout_pending(tp->tt_rexmt) || !callout_active(tp->tt_rexmt)) { + splx(s); + return; + } + callout_deactivate(tp->tt_rexmt); + /* + * Retransmission timer went off. Message has not + * been acked within retransmit interval. Back off + * to a longer retransmit interval and retransmit one segment. + */ + if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { + tp->t_rxtshift = TCP_MAXRXTSHIFT; + tcpstat.tcps_timeoutdrop++; + tp = tcp_drop(tp, tp->t_softerror ? + tp->t_softerror : ETIMEDOUT); + goto out; + } + if (tp->t_rxtshift == 1) { + /* + * first retransmit; record ssthresh and cwnd so they can + * be recovered if this turns out to be a "bad" retransmit. + * A retransmit is considered "bad" if an ACK for this + * segment is received within RTT/2 interval; the assumption + * here is that the ACK was already in flight. See + * "On Estimating End-to-End Network Path Properties" by + * Allman and Paxson for more details. + */ + tp->snd_cwnd_prev = tp->snd_cwnd; + tp->snd_ssthresh_prev = tp->snd_ssthresh; + tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); + } + tcpstat.tcps_rexmttimeo++; + if (tp->t_state == TCPS_SYN_SENT) + rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift]; + else + rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; + TCPT_RANGESET(tp->t_rxtcur, rexmt, + tp->t_rttmin, TCPTV_REXMTMAX); + /* + * If losing, let the lower level know and try for + * a better route. Also, if we backed off this far, + * our srtt estimate is probably bogus. Clobber it + * so we'll take the next rtt measurement as our srtt; + * move the current srtt into rttvar to keep the current + * retransmit times until then. + */ + if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { +#ifdef INET6 + if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) + in6_losing(tp->t_inpcb); + else +#endif + in_losing(tp->t_inpcb); + tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); + tp->t_srtt = 0; + } + tp->snd_nxt = tp->snd_una; + /* + * Note: We overload snd_recover to function also as the + * snd_last variable described in RFC 2582 + */ + tp->snd_recover = tp->snd_max; + /* + * Force a segment to be sent. + */ + tp->t_flags |= TF_ACKNOW; + /* + * If timing a segment in this window, stop the timer. + */ + tp->t_rtttime = 0; + /* + * Close the congestion window down to one segment + * (we'll open it by one segment for each ack we get). + * Since we probably have a window's worth of unacked + * data accumulated, this "slow start" keeps us from + * dumping all that data as back-to-back packets (which + * might overwhelm an intermediate gateway). + * + * There are two phases to the opening: Initially we + * open by one mss on each ack. This makes the window + * size increase exponentially with time. If the + * window is larger than the path can handle, this + * exponential growth results in dropped packet(s) + * almost immediately. To get more time between + * drops but still "push" the network to take advantage + * of improving conditions, we switch from exponential + * to linear window opening at some threshhold size. + * For a threshhold, we use half the current window + * size, truncated to a multiple of the mss. + * + * (the minimum cwnd that will give us exponential + * growth is 2 mss. We don't allow the threshhold + * to go below this.) + */ + { + u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; + if (win < 2) + win = 2; + tp->snd_cwnd = tp->t_maxseg; + tp->snd_ssthresh = win * tp->t_maxseg; + tp->t_dupacks = 0; + } + (void) tcp_output(tp); + +out: +#ifdef TCPDEBUG + if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) + tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, + PRU_SLOWTIMO); +#endif + splx(s); +} diff --git a/sys/netinet/tcp_timer.h b/sys/netinet/tcp_timer.h new file mode 100644 index 0000000..b2bcb43 --- /dev/null +++ b/sys/netinet/tcp_timer.h @@ -0,0 +1,139 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_timer.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_TCP_TIMER_H_ +#define _NETINET_TCP_TIMER_H_ + +/* + * The TCPT_REXMT timer is used to force retransmissions. + * The TCP has the TCPT_REXMT timer set whenever segments + * have been sent for which ACKs are expected but not yet + * received. If an ACK is received which advances tp->snd_una, + * then the retransmit timer is cleared (if there are no more + * outstanding segments) or reset to the base value (if there + * are more ACKs expected). Whenever the retransmit timer goes off, + * we retransmit one unacknowledged segment, and do a backoff + * on the retransmit timer. + * + * The TCPT_PERSIST timer is used to keep window size information + * flowing even if the window goes shut. If all previous transmissions + * have been acknowledged (so that there are no retransmissions in progress), + * and the window is too small to bother sending anything, then we start + * the TCPT_PERSIST timer. When it expires, if the window is nonzero, + * we go to transmit state. Otherwise, at intervals send a single byte + * into the peer's window to force him to update our window information. + * We do this at most as often as TCPT_PERSMIN time intervals, + * but no more frequently than the current estimate of round-trip + * packet time. The TCPT_PERSIST timer is cleared whenever we receive + * a window update from the peer. + * + * The TCPT_KEEP timer is used to keep connections alive. If an + * connection is idle (no segments received) for TCPTV_KEEP_INIT amount of time, + * but not yet established, then we drop the connection. Once the connection + * is established, if the connection is idle for TCPTV_KEEP_IDLE time + * (and keepalives have been enabled on the socket), we begin to probe + * the connection. We force the peer to send us a segment by sending: + * <SEQ=SND.UNA-1><ACK=RCV.NXT><CTL=ACK> + * This segment is (deliberately) outside the window, and should elicit + * an ack segment in response from the peer. If, despite the TCPT_KEEP + * initiated segments we cannot elicit a response from a peer in TCPT_MAXIDLE + * amount of time probing, then we drop the connection. + */ + +/* + * Time constants. + */ +#define TCPTV_MSL ( 30*hz) /* max seg lifetime (hah!) */ +#define TCPTV_SRTTBASE 0 /* base roundtrip time; + if 0, no idea yet */ +#define TCPTV_RTOBASE ( 3*hz) /* assumed RTO if no info */ +#define TCPTV_SRTTDFLT ( 3*hz) /* assumed RTT if no info */ + +#define TCPTV_PERSMIN ( 5*hz) /* retransmit persistence */ +#define TCPTV_PERSMAX ( 60*hz) /* maximum persist interval */ + +#define TCPTV_KEEP_INIT ( 75*hz) /* initial connect keepalive */ +#define TCPTV_KEEP_IDLE (120*60*hz) /* dflt time before probing */ +#define TCPTV_KEEPINTVL ( 75*hz) /* default probe interval */ +#define TCPTV_KEEPCNT 8 /* max probes before drop */ + +#define TCPTV_MIN ( 1*hz) /* minimum allowable value */ +#define TCPTV_REXMTMAX ( 64*hz) /* max allowable REXMT value */ + +#define TCPTV_TWTRUNC 8 /* RTO factor to truncate TW */ + +#define TCP_LINGERTIME 120 /* linger at most 2 minutes */ + +#define TCP_MAXRXTSHIFT 12 /* maximum retransmits */ + +#define TCPTV_DELACK (hz / PR_FASTHZ / 2) /* 100ms timeout */ + +#ifdef TCPTIMERS +static char *tcptimers[] = + { "REXMT", "PERSIST", "KEEP", "2MSL" }; +#endif + +/* + * Force a time value to be in a certain range. + */ +#define TCPT_RANGESET(tv, value, tvmin, tvmax) do { \ + (tv) = (value); \ + if ((u_long)(tv) < (u_long)(tvmin)) \ + (tv) = (tvmin); \ + else if ((u_long)(tv) > (u_long)(tvmax)) \ + (tv) = (tvmax); \ +} while(0) + +#ifdef _KERNEL +extern int tcp_keepinit; /* time to establish connection */ +extern int tcp_keepidle; /* time before keepalive probes begin */ +extern int tcp_keepintvl; /* time between keepalive probes */ +extern int tcp_maxidle; /* time to drop after starting probes */ +extern int tcp_delacktime; /* time before sending a delayed ACK */ +extern int tcp_maxpersistidle; +extern int tcp_msl; +extern int tcp_ttl; /* time to live for TCP segs */ +extern int tcp_backoff[]; + +void tcp_timer_2msl __P((void *xtp)); +void tcp_timer_keep __P((void *xtp)); +void tcp_timer_persist __P((void *xtp)); +void tcp_timer_rexmt __P((void *xtp)); +void tcp_timer_delack __P((void *xtp)); + +#endif /* _KERNEL */ + +#endif /* !_NETINET_TCP_TIMER_H_ */ diff --git a/sys/netinet/tcp_timewait.c b/sys/netinet/tcp_timewait.c new file mode 100644 index 0000000..4089551 --- /dev/null +++ b/sys/netinet/tcp_timewait.c @@ -0,0 +1,1424 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 + * $FreeBSD$ + */ + +#include "opt_compat.h" +#include "opt_inet6.h" +#include "opt_ipsec.h" +#include "opt_tcpdebug.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/callout.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#ifdef INET6 +#include <sys/domain.h> +#endif +#include <sys/proc.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/protosw.h> +#include <sys/random.h> + +#include <vm/vm_zone.h> + +#include <net/route.h> +#include <net/if.h> + +#define _IP_VHL +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#ifdef INET6 +#include <netinet/ip6.h> +#endif +#include <netinet/in_pcb.h> +#ifdef INET6 +#include <netinet6/in6_pcb.h> +#endif +#include <netinet/in_var.h> +#include <netinet/ip_var.h> +#ifdef INET6 +#include <netinet6/ip6_var.h> +#endif +#include <netinet/tcp.h> +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_seq.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#ifdef INET6 +#include <netinet6/tcp6_var.h> +#endif +#include <netinet/tcpip.h> +#ifdef TCPDEBUG +#include <netinet/tcp_debug.h> +#endif +#include <netinet6/ip6protosw.h> + +#ifdef IPSEC +#include <netinet6/ipsec.h> +#ifdef INET6 +#include <netinet6/ipsec6.h> +#endif +#endif /*IPSEC*/ + +#include <machine/in_cksum.h> + +int tcp_mssdflt = TCP_MSS; +SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW, + &tcp_mssdflt , 0, "Default TCP Maximum Segment Size"); + +#ifdef INET6 +int tcp_v6mssdflt = TCP6_MSS; +SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, + CTLFLAG_RW, &tcp_v6mssdflt , 0, + "Default TCP Maximum Segment Size for IPv6"); +#endif + +#if 0 +static int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ; +SYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, CTLFLAG_RW, + &tcp_rttdflt , 0, "Default maximum TCP Round Trip Time"); +#endif + +static int tcp_do_rfc1323 = 1; +SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW, + &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions"); + +static int tcp_do_rfc1644 = 0; +SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW, + &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions"); + +static int tcp_tcbhashsize = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD, + &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); + +static int do_tcpdrain = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, + "Enable tcp_drain routine for extra help when low on mbufs"); + +SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD, + &tcbinfo.ipi_count, 0, "Number of active PCBs"); + +static int icmp_may_rst = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, &icmp_may_rst, 0, + "Certain ICMP unreachable messages may abort connections in SYN_SENT"); + +static void tcp_cleartaocache __P((void)); +static void tcp_notify __P((struct inpcb *, int)); + +/* + * Target size of TCP PCB hash tables. Must be a power of two. + * + * Note that this can be overridden by the kernel environment + * variable net.inet.tcp.tcbhashsize + */ +#ifndef TCBHASHSIZE +#define TCBHASHSIZE 512 +#endif + +/* + * This is the actual shape of what we allocate using the zone + * allocator. Doing it this way allows us to protect both structures + * using the same generation count, and also eliminates the overhead + * of allocating tcpcbs separately. By hiding the structure here, + * we avoid changing most of the rest of the code (although it needs + * to be changed, eventually, for greater efficiency). + */ +#define ALIGNMENT 32 +#define ALIGNM1 (ALIGNMENT - 1) +struct inp_tp { + union { + struct inpcb inp; + char align[(sizeof(struct inpcb) + ALIGNM1) & ~ALIGNM1]; + } inp_tp_u; + struct tcpcb tcb; + struct callout inp_tp_rexmt, inp_tp_persist, inp_tp_keep, inp_tp_2msl; + struct callout inp_tp_delack; +}; +#undef ALIGNMENT +#undef ALIGNM1 + +/* + * Tcp initialization + */ +void +tcp_init() +{ + int hashsize; + + tcp_ccgen = 1; + tcp_cleartaocache(); + + tcp_delacktime = TCPTV_DELACK; + tcp_keepinit = TCPTV_KEEP_INIT; + tcp_keepidle = TCPTV_KEEP_IDLE; + tcp_keepintvl = TCPTV_KEEPINTVL; + tcp_maxpersistidle = TCPTV_KEEP_IDLE; + tcp_msl = TCPTV_MSL; + + LIST_INIT(&tcb); + tcbinfo.listhead = &tcb; + TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", TCBHASHSIZE, hashsize); + if (!powerof2(hashsize)) { + printf("WARNING: TCB hash size not a power of 2\n"); + hashsize = 512; /* safe default */ + } + tcp_tcbhashsize = hashsize; + tcbinfo.hashbase = hashinit(hashsize, M_PCB, &tcbinfo.hashmask); + tcbinfo.porthashbase = hashinit(hashsize, M_PCB, + &tcbinfo.porthashmask); + tcbinfo.ipi_zone = zinit("tcpcb", sizeof(struct inp_tp), maxsockets, + ZONE_INTERRUPT, 0); +#ifdef INET6 +#define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) +#else /* INET6 */ +#define TCP_MINPROTOHDR (sizeof(struct tcpiphdr)) +#endif /* INET6 */ + if (max_protohdr < TCP_MINPROTOHDR) + max_protohdr = TCP_MINPROTOHDR; + if (max_linkhdr + TCP_MINPROTOHDR > MHLEN) + panic("tcp_init"); +#undef TCP_MINPROTOHDR +} + +/* + * Create template to be used to send tcp packets on a connection. + * Call after host entry created, allocates an mbuf and fills + * in a skeletal tcp/ip header, minimizing the amount of work + * necessary when the connection is used. + */ +struct tcptemp * +tcp_template(tp) + struct tcpcb *tp; +{ + register struct inpcb *inp = tp->t_inpcb; + register struct mbuf *m; + register struct tcptemp *n; + + if ((n = tp->t_template) == 0) { + m = m_get(M_DONTWAIT, MT_HEADER); + if (m == NULL) + return (0); + m->m_len = sizeof (struct tcptemp); + n = mtod(m, struct tcptemp *); + } +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV6) != 0) { + register struct ip6_hdr *ip6; + + ip6 = (struct ip6_hdr *)n->tt_ipgen; + ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | + (inp->in6p_flowinfo & IPV6_FLOWINFO_MASK); + ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | + (IPV6_VERSION & IPV6_VERSION_MASK); + ip6->ip6_nxt = IPPROTO_TCP; + ip6->ip6_plen = sizeof(struct tcphdr); + ip6->ip6_src = inp->in6p_laddr; + ip6->ip6_dst = inp->in6p_faddr; + n->tt_t.th_sum = 0; + } else +#endif + { + struct ip *ip = (struct ip *)n->tt_ipgen; + + bzero(ip, sizeof(struct ip)); /* XXX overkill? */ + ip->ip_vhl = IP_VHL_BORING; + ip->ip_p = IPPROTO_TCP; + ip->ip_src = inp->inp_laddr; + ip->ip_dst = inp->inp_faddr; + n->tt_t.th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, + htons(sizeof(struct tcphdr) + IPPROTO_TCP)); + } + n->tt_t.th_sport = inp->inp_lport; + n->tt_t.th_dport = inp->inp_fport; + n->tt_t.th_seq = 0; + n->tt_t.th_ack = 0; + n->tt_t.th_x2 = 0; + n->tt_t.th_off = 5; + n->tt_t.th_flags = 0; + n->tt_t.th_win = 0; + n->tt_t.th_urp = 0; + return (n); +} + +/* + * Send a single message to the TCP at address specified by + * the given TCP/IP header. If m == 0, then we make a copy + * of the tcpiphdr at ti and send directly to the addressed host. + * This is used to force keep alive messages out using the TCP + * template for a connection tp->t_template. If flags are given + * then we send a message back to the TCP which originated the + * segment ti, and discard the mbuf containing it and any other + * attached mbufs. + * + * In any case the ack and sequence number of the transmitted + * segment are as specified by the parameters. + * + * NOTE: If m != NULL, then ti must point to *inside* the mbuf. + */ +void +tcp_respond(tp, ipgen, th, m, ack, seq, flags) + struct tcpcb *tp; + void *ipgen; + register struct tcphdr *th; + register struct mbuf *m; + tcp_seq ack, seq; + int flags; +{ + register int tlen; + int win = 0; + struct route *ro = 0; + struct route sro; + struct ip *ip; + struct tcphdr *nth; +#ifdef INET6 + struct route_in6 *ro6 = 0; + struct route_in6 sro6; + struct ip6_hdr *ip6; + int isipv6; +#endif /* INET6 */ + int ipflags = 0; + +#ifdef INET6 + isipv6 = IP_VHL_V(((struct ip *)ipgen)->ip_vhl) == 6; + ip6 = ipgen; +#endif /* INET6 */ + ip = ipgen; + + if (tp) { + if (!(flags & TH_RST)) { + win = sbspace(&tp->t_inpcb->inp_socket->so_rcv); + if (win > (long)TCP_MAXWIN << tp->rcv_scale) + win = (long)TCP_MAXWIN << tp->rcv_scale; + } +#ifdef INET6 + if (isipv6) + ro6 = &tp->t_inpcb->in6p_route; + else +#endif /* INET6 */ + ro = &tp->t_inpcb->inp_route; + } else { +#ifdef INET6 + if (isipv6) { + ro6 = &sro6; + bzero(ro6, sizeof *ro6); + } else +#endif /* INET6 */ + { + ro = &sro; + bzero(ro, sizeof *ro); + } + } + if (m == 0) { + m = m_gethdr(M_DONTWAIT, MT_HEADER); + if (m == NULL) + return; + tlen = 0; + m->m_data += max_linkhdr; +#ifdef INET6 + if (isipv6) { + bcopy((caddr_t)ip6, mtod(m, caddr_t), + sizeof(struct ip6_hdr)); + ip6 = mtod(m, struct ip6_hdr *); + nth = (struct tcphdr *)(ip6 + 1); + } else +#endif /* INET6 */ + { + bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); + ip = mtod(m, struct ip *); + nth = (struct tcphdr *)(ip + 1); + } + bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); + flags = TH_ACK; + } else { + m_freem(m->m_next); + m->m_next = 0; + m->m_data = (caddr_t)ipgen; + /* m_len is set later */ + tlen = 0; +#define xchg(a,b,type) { type t; t=a; a=b; b=t; } +#ifdef INET6 + if (isipv6) { + xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); + nth = (struct tcphdr *)(ip6 + 1); + } else +#endif /* INET6 */ + { + xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long); + nth = (struct tcphdr *)(ip + 1); + } + if (th != nth) { + /* + * this is usually a case when an extension header + * exists between the IPv6 header and the + * TCP header. + */ + nth->th_sport = th->th_sport; + nth->th_dport = th->th_dport; + } + xchg(nth->th_dport, nth->th_sport, n_short); +#undef xchg + } +#ifdef INET6 + if (isipv6) { + ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) + + tlen)); + tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr); + } else +#endif + { + tlen += sizeof (struct tcpiphdr); + ip->ip_len = tlen; + ip->ip_ttl = ip_defttl; + } + m->m_len = tlen; + m->m_pkthdr.len = tlen; + m->m_pkthdr.rcvif = (struct ifnet *) 0; + nth->th_seq = htonl(seq); + nth->th_ack = htonl(ack); + nth->th_x2 = 0; + nth->th_off = sizeof (struct tcphdr) >> 2; + nth->th_flags = flags; + if (tp) + nth->th_win = htons((u_short) (win >> tp->rcv_scale)); + else + nth->th_win = htons((u_short)win); + nth->th_urp = 0; +#ifdef INET6 + if (isipv6) { + nth->th_sum = 0; + nth->th_sum = in6_cksum(m, IPPROTO_TCP, + sizeof(struct ip6_hdr), + tlen - sizeof(struct ip6_hdr)); + ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL, + ro6 && ro6->ro_rt ? + ro6->ro_rt->rt_ifp : + NULL); + } else +#endif /* INET6 */ + { + nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, + htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p))); + m->m_pkthdr.csum_flags = CSUM_TCP; + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); + } +#ifdef TCPDEBUG + if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) + tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0); +#endif +#ifdef IPSEC + ipsec_setsocket(m, tp ? tp->t_inpcb->inp_socket : NULL); +#endif +#ifdef INET6 + if (isipv6) { + (void)ip6_output(m, NULL, ro6, ipflags, NULL, NULL); + if (ro6 == &sro6 && ro6->ro_rt) { + RTFREE(ro6->ro_rt); + ro6->ro_rt = NULL; + } + } else +#endif /* INET6 */ + { + (void) ip_output(m, NULL, ro, ipflags, NULL); + if (ro == &sro && ro->ro_rt) { + RTFREE(ro->ro_rt); + ro->ro_rt = NULL; + } + } +} + +/* + * Create a new TCP control block, making an + * empty reassembly queue and hooking it to the argument + * protocol control block. The `inp' parameter must have + * come from the zone allocator set up in tcp_init(). + */ +struct tcpcb * +tcp_newtcpcb(inp) + struct inpcb *inp; +{ + struct inp_tp *it; + register struct tcpcb *tp; +#ifdef INET6 + int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; +#endif /* INET6 */ + + it = (struct inp_tp *)inp; + tp = &it->tcb; + bzero((char *) tp, sizeof(struct tcpcb)); + LIST_INIT(&tp->t_segq); + tp->t_maxseg = tp->t_maxopd = +#ifdef INET6 + isipv6 ? tcp_v6mssdflt : +#endif /* INET6 */ + tcp_mssdflt; + + /* Set up our timeouts. */ + callout_init(tp->tt_rexmt = &it->inp_tp_rexmt, 0); + callout_init(tp->tt_persist = &it->inp_tp_persist, 0); + callout_init(tp->tt_keep = &it->inp_tp_keep, 0); + callout_init(tp->tt_2msl = &it->inp_tp_2msl, 0); + callout_init(tp->tt_delack = &it->inp_tp_delack, 0); + + if (tcp_do_rfc1323) + tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); + if (tcp_do_rfc1644) + tp->t_flags |= TF_REQ_CC; + tp->t_inpcb = inp; /* XXX */ + /* + * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no + * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives + * reasonable initial retransmit time. + */ + tp->t_srtt = TCPTV_SRTTBASE; + tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; + tp->t_rttmin = TCPTV_MIN; + tp->t_rxtcur = TCPTV_RTOBASE; + tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; + tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; + tp->t_rcvtime = ticks; + /* + * IPv4 TTL initialization is necessary for an IPv6 socket as well, + * because the socket may be bound to an IPv6 wildcard address, + * which may match an IPv4-mapped IPv6 address. + */ + inp->inp_ip_ttl = ip_defttl; + inp->inp_ppcb = (caddr_t)tp; + return (tp); /* XXX */ +} + +/* + * Drop a TCP connection, reporting + * the specified error. If connection is synchronized, + * then send a RST to peer. + */ +struct tcpcb * +tcp_drop(tp, errno) + register struct tcpcb *tp; + int errno; +{ + struct socket *so = tp->t_inpcb->inp_socket; + + if (TCPS_HAVERCVDSYN(tp->t_state)) { + tp->t_state = TCPS_CLOSED; + (void) tcp_output(tp); + tcpstat.tcps_drops++; + } else + tcpstat.tcps_conndrops++; + if (errno == ETIMEDOUT && tp->t_softerror) + errno = tp->t_softerror; + so->so_error = errno; + return (tcp_close(tp)); +} + +/* + * Close a TCP control block: + * discard all space held by the tcp + * discard internet protocol block + * wake up any sleepers + */ +struct tcpcb * +tcp_close(tp) + register struct tcpcb *tp; +{ + register struct tseg_qent *q; + struct inpcb *inp = tp->t_inpcb; + struct socket *so = inp->inp_socket; +#ifdef INET6 + int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; +#endif /* INET6 */ + register struct rtentry *rt; + int dosavessthresh; + + /* + * Make sure that all of our timers are stopped before we + * delete the PCB. + */ + callout_stop(tp->tt_rexmt); + callout_stop(tp->tt_persist); + callout_stop(tp->tt_keep); + callout_stop(tp->tt_2msl); + callout_stop(tp->tt_delack); + + /* + * If we got enough samples through the srtt filter, + * save the rtt and rttvar in the routing entry. + * 'Enough' is arbitrarily defined as the 16 samples. + * 16 samples is enough for the srtt filter to converge + * to within 5% of the correct value; fewer samples and + * we could save a very bogus rtt. + * + * Don't update the default route's characteristics and don't + * update anything that the user "locked". + */ + if (tp->t_rttupdated >= 16) { + register u_long i = 0; +#ifdef INET6 + if (isipv6) { + struct sockaddr_in6 *sin6; + + if ((rt = inp->in6p_route.ro_rt) == NULL) + goto no_valid_rt; + sin6 = (struct sockaddr_in6 *)rt_key(rt); + if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) + goto no_valid_rt; + } + else +#endif /* INET6 */ + if ((rt = inp->inp_route.ro_rt) == NULL || + ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr + == INADDR_ANY) + goto no_valid_rt; + + if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) { + i = tp->t_srtt * + (RTM_RTTUNIT / (hz * TCP_RTT_SCALE)); + if (rt->rt_rmx.rmx_rtt && i) + /* + * filter this update to half the old & half + * the new values, converting scale. + * See route.h and tcp_var.h for a + * description of the scaling constants. + */ + rt->rt_rmx.rmx_rtt = + (rt->rt_rmx.rmx_rtt + i) / 2; + else + rt->rt_rmx.rmx_rtt = i; + tcpstat.tcps_cachedrtt++; + } + if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) { + i = tp->t_rttvar * + (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE)); + if (rt->rt_rmx.rmx_rttvar && i) + rt->rt_rmx.rmx_rttvar = + (rt->rt_rmx.rmx_rttvar + i) / 2; + else + rt->rt_rmx.rmx_rttvar = i; + tcpstat.tcps_cachedrttvar++; + } + /* + * The old comment here said: + * update the pipelimit (ssthresh) if it has been updated + * already or if a pipesize was specified & the threshhold + * got below half the pipesize. I.e., wait for bad news + * before we start updating, then update on both good + * and bad news. + * + * But we want to save the ssthresh even if no pipesize is + * specified explicitly in the route, because such + * connections still have an implicit pipesize specified + * by the global tcp_sendspace. In the absence of a reliable + * way to calculate the pipesize, it will have to do. + */ + i = tp->snd_ssthresh; + if (rt->rt_rmx.rmx_sendpipe != 0) + dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2); + else + dosavessthresh = (i < so->so_snd.sb_hiwat / 2); + if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 && + i != 0 && rt->rt_rmx.rmx_ssthresh != 0) + || dosavessthresh) { + /* + * convert the limit from user data bytes to + * packets then to packet data bytes. + */ + i = (i + tp->t_maxseg / 2) / tp->t_maxseg; + if (i < 2) + i = 2; + i *= (u_long)(tp->t_maxseg + +#ifdef INET6 + (isipv6 ? sizeof (struct ip6_hdr) + + sizeof (struct tcphdr) : +#endif + sizeof (struct tcpiphdr) +#ifdef INET6 + ) +#endif + ); + if (rt->rt_rmx.rmx_ssthresh) + rt->rt_rmx.rmx_ssthresh = + (rt->rt_rmx.rmx_ssthresh + i) / 2; + else + rt->rt_rmx.rmx_ssthresh = i; + tcpstat.tcps_cachedssthresh++; + } + } + rt = inp->inp_route.ro_rt; + if (rt) { + /* + * mark route for deletion if no information is + * cached. + */ + if ((tp->t_flags & TF_LQ_OVERFLOW) && + ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0)){ + if (rt->rt_rmx.rmx_rtt == 0) + rt->rt_flags |= RTF_DELCLONE; + } + } + no_valid_rt: + /* free the reassembly queue, if any */ + while((q = LIST_FIRST(&tp->t_segq)) != NULL) { + LIST_REMOVE(q, tqe_q); + m_freem(q->tqe_m); + FREE(q, M_TSEGQ); + } + if (tp->t_template) + (void) m_free(dtom(tp->t_template)); + inp->inp_ppcb = NULL; + soisdisconnected(so); +#ifdef INET6 + if (INP_CHECK_SOCKAF(so, AF_INET6)) + in6_pcbdetach(inp); + else +#endif /* INET6 */ + in_pcbdetach(inp); + tcpstat.tcps_closed++; + return ((struct tcpcb *)0); +} + +void +tcp_drain() +{ + if (do_tcpdrain) + { + struct inpcb *inpb; + struct tcpcb *tcpb; + struct tseg_qent *te; + + /* + * Walk the tcpbs, if existing, and flush the reassembly queue, + * if there is one... + * XXX: The "Net/3" implementation doesn't imply that the TCP + * reassembly queue should be flushed, but in a situation + * where we're really low on mbufs, this is potentially + * usefull. + */ + LIST_FOREACH(inpb, tcbinfo.listhead, inp_list) { + if ((tcpb = intotcpcb(inpb))) { + while ((te = LIST_FIRST(&tcpb->t_segq)) + != NULL) { + LIST_REMOVE(te, tqe_q); + m_freem(te->tqe_m); + FREE(te, M_TSEGQ); + } + } + } + } +} + +/* + * Notify a tcp user of an asynchronous error; + * store error as soft error, but wake up user + * (for now, won't do anything until can select for soft error). + * + * Do not wake up user since there currently is no mechanism for + * reporting soft errors (yet - a kqueue filter may be added). + */ +static void +tcp_notify(inp, error) + struct inpcb *inp; + int error; +{ + struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb; + + /* + * Ignore some errors if we are hooked up. + * If connection hasn't completed, has retransmitted several times, + * and receives a second error, give up now. This is better + * than waiting a long time to establish a connection that + * can never complete. + */ + if (tp->t_state == TCPS_ESTABLISHED && + (error == EHOSTUNREACH || error == ENETUNREACH || + error == EHOSTDOWN)) { + return; + } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && + tp->t_softerror) + tcp_drop(tp, error); + else + tp->t_softerror = error; +#if 0 + wakeup((caddr_t) &so->so_timeo); + sorwakeup(so); + sowwakeup(so); +#endif +} + +static int +tcp_pcblist(SYSCTL_HANDLER_ARGS) +{ + int error, i, n, s; + struct inpcb *inp, **inp_list; + inp_gen_t gencnt; + struct xinpgen xig; + + /* + * The process of preparing the TCB list is too time-consuming and + * resource-intensive to repeat twice on every request. + */ + if (req->oldptr == 0) { + n = tcbinfo.ipi_count; + req->oldidx = 2 * (sizeof xig) + + (n + n/8) * sizeof(struct xtcpcb); + return 0; + } + + if (req->newptr != 0) + return EPERM; + + /* + * OK, now we're committed to doing something. + */ + s = splnet(); + gencnt = tcbinfo.ipi_gencnt; + n = tcbinfo.ipi_count; + splx(s); + + xig.xig_len = sizeof xig; + xig.xig_count = n; + xig.xig_gen = gencnt; + xig.xig_sogen = so_gencnt; + error = SYSCTL_OUT(req, &xig, sizeof xig); + if (error) + return error; + + inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); + if (inp_list == 0) + return ENOMEM; + + s = splnet(); + for (inp = LIST_FIRST(tcbinfo.listhead), i = 0; inp && i < n; + inp = LIST_NEXT(inp, inp_list)) { + if (inp->inp_gencnt <= gencnt && !prison_xinpcb(req->p, inp)) + inp_list[i++] = inp; + } + splx(s); + n = i; + + error = 0; + for (i = 0; i < n; i++) { + inp = inp_list[i]; + if (inp->inp_gencnt <= gencnt) { + struct xtcpcb xt; + caddr_t inp_ppcb; + xt.xt_len = sizeof xt; + /* XXX should avoid extra copy */ + bcopy(inp, &xt.xt_inp, sizeof *inp); + inp_ppcb = inp->inp_ppcb; + if (inp_ppcb != NULL) + bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp); + else + bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); + if (inp->inp_socket) + sotoxsocket(inp->inp_socket, &xt.xt_socket); + error = SYSCTL_OUT(req, &xt, sizeof xt); + } + } + if (!error) { + /* + * Give the user an updated idea of our state. + * If the generation differs from what we told + * her before, she knows that something happened + * while we were processing this request, and it + * might be necessary to retry. + */ + s = splnet(); + xig.xig_gen = tcbinfo.ipi_gencnt; + xig.xig_sogen = so_gencnt; + xig.xig_count = tcbinfo.ipi_count; + splx(s); + error = SYSCTL_OUT(req, &xig, sizeof xig); + } + free(inp_list, M_TEMP); + return error; +} + +SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0, + tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); + +static int +tcp_getcred(SYSCTL_HANDLER_ARGS) +{ + struct xucred xuc; + struct sockaddr_in addrs[2]; + struct inpcb *inp; + int error, s; + + error = suser(req->p); + if (error) + return (error); + error = SYSCTL_IN(req, addrs, sizeof(addrs)); + if (error) + return (error); + s = splnet(); + inp = in_pcblookup_hash(&tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, + addrs[0].sin_addr, addrs[0].sin_port, 0, NULL); + if (inp == NULL || inp->inp_socket == NULL) { + error = ENOENT; + goto out; + } + bzero(&xuc, sizeof(xuc)); + xuc.cr_uid = inp->inp_socket->so_cred->cr_uid; + xuc.cr_ngroups = inp->inp_socket->so_cred->cr_ngroups; + bcopy(inp->inp_socket->so_cred->cr_groups, xuc.cr_groups, + sizeof(xuc.cr_groups)); + error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); +out: + splx(s); + return (error); +} + +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW, + 0, 0, tcp_getcred, "S,xucred", "Get the xucred of a TCP connection"); + +#ifdef INET6 +static int +tcp6_getcred(SYSCTL_HANDLER_ARGS) +{ + struct xucred xuc; + struct sockaddr_in6 addrs[2]; + struct inpcb *inp; + int error, s, mapped = 0; + + error = suser(req->p); + if (error) + return (error); + error = SYSCTL_IN(req, addrs, sizeof(addrs)); + if (error) + return (error); + if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) { + if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr)) + mapped = 1; + else + return (EINVAL); + } + s = splnet(); + if (mapped == 1) + inp = in_pcblookup_hash(&tcbinfo, + *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], + addrs[1].sin6_port, + *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], + addrs[0].sin6_port, + 0, NULL); + else + inp = in6_pcblookup_hash(&tcbinfo, &addrs[1].sin6_addr, + addrs[1].sin6_port, + &addrs[0].sin6_addr, addrs[0].sin6_port, + 0, NULL); + if (inp == NULL || inp->inp_socket == NULL) { + error = ENOENT; + goto out; + } + bzero(&xuc, sizeof(xuc)); + xuc.cr_uid = inp->inp_socket->so_cred->cr_uid; + xuc.cr_ngroups = inp->inp_socket->so_cred->cr_ngroups; + bcopy(inp->inp_socket->so_cred->cr_groups, xuc.cr_groups, + sizeof(xuc.cr_groups)); + error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); +out: + splx(s); + return (error); +} + +SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW, + 0, 0, + tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection"); +#endif + + +void +tcp_ctlinput(cmd, sa, vip) + int cmd; + struct sockaddr *sa; + void *vip; +{ + struct ip *ip = vip; + struct tcphdr *th; + struct in_addr faddr; + struct inpcb *inp; + struct tcpcb *tp; + void (*notify) __P((struct inpcb *, int)) = tcp_notify; + tcp_seq icmp_seq; + int s; + + faddr = ((struct sockaddr_in *)sa)->sin_addr; + if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) + return; + + if (cmd == PRC_QUENCH) + notify = tcp_quench; + else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || + cmd == PRC_UNREACH_PORT) && ip) + notify = tcp_drop_syn_sent; + else if (cmd == PRC_MSGSIZE) + notify = tcp_mtudisc; + else if (PRC_IS_REDIRECT(cmd)) { + ip = 0; + notify = in_rtchange; + } else if (cmd == PRC_HOSTDEAD) + ip = 0; + else if ((unsigned)cmd > PRC_NCMDS || inetctlerrmap[cmd] == 0) + return; + if (ip) { + s = splnet(); + th = (struct tcphdr *)((caddr_t)ip + + (IP_VHL_HL(ip->ip_vhl) << 2)); + inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport, + ip->ip_src, th->th_sport, 0, NULL); + if (inp != NULL && inp->inp_socket != NULL) { + icmp_seq = htonl(th->th_seq); + tp = intotcpcb(inp); + if (SEQ_GEQ(icmp_seq, tp->snd_una) && + SEQ_LT(icmp_seq, tp->snd_max)) + (*notify)(inp, inetctlerrmap[cmd]); + } + splx(s); + } else + in_pcbnotifyall(&tcb, faddr, inetctlerrmap[cmd], notify); +} + +#ifdef INET6 +void +tcp6_ctlinput(cmd, sa, d) + int cmd; + struct sockaddr *sa; + void *d; +{ + register struct tcphdr *thp; + struct tcphdr th; + void (*notify) __P((struct inpcb *, int)) = tcp_notify; + struct sockaddr_in6 sa6; + struct ip6_hdr *ip6; + struct mbuf *m; + int off; + + if (sa->sa_family != AF_INET6 || + sa->sa_len != sizeof(struct sockaddr_in6)) + return; + + if (cmd == PRC_QUENCH) + notify = tcp_quench; + else if (cmd == PRC_MSGSIZE) + notify = tcp_mtudisc; + else if (!PRC_IS_REDIRECT(cmd) && + ((unsigned)cmd > PRC_NCMDS || inet6ctlerrmap[cmd] == 0)) + return; + + /* if the parameter is from icmp6, decode it. */ + if (d != NULL) { + struct ip6ctlparam *ip6cp = (struct ip6ctlparam *)d; + m = ip6cp->ip6c_m; + ip6 = ip6cp->ip6c_ip6; + off = ip6cp->ip6c_off; + } else { + m = NULL; + ip6 = NULL; + off = 0; /* fool gcc */ + } + + /* + * Translate addresses into internal form. + * Sa check if it is AF_INET6 is done at the top of this funciton. + */ + sa6 = *(struct sockaddr_in6 *)sa; + if (IN6_IS_ADDR_LINKLOCAL(&sa6.sin6_addr) != 0 && m != NULL && + m->m_pkthdr.rcvif != NULL) + sa6.sin6_addr.s6_addr16[1] = htons(m->m_pkthdr.rcvif->if_index); + + if (ip6) { + /* + * XXX: We assume that when IPV6 is non NULL, + * M and OFF are valid. + */ + struct in6_addr s; + + /* translate addresses into internal form */ + memcpy(&s, &ip6->ip6_src, sizeof(s)); + if (IN6_IS_ADDR_LINKLOCAL(&s) != 0 && m != NULL && + m->m_pkthdr.rcvif != NULL) + s.s6_addr16[1] = htons(m->m_pkthdr.rcvif->if_index); + + /* check if we can safely examine src and dst ports */ + if (m->m_pkthdr.len < off + sizeof(th)) + return; + + if (m->m_len < off + sizeof(th)) { + /* + * this should be rare case + * because now MINCLSIZE is "(MHLEN + 1)", + * so we compromise on this copy... + */ + m_copydata(m, off, sizeof(th), (caddr_t)&th); + thp = &th; + } else + thp = (struct tcphdr *)(mtod(m, caddr_t) + off); + in6_pcbnotify(&tcb, (struct sockaddr *)&sa6, thp->th_dport, + &s, thp->th_sport, cmd, notify); + } else + in6_pcbnotify(&tcb, (struct sockaddr *)&sa6, 0, &zeroin6_addr, + 0, cmd, notify); +} +#endif /* INET6 */ + +#define TCP_RNDISS_ROUNDS 16 +#define TCP_RNDISS_OUT 7200 +#define TCP_RNDISS_MAX 30000 + +u_int8_t tcp_rndiss_sbox[128]; +u_int16_t tcp_rndiss_msb; +u_int16_t tcp_rndiss_cnt; +long tcp_rndiss_reseed; + +u_int16_t +tcp_rndiss_encrypt(val) + u_int16_t val; +{ + u_int16_t sum = 0, i; + + for (i = 0; i < TCP_RNDISS_ROUNDS; i++) { + sum += 0x79b9; + val ^= ((u_int16_t)tcp_rndiss_sbox[(val^sum) & 0x7f]) << 7; + val = ((val & 0xff) << 7) | (val >> 8); + } + + return val; +} + +void +tcp_rndiss_init() +{ + struct timeval time; + + getmicrotime(&time); + read_random(tcp_rndiss_sbox, sizeof(tcp_rndiss_sbox)); + + tcp_rndiss_reseed = time.tv_sec + TCP_RNDISS_OUT; + tcp_rndiss_msb = tcp_rndiss_msb == 0x8000 ? 0 : 0x8000; + tcp_rndiss_cnt = 0; +} + +tcp_seq +tcp_rndiss_next() +{ + u_int16_t tmp; + struct timeval time; + + getmicrotime(&time); + + if (tcp_rndiss_cnt >= TCP_RNDISS_MAX || + time.tv_sec > tcp_rndiss_reseed) + tcp_rndiss_init(); + + read_random(&tmp, sizeof(tmp)); + + /* (tmp & 0x7fff) ensures a 32768 byte gap between ISS */ + return ((tcp_rndiss_encrypt(tcp_rndiss_cnt++) | tcp_rndiss_msb) <<16) | + (tmp & 0x7fff); +} + + +/* + * When a source quench is received, close congestion window + * to one segment. We will gradually open it again as we proceed. + */ +void +tcp_quench(inp, errno) + struct inpcb *inp; + int errno; +{ + struct tcpcb *tp = intotcpcb(inp); + + if (tp) + tp->snd_cwnd = tp->t_maxseg; +} + +/* + * When a specific ICMP unreachable message is received and the + * connection state is SYN-SENT, drop the connection. This behavior + * is controlled by the icmp_may_rst sysctl. + */ +void +tcp_drop_syn_sent(inp, errno) + struct inpcb *inp; + int errno; +{ + struct tcpcb *tp = intotcpcb(inp); + + if (tp && tp->t_state == TCPS_SYN_SENT) + tcp_drop(tp, errno); +} + +/* + * When `need fragmentation' ICMP is received, update our idea of the MSS + * based on the new value in the route. Also nudge TCP to send something, + * since we know the packet we just sent was dropped. + * This duplicates some code in the tcp_mss() function in tcp_input.c. + */ +void +tcp_mtudisc(inp, errno) + struct inpcb *inp; + int errno; +{ + struct tcpcb *tp = intotcpcb(inp); + struct rtentry *rt; + struct rmxp_tao *taop; + struct socket *so = inp->inp_socket; + int offered; + int mss; +#ifdef INET6 + int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; +#endif /* INET6 */ + + if (tp) { +#ifdef INET6 + if (isipv6) + rt = tcp_rtlookup6(inp); + else +#endif /* INET6 */ + rt = tcp_rtlookup(inp); + if (!rt || !rt->rt_rmx.rmx_mtu) { + tp->t_maxopd = tp->t_maxseg = +#ifdef INET6 + isipv6 ? tcp_v6mssdflt : +#endif /* INET6 */ + tcp_mssdflt; + return; + } + taop = rmx_taop(rt->rt_rmx); + offered = taop->tao_mssopt; + mss = rt->rt_rmx.rmx_mtu - +#ifdef INET6 + (isipv6 ? + sizeof(struct ip6_hdr) + sizeof(struct tcphdr) : +#endif /* INET6 */ + sizeof(struct tcpiphdr) +#ifdef INET6 + ) +#endif /* INET6 */ + ; + + if (offered) + mss = min(mss, offered); + /* + * XXX - The above conditional probably violates the TCP + * spec. The problem is that, since we don't know the + * other end's MSS, we are supposed to use a conservative + * default. But, if we do that, then MTU discovery will + * never actually take place, because the conservative + * default is much less than the MTUs typically seen + * on the Internet today. For the moment, we'll sweep + * this under the carpet. + * + * The conservative default might not actually be a problem + * if the only case this occurs is when sending an initial + * SYN with options and data to a host we've never talked + * to before. Then, they will reply with an MSS value which + * will get recorded and the new parameters should get + * recomputed. For Further Study. + */ + if (tp->t_maxopd <= mss) + return; + tp->t_maxopd = mss; + + if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && + (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) + mss -= TCPOLEN_TSTAMP_APPA; + if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && + (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC) + mss -= TCPOLEN_CC_APPA; +#if (MCLBYTES & (MCLBYTES - 1)) == 0 + if (mss > MCLBYTES) + mss &= ~(MCLBYTES-1); +#else + if (mss > MCLBYTES) + mss = mss / MCLBYTES * MCLBYTES; +#endif + if (so->so_snd.sb_hiwat < mss) + mss = so->so_snd.sb_hiwat; + + tp->t_maxseg = mss; + + tcpstat.tcps_mturesent++; + tp->t_rtttime = 0; + tp->snd_nxt = tp->snd_una; + tcp_output(tp); + } +} + +/* + * Look-up the routing entry to the peer of this inpcb. If no route + * is found and it cannot be allocated the return NULL. This routine + * is called by TCP routines that access the rmx structure and by tcp_mss + * to get the interface MTU. + */ +struct rtentry * +tcp_rtlookup(inp) + struct inpcb *inp; +{ + struct route *ro; + struct rtentry *rt; + + ro = &inp->inp_route; + rt = ro->ro_rt; + if (rt == NULL || !(rt->rt_flags & RTF_UP)) { + /* No route yet, so try to acquire one */ + if (inp->inp_faddr.s_addr != INADDR_ANY) { + ro->ro_dst.sa_family = AF_INET; + ro->ro_dst.sa_len = sizeof(ro->ro_dst); + ((struct sockaddr_in *) &ro->ro_dst)->sin_addr = + inp->inp_faddr; + rtalloc(ro); + rt = ro->ro_rt; + } + } + return rt; +} + +#ifdef INET6 +struct rtentry * +tcp_rtlookup6(inp) + struct inpcb *inp; +{ + struct route_in6 *ro6; + struct rtentry *rt; + + ro6 = &inp->in6p_route; + rt = ro6->ro_rt; + if (rt == NULL || !(rt->rt_flags & RTF_UP)) { + /* No route yet, so try to acquire one */ + if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { + ro6->ro_dst.sin6_family = AF_INET6; + ro6->ro_dst.sin6_len = sizeof(ro6->ro_dst); + ro6->ro_dst.sin6_addr = inp->in6p_faddr; + rtalloc((struct route *)ro6); + rt = ro6->ro_rt; + } + } + return rt; +} +#endif /* INET6 */ + +#ifdef IPSEC +/* compute ESP/AH header size for TCP, including outer IP header. */ +size_t +ipsec_hdrsiz_tcp(tp) + struct tcpcb *tp; +{ + struct inpcb *inp; + struct mbuf *m; + size_t hdrsiz; + struct ip *ip; +#ifdef INET6 + struct ip6_hdr *ip6; +#endif /* INET6 */ + struct tcphdr *th; + + if (!tp || !tp->t_template || !(inp = tp->t_inpcb)) + return 0; + MGETHDR(m, M_DONTWAIT, MT_DATA); + if (!m) + return 0; + +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV6) != 0) { + ip6 = mtod(m, struct ip6_hdr *); + th = (struct tcphdr *)(ip6 + 1); + m->m_pkthdr.len = m->m_len = + sizeof(struct ip6_hdr) + sizeof(struct tcphdr); + bcopy((caddr_t)tp->t_template->tt_ipgen, (caddr_t)ip6, + sizeof(struct ip6_hdr)); + bcopy((caddr_t)&tp->t_template->tt_t, (caddr_t)th, + sizeof(struct tcphdr)); + hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); + } else +#endif /* INET6 */ + { + ip = mtod(m, struct ip *); + th = (struct tcphdr *)(ip + 1); + m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr); + bcopy((caddr_t)tp->t_template->tt_ipgen, (caddr_t)ip, + sizeof(struct ip)); + bcopy((caddr_t)&tp->t_template->tt_t, (caddr_t)th, + sizeof(struct tcphdr)); + hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); + } + + m_free(m); + return hdrsiz; +} +#endif /*IPSEC*/ + +/* + * Return a pointer to the cached information about the remote host. + * The cached information is stored in the protocol specific part of + * the route metrics. + */ +struct rmxp_tao * +tcp_gettaocache(inp) + struct inpcb *inp; +{ + struct rtentry *rt; + +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV6) != 0) + rt = tcp_rtlookup6(inp); + else +#endif /* INET6 */ + rt = tcp_rtlookup(inp); + + /* Make sure this is a host route and is up. */ + if (rt == NULL || + (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST)) + return NULL; + + return rmx_taop(rt->rt_rmx); +} + +/* + * Clear all the TAO cache entries, called from tcp_init. + * + * XXX + * This routine is just an empty one, because we assume that the routing + * routing tables are initialized at the same time when TCP, so there is + * nothing in the cache left over. + */ +static void +tcp_cleartaocache() +{ +} diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c new file mode 100644 index 0000000..25834d4 --- /dev/null +++ b/sys/netinet/tcp_usrreq.c @@ -0,0 +1,1151 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * From: @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94 + * $FreeBSD$ + */ + +#include "opt_ipsec.h" +#include "opt_inet6.h" +#include "opt_tcpdebug.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/mbuf.h> +#ifdef INET6 +#include <sys/domain.h> +#endif /* INET6 */ +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/protosw.h> +#include <sys/proc.h> +#include <sys/jail.h> + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#ifdef INET6 +#include <netinet/ip6.h> +#endif +#include <netinet/in_pcb.h> +#ifdef INET6 +#include <netinet6/in6_pcb.h> +#endif +#include <netinet/in_var.h> +#include <netinet/ip_var.h> +#ifdef INET6 +#include <netinet6/ip6_var.h> +#endif +#include <netinet/tcp.h> +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_seq.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#include <netinet/tcpip.h> +#ifdef TCPDEBUG +#include <netinet/tcp_debug.h> +#endif + +#ifdef IPSEC +#include <netinet6/ipsec.h> +#endif /*IPSEC*/ + +/* + * TCP protocol interface to socket abstraction. + */ +extern char *tcpstates[]; /* XXX ??? */ + +static int tcp_attach __P((struct socket *, struct proc *)); +static int tcp_connect __P((struct tcpcb *, struct sockaddr *, + struct proc *)); +#ifdef INET6 +static int tcp6_connect __P((struct tcpcb *, struct sockaddr *, + struct proc *)); +#endif /* INET6 */ +static struct tcpcb * + tcp_disconnect __P((struct tcpcb *)); +static struct tcpcb * + tcp_usrclosed __P((struct tcpcb *)); + +#ifdef TCPDEBUG +#define TCPDEBUG0 int ostate = 0 +#define TCPDEBUG1() ostate = tp ? tp->t_state : 0 +#define TCPDEBUG2(req) if (tp && (so->so_options & SO_DEBUG)) \ + tcp_trace(TA_USER, ostate, tp, 0, 0, req) +#else +#define TCPDEBUG0 +#define TCPDEBUG1() +#define TCPDEBUG2(req) +#endif + +/* + * TCP attaches to socket via pru_attach(), reserving space, + * and an internet control block. + */ +static int +tcp_usr_attach(struct socket *so, int proto, struct proc *p) +{ + int s = splnet(); + int error; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp = 0; + TCPDEBUG0; + + TCPDEBUG1(); + if (inp) { + error = EISCONN; + goto out; + } + + error = tcp_attach(so, p); + if (error) + goto out; + + if ((so->so_options & SO_LINGER) && so->so_linger == 0) + so->so_linger = TCP_LINGERTIME; + tp = sototcpcb(so); +out: + TCPDEBUG2(PRU_ATTACH); + splx(s); + return error; +} + +/* + * pru_detach() detaches the TCP protocol from the socket. + * If the protocol state is non-embryonic, then can't + * do this directly: have to initiate a pru_disconnect(), + * which may finish later; embryonic TCB's can just + * be discarded here. + */ +static int +tcp_usr_detach(struct socket *so) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + TCPDEBUG0; + + if (inp == 0) { + splx(s); + return EINVAL; /* XXX */ + } + tp = intotcpcb(inp); + TCPDEBUG1(); + tp = tcp_disconnect(tp); + + TCPDEBUG2(PRU_DETACH); + splx(s); + return error; +} + +#define COMMON_START() TCPDEBUG0; \ + do { \ + if (inp == 0) { \ + splx(s); \ + return EINVAL; \ + } \ + tp = intotcpcb(inp); \ + TCPDEBUG1(); \ + } while(0) + +#define COMMON_END(req) out: TCPDEBUG2(req); splx(s); return error; goto out + + +/* + * Give the socket an address. + */ +static int +tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct proc *p) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + struct sockaddr_in *sinp; + + COMMON_START(); + + /* + * Must check for multicast addresses and disallow binding + * to them. + */ + sinp = (struct sockaddr_in *)nam; + if (sinp->sin_family == AF_INET && + IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) { + error = EAFNOSUPPORT; + goto out; + } + error = in_pcbbind(inp, nam, p); + if (error) + goto out; + COMMON_END(PRU_BIND); + +} + +#ifdef INET6 +static int +tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct proc *p) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + struct sockaddr_in6 *sin6p; + + COMMON_START(); + + /* + * Must check for multicast addresses and disallow binding + * to them. + */ + sin6p = (struct sockaddr_in6 *)nam; + if (sin6p->sin6_family == AF_INET6 && + IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) { + error = EAFNOSUPPORT; + goto out; + } + inp->inp_vflag &= ~INP_IPV4; + inp->inp_vflag |= INP_IPV6; + if ((inp->inp_flags & IN6P_BINDV6ONLY) == 0) { + + if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr)) + inp->inp_vflag |= INP_IPV4; + else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { + struct sockaddr_in sin; + + in6_sin6_2_sin(&sin, sin6p); + inp->inp_vflag |= INP_IPV4; + inp->inp_vflag &= ~INP_IPV6; + error = in_pcbbind(inp, (struct sockaddr *)&sin, p); + goto out; + } + } + error = in6_pcbbind(inp, nam, p); + if (error) + goto out; + COMMON_END(PRU_BIND); +} +#endif /* INET6 */ + +/* + * Prepare to accept connections. + */ +static int +tcp_usr_listen(struct socket *so, struct proc *p) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + + COMMON_START(); + if (inp->inp_lport == 0) + error = in_pcbbind(inp, (struct sockaddr *)0, p); + if (error == 0) + tp->t_state = TCPS_LISTEN; + COMMON_END(PRU_LISTEN); +} + +#ifdef INET6 +static int +tcp6_usr_listen(struct socket *so, struct proc *p) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + + COMMON_START(); + if (inp->inp_lport == 0) { + inp->inp_vflag &= ~INP_IPV4; + if ((inp->inp_flags & IN6P_BINDV6ONLY) == 0) + inp->inp_vflag |= INP_IPV4; + error = in6_pcbbind(inp, (struct sockaddr *)0, p); + } + if (error == 0) + tp->t_state = TCPS_LISTEN; + COMMON_END(PRU_LISTEN); +} +#endif /* INET6 */ + +/* + * Initiate connection to peer. + * Create a template for use in transmissions on this connection. + * Enter SYN_SENT state, and mark socket as connecting. + * Start keep-alive timer, and seed output sequence space. + * Send initial segment on connection. + */ +static int +tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct proc *p) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + struct sockaddr_in *sinp; + + COMMON_START(); + + /* + * Must disallow TCP ``connections'' to multicast addresses. + */ + sinp = (struct sockaddr_in *)nam; + if (sinp->sin_family == AF_INET + && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) { + error = EAFNOSUPPORT; + goto out; + } + + if (p && jailed(p->p_ucred)) + prison_remote_ip(p->p_ucred, 0, &sinp->sin_addr.s_addr); + + if ((error = tcp_connect(tp, nam, p)) != 0) + goto out; + error = tcp_output(tp); + COMMON_END(PRU_CONNECT); +} + +#ifdef INET6 +static int +tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct proc *p) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + struct sockaddr_in6 *sin6p; + + COMMON_START(); + + /* + * Must disallow TCP ``connections'' to multicast addresses. + */ + sin6p = (struct sockaddr_in6 *)nam; + if (sin6p->sin6_family == AF_INET6 + && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) { + error = EAFNOSUPPORT; + goto out; + } + + if ((inp->inp_flags & IN6P_BINDV6ONLY) == 0 && + IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { + struct sockaddr_in sin; + + in6_sin6_2_sin(&sin, sin6p); + inp->inp_vflag |= INP_IPV4; + inp->inp_vflag &= ~INP_IPV6; + if ((error = tcp_connect(tp, (struct sockaddr *)&sin, p)) != 0) + goto out; + error = tcp_output(tp); + goto out; + } + inp->inp_vflag &= ~INP_IPV4; + inp->inp_vflag |= INP_IPV6; + if ((error = tcp6_connect(tp, nam, p)) != 0) + goto out; + error = tcp_output(tp); + COMMON_END(PRU_CONNECT); +} +#endif /* INET6 */ + +/* + * Initiate disconnect from peer. + * If connection never passed embryonic stage, just drop; + * else if don't need to let data drain, then can just drop anyways, + * else have to begin TCP shutdown process: mark socket disconnecting, + * drain unread data, state switch to reflect user close, and + * send segment (e.g. FIN) to peer. Socket will be really disconnected + * when peer sends FIN and acks ours. + * + * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. + */ +static int +tcp_usr_disconnect(struct socket *so) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + + COMMON_START(); + tp = tcp_disconnect(tp); + COMMON_END(PRU_DISCONNECT); +} + +/* + * Accept a connection. Essentially all the work is + * done at higher levels; just return the address + * of the peer, storing through addr. + */ +static int +tcp_usr_accept(struct socket *so, struct sockaddr **nam) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp = NULL; + TCPDEBUG0; + + if (so->so_state & SS_ISDISCONNECTED) { + error = ECONNABORTED; + goto out; + } + if (inp == 0) { + splx(s); + return (EINVAL); + } + tp = intotcpcb(inp); + TCPDEBUG1(); + in_setpeeraddr(so, nam); + COMMON_END(PRU_ACCEPT); +} + +#ifdef INET6 +static int +tcp6_usr_accept(struct socket *so, struct sockaddr **nam) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp = NULL; + TCPDEBUG0; + + if (so->so_state & SS_ISDISCONNECTED) { + error = ECONNABORTED; + goto out; + } + if (inp == 0) { + splx(s); + return (EINVAL); + } + tp = intotcpcb(inp); + TCPDEBUG1(); + in6_mapped_peeraddr(so, nam); + COMMON_END(PRU_ACCEPT); +} +#endif /* INET6 */ +/* + * Mark the connection as being incapable of further output. + */ +static int +tcp_usr_shutdown(struct socket *so) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + + COMMON_START(); + socantsendmore(so); + tp = tcp_usrclosed(tp); + if (tp) + error = tcp_output(tp); + COMMON_END(PRU_SHUTDOWN); +} + +/* + * After a receive, possibly send window update to peer. + */ +static int +tcp_usr_rcvd(struct socket *so, int flags) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + + COMMON_START(); + tcp_output(tp); + COMMON_END(PRU_RCVD); +} + +/* + * Do a send by putting data in output queue and updating urgent + * marker if URG set. Possibly send more data. Unlike the other + * pru_*() routines, the mbuf chains are our responsibility. We + * must either enqueue them or free them. The other pru_* routines + * generally are caller-frees. + */ +static int +tcp_usr_send(struct socket *so, int flags, struct mbuf *m, + struct sockaddr *nam, struct mbuf *control, struct proc *p) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; +#ifdef INET6 + int isipv6; +#endif + TCPDEBUG0; + + if (inp == NULL) { + /* + * OOPS! we lost a race, the TCP session got reset after + * we checked SS_CANTSENDMORE, eg: while doing uiomove or a + * network interrupt in the non-splnet() section of sosend(). + */ + if (m) + m_freem(m); + if (control) + m_freem(control); + error = ECONNRESET; /* XXX EPIPE? */ + tp = NULL; + TCPDEBUG1(); + goto out; + } +#ifdef INET6 + isipv6 = nam && nam->sa_family == AF_INET6; +#endif /* INET6 */ + tp = intotcpcb(inp); + TCPDEBUG1(); + if (control) { + /* TCP doesn't do control messages (rights, creds, etc) */ + if (control->m_len) { + m_freem(control); + if (m) + m_freem(m); + error = EINVAL; + goto out; + } + m_freem(control); /* empty control, just free it */ + } + if(!(flags & PRUS_OOB)) { + sbappend(&so->so_snd, m); + if (nam && tp->t_state < TCPS_SYN_SENT) { + /* + * Do implied connect if not yet connected, + * initialize window to default value, and + * initialize maxseg/maxopd using peer's cached + * MSS. + */ +#ifdef INET6 + if (isipv6) + error = tcp6_connect(tp, nam, p); + else +#endif /* INET6 */ + error = tcp_connect(tp, nam, p); + if (error) + goto out; + tp->snd_wnd = TTCP_CLIENT_SND_WND; + tcp_mss(tp, -1); + } + + if (flags & PRUS_EOF) { + /* + * Close the send side of the connection after + * the data is sent. + */ + socantsendmore(so); + tp = tcp_usrclosed(tp); + } + if (tp != NULL) { + if (flags & PRUS_MORETOCOME) + tp->t_flags |= TF_MORETOCOME; + error = tcp_output(tp); + if (flags & PRUS_MORETOCOME) + tp->t_flags &= ~TF_MORETOCOME; + } + } else { + if (sbspace(&so->so_snd) < -512) { + m_freem(m); + error = ENOBUFS; + goto out; + } + /* + * According to RFC961 (Assigned Protocols), + * the urgent pointer points to the last octet + * of urgent data. We continue, however, + * to consider it to indicate the first octet + * of data past the urgent section. + * Otherwise, snd_up should be one lower. + */ + sbappend(&so->so_snd, m); + if (nam && tp->t_state < TCPS_SYN_SENT) { + /* + * Do implied connect if not yet connected, + * initialize window to default value, and + * initialize maxseg/maxopd using peer's cached + * MSS. + */ +#ifdef INET6 + if (isipv6) + error = tcp6_connect(tp, nam, p); + else +#endif /* INET6 */ + error = tcp_connect(tp, nam, p); + if (error) + goto out; + tp->snd_wnd = TTCP_CLIENT_SND_WND; + tcp_mss(tp, -1); + } + tp->snd_up = tp->snd_una + so->so_snd.sb_cc; + tp->t_force = 1; + error = tcp_output(tp); + tp->t_force = 0; + } + COMMON_END((flags & PRUS_OOB) ? PRU_SENDOOB : + ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); +} + +/* + * Abort the TCP. + */ +static int +tcp_usr_abort(struct socket *so) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + + COMMON_START(); + tp = tcp_drop(tp, ECONNABORTED); + COMMON_END(PRU_ABORT); +} + +/* + * Receive out-of-band data. + */ +static int +tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags) +{ + int s = splnet(); + int error = 0; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp; + + COMMON_START(); + if ((so->so_oobmark == 0 && + (so->so_state & SS_RCVATMARK) == 0) || + so->so_options & SO_OOBINLINE || + tp->t_oobflags & TCPOOB_HADDATA) { + error = EINVAL; + goto out; + } + if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { + error = EWOULDBLOCK; + goto out; + } + m->m_len = 1; + *mtod(m, caddr_t) = tp->t_iobc; + if ((flags & MSG_PEEK) == 0) + tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); + COMMON_END(PRU_RCVOOB); +} + +/* xxx - should be const */ +struct pr_usrreqs tcp_usrreqs = { + tcp_usr_abort, tcp_usr_accept, tcp_usr_attach, tcp_usr_bind, + tcp_usr_connect, pru_connect2_notsupp, in_control, tcp_usr_detach, + tcp_usr_disconnect, tcp_usr_listen, in_setpeeraddr, tcp_usr_rcvd, + tcp_usr_rcvoob, tcp_usr_send, pru_sense_null, tcp_usr_shutdown, + in_setsockaddr, sosend, soreceive, sopoll +}; + +#ifdef INET6 +struct pr_usrreqs tcp6_usrreqs = { + tcp_usr_abort, tcp6_usr_accept, tcp_usr_attach, tcp6_usr_bind, + tcp6_usr_connect, pru_connect2_notsupp, in6_control, tcp_usr_detach, + tcp_usr_disconnect, tcp6_usr_listen, in6_mapped_peeraddr, tcp_usr_rcvd, + tcp_usr_rcvoob, tcp_usr_send, pru_sense_null, tcp_usr_shutdown, + in6_mapped_sockaddr, sosend, soreceive, sopoll +}; +#endif /* INET6 */ + +/* + * Common subroutine to open a TCP connection to remote host specified + * by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local + * port number if needed. Call in_pcbladdr to do the routing and to choose + * a local host address (interface). If there is an existing incarnation + * of the same connection in TIME-WAIT state and if the remote host was + * sending CC options and if the connection duration was < MSL, then + * truncate the previous TIME-WAIT state and proceed. + * Initialize connection parameters and enter SYN-SENT state. + */ +static int +tcp_connect(tp, nam, p) + register struct tcpcb *tp; + struct sockaddr *nam; + struct proc *p; +{ + struct inpcb *inp = tp->t_inpcb, *oinp; + struct socket *so = inp->inp_socket; + struct tcpcb *otp; + struct sockaddr_in *sin = (struct sockaddr_in *)nam; + struct sockaddr_in *ifaddr; + struct rmxp_tao *taop; + struct rmxp_tao tao_noncached; + int error; + + if (inp->inp_lport == 0) { + error = in_pcbbind(inp, (struct sockaddr *)0, p); + if (error) + return error; + } + + /* + * Cannot simply call in_pcbconnect, because there might be an + * earlier incarnation of this same connection still in + * TIME_WAIT state, creating an ADDRINUSE error. + */ + error = in_pcbladdr(inp, nam, &ifaddr); + if (error) + return error; + oinp = in_pcblookup_hash(inp->inp_pcbinfo, + sin->sin_addr, sin->sin_port, + inp->inp_laddr.s_addr != INADDR_ANY ? inp->inp_laddr + : ifaddr->sin_addr, + inp->inp_lport, 0, NULL); + if (oinp) { + if (oinp != inp && (otp = intotcpcb(oinp)) != NULL && + otp->t_state == TCPS_TIME_WAIT && + (ticks - otp->t_starttime) < tcp_msl && + (otp->t_flags & TF_RCVD_CC)) + otp = tcp_close(otp); + else + return EADDRINUSE; + } + if (inp->inp_laddr.s_addr == INADDR_ANY) + inp->inp_laddr = ifaddr->sin_addr; + inp->inp_faddr = sin->sin_addr; + inp->inp_fport = sin->sin_port; + in_pcbrehash(inp); + + tp->t_template = tcp_template(tp); + if (tp->t_template == 0) { + in_pcbdisconnect(inp); + return ENOBUFS; + } + + /* Compute window scaling to request. */ + while (tp->request_r_scale < TCP_MAX_WINSHIFT && + (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat) + tp->request_r_scale++; + + soisconnecting(so); + tcpstat.tcps_connattempt++; + tp->t_state = TCPS_SYN_SENT; + callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp); + tp->iss = tcp_rndiss_next(); + tcp_sendseqinit(tp); + + /* + * Generate a CC value for this connection and + * check whether CC or CCnew should be used. + */ + if ((taop = tcp_gettaocache(tp->t_inpcb)) == NULL) { + taop = &tao_noncached; + bzero(taop, sizeof(*taop)); + } + + tp->cc_send = CC_INC(tcp_ccgen); + if (taop->tao_ccsent != 0 && + CC_GEQ(tp->cc_send, taop->tao_ccsent)) { + taop->tao_ccsent = tp->cc_send; + } else { + taop->tao_ccsent = 0; + tp->t_flags |= TF_SENDCCNEW; + } + + return 0; +} + +#ifdef INET6 +static int +tcp6_connect(tp, nam, p) + register struct tcpcb *tp; + struct sockaddr *nam; + struct proc *p; +{ + struct inpcb *inp = tp->t_inpcb, *oinp; + struct socket *so = inp->inp_socket; + struct tcpcb *otp; + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; + struct in6_addr *addr6; + struct rmxp_tao *taop; + struct rmxp_tao tao_noncached; + int error; + + if (inp->inp_lport == 0) { + error = in6_pcbbind(inp, (struct sockaddr *)0, p); + if (error) + return error; + } + + /* + * Cannot simply call in_pcbconnect, because there might be an + * earlier incarnation of this same connection still in + * TIME_WAIT state, creating an ADDRINUSE error. + */ + error = in6_pcbladdr(inp, nam, &addr6); + if (error) + return error; + oinp = in6_pcblookup_hash(inp->inp_pcbinfo, + &sin6->sin6_addr, sin6->sin6_port, + IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) + ? addr6 + : &inp->in6p_laddr, + inp->inp_lport, 0, NULL); + if (oinp) { + if (oinp != inp && (otp = intotcpcb(oinp)) != NULL && + otp->t_state == TCPS_TIME_WAIT && + (ticks - otp->t_starttime) < tcp_msl && + (otp->t_flags & TF_RCVD_CC)) + otp = tcp_close(otp); + else + return EADDRINUSE; + } + if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) + inp->in6p_laddr = *addr6; + inp->in6p_faddr = sin6->sin6_addr; + inp->inp_fport = sin6->sin6_port; + if ((sin6->sin6_flowinfo & IPV6_FLOWINFO_MASK) != NULL) + inp->in6p_flowinfo = sin6->sin6_flowinfo; + in_pcbrehash(inp); + + tp->t_template = tcp_template(tp); + if (tp->t_template == 0) { + in6_pcbdisconnect(inp); + return ENOBUFS; + } + + /* Compute window scaling to request. */ + while (tp->request_r_scale < TCP_MAX_WINSHIFT && + (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat) + tp->request_r_scale++; + + soisconnecting(so); + tcpstat.tcps_connattempt++; + tp->t_state = TCPS_SYN_SENT; + callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp); + tp->iss = tcp_rndiss_next(); + tcp_sendseqinit(tp); + + /* + * Generate a CC value for this connection and + * check whether CC or CCnew should be used. + */ + if ((taop = tcp_gettaocache(tp->t_inpcb)) == NULL) { + taop = &tao_noncached; + bzero(taop, sizeof(*taop)); + } + + tp->cc_send = CC_INC(tcp_ccgen); + if (taop->tao_ccsent != 0 && + CC_GEQ(tp->cc_send, taop->tao_ccsent)) { + taop->tao_ccsent = tp->cc_send; + } else { + taop->tao_ccsent = 0; + tp->t_flags |= TF_SENDCCNEW; + } + + return 0; +} +#endif /* INET6 */ + +/* + * The new sockopt interface makes it possible for us to block in the + * copyin/out step (if we take a page fault). Taking a page fault at + * splnet() is probably a Bad Thing. (Since sockets and pcbs both now + * use TSM, there probably isn't any need for this function to run at + * splnet() any more. This needs more examination.) + */ +int +tcp_ctloutput(so, sopt) + struct socket *so; + struct sockopt *sopt; +{ + int error, opt, optval, s; + struct inpcb *inp; + struct tcpcb *tp; + + error = 0; + s = splnet(); /* XXX */ + inp = sotoinpcb(so); + if (inp == NULL) { + splx(s); + return (ECONNRESET); + } + if (sopt->sopt_level != IPPROTO_TCP) { +#ifdef INET6 + if (INP_CHECK_SOCKAF(so, AF_INET6)) + error = ip6_ctloutput(so, sopt); + else +#endif /* INET6 */ + error = ip_ctloutput(so, sopt); + splx(s); + return (error); + } + tp = intotcpcb(inp); + + switch (sopt->sopt_dir) { + case SOPT_SET: + switch (sopt->sopt_name) { + case TCP_NODELAY: + case TCP_NOOPT: + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + break; + + switch (sopt->sopt_name) { + case TCP_NODELAY: + opt = TF_NODELAY; + break; + case TCP_NOOPT: + opt = TF_NOOPT; + break; + default: + opt = 0; /* dead code to fool gcc */ + break; + } + + if (optval) + tp->t_flags |= opt; + else + tp->t_flags &= ~opt; + break; + + case TCP_NOPUSH: + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + break; + + if (optval) + tp->t_flags |= TF_NOPUSH; + else { + tp->t_flags &= ~TF_NOPUSH; + error = tcp_output(tp); + } + break; + + case TCP_MAXSEG: + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + break; + + if (optval > 0 && optval <= tp->t_maxseg) + tp->t_maxseg = optval; + else + error = EINVAL; + break; + + default: + error = ENOPROTOOPT; + break; + } + break; + + case SOPT_GET: + switch (sopt->sopt_name) { + case TCP_NODELAY: + optval = tp->t_flags & TF_NODELAY; + break; + case TCP_MAXSEG: + optval = tp->t_maxseg; + break; + case TCP_NOOPT: + optval = tp->t_flags & TF_NOOPT; + break; + case TCP_NOPUSH: + optval = tp->t_flags & TF_NOPUSH; + break; + default: + error = ENOPROTOOPT; + break; + } + if (error == 0) + error = sooptcopyout(sopt, &optval, sizeof optval); + break; + } + splx(s); + return (error); +} + +/* + * tcp_sendspace and tcp_recvspace are the default send and receive window + * sizes, respectively. These are obsolescent (this information should + * be set by the route). + */ +u_long tcp_sendspace = 1024*16; +SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW, + &tcp_sendspace , 0, "Maximum outgoing TCP datagram size"); +u_long tcp_recvspace = 1024*16; +SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW, + &tcp_recvspace , 0, "Maximum incoming TCP datagram size"); + +/* + * Attach TCP protocol to socket, allocating + * internet protocol control block, tcp control block, + * bufer space, and entering LISTEN state if to accept connections. + */ +static int +tcp_attach(so, p) + struct socket *so; + struct proc *p; +{ + register struct tcpcb *tp; + struct inpcb *inp; + int error; +#ifdef INET6 + int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != NULL; +#endif + + if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { + error = soreserve(so, tcp_sendspace, tcp_recvspace); + if (error) + return (error); + } + error = in_pcballoc(so, &tcbinfo, p); + if (error) + return (error); + inp = sotoinpcb(so); +#ifdef IPSEC + error = ipsec_init_policy(so, &inp->inp_sp); + if (error) { +#ifdef INET6 + if (isipv6) + in6_pcbdetach(inp); + else +#endif + in_pcbdetach(inp); + return (error); + } +#endif /*IPSEC*/ +#ifdef INET6 + if (isipv6) { + inp->inp_vflag |= INP_IPV6; + inp->in6p_hops = -1; /* use kernel default */ + } + else +#endif + inp->inp_vflag |= INP_IPV4; + tp = tcp_newtcpcb(inp); + if (tp == 0) { + int nofd = so->so_state & SS_NOFDREF; /* XXX */ + + so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ +#ifdef INET6 + if (isipv6) + in6_pcbdetach(inp); + else +#endif + in_pcbdetach(inp); + so->so_state |= nofd; + return (ENOBUFS); + } + tp->t_state = TCPS_CLOSED; + return (0); +} + +/* + * Initiate (or continue) disconnect. + * If embryonic state, just send reset (once). + * If in ``let data drain'' option and linger null, just drop. + * Otherwise (hard), mark socket disconnecting and drop + * current input data; switch states based on user close, and + * send segment to peer (with FIN). + */ +static struct tcpcb * +tcp_disconnect(tp) + register struct tcpcb *tp; +{ + struct socket *so = tp->t_inpcb->inp_socket; + + if (tp->t_state < TCPS_ESTABLISHED) + tp = tcp_close(tp); + else if ((so->so_options & SO_LINGER) && so->so_linger == 0) + tp = tcp_drop(tp, 0); + else { + soisdisconnecting(so); + sbflush(&so->so_rcv); + tp = tcp_usrclosed(tp); + if (tp) + (void) tcp_output(tp); + } + return (tp); +} + +/* + * User issued close, and wish to trail through shutdown states: + * if never received SYN, just forget it. If got a SYN from peer, + * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. + * If already got a FIN from peer, then almost done; go to LAST_ACK + * state. In all other cases, have already sent FIN to peer (e.g. + * after PRU_SHUTDOWN), and just have to play tedious game waiting + * for peer to send FIN or not respond to keep-alives, etc. + * We can let the user exit from the close as soon as the FIN is acked. + */ +static struct tcpcb * +tcp_usrclosed(tp) + register struct tcpcb *tp; +{ + + switch (tp->t_state) { + + case TCPS_CLOSED: + case TCPS_LISTEN: + tp->t_state = TCPS_CLOSED; + tp = tcp_close(tp); + break; + + case TCPS_SYN_SENT: + case TCPS_SYN_RECEIVED: + tp->t_flags |= TF_NEEDFIN; + break; + + case TCPS_ESTABLISHED: + tp->t_state = TCPS_FIN_WAIT_1; + break; + + case TCPS_CLOSE_WAIT: + tp->t_state = TCPS_LAST_ACK; + break; + } + if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { + soisdisconnected(tp->t_inpcb->inp_socket); + /* To prevent the connection hanging in FIN_WAIT_2 forever. */ + if (tp->t_state == TCPS_FIN_WAIT_2) + callout_reset(tp->tt_2msl, tcp_maxidle, + tcp_timer_2msl, tp); + } + return (tp); +} + diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h new file mode 100644 index 0000000..ecac451 --- /dev/null +++ b/sys/netinet/tcp_var.h @@ -0,0 +1,419 @@ +/* + * Copyright (c) 1982, 1986, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_var.h 8.4 (Berkeley) 5/24/95 + * $FreeBSD$ + */ + +#ifndef _NETINET_TCP_VAR_H_ +#define _NETINET_TCP_VAR_H_ +/* + * Kernel variables for tcp. + */ + +/* TCP segment queue entry */ +struct tseg_qent { + LIST_ENTRY(tseg_qent) tqe_q; + int tqe_len; /* TCP segment data length */ + struct tcphdr *tqe_th; /* a pointer to tcp header */ + struct mbuf *tqe_m; /* mbuf contains packet */ +}; +LIST_HEAD(tsegqe_head, tseg_qent); +#ifdef MALLOC_DECLARE +MALLOC_DECLARE(M_TSEGQ); +#endif + +struct tcptemp { + u_char tt_ipgen[40]; /* the size must be of max ip header, now IPv6 */ + struct tcphdr tt_t; +}; + +#define tcp6cb tcpcb /* for KAME src sync over BSD*'s */ + +/* + * Tcp control block, one per tcp; fields: + * Organized for 16 byte cacheline efficiency. + */ +struct tcpcb { + struct tsegqe_head t_segq; + int t_dupacks; /* consecutive dup acks recd */ + struct tcptemp *t_template; /* skeletal packet for transmit */ + + struct callout *tt_rexmt; /* retransmit timer */ + struct callout *tt_persist; /* retransmit persistence */ + struct callout *tt_keep; /* keepalive */ + struct callout *tt_2msl; /* 2*msl TIME_WAIT timer */ + struct callout *tt_delack; /* delayed ACK timer */ + + struct inpcb *t_inpcb; /* back pointer to internet pcb */ + int t_state; /* state of this connection */ + u_int t_flags; +#define TF_ACKNOW 0x00001 /* ack peer immediately */ +#define TF_DELACK 0x00002 /* ack, but try to delay it */ +#define TF_NODELAY 0x00004 /* don't delay packets to coalesce */ +#define TF_NOOPT 0x00008 /* don't use tcp options */ +#define TF_SENTFIN 0x00010 /* have sent FIN */ +#define TF_REQ_SCALE 0x00020 /* have/will request window scaling */ +#define TF_RCVD_SCALE 0x00040 /* other side has requested scaling */ +#define TF_REQ_TSTMP 0x00080 /* have/will request timestamps */ +#define TF_RCVD_TSTMP 0x00100 /* a timestamp was received in SYN */ +#define TF_SACK_PERMIT 0x00200 /* other side said I could SACK */ +#define TF_NEEDSYN 0x00400 /* send SYN (implicit state) */ +#define TF_NEEDFIN 0x00800 /* send FIN (implicit state) */ +#define TF_NOPUSH 0x01000 /* don't push */ +#define TF_REQ_CC 0x02000 /* have/will request CC */ +#define TF_RCVD_CC 0x04000 /* a CC was received in SYN */ +#define TF_SENDCCNEW 0x08000 /* send CCnew instead of CC in SYN */ +#define TF_MORETOCOME 0x10000 /* More data to be appended to sock */ +#define TF_LQ_OVERFLOW 0x20000 /* listen queue overflow */ + int t_force; /* 1 if forcing out a byte */ + + tcp_seq snd_una; /* send unacknowledged */ + tcp_seq snd_max; /* highest sequence number sent; + * used to recognize retransmits + */ + tcp_seq snd_nxt; /* send next */ + tcp_seq snd_up; /* send urgent pointer */ + + tcp_seq snd_wl1; /* window update seg seq number */ + tcp_seq snd_wl2; /* window update seg ack number */ + tcp_seq iss; /* initial send sequence number */ + tcp_seq irs; /* initial receive sequence number */ + + tcp_seq rcv_nxt; /* receive next */ + tcp_seq rcv_adv; /* advertised window */ + u_long rcv_wnd; /* receive window */ + tcp_seq rcv_up; /* receive urgent pointer */ + + u_long snd_wnd; /* send window */ + u_long snd_cwnd; /* congestion-controlled window */ + u_long snd_ssthresh; /* snd_cwnd size threshold for + * for slow start exponential to + * linear switch + */ + tcp_seq snd_recover; /* for use in fast recovery */ + + u_int t_maxopd; /* mss plus options */ + + u_long t_rcvtime; /* inactivity time */ + u_long t_starttime; /* time connection was established */ + int t_rtttime; /* round trip time */ + tcp_seq t_rtseq; /* sequence number being timed */ + + int t_rxtcur; /* current retransmit value (ticks) */ + u_int t_maxseg; /* maximum segment size */ + int t_srtt; /* smoothed round-trip time */ + int t_rttvar; /* variance in round-trip time */ + + int t_rxtshift; /* log(2) of rexmt exp. backoff */ + u_int t_rttmin; /* minimum rtt allowed */ + u_long t_rttupdated; /* number of times rtt sampled */ + u_long max_sndwnd; /* largest window peer has offered */ + + int t_softerror; /* possible error not yet reported */ +/* out-of-band data */ + char t_oobflags; /* have some */ + char t_iobc; /* input character */ +#define TCPOOB_HAVEDATA 0x01 +#define TCPOOB_HADDATA 0x02 +/* RFC 1323 variables */ + u_char snd_scale; /* window scaling for send window */ + u_char rcv_scale; /* window scaling for recv window */ + u_char request_r_scale; /* pending window scaling */ + u_char requested_s_scale; + u_long ts_recent; /* timestamp echo data */ + + u_long ts_recent_age; /* when last updated */ + tcp_seq last_ack_sent; +/* RFC 1644 variables */ + tcp_cc cc_send; /* send connection count */ + tcp_cc cc_recv; /* receive connection count */ +/* experimental */ + u_long snd_cwnd_prev; /* cwnd prior to retransmit */ + u_long snd_ssthresh_prev; /* ssthresh prior to retransmit */ + u_long t_badrxtwin; /* window for retransmit recovery */ +}; + +/* + * Structure to hold TCP options that are only used during segment + * processing (in tcp_input), but not held in the tcpcb. + * It's basically used to reduce the number of parameters + * to tcp_dooptions. + */ +struct tcpopt { + u_long to_flag; /* which options are present */ +#define TOF_TS 0x0001 /* timestamp */ +#define TOF_CC 0x0002 /* CC and CCnew are exclusive */ +#define TOF_CCNEW 0x0004 +#define TOF_CCECHO 0x0008 + u_long to_tsval; + u_long to_tsecr; + tcp_cc to_cc; /* holds CC or CCnew */ + tcp_cc to_ccecho; +}; + +/* + * The TAO cache entry which is stored in the protocol family specific + * portion of the route metrics. + */ +struct rmxp_tao { + tcp_cc tao_cc; /* latest CC in valid SYN */ + tcp_cc tao_ccsent; /* latest CC sent to peer */ + u_short tao_mssopt; /* peer's cached MSS */ +#ifdef notyet + u_short tao_flags; /* cache status flags */ +#define TAOF_DONT 0x0001 /* peer doesn't understand rfc1644 */ +#define TAOF_OK 0x0002 /* peer does understand rfc1644 */ +#define TAOF_UNDEF 0 /* we don't know yet */ +#endif /* notyet */ +}; +#define rmx_taop(r) ((struct rmxp_tao *)(r).rmx_filler) + +#define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb) +#define sototcpcb(so) (intotcpcb(sotoinpcb(so))) + +/* + * The smoothed round-trip time and estimated variance + * are stored as fixed point numbers scaled by the values below. + * For convenience, these scales are also used in smoothing the average + * (smoothed = (1/scale)sample + ((scale-1)/scale)smoothed). + * With these scales, srtt has 3 bits to the right of the binary point, + * and thus an "ALPHA" of 0.875. rttvar has 2 bits to the right of the + * binary point, and is smoothed with an ALPHA of 0.75. + */ +#define TCP_RTT_SCALE 32 /* multiplier for srtt; 3 bits frac. */ +#define TCP_RTT_SHIFT 5 /* shift for srtt; 3 bits frac. */ +#define TCP_RTTVAR_SCALE 16 /* multiplier for rttvar; 2 bits */ +#define TCP_RTTVAR_SHIFT 4 /* shift for rttvar; 2 bits */ +#define TCP_DELTA_SHIFT 2 /* see tcp_input.c */ + +/* + * The initial retransmission should happen at rtt + 4 * rttvar. + * Because of the way we do the smoothing, srtt and rttvar + * will each average +1/2 tick of bias. When we compute + * the retransmit timer, we want 1/2 tick of rounding and + * 1 extra tick because of +-1/2 tick uncertainty in the + * firing of the timer. The bias will give us exactly the + * 1.5 tick we need. But, because the bias is + * statistical, we have to test that we don't drop below + * the minimum feasible timer (which is 2 ticks). + * This version of the macro adapted from a paper by Lawrence + * Brakmo and Larry Peterson which outlines a problem caused + * by insufficient precision in the original implementation, + * which results in inappropriately large RTO values for very + * fast networks. + */ +#define TCP_REXMTVAL(tp) \ + max((tp)->t_rttmin, (((tp)->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)) \ + + (tp)->t_rttvar) >> TCP_DELTA_SHIFT) + +/* + * TCP statistics. + * Many of these should be kept per connection, + * but that's inconvenient at the moment. + */ +struct tcpstat { + u_long tcps_connattempt; /* connections initiated */ + u_long tcps_accepts; /* connections accepted */ + u_long tcps_connects; /* connections established */ + u_long tcps_drops; /* connections dropped */ + u_long tcps_conndrops; /* embryonic connections dropped */ + u_long tcps_closed; /* conn. closed (includes drops) */ + u_long tcps_segstimed; /* segs where we tried to get rtt */ + u_long tcps_rttupdated; /* times we succeeded */ + u_long tcps_delack; /* delayed acks sent */ + u_long tcps_timeoutdrop; /* conn. dropped in rxmt timeout */ + u_long tcps_rexmttimeo; /* retransmit timeouts */ + u_long tcps_persisttimeo; /* persist timeouts */ + u_long tcps_keeptimeo; /* keepalive timeouts */ + u_long tcps_keepprobe; /* keepalive probes sent */ + u_long tcps_keepdrops; /* connections dropped in keepalive */ + + u_long tcps_sndtotal; /* total packets sent */ + u_long tcps_sndpack; /* data packets sent */ + u_long tcps_sndbyte; /* data bytes sent */ + u_long tcps_sndrexmitpack; /* data packets retransmitted */ + u_long tcps_sndrexmitbyte; /* data bytes retransmitted */ + u_long tcps_sndacks; /* ack-only packets sent */ + u_long tcps_sndprobe; /* window probes sent */ + u_long tcps_sndurg; /* packets sent with URG only */ + u_long tcps_sndwinup; /* window update-only packets sent */ + u_long tcps_sndctrl; /* control (SYN|FIN|RST) packets sent */ + + u_long tcps_rcvtotal; /* total packets received */ + u_long tcps_rcvpack; /* packets received in sequence */ + u_long tcps_rcvbyte; /* bytes received in sequence */ + u_long tcps_rcvbadsum; /* packets received with ccksum errs */ + u_long tcps_rcvbadoff; /* packets received with bad offset */ + u_long tcps_rcvmemdrop; /* packets dropped for lack of memory */ + u_long tcps_rcvshort; /* packets received too short */ + u_long tcps_rcvduppack; /* duplicate-only packets received */ + u_long tcps_rcvdupbyte; /* duplicate-only bytes received */ + u_long tcps_rcvpartduppack; /* packets with some duplicate data */ + u_long tcps_rcvpartdupbyte; /* dup. bytes in part-dup. packets */ + u_long tcps_rcvoopack; /* out-of-order packets received */ + u_long tcps_rcvoobyte; /* out-of-order bytes received */ + u_long tcps_rcvpackafterwin; /* packets with data after window */ + u_long tcps_rcvbyteafterwin; /* bytes rcvd after window */ + u_long tcps_rcvafterclose; /* packets rcvd after "close" */ + u_long tcps_rcvwinprobe; /* rcvd window probe packets */ + u_long tcps_rcvdupack; /* rcvd duplicate acks */ + u_long tcps_rcvacktoomuch; /* rcvd acks for unsent data */ + u_long tcps_rcvackpack; /* rcvd ack packets */ + u_long tcps_rcvackbyte; /* bytes acked by rcvd acks */ + u_long tcps_rcvwinupd; /* rcvd window update packets */ + u_long tcps_pawsdrop; /* segments dropped due to PAWS */ + u_long tcps_predack; /* times hdr predict ok for acks */ + u_long tcps_preddat; /* times hdr predict ok for data pkts */ + u_long tcps_pcbcachemiss; + u_long tcps_cachedrtt; /* times cached RTT in route updated */ + u_long tcps_cachedrttvar; /* times cached rttvar updated */ + u_long tcps_cachedssthresh; /* times cached ssthresh updated */ + u_long tcps_usedrtt; /* times RTT initialized from route */ + u_long tcps_usedrttvar; /* times RTTVAR initialized from rt */ + u_long tcps_usedssthresh; /* times ssthresh initialized from rt*/ + u_long tcps_persistdrop; /* timeout in persist state */ + u_long tcps_badsyn; /* bogus SYN, e.g. premature ACK */ + u_long tcps_mturesent; /* resends due to MTU discovery */ + u_long tcps_listendrop; /* listen queue overflows */ +}; + +/* + * TCB structure exported to user-land via sysctl(3). + * Evil hack: declare only if in_pcb.h and sys/socketvar.h have been + * included. Not all of our clients do. + */ +#if defined(_NETINET_IN_PCB_H_) && defined(_SYS_SOCKETVAR_H_) +struct xtcpcb { + size_t xt_len; + struct inpcb xt_inp; + struct tcpcb xt_tp; + struct xsocket xt_socket; + u_quad_t xt_alignment_hack; +}; +#endif + +/* + * Names for TCP sysctl objects + */ +#define TCPCTL_DO_RFC1323 1 /* use RFC-1323 extensions */ +#define TCPCTL_DO_RFC1644 2 /* use RFC-1644 extensions */ +#define TCPCTL_MSSDFLT 3 /* MSS default */ +#define TCPCTL_STATS 4 /* statistics (read-only) */ +#define TCPCTL_RTTDFLT 5 /* default RTT estimate */ +#define TCPCTL_KEEPIDLE 6 /* keepalive idle timer */ +#define TCPCTL_KEEPINTVL 7 /* interval to send keepalives */ +#define TCPCTL_SENDSPACE 8 /* send buffer space */ +#define TCPCTL_RECVSPACE 9 /* receive buffer space */ +#define TCPCTL_KEEPINIT 10 /* timeout for establishing syn */ +#define TCPCTL_PCBLIST 11 /* list of all outstanding PCBs */ +#define TCPCTL_DELACKTIME 12 /* time before sending delayed ACK */ +#define TCPCTL_V6MSSDFLT 13 /* MSS default for IPv6 */ +#define TCPCTL_MAXID 14 + +#define TCPCTL_NAMES { \ + { 0, 0 }, \ + { "rfc1323", CTLTYPE_INT }, \ + { "rfc1644", CTLTYPE_INT }, \ + { "mssdflt", CTLTYPE_INT }, \ + { "stats", CTLTYPE_STRUCT }, \ + { "rttdflt", CTLTYPE_INT }, \ + { "keepidle", CTLTYPE_INT }, \ + { "keepintvl", CTLTYPE_INT }, \ + { "sendspace", CTLTYPE_INT }, \ + { "recvspace", CTLTYPE_INT }, \ + { "keepinit", CTLTYPE_INT }, \ + { "pcblist", CTLTYPE_STRUCT }, \ + { "delacktime", CTLTYPE_INT }, \ + { "v6mssdflt", CTLTYPE_INT }, \ +} + + +#ifdef _KERNEL +#ifdef SYSCTL_DECL +SYSCTL_DECL(_net_inet_tcp); +#endif + +extern struct inpcbhead tcb; /* head of queue of active tcpcb's */ +extern struct inpcbinfo tcbinfo; +extern struct tcpstat tcpstat; /* tcp statistics */ +extern int tcp_mssdflt; /* XXX */ +extern int tcp_delack_enabled; +extern int tcp_do_newreno; +extern int ss_fltsz; +extern int ss_fltsz_local; + +void tcp_canceltimers __P((struct tcpcb *)); +struct tcpcb * + tcp_close __P((struct tcpcb *)); +void tcp_ctlinput __P((int, struct sockaddr *, void *)); +int tcp_ctloutput __P((struct socket *, struct sockopt *)); +struct tcpcb * + tcp_drop __P((struct tcpcb *, int)); +void tcp_drain __P((void)); +void tcp_fasttimo __P((void)); +struct rmxp_tao * + tcp_gettaocache __P((struct inpcb *)); +void tcp_init __P((void)); +void tcp_input __P((struct mbuf *, int, int)); +void tcp_mss __P((struct tcpcb *, int)); +int tcp_mssopt __P((struct tcpcb *)); +void tcp_drop_syn_sent __P((struct inpcb *, int)); +void tcp_mtudisc __P((struct inpcb *, int)); +struct tcpcb * + tcp_newtcpcb __P((struct inpcb *)); +int tcp_output __P((struct tcpcb *)); +void tcp_quench __P((struct inpcb *, int)); +void tcp_respond __P((struct tcpcb *, void *, + struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, int)); +struct rtentry * + tcp_rtlookup __P((struct inpcb *)); +void tcp_setpersist __P((struct tcpcb *)); +void tcp_slowtimo __P((void)); +struct tcptemp * + tcp_template __P((struct tcpcb *)); +struct tcpcb * + tcp_timers __P((struct tcpcb *, int)); +void tcp_trace __P((int, int, struct tcpcb *, void *, struct tcphdr *, + int)); + +extern struct pr_usrreqs tcp_usrreqs; +extern u_long tcp_sendspace; +extern u_long tcp_recvspace; +void tcp_rndiss_init __P((void)); +tcp_seq tcp_rndiss_next __P((void)); +u_int16_t + tcp_rndiss_encrypt __P((u_int16_t)); + +#endif /* _KERNEL */ + +#endif /* _NETINET_TCP_VAR_H_ */ diff --git a/sys/netinet/tcpip.h b/sys/netinet/tcpip.h new file mode 100644 index 0000000..92189b9 --- /dev/null +++ b/sys/netinet/tcpip.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcpip.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_TCPIP_H_ +#define _NETINET_TCPIP_H_ + +/* + * Tcp+ip header, after ip options removed. + */ +struct tcpiphdr { + struct ipovly ti_i; /* overlaid ip structure */ + struct tcphdr ti_t; /* tcp header */ +}; +#define ti_x1 ti_i.ih_x1 +#define ti_pr ti_i.ih_pr +#define ti_len ti_i.ih_len +#define ti_src ti_i.ih_src +#define ti_dst ti_i.ih_dst +#define ti_sport ti_t.th_sport +#define ti_dport ti_t.th_dport +#define ti_seq ti_t.th_seq +#define ti_ack ti_t.th_ack +#define ti_x2 ti_t.th_x2 +#define ti_off ti_t.th_off +#define ti_flags ti_t.th_flags +#define ti_win ti_t.th_win +#define ti_sum ti_t.th_sum +#define ti_urp ti_t.th_urp + +#endif diff --git a/sys/netinet/udp.h b/sys/netinet/udp.h new file mode 100644 index 0000000..635267f --- /dev/null +++ b/sys/netinet/udp.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)udp.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_UDP_H_ +#define _NETINET_UDP_H_ + +/* + * Udp protocol header. + * Per RFC 768, September, 1981. + */ +struct udphdr { + u_short uh_sport; /* source port */ + u_short uh_dport; /* destination port */ + u_short uh_ulen; /* udp length */ + u_short uh_sum; /* udp checksum */ +}; + +#endif diff --git a/sys/netinet/udp_usrreq.c b/sys/netinet/udp_usrreq.c new file mode 100644 index 0000000..350a384 --- /dev/null +++ b/sys/netinet/udp_usrreq.c @@ -0,0 +1,933 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)udp_usrreq.c 8.6 (Berkeley) 5/23/95 + * $FreeBSD$ + */ + +#include "opt_ipsec.h" +#include "opt_inet6.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/domain.h> +#include <sys/proc.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> +#include <sys/jail.h> + +#include <vm/vm_zone.h> + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#ifdef INET6 +#include <netinet/ip6.h> +#endif +#include <netinet/in_pcb.h> +#include <netinet/in_var.h> +#include <netinet/ip_var.h> +#ifdef INET6 +#include <netinet6/ip6_var.h> +#endif +#include <netinet/ip_icmp.h> +#include <netinet/icmp_var.h> +#include <netinet/udp.h> +#include <netinet/udp_var.h> + +#ifdef IPSEC +#include <netinet6/ipsec.h> +#endif /*IPSEC*/ + +#include <machine/in_cksum.h> + +/* + * UDP protocol implementation. + * Per RFC 768, August, 1980. + */ +#ifndef COMPAT_42 +static int udpcksum = 1; +#else +static int udpcksum = 0; /* XXX */ +#endif +SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_RW, + &udpcksum, 0, ""); + +int log_in_vain = 0; +SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW, + &log_in_vain, 0, "Log all incoming UDP packets"); + +static int blackhole = 0; +SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_RW, + &blackhole, 0, "Do not send port unreachables for refused connects"); + +struct inpcbhead udb; /* from udp_var.h */ +#define udb6 udb /* for KAME src sync over BSD*'s */ +struct inpcbinfo udbinfo; + +#ifndef UDBHASHSIZE +#define UDBHASHSIZE 16 +#endif + +struct udpstat udpstat; /* from udp_var.h */ +SYSCTL_STRUCT(_net_inet_udp, UDPCTL_STATS, stats, CTLFLAG_RD, + &udpstat, udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)"); + +static struct sockaddr_in udp_in = { sizeof(udp_in), AF_INET }; +#ifdef INET6 +struct udp_in6 { + struct sockaddr_in6 uin6_sin; + u_char uin6_init_done : 1; +} udp_in6 = { + { sizeof(udp_in6.uin6_sin), AF_INET6 }, + 0 +}; +struct udp_ip6 { + struct ip6_hdr uip6_ip6; + u_char uip6_init_done : 1; +} udp_ip6; +#endif /* INET6 */ + +static void udp_append __P((struct inpcb *last, struct ip *ip, + struct mbuf *n, int off)); +#ifdef INET6 +static void ip_2_ip6_hdr __P((struct ip6_hdr *ip6, struct ip *ip)); +#endif + +static int udp_detach __P((struct socket *so)); +static int udp_output __P((struct inpcb *, struct mbuf *, struct sockaddr *, + struct mbuf *, struct proc *)); + +void +udp_init() +{ + LIST_INIT(&udb); + udbinfo.listhead = &udb; + udbinfo.hashbase = hashinit(UDBHASHSIZE, M_PCB, &udbinfo.hashmask); + udbinfo.porthashbase = hashinit(UDBHASHSIZE, M_PCB, + &udbinfo.porthashmask); + udbinfo.ipi_zone = zinit("udpcb", sizeof(struct inpcb), maxsockets, + ZONE_INTERRUPT, 0); +} + +void +udp_input(m, off, proto) + register struct mbuf *m; + int off, proto; +{ + int iphlen = off; + register struct ip *ip; + register struct udphdr *uh; + register struct inpcb *inp; + struct mbuf *opts = 0; + int len; + struct ip save_ip; + struct sockaddr *append_sa; + + udpstat.udps_ipackets++; + + /* + * Strip IP options, if any; should skip this, + * make available to user, and use on returned packets, + * but we don't yet have a way to check the checksum + * with options still present. + */ + if (iphlen > sizeof (struct ip)) { + ip_stripoptions(m, (struct mbuf *)0); + iphlen = sizeof(struct ip); + } + + /* + * Get IP and UDP header together in first mbuf. + */ + ip = mtod(m, struct ip *); + if (m->m_len < iphlen + sizeof(struct udphdr)) { + if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == 0) { + udpstat.udps_hdrops++; + return; + } + ip = mtod(m, struct ip *); + } + uh = (struct udphdr *)((caddr_t)ip + iphlen); + + /* destination port of 0 is illegal, based on RFC768. */ + if (uh->uh_dport == 0) + goto bad; + + /* + * Make mbuf data length reflect UDP length. + * If not enough data to reflect UDP length, drop. + */ + len = ntohs((u_short)uh->uh_ulen); + if (ip->ip_len != len) { + if (len > ip->ip_len || len < sizeof(struct udphdr)) { + udpstat.udps_badlen++; + goto bad; + } + m_adj(m, len - ip->ip_len); + /* ip->ip_len = len; */ + } + /* + * Save a copy of the IP header in case we want restore it + * for sending an ICMP error message in response. + */ + if (!blackhole) + save_ip = *ip; + + /* + * Checksum extended UDP header and data. + */ + if (uh->uh_sum) { + if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { + if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) + uh->uh_sum = m->m_pkthdr.csum_data; + else + uh->uh_sum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, htonl((u_short)len + + m->m_pkthdr.csum_data + IPPROTO_UDP)); + uh->uh_sum ^= 0xffff; + } else { + bzero(((struct ipovly *)ip)->ih_x1, 9); + ((struct ipovly *)ip)->ih_len = uh->uh_ulen; + uh->uh_sum = in_cksum(m, len + sizeof (struct ip)); + } + if (uh->uh_sum) { + udpstat.udps_badsum++; + m_freem(m); + return; + } + } else + udpstat.udps_nosum++; + + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || + in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) { + struct inpcb *last; + /* + * Deliver a multicast or broadcast datagram to *all* sockets + * for which the local and remote addresses and ports match + * those of the incoming datagram. This allows more than + * one process to receive multi/broadcasts on the same port. + * (This really ought to be done for unicast datagrams as + * well, but that would cause problems with existing + * applications that open both address-specific sockets and + * a wildcard socket listening to the same port -- they would + * end up receiving duplicates of every unicast datagram. + * Those applications open the multiple sockets to overcome an + * inadequacy of the UDP socket interface, but for backwards + * compatibility we avoid the problem here rather than + * fixing the interface. Maybe 4.5BSD will remedy this?) + */ + + /* + * Construct sockaddr format source address. + */ + udp_in.sin_port = uh->uh_sport; + udp_in.sin_addr = ip->ip_src; + /* + * Locate pcb(s) for datagram. + * (Algorithm copied from raw_intr().) + */ + last = NULL; +#ifdef INET6 + udp_in6.uin6_init_done = udp_ip6.uip6_init_done = 0; +#endif + LIST_FOREACH(inp, &udb, inp_list) { +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV4) == 0) + continue; +#endif + if (inp->inp_lport != uh->uh_dport) + continue; + if (inp->inp_laddr.s_addr != INADDR_ANY) { + if (inp->inp_laddr.s_addr != + ip->ip_dst.s_addr) + continue; + } + if (inp->inp_faddr.s_addr != INADDR_ANY) { + if (inp->inp_faddr.s_addr != + ip->ip_src.s_addr || + inp->inp_fport != uh->uh_sport) + continue; + } + + if (last != NULL) { + struct mbuf *n; + +#ifdef IPSEC + /* check AH/ESP integrity. */ + if (ipsec4_in_reject_so(m, last->inp_socket)) + ipsecstat.in_polvio++; + /* do not inject data to pcb */ + else +#endif /*IPSEC*/ + if ((n = m_copy(m, 0, M_COPYALL)) != NULL) + udp_append(last, ip, n, + iphlen + + sizeof(struct udphdr)); + } + last = inp; + /* + * Don't look for additional matches if this one does + * not have either the SO_REUSEPORT or SO_REUSEADDR + * socket options set. This heuristic avoids searching + * through all pcbs in the common case of a non-shared + * port. It * assumes that an application will never + * clear these options after setting them. + */ + if ((last->inp_socket->so_options&(SO_REUSEPORT|SO_REUSEADDR)) == 0) + break; + } + + if (last == NULL) { + /* + * No matching pcb found; discard datagram. + * (No need to send an ICMP Port Unreachable + * for a broadcast or multicast datgram.) + */ + udpstat.udps_noportbcast++; + goto bad; + } +#ifdef IPSEC + /* check AH/ESP integrity. */ + if (ipsec4_in_reject_so(m, last->inp_socket)) { + ipsecstat.in_polvio++; + goto bad; + } +#endif /*IPSEC*/ + udp_append(last, ip, m, iphlen + sizeof(struct udphdr)); + return; + } + /* + * Locate pcb for datagram. + */ + inp = in_pcblookup_hash(&udbinfo, ip->ip_src, uh->uh_sport, + ip->ip_dst, uh->uh_dport, 1, m->m_pkthdr.rcvif); + if (inp == NULL) { + if (log_in_vain) { + char buf[4*sizeof "123"]; + + strcpy(buf, inet_ntoa(ip->ip_dst)); + log(LOG_INFO, + "Connection attempt to UDP %s:%d from %s:%d\n", + buf, ntohs(uh->uh_dport), inet_ntoa(ip->ip_src), + ntohs(uh->uh_sport)); + } + udpstat.udps_noport++; + if (m->m_flags & (M_BCAST | M_MCAST)) { + udpstat.udps_noportbcast++; + goto bad; + } + if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0) + goto bad; + if (blackhole) + goto bad; + *ip = save_ip; + ip->ip_len += iphlen; + icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0); + return; + } +#ifdef IPSEC + if (ipsec4_in_reject_so(m, inp->inp_socket)) { + ipsecstat.in_polvio++; + goto bad; + } +#endif /*IPSEC*/ + + /* + * Construct sockaddr format source address. + * Stuff source address and datagram in user buffer. + */ + udp_in.sin_port = uh->uh_sport; + udp_in.sin_addr = ip->ip_src; + if (inp->inp_flags & INP_CONTROLOPTS + || inp->inp_socket->so_options & SO_TIMESTAMP) { +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6) { + int savedflags; + + ip_2_ip6_hdr(&udp_ip6.uip6_ip6, ip); + savedflags = inp->inp_flags; + inp->inp_flags &= ~INP_UNMAPPABLEOPTS; + ip6_savecontrol(inp, &opts, &udp_ip6.uip6_ip6, m); + inp->inp_flags = savedflags; + } else +#endif + ip_savecontrol(inp, &opts, ip, m); + } + iphlen += sizeof(struct udphdr); + m_adj(m, iphlen); +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6) { + in6_sin_2_v4mapsin6(&udp_in, &udp_in6.uin6_sin); + append_sa = (struct sockaddr *)&udp_in6; + } else +#endif + append_sa = (struct sockaddr *)&udp_in; + if (sbappendaddr(&inp->inp_socket->so_rcv, append_sa, m, opts) == 0) { + udpstat.udps_fullsock++; + goto bad; + } + sorwakeup(inp->inp_socket); + return; +bad: + m_freem(m); + if (opts) + m_freem(opts); + return; +} + +#ifdef INET6 +static void +ip_2_ip6_hdr(ip6, ip) + struct ip6_hdr *ip6; + struct ip *ip; +{ + bzero(ip6, sizeof(*ip6)); + + ip6->ip6_vfc = IPV6_VERSION; + ip6->ip6_plen = ip->ip_len; + ip6->ip6_nxt = ip->ip_p; + ip6->ip6_hlim = ip->ip_ttl; + ip6->ip6_src.s6_addr32[2] = ip6->ip6_dst.s6_addr32[2] = + IPV6_ADDR_INT32_SMP; + ip6->ip6_src.s6_addr32[3] = ip->ip_src.s_addr; + ip6->ip6_dst.s6_addr32[3] = ip->ip_dst.s_addr; +} +#endif + +/* + * subroutine of udp_input(), mainly for source code readability. + * caller must properly init udp_ip6 and udp_in6 beforehand. + */ +static void +udp_append(last, ip, n, off) + struct inpcb *last; + struct ip *ip; + struct mbuf *n; + int off; +{ + struct sockaddr *append_sa; + struct mbuf *opts = 0; + + if (last->inp_flags & INP_CONTROLOPTS || + last->inp_socket->so_options & SO_TIMESTAMP) { +#ifdef INET6 + if (last->inp_vflag & INP_IPV6) { + int savedflags; + + if (udp_ip6.uip6_init_done == 0) { + ip_2_ip6_hdr(&udp_ip6.uip6_ip6, ip); + udp_ip6.uip6_init_done = 1; + } + savedflags = last->inp_flags; + last->inp_flags &= ~INP_UNMAPPABLEOPTS; + ip6_savecontrol(last, &opts, &udp_ip6.uip6_ip6, n); + last->inp_flags = savedflags; + } else +#endif + ip_savecontrol(last, &opts, ip, n); + } +#ifdef INET6 + if (last->inp_vflag & INP_IPV6) { + if (udp_in6.uin6_init_done == 0) { + in6_sin_2_v4mapsin6(&udp_in, &udp_in6.uin6_sin); + udp_in6.uin6_init_done = 1; + } + append_sa = (struct sockaddr *)&udp_in6.uin6_sin; + } else +#endif + append_sa = (struct sockaddr *)&udp_in; + m_adj(n, off); + if (sbappendaddr(&last->inp_socket->so_rcv, append_sa, n, opts) == 0) { + m_freem(n); + if (opts) + m_freem(opts); + udpstat.udps_fullsock++; + } else + sorwakeup(last->inp_socket); +} + +/* + * Notify a udp user of an asynchronous error; + * just wake up so that he can collect error status. + */ +void +udp_notify(inp, errno) + register struct inpcb *inp; + int errno; +{ + inp->inp_socket->so_error = errno; + sorwakeup(inp->inp_socket); + sowwakeup(inp->inp_socket); +} + +void +udp_ctlinput(cmd, sa, vip) + int cmd; + struct sockaddr *sa; + void *vip; +{ + struct ip *ip = vip; + struct udphdr *uh; + void (*notify) __P((struct inpcb *, int)) = udp_notify; + struct in_addr faddr; + struct inpcb *inp; + int s; + + faddr = ((struct sockaddr_in *)sa)->sin_addr; + if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) + return; + + if (PRC_IS_REDIRECT(cmd)) { + ip = 0; + notify = in_rtchange; + } else if (cmd == PRC_HOSTDEAD) + ip = 0; + else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) + return; + if (ip) { + s = splnet(); + uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2)); + inp = in_pcblookup_hash(&udbinfo, faddr, uh->uh_dport, + ip->ip_src, uh->uh_sport, 0, NULL); + if (inp != NULL && inp->inp_socket != NULL) + (*notify)(inp, inetctlerrmap[cmd]); + splx(s); + } else + in_pcbnotifyall(&udb, faddr, inetctlerrmap[cmd], notify); +} + +static int +udp_pcblist(SYSCTL_HANDLER_ARGS) +{ + int error, i, n, s; + struct inpcb *inp, **inp_list; + inp_gen_t gencnt; + struct xinpgen xig; + + /* + * The process of preparing the TCB list is too time-consuming and + * resource-intensive to repeat twice on every request. + */ + if (req->oldptr == 0) { + n = udbinfo.ipi_count; + req->oldidx = 2 * (sizeof xig) + + (n + n/8) * sizeof(struct xinpcb); + return 0; + } + + if (req->newptr != 0) + return EPERM; + + /* + * OK, now we're committed to doing something. + */ + s = splnet(); + gencnt = udbinfo.ipi_gencnt; + n = udbinfo.ipi_count; + splx(s); + + xig.xig_len = sizeof xig; + xig.xig_count = n; + xig.xig_gen = gencnt; + xig.xig_sogen = so_gencnt; + error = SYSCTL_OUT(req, &xig, sizeof xig); + if (error) + return error; + + inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); + if (inp_list == 0) + return ENOMEM; + + s = splnet(); + for (inp = LIST_FIRST(udbinfo.listhead), i = 0; inp && i < n; + inp = LIST_NEXT(inp, inp_list)) { + if (inp->inp_gencnt <= gencnt && !prison_xinpcb(req->p, inp)) + inp_list[i++] = inp; + } + splx(s); + n = i; + + error = 0; + for (i = 0; i < n; i++) { + inp = inp_list[i]; + if (inp->inp_gencnt <= gencnt) { + struct xinpcb xi; + xi.xi_len = sizeof xi; + /* XXX should avoid extra copy */ + bcopy(inp, &xi.xi_inp, sizeof *inp); + if (inp->inp_socket) + sotoxsocket(inp->inp_socket, &xi.xi_socket); + error = SYSCTL_OUT(req, &xi, sizeof xi); + } + } + if (!error) { + /* + * Give the user an updated idea of our state. + * If the generation differs from what we told + * her before, she knows that something happened + * while we were processing this request, and it + * might be necessary to retry. + */ + s = splnet(); + xig.xig_gen = udbinfo.ipi_gencnt; + xig.xig_sogen = so_gencnt; + xig.xig_count = udbinfo.ipi_count; + splx(s); + error = SYSCTL_OUT(req, &xig, sizeof xig); + } + free(inp_list, M_TEMP); + return error; +} + +SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0, + udp_pcblist, "S,xinpcb", "List of active UDP sockets"); + +static int +udp_getcred(SYSCTL_HANDLER_ARGS) +{ + struct xucred xuc; + struct sockaddr_in addrs[2]; + struct inpcb *inp; + int error, s; + + error = suser(req->p); + if (error) + return (error); + error = SYSCTL_IN(req, addrs, sizeof(addrs)); + if (error) + return (error); + s = splnet(); + inp = in_pcblookup_hash(&udbinfo, addrs[1].sin_addr, addrs[1].sin_port, + addrs[0].sin_addr, addrs[0].sin_port, 1, NULL); + if (inp == NULL || inp->inp_socket == NULL) { + error = ENOENT; + goto out; + } + bzero(&xuc, sizeof(xuc)); + xuc.cr_uid = inp->inp_socket->so_cred->cr_uid; + xuc.cr_ngroups = inp->inp_socket->so_cred->cr_ngroups; + bcopy(inp->inp_socket->so_cred->cr_groups, xuc.cr_groups, + sizeof(xuc.cr_groups)); + error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); +out: + splx(s); + return (error); +} + +SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW, + 0, 0, udp_getcred, "S,xucred", "Get the xucred of a UDP connection"); + +static int +udp_output(inp, m, addr, control, p) + register struct inpcb *inp; + struct mbuf *m; + struct sockaddr *addr; + struct mbuf *control; + struct proc *p; +{ + register struct udpiphdr *ui; + register int len = m->m_pkthdr.len; + struct in_addr laddr; + struct sockaddr_in *sin; + int s = 0, error = 0; + + if (control) + m_freem(control); /* XXX */ + + if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) { + error = EMSGSIZE; + goto release; + } + + if (addr) { + sin = (struct sockaddr_in *)addr; + if (p && jailed(p->p_ucred)) + prison_remote_ip(p->p_ucred, 0, &sin->sin_addr.s_addr); + laddr = inp->inp_laddr; + if (inp->inp_faddr.s_addr != INADDR_ANY) { + error = EISCONN; + goto release; + } + /* + * Must block input while temporarily connected. + */ + s = splnet(); + error = in_pcbconnect(inp, addr, p); + if (error) { + splx(s); + goto release; + } + } else { + if (inp->inp_faddr.s_addr == INADDR_ANY) { + error = ENOTCONN; + goto release; + } + } + /* + * Calculate data length and get a mbuf + * for UDP and IP headers. + */ + M_PREPEND(m, sizeof(struct udpiphdr), M_DONTWAIT); + if (m == 0) { + error = ENOBUFS; + if (addr) + splx(s); + goto release; + } + + /* + * Fill in mbuf with extended UDP header + * and addresses and length put into network format. + */ + ui = mtod(m, struct udpiphdr *); + bzero(ui->ui_x1, sizeof(ui->ui_x1)); /* XXX still needed? */ + ui->ui_pr = IPPROTO_UDP; + ui->ui_src = inp->inp_laddr; + ui->ui_dst = inp->inp_faddr; + ui->ui_sport = inp->inp_lport; + ui->ui_dport = inp->inp_fport; + ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr)); + + /* + * Set up checksum and output datagram. + */ + if (udpcksum) { + ui->ui_sum = in_pseudo(ui->ui_src.s_addr, ui->ui_dst.s_addr, + htons((u_short)len + sizeof(struct udphdr) + IPPROTO_UDP)); + m->m_pkthdr.csum_flags = CSUM_UDP; + m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); + } else { + ui->ui_sum = 0; + } + ((struct ip *)ui)->ip_len = sizeof (struct udpiphdr) + len; + ((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl; /* XXX */ + ((struct ip *)ui)->ip_tos = inp->inp_ip_tos; /* XXX */ + udpstat.udps_opackets++; + +#ifdef IPSEC + ipsec_setsocket(m, inp->inp_socket); +#endif /*IPSEC*/ + error = ip_output(m, inp->inp_options, &inp->inp_route, + (inp->inp_socket->so_options & (SO_DONTROUTE | SO_BROADCAST)), + inp->inp_moptions); + + if (addr) { + in_pcbdisconnect(inp); + inp->inp_laddr = laddr; /* XXX rehash? */ + splx(s); + } + return (error); + +release: + m_freem(m); + return (error); +} + +u_long udp_sendspace = 9216; /* really max datagram size */ + /* 40 1K datagrams */ +SYSCTL_INT(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW, + &udp_sendspace, 0, "Maximum outgoing UDP datagram size"); + +u_long udp_recvspace = 40 * (1024 + +#ifdef INET6 + sizeof(struct sockaddr_in6) +#else + sizeof(struct sockaddr_in) +#endif + ); +SYSCTL_INT(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW, + &udp_recvspace, 0, "Maximum incoming UDP datagram size"); + +static int +udp_abort(struct socket *so) +{ + struct inpcb *inp; + int s; + + inp = sotoinpcb(so); + if (inp == 0) + return EINVAL; /* ??? possible? panic instead? */ + soisdisconnected(so); + s = splnet(); + in_pcbdetach(inp); + splx(s); + return 0; +} + +static int +udp_attach(struct socket *so, int proto, struct proc *p) +{ + struct inpcb *inp; + int s, error; + + inp = sotoinpcb(so); + if (inp != 0) + return EINVAL; + + error = soreserve(so, udp_sendspace, udp_recvspace); + if (error) + return error; + s = splnet(); + error = in_pcballoc(so, &udbinfo, p); + splx(s); + if (error) + return error; + + inp = (struct inpcb *)so->so_pcb; + inp->inp_vflag |= INP_IPV4; + inp->inp_ip_ttl = ip_defttl; +#ifdef IPSEC + error = ipsec_init_policy(so, &inp->inp_sp); + if (error != 0) { + in_pcbdetach(inp); + return error; + } +#endif /*IPSEC*/ + return 0; +} + +static int +udp_bind(struct socket *so, struct sockaddr *nam, struct proc *p) +{ + struct inpcb *inp; + int s, error; + + inp = sotoinpcb(so); + if (inp == 0) + return EINVAL; + s = splnet(); + error = in_pcbbind(inp, nam, p); + splx(s); + return error; +} + +static int +udp_connect(struct socket *so, struct sockaddr *nam, struct proc *p) +{ + struct inpcb *inp; + int s, error; + struct sockaddr_in *sin; + + inp = sotoinpcb(so); + if (inp == 0) + return EINVAL; + if (inp->inp_faddr.s_addr != INADDR_ANY) + return EISCONN; + s = splnet(); + sin = (struct sockaddr_in *)nam; + if (p && jailed(p->p_ucred)) + prison_remote_ip(p->p_ucred, 0, &sin->sin_addr.s_addr); + error = in_pcbconnect(inp, nam, p); + splx(s); + if (error == 0) + soisconnected(so); + return error; +} + +static int +udp_detach(struct socket *so) +{ + struct inpcb *inp; + int s; + + inp = sotoinpcb(so); + if (inp == 0) + return EINVAL; + s = splnet(); + in_pcbdetach(inp); + splx(s); + return 0; +} + +static int +udp_disconnect(struct socket *so) +{ + struct inpcb *inp; + int s; + + inp = sotoinpcb(so); + if (inp == 0) + return EINVAL; + if (inp->inp_faddr.s_addr == INADDR_ANY) + return ENOTCONN; + + s = splnet(); + in_pcbdisconnect(inp); + inp->inp_laddr.s_addr = INADDR_ANY; + splx(s); + so->so_state &= ~SS_ISCONNECTED; /* XXX */ + return 0; +} + +static int +udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, + struct mbuf *control, struct proc *p) +{ + struct inpcb *inp; + + inp = sotoinpcb(so); + if (inp == 0) { + m_freem(m); + return EINVAL; + } + return udp_output(inp, m, addr, control, p); +} + +int +udp_shutdown(struct socket *so) +{ + struct inpcb *inp; + + inp = sotoinpcb(so); + if (inp == 0) + return EINVAL; + socantsendmore(so); + return 0; +} + +struct pr_usrreqs udp_usrreqs = { + udp_abort, pru_accept_notsupp, udp_attach, udp_bind, udp_connect, + pru_connect2_notsupp, in_control, udp_detach, udp_disconnect, + pru_listen_notsupp, in_setpeeraddr, pru_rcvd_notsupp, + pru_rcvoob_notsupp, udp_send, pru_sense_null, udp_shutdown, + in_setsockaddr, sosend, soreceive, sopoll +}; diff --git a/sys/netinet/udp_var.h b/sys/netinet/udp_var.h new file mode 100644 index 0000000..b0b2667 --- /dev/null +++ b/sys/netinet/udp_var.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)udp_var.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_UDP_VAR_H_ +#define _NETINET_UDP_VAR_H_ + +/* + * UDP kernel structures and variables. + */ +struct udpiphdr { + struct ipovly ui_i; /* overlaid ip structure */ + struct udphdr ui_u; /* udp header */ +}; +#define ui_x1 ui_i.ih_x1 +#define ui_pr ui_i.ih_pr +#define ui_len ui_i.ih_len +#define ui_src ui_i.ih_src +#define ui_dst ui_i.ih_dst +#define ui_sport ui_u.uh_sport +#define ui_dport ui_u.uh_dport +#define ui_ulen ui_u.uh_ulen +#define ui_sum ui_u.uh_sum + +struct udpstat { + /* input statistics: */ + u_long udps_ipackets; /* total input packets */ + u_long udps_hdrops; /* packet shorter than header */ + u_long udps_badsum; /* checksum error */ + u_long udps_nosum; /* no checksum */ + u_long udps_badlen; /* data length larger than packet */ + u_long udps_noport; /* no socket on port */ + u_long udps_noportbcast; /* of above, arrived as broadcast */ + u_long udps_fullsock; /* not delivered, input socket full */ + u_long udpps_pcbcachemiss; /* input packets missing pcb cache */ + u_long udpps_pcbhashmiss; /* input packets not for hashed pcb */ + /* output statistics: */ + u_long udps_opackets; /* total output packets */ + u_long udps_fastout; /* output packets on fast path */ + /* of no socket on port, arrived as multicast */ + u_long udps_noportmcast; +}; + +/* + * Names for UDP sysctl objects + */ +#define UDPCTL_CHECKSUM 1 /* checksum UDP packets */ +#define UDPCTL_STATS 2 /* statistics (read-only) */ +#define UDPCTL_MAXDGRAM 3 /* max datagram size */ +#define UDPCTL_RECVSPACE 4 /* default receive buffer space */ +#define UDPCTL_PCBLIST 5 /* list of PCBs for UDP sockets */ +#define UDPCTL_MAXID 6 + +#define UDPCTL_NAMES { \ + { 0, 0 }, \ + { "checksum", CTLTYPE_INT }, \ + { "stats", CTLTYPE_STRUCT }, \ + { "maxdgram", CTLTYPE_INT }, \ + { "recvspace", CTLTYPE_INT }, \ + { "pcblist", CTLTYPE_STRUCT }, \ +} + +#ifdef _KERNEL +SYSCTL_DECL(_net_inet_udp); + +extern struct pr_usrreqs udp_usrreqs; +extern struct inpcbhead udb; +extern struct inpcbinfo udbinfo; +extern u_long udp_sendspace; +extern u_long udp_recvspace; +extern struct udpstat udpstat; +extern int log_in_vain; + +void udp_ctlinput __P((int, struct sockaddr *, void *)); +void udp_init __P((void)); +void udp_input __P((struct mbuf *, int, int)); + +void udp_notify __P((struct inpcb *inp, int errno)); +int udp_shutdown __P((struct socket *so)); +#endif + +#endif |